22from PyQt5 .QtGui import QIcon
33from datetime import datetime
44from requests_html import HTMLSession
5- from requests .exceptions import ConnectionError
5+ import requests .exceptions
66import os
77import sys
88
@@ -13,48 +13,63 @@ def __init__(self):
1313 uic .loadUi (self .scriptDir + os .path .sep + 'WebScrapper.ui' ,self )
1414 self .setWindowIcon (QIcon (self .scriptDir + os .path .sep + 'icon.png' ))
1515 self .setWindowTitle ('WebScrapper by [@Nishant Ghanate]' )
16- self .buttonLoadUrl .clicked .connect (self .getUrl )
17- self .buttonGetSource .clicked .connect (self .getSource )
18- self .buttonGetSource .setEnabled (False )
16+ self .buttonGetInput .clicked .connect (self .getInput )
1917 self .buttonSaveLogs .clicked .connect (self .saveLogs )
2018 self .buttonClearLogs .clicked .connect (self .clearLogs )
19+ self .oldUrl = "_"
2120
2221 QtCore .pyqtSlot ()
22+ def getTimeStamp (self ):
23+ return datetime .now ().strftime ("%A, %d. %B %Y %I:%M:%S %p" )
2324
24- def getUrl (self ):
25- url = self .textEditUrl .toPlainText ()
26- session = HTMLSession ()
25+ def validateUrl (self ,url ):
2726 try :
28- self . r = session . get ( url )
29- except ConnectionError as e :
30- print ( e )
31- sys . exit ( 1 )
32- print ( self .r . status_code )
33- if self . r . status_code == 200 :
34- self . listWidgetLogs . addItem ( str ( self . r . status_code ))
35- self . buttonGetSource . setEnabled ( True )
36- else :
37- self .listWidgetLogs .addItem (str (self . r . status_code ))
38- # sys.exit(1)
27+ # if new url found
28+ if self . oldUrl != url :
29+ self . oldUrl = url
30+ session = HTMLSession ( )
31+ self .r = session . get ( url )
32+ return True
33+ else :
34+ return False
35+ except requests . exceptions . RequestException as e :
36+ self .listWidgetLogs .addItem (str (e ))
37+ return False
3938
4039
4140 def getSource (self ):
42- src = self .textEditSource .toPlainText ()
43- # print(src)
44- try :
45- r = self .r .html .xpath (src )
46- # timestampDay = datetime.now().strftime("%A, %d. %B %Y %I:%M:%S %p")
47- # self.listWidgetLogs.addItem(timestampDay)
48- # print(type(r))
49- for _ in r :
50- print (_ )
51- _ = str (_ )
52- self .listWidgetLogs .addItem (_ )
53- except :
54- self .listWidgetLogs .addItem ('Something went wrong ops!' )
55-
41+ src = self .r .html .xpath (self .xpathSrc )
42+
43+ if src :
44+ timeStamp = self .getTimeStamp ()
45+ for s in src :
46+ print (s )
47+ s = str (s )
48+ self .listWidgetLogs .addItem (s )
49+ if self .r .html ._next ():
50+ print (self .r .html ._next ())
51+ url = self .validateUrl (self .r .html ._next ())
52+ if url :
53+ if self .r .status_code == 200 :
54+ self .getSource ()
55+
56+
57+
58+ def getInput (self ):
59+ url = self .textEditUrl .toPlainText ()
60+ self .xpathSrc = self .textEditSource .toPlainText ()
61+ if url is None or len (url ) < 5 :
62+ timestampDay = self .getTimeStamp ()
63+ self .listWidgetLogs .addItem (timestampDay )
64+ self .listWidgetLogs .addItem ('Url cannot be empty ¯\_(ツ)_/¯ \n ' )
65+ elif self .xpathSrc is None :
66+ self .listWidgetLogs .addItem ('xpath input \_(ʘ_ʘ)_/ ? ' )
67+ else :
68+ source = self .validateUrl (url )
69+ if source :
70+ self .getSource ()
71+
5672
57-
5873 def saveLogs (self ):
5974 timestampDay = datetime .now ().strftime ("%A, %d. %B %Y %I:%M:%S %p" )
6075 file = open (self .scriptDir + os .path .sep + 'WebScrappingLogs_' + timestampDay + '.txt' ,'a+' )
@@ -76,4 +91,16 @@ def clearLogs(self):
7691 sys .exit (app .exec_ ())
7792
7893# https://www.reddit.com/r/ProgrammerHumor/
79- # r = r.html.xpath('//*[@class="y8HYJ-y_lTUHkQIc1mdCq"]//h2//text()')
94+ # r = r.html.xpath('//*[@class="y8HYJ-y_lTUHkQIc1mdCq"]//h2//text()')
95+
96+
97+ # if self.r.html._next():
98+ # url = validateUrl(self.r.html._next())
99+ # if url:
100+ # if self.r.status_code == 200:
101+ # self.getSource()
102+ # else:
103+ # self.listWidgetLogs.addItem('Sorry we are unable to get next page')
104+
105+ # else:
106+ # self.listWidgetLogs.addItem('Could not find given xpath, (•◡•) please try again diffrent')
0 commit comments