66import os
77import sys
88
9+ #Init PythonUI
910class Web (QtWidgets .QMainWindow ):
1011 def __init__ (self ):
1112 super (Web ,self ).__init__ ()
@@ -16,36 +17,48 @@ def __init__(self):
1617 self .buttonGetInput .clicked .connect (self .getInput )
1718 self .buttonSaveLogs .clicked .connect (self .saveLogs )
1819 self .buttonClearLogs .clicked .connect (self .clearLogs )
20+ # self.buttonClear.clicked.connect()
21+ # self.buttonStop.clicked.connect()
22+
1923 self .oldUrl = "_"
20-
24+
25+ #Acttivate UI fucntions
2126 QtCore .pyqtSlot ()
27+
28+ # Returns Time in e.g 1 oct 2018 formart
2229 def getTimeStamp (self ):
2330 return datetime .now ().strftime ("%A, %d. %B %Y %I:%M:%S %p" )
2431
32+ # Validate input url and throw Expection for invalid
2533 def validateUrl (self ,url ):
2634 try :
27- # if new url found
35+ # Create new session only if new url
2836 if self .oldUrl != url :
2937 self .oldUrl = url
3038 session = HTMLSession ()
3139 self .r = session .get (url )
3240 return True
41+ # else True because if only xpathSrc changed and continue scrapping from given url
3342 else :
34- return False
43+ return True
44+ # Throw expection and close the fucntion
3545 except requests .exceptions .RequestException as e :
3646 self .listWidgetLogs .addItem (str (e ))
3747 return False
3848
39-
40- def getSource (self ):
49+ # This function will scrap it is also recurive if required
50+ def getSource (self ):
51+ # returns list for single Xpath item
4152 src = self .r .html .xpath (self .xpathSrc )
42-
53+ # proceed further if list is not empty
4354 if src :
4455 timeStamp = self .getTimeStamp ()
4556 for s in src :
4657 print (s )
4758 s = str (s )
59+ # List widget only supports string
4860 self .listWidgetLogs .addItem (s )
61+
4962 if self .r .html ._next ():
5063 print (self .r .html ._next ())
5164 url = self .validateUrl (self .r .html ._next ())
@@ -54,22 +67,23 @@ def getSource(self):
5467 self .getSource ()
5568
5669
57-
70+ # Get button function connection
5871 def getInput (self ):
5972 url = self .textEditUrl .toPlainText ()
6073 self .xpathSrc = self .textEditSource .toPlainText ()
61- if url is None or len (url ) < 5 :
74+
75+ if url and self .xpathSrc is not None :
76+ source = self .validateUrl (url )
77+ if source :
78+ self .getSource ()
79+ elif url is None or len (url ) < 5 :
6280 timestampDay = self .getTimeStamp ()
6381 self .listWidgetLogs .addItem (timestampDay )
6482 self .listWidgetLogs .addItem ('Url cannot be empty ¯\_(ツ)_/¯ \n ' )
6583 elif self .xpathSrc is None :
6684 self .listWidgetLogs .addItem ('xpath input \_(ʘ_ʘ)_/ ? ' )
67- else :
68- source = self .validateUrl (url )
69- if source :
70- self .getSource ()
71-
72-
85+
86+ # Saves logs from Preserved logs
7387 def saveLogs (self ):
7488 timestampDay = datetime .now ().strftime ("%A, %d. %B %Y %I:%M:%S %p" )
7589 file = open (self .scriptDir + os .path .sep + 'WebScrappingLogs_' + timestampDay + '.txt' ,'a+' )
@@ -80,7 +94,7 @@ def saveLogs(self):
8094 file .close ()
8195 self .listWidgetLogs .addItem ('Logs saved Sucessfully' )
8296
83-
97+ # clear preservedLogs
8498 def clearLogs (self ):
8599 self .listWidgetLogs .clear ()
86100
@@ -92,15 +106,3 @@ def clearLogs(self):
92106
93107# https://www.reddit.com/r/ProgrammerHumor/
94108# r = r.html.xpath('//*[@class="y8HYJ-y_lTUHkQIc1mdCq"]//h2//text()')
95-
96-
97- # if self.r.html._next():
98- # url = validateUrl(self.r.html._next())
99- # if url:
100- # if self.r.status_code == 200:
101- # self.getSource()
102- # else:
103- # self.listWidgetLogs.addItem('Sorry we are unable to get next page')
104-
105- # else:
106- # self.listWidgetLogs.addItem('Could not find given xpath, (•◡•) please try again diffrent')
0 commit comments