Skip to content

Commit c6cb6e4

Browse files
Altered to dynamic functions and removed url button
1 parent e6cdc65 commit c6cb6e4

5 files changed

Lines changed: 283 additions & 195 deletions

File tree

PyQtDesigner/WebScrapper/WebScapper.py

Lines changed: 61 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from PyQt5.QtGui import QIcon
33
from datetime import datetime
44
from requests_html import HTMLSession
5-
from requests.exceptions import ConnectionError
5+
import requests.exceptions
66
import os
77
import sys
88

@@ -13,48 +13,63 @@ def __init__(self):
1313
uic.loadUi(self.scriptDir + os.path.sep + 'WebScrapper.ui',self)
1414
self.setWindowIcon(QIcon(self.scriptDir + os.path.sep + 'icon.png'))
1515
self.setWindowTitle('WebScrapper by [@Nishant Ghanate]')
16-
self.buttonLoadUrl.clicked.connect(self.getUrl)
17-
self.buttonGetSource.clicked.connect(self.getSource)
18-
self.buttonGetSource.setEnabled(False)
16+
self.buttonGetInput.clicked.connect(self.getInput)
1917
self.buttonSaveLogs.clicked.connect(self.saveLogs)
2018
self.buttonClearLogs.clicked.connect(self.clearLogs)
19+
self.oldUrl = "_"
2120

2221
QtCore.pyqtSlot()
22+
def getTimeStamp(self):
23+
return datetime.now().strftime("%A, %d. %B %Y %I:%M:%S %p")
2324

24-
def getUrl(self):
25-
url = self.textEditUrl.toPlainText()
26-
session = HTMLSession()
25+
def validateUrl(self,url):
2726
try:
28-
self.r = session.get(url)
29-
except ConnectionError as e:
30-
print(e)
31-
sys.exit(1)
32-
print(self.r.status_code)
33-
if self.r.status_code ==200:
34-
self.listWidgetLogs.addItem(str(self.r.status_code))
35-
self.buttonGetSource.setEnabled(True)
36-
else:
37-
self.listWidgetLogs.addItem(str(self.r.status_code))
38-
# sys.exit(1)
27+
# if new url found
28+
if self.oldUrl != url:
29+
self.oldUrl = url
30+
session = HTMLSession()
31+
self.r = session.get(url)
32+
return True
33+
else :
34+
return False
35+
except requests.exceptions.RequestException as e:
36+
self.listWidgetLogs.addItem(str(e))
37+
return False
3938

4039

4140
def getSource(self):
42-
src = self.textEditSource.toPlainText()
43-
# print(src)
44-
try:
45-
r = self.r.html.xpath(src)
46-
# timestampDay = datetime.now().strftime("%A, %d. %B %Y %I:%M:%S %p")
47-
# self.listWidgetLogs.addItem(timestampDay)
48-
# print(type(r))
49-
for _ in r:
50-
print(_)
51-
_ = str(_)
52-
self.listWidgetLogs.addItem(_)
53-
except:
54-
self.listWidgetLogs.addItem('Something went wrong ops!')
55-
41+
src = self.r.html.xpath(self.xpathSrc)
42+
43+
if src :
44+
timeStamp = self.getTimeStamp()
45+
for s in src:
46+
print(s)
47+
s = str(s)
48+
self.listWidgetLogs.addItem(s)
49+
if self.r.html._next():
50+
print(self.r.html._next())
51+
url = self.validateUrl(self.r.html._next())
52+
if url:
53+
if self.r.status_code == 200:
54+
self.getSource()
55+
56+
57+
58+
def getInput(self):
59+
url = self.textEditUrl.toPlainText()
60+
self.xpathSrc = self.textEditSource.toPlainText()
61+
if url is None or len(url) < 5:
62+
timestampDay = self.getTimeStamp()
63+
self.listWidgetLogs.addItem(timestampDay)
64+
self.listWidgetLogs.addItem('Url cannot be empty ¯\_(ツ)_/¯ \n')
65+
elif self.xpathSrc is None:
66+
self.listWidgetLogs.addItem('xpath input \_(ʘ_ʘ)_/ ? ')
67+
else :
68+
source = self.validateUrl(url)
69+
if source:
70+
self.getSource()
71+
5672

57-
5873
def saveLogs(self):
5974
timestampDay = datetime.now().strftime("%A, %d. %B %Y %I:%M:%S %p")
6075
file = open(self.scriptDir + os.path.sep +'WebScrappingLogs_'+timestampDay+'.txt','a+')
@@ -76,4 +91,16 @@ def clearLogs(self):
7691
sys.exit(app.exec_())
7792

7893
# https://www.reddit.com/r/ProgrammerHumor/
79-
# r = r.html.xpath('//*[@class="y8HYJ-y_lTUHkQIc1mdCq"]//h2//text()')
94+
# r = r.html.xpath('//*[@class="y8HYJ-y_lTUHkQIc1mdCq"]//h2//text()')
95+
96+
97+
# if self.r.html._next():
98+
# url = validateUrl(self.r.html._next())
99+
# if url:
100+
# if self.r.status_code == 200:
101+
# self.getSource()
102+
# else:
103+
# self.listWidgetLogs.addItem('Sorry we are unable to get next page')
104+
105+
# else:
106+
# self.listWidgetLogs.addItem('Could not find given xpath, (•◡•) please try again diffrent')

0 commit comments

Comments
 (0)