forked from kyle-tapang-illinois/CourseProject
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathScrapeHackerNews.py
More file actions
162 lines (121 loc) · 5.51 KB
/
ScrapeHackerNews.py
File metadata and controls
162 lines (121 loc) · 5.51 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# -*- coding: utf-8 -*-
"""
Created on Mon Oct 24 21:33:31 2022
@author: WilliamKiger
"""
import requests
import urllib.request
from urllib.error import HTTPError
from socket import timeout
from bs4 import BeautifulSoup
import pprint
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
COUNT = 1
def increment():
global COUNT
COUNT = COUNT + 1
class ScrapeHackerNews:
def getPageText(self, url):
print(url)
increment()
print(COUNT)
if url.startswith(('http://', 'https://')):
try:
html = urllib.request.urlopen(url, timeout=(3)).read()
except HTTPError as e:
print (e)
return ''
except timeout:
print('timeout error occured')
return ''
soup = BeautifulSoup(html, features="lxml")
# deactivate all script and style elements
for script in soup(["script", "style"]):
script.extract() # extract the text
# get the text ...then clean it up
text = soup.get_text()
#remove leading and trailing space https://stackoverflow.com/questions/1936466/how-to-scrape-only-visible-webpage-text-with-beautifulsoup
lines = (line.strip() for line in text.splitlines())
# break up headlines
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
# eliminate blank lines
text = '\n'.join(chunk for chunk in chunks if chunk)
# print(text.encode('utf-8'))
return text.encode('utf-8')
else:
print("invalid url found is " + url)
return ''
def rankByVotes(self, data_list):
return sorted(data_list, key= lambda k:k['votes'], reverse=True)
def extract_metadata(self, links, subtext, scraping_depth, search_external_bool):
data = []
columns = len(links[0])
for i in range(scraping_depth):
for j in range(columns):
# print("columns == " + str(columns))
title = links[i][j].getText()
# print("title: " + title)
href = links[i][j].a.get('href', None)
#controls searching external to hacker news
if search_external_bool == True :
page_contents = self.getPageText(href)
else:
page_contents = '' #if not, contents will be empty
vote = subtext[i][j].select('.score') #this is now subtext[num of pages][0-29 number of articles per page]
# print("vote: " + str(vote))
if len(vote): #votes should be a list...means above line iw wrong
points = int(vote[0].getText().replace(' points', ''))
data.append({'title': title, 'link': href, 'votes': points, 'contents': page_contents})
return self.rankByVotes(data)
def ScrapePages(self, url_list):
mega_links = []
mega_subtext = []
for i in range(len(url_list)):
#get request
get_page = requests.get(url_list[i])
soup = BeautifulSoup(get_page.text, 'html.parser')
links = soup.select('.titleline') #grabbing <span class='titleline"> == $0
# print(links)
subtext = soup.select('.subtext') #contains score under titleline
# print(subtext)
mega_links.append(links)
mega_subtext.append(subtext)
return mega_links, mega_subtext
def PagesToScrape(self, search_depth):
# print("search depth is: " + str(search_depth))
urls = []
urls.append('https://news.ycombinator.com/news')
if search_depth >= 2:
search_depth = search_depth - 1
num = 2
for i in range(search_depth):
page = 'https://news.ycombinator.com/news?p={}'.format(str(num))
# print(page)
urls.append(page)
num = num + 1
return urls
def get_data(self, scraping_depth, search_external_bool):
urls = self.PagesToScrape(scraping_depth)
links, subtext = self.ScrapePages(urls)
return self.extract_metadata(links, subtext, scraping_depth, search_external_bool)
def main():
scrapeHackerNews = ScrapeHackerNews()
# controls how many pages you want to scrape from Hacker News
scraping_depth = 1
# loading a list of urls for the depth of pages looked at in Hacker News
urls = scrapeHackerNews.PagesToScrape(scraping_depth)
print(urls)
# now we are getting the Hacker news articles(not searching the actual articles yet): links and subtext
links, subtext = scrapeHackerNews.ScrapePages(urls)
# ******this boolean will control going to the linked pages *******
search_external_bool = True
# search_external_bool = True
# score_sorted is a dictionary with link, title, votes, contents.
# if search_external_bool is true, this program will grab the text from that page too and insert it in the contents of the score sorted dictionary
score_sorted = scrapeHackerNews.extract_metadata(links, subtext, scraping_depth, search_external_bool)
# this output will be massive if you are scraping pages external to Hacker News too
# which is controled by teh search_external_bool
pprint.pprint(score_sorted)
if __name__ == '__main__':
main()