forked from azk0019/CourseProject
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathreuters_scraper.py
More file actions
105 lines (85 loc) · 3.56 KB
/
reuters_scraper.py
File metadata and controls
105 lines (85 loc) · 3.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import requests
import json
import nltk
import re
import urllib
import time
#create a webdriver object and set options for headless browsing
options = Options()
options.headless = True
driver = webdriver.Chrome('./chromedriver',options=options)
#helper functions from MP2.1
#uses webdriver object to execute javascript code and get dynamically loaded webcontent
def get_js_soup(url,driver):
driver.get(url)
res_html = driver.execute_script('return document.body.innerHTML')
soup = BeautifulSoup(res_html, "lxml") #beautiful soup object to be used for parsing html content
return soup
#tidies extracted text
def process_article(article):
article = article.encode('ascii',errors='ignore').decode('utf-8') #removes non-ascii characters
article = re.sub('\s+',' ',article) #repalces repeated whitespace characters with single space
return article
''' More tidying
Sometimes the text extracted HTML webpage may contain javascript code and some style elements.
This function removes script and style tags from HTML so that extracted text does not contain them.
'''
def remove_script(soup):
for script in soup(["script", "style"]):
script.decompose()
return soup
#Adapted from MP2.1
def scrape_links(dir_url,driver):
url_list = [] #for easier access later when obtaining content
output_line = []
base_url = "https://www.reuters.com"
soup = get_js_soup(dir_url,driver)
content = soup.find('div', class_ = 'column1 col col-10')
with open('reuters_urls.csv', 'a+') as f:
for link_holder in content.find_all('div', class_ = 'story-content'): #get list of all <div>
output_line = []
try:
rel_link = link_holder.find('a')['href'] #get url
url_list.append(base_url+rel_link)
headline = link_holder.find('h3', class_ = 'story-title').string.strip()
headline = headline.replace("'", "\\'") #add escape chars for quotes
headline = headline.replace("\"", '\\' + '\"')
headline = headline.replace(",", ";") #replace commas in headline with semicolon for CSV purposes
publishDate = link_holder.find('span', class_ = 'timestamp').string.strip()
output_line.append(publishDate)
output_line.append(headline)
output_line.append(base_url+rel_link)
f.write(','.join(map(str, output_line)) + "\n")
except:
continue
return url_list
def scrape_article(url):
soup = get_js_soup(url, driver)
body = soup.find_all('p', class_='Paragraph-paragraph-2Bgue ArticleBody-para-TD_9x')
txt = ''
for p in body:
try:
txt += process_article(p.string)
except:
continue
with open('reuters_body.txt', 'a+') as t:
t.write(txt.replace("\n", '') + "\n")
#main scrape
reuters_links = []
with open('reuters_urls.csv', 'a+') as f:
output_line = []
output_line.append('publish_date')
output_line.append('headline')
output_line.append('url')
f.write(','.join(map(str, output_line)) + "\n")
for i in range(1, 347):
print('-'*20, 'Page ' + str(i), '-'*20)
tgt_url = "https://www.reuters.com/news/archive/us-elections-2020?view=page&page={}&pageSize=10".format(i)
reuters_links += scrape_links(tgt_url, driver)
print(str(len(reuters_links)), 'links scraped')
for i in range(len(reuters_links)):
scrape_article(reuters_links[i])
driver.close()