-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraper.py
More file actions
220 lines (204 loc) · 9.71 KB
/
scraper.py
File metadata and controls
220 lines (204 loc) · 9.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import requests
import feedparser
import lxml.html as lh
from datetime import datetime
from models import db, Article, EventType
import time
import os
def parseMOHFeed():
NewsFeed = feedparser.parse("https://www.moh.gov.sg/feeds/news-highlights")
outputList = []
for article in NewsFeed.entries:
if article.title.lower().strip().startswith('update on local covid-19 situation'):
# ddict = {}
text = article.description[ article.description.lower().find('summary') : article.description.lower().find('<strong>', article.description.lower().find('summary')) ]
text = lh.fromstring(text).text_content().replace('\xa0', ' ').replace('·', '')
if text.startswith('Summary'):
text = text[len("Summary of local situation"):].strip()
if len(text) < 10 : continue
checker = Article.query.filter_by(articleId=article.link).first()
if checker is not None: break
datePublished = (article.published).strip().replace(',', '')[:-2]
# Thu 14 Oct 2021 15:30:00
datePublishedObj = datetime.strptime(f'{datePublished}', '%a %d %b %Y %X')
print(datePublishedObj.isoformat())
article = Article(articleId=article.link, # link serves as the id of the article
title=article.title,
bodyText=text,
datePublished=article.published,
description="")
db.session.add(article)
db.session.commit()
return outputList
# Tags:
# Health
# COVID-19
# Social and Community
# POFMA
# Environment
# Economy and Finance
# not to include : 'Others'
def gov_sg_api_scrape():
NUM_ROWS_GOV_SG_API = str(50)
GOV_SG_API = "https://www.gov.sg/api/v1/search?fq=contenttype_s:[*%20TO%20*]&fq=isfeatured_b:false&fq=primarytopic_s:[*%20TO%20*]%20OR%20secondarytopic_sm:[*%20TO%20*]&sort=publish_date_tdt%20desc&start=0&rows={}".format(NUM_ROWS_GOV_SG_API)
headers = {"accept": "application/json, text/plain, */*",
"accept-language": "en-US,en;q=0.9",
"accept-encoding": "gzip, deflate, br",
"connection": "keep-alive",
# "cookie": "_gcl_au=1.1.1590965881.1629875587; _ga=GA1.1.1281913755.1629875588; BIGipServerPOOL_T01699AENW007_80=!RRaPXSVqcp7JPmK2y3uy1ZSmqkptIK1MPWyq0SlDFJIQgxRRsgqdf6XUEuxJVpeoYIp2kRG+2abm7Io=; BIGipServerPOOL_T01699AENW008_80=!85WJrW9y65os52y2y3uy1ZSmqkptIKl569Y42Q7comc/a5jhd+7kOhrbLjz3xjxZ5hl7ZybdZ7gLfO4j844sEmEC+NBEVdql/E0aGEI=; _gid=GA1.1.53318824.1630161450; AMCVS_DF38E5285913269B0A495E5A%40AdobeOrg=1; AMCV_DF38E5285913269B0A495E5A%40AdobeOrg=1075005958%7CMCIDTS%7C18868%7CMCMID%7C83649643605679938043731086567568432525%7CMCAAMLH-1630766250%7C3%7CMCAAMB-1630766250%7CRKhpRz8krg2tLO6pguXWp5olkAcUniQYPHaMWWgdJ3xzPWQmdj0y%7CMCOPTOUT-1630168650s%7CNONE%7CvVersion%7C4.4.1; ASP.NET_SessionId=l2kygwiqrlevugxelj04ksaa; ARRAffinity=0ac9bce649c45e7967e35a9af818b7ca2e33342c80af9690acae9cd95bdb1278; SC_ANALYTICS_GLOBAL_COOKIE=755da20b05e5457f9b691533252f129b|True; _sp_ses.8ca1=*; _sp_id.8ca1=f419c00c-fe3b-45ed-b984-2f9eee3085b0.1629875589.3.1630166281.1630162507.db92c051-f2b0-4977-ba94-b6bf7bccbc29",
"sec-ch-ua": "\" Not;A Brand\";v=\"99\", \"Google Chrome\";v=\"91\", \"Chromium\";v=\"91\"",
"sec-ch-ua-mobile": "?0",
"sec-fetch-dest": "empty",
"sec-fetch-mode": "cors",
"sec-fetch-site": "same-origin",
"referer": "https://www/gov.sg/health",
"host": "www.gov.sg",
"User-Agent": 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.76 Safari/537.36', }
try:
response = requests.get(GOV_SG_API, headers=headers, timeout=15)
except requests.exceptions.Timeout:
return
if response.status_code != 200:
print(response.status_code)
return
try:
data = response.json()
except ValueError:
print('ValueError')
return
try:
testRun = data['response']
except:
print('Json Issue')
return
print("Total Num responses:", len(data['response']['docs']))
outputList = []
for article in data['response']['docs']:
ddict = {}
imgUrl = "https://www.gov.sg/" + article['imageurl_s']
minutesToRead = article['minuteread_s']
articleUrl = "https://www.gov.sg/" + article['pageurl_s']
articleTitle = article['title_t']
try: articleDescription = article['short_description_t']
except: articleDescription = ""
articleID = article['itemid_s']
articleMainText = article['bodytext_t']
datePublished = article['publishdate_s']
# Example : 06 Aug 2021
datePublishedObj = datetime.strptime(f'{datePublished} 00:01AM', '%d %b %Y %H:%M%p')
print(f'DatePublished gov_sg_api_scrape : {datePublishedObj.isoformat()}')
# nCount = find_nth(articleMainText, '. ', articleMainText.count('. ') * 0.3)
# articleSummarized = meaningCloudSummarizer(articleMainText)
checker = Article.query.filter_by(articleId=articleUrl).first()
if checker is not None: break
article = Article(articleId= articleUrl,
title=articleTitle,
bodyText=articleMainText,
datePublished=datePublished,
description=articleDescription)
db.session.add(article)
db.session.commit()
# ddict['imgUrl'] = imgUrl
# ddict['minutesToRead'] = minutesToRead
# ddict['articleSummarized'] = articleSummarized
# ddict['articleSummarized'] = articleMainText[:nCount]
return outputList
def checkTags():
# Attractions: Accordion 3
# Country and Recreational Clubs: Accordion 6
# Enterprises in Finance Sector: Accordion 13
# Funeral Events: Accordion 15
# F&B: Accordion 16
# Hotels: Accordion 20
# Solemnisation and Reception: Accordion 22
# MICE Events : Accordion 26
# Nightlife: Accordion 27
# Property Show Galleries: Accordion 31
# Public Entertainment: Accordion 32
# Religious organisations : Accordion 33
# Sports Sector Enterprises: Accordion 36
# Tours: Accordion 38
try:
response = requests.get('https://www.gobusiness.gov.sg/safemanagement/sector/', timeout=15)
except requests.exceptions.Timeout:
return
if response.status_code != 200:
print(response.status_code)
return
root = lh.fromstring(response.text)
# TODO: find len 'jekyllcodex_accordion', replace with 42
SectorList = [
'Attractions',
'Country and recreation clubs',
'Funeral events',
'Marriage solemnisations and wedding receptions',
'MICE events',
'Hotels',
'Property show galleries',
'Sports sector enterprises, sports education, and premises with sports facilities',
'Religious organisations']
postList = []
for num in range(1, 42):
sectorContent = root.xpath(f'//*[@id="main-content"]/section[3]/div/div/div[2]/div/div/div/ul/li[{num}]')
text = sectorContent[0].text_content()
sectorName = text[:text.find('\n')]
if sectorName in SectorList:
# print(sectorName.strip())
text = text[len(sectorName):]
check = 0
lastUpdatedStr = ""
while check != -1:
lastUpdatedStr += text[text.find('[', check): text.find(']', check) + 1]
text = text[text.find(']', check):]
check = text.find('[')
checker = EventType.query.filter_by(eventTypeName=sectorName.strip()).first()
if checker is not None and checker.currString != lastUpdatedStr.strip():
checker.currString = lastUpdatedStr.strip()
# Append to a Post request list
postList.append(lastUpdatedStr.strip())
db.session.commit()
if checker is None:
newEventType = EventType(sectorName, lastUpdatedStr.strip())
db.session.add(newEventType)
db.session.commit()
# Make the post request
if postList:
r = requests.post("https://locus.social:8080/event/type/notification", json={'eventTypes': postList})
# def find_nth(haystack, needle, n):
# start = haystack.find(needle)
# while start >= 0 and n > 1:
# start = haystack.find(needle, start + len(needle))
# n -= 1
# return start
# limited to 20000 requests a month, $0.01 aft that
# def meaningCloudSummarizer(text):
# # print(f'Original text: {text}.')
# numSentences = text.count('.')
# print(f'Number of Sentences initially: {numSentences}')
# url = "https://meaningcloud-summarization-v1.p.rapidapi.com/summarization-1.0"
# querystring = {"sentences": "10", "txt": text}
# smmrizeHeaders = {
# 'accept': "application/json",
# 'x-rapidapi-host': "meaningcloud-summarization-v1.p.rapidapi.com",
# 'x-rapidapi-key': os.environ.get('SMMRIZE_API_KEY')
# }
# try:
# response = requests.request("GET", url, headers=smmrizeHeaders, params=querystring, timeout=15)
# except requests.exceptions.Timeout:
# return
# if response.status_code != 200:
# print(response.status_code)
# return
# try:
# summaryJSON = response.json()
# except ValueError:
# print(response.text)
# return
# try:
# summarizedText = summaryJSON["summary"]
# except:
# if response.text.find('"summary"') == -1: return
# summarizedText = response.text[response.text.find('"summary"') + len("summary"): -2]
# pass
# # print(response.text)
# return summarizedText.replace('[...] ', '')