-
Notifications
You must be signed in to change notification settings - Fork 14
Expand file tree
/
Copy pathSemanticScholarMetaCrawler.py
More file actions
236 lines (190 loc) · 10.4 KB
/
SemanticScholarMetaCrawler.py
File metadata and controls
236 lines (190 loc) · 10.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
import os
import re
import requests
import sys
from typing import List,Set
import datetime
from Artigo import Artigo
from Autor import Autor
from ExcelExporter import ExcelExporter
from Gerenciador import Gerenciador
import Timer
from findQualis import find_similar_journal
class Crawler:
def __init__(self, root_directory):
# saves current directory in a string
self.root_directory = root_directory
self.start_time = Timer.timeNow()
self.end_time = Timer.timeNow()
self.manager = None
self.list_authors: List[Autor] = []
self.list_articles: List[Artigo] = []
self.input_search = ''
self.input_pages = 0
self.gui = None
self.index_progress_bar = 1
def update_search_parameters(self, input_search, input_pages):
self.input_search = input_search
self.input_pages = input_pages
# extract the type of the article from the BibText cite text and returns it as a single word string
# TODO: extract it from "publicationTypes" attribute
def return_type_cite(self, string_cite):
list_cite = string_cite.split('{')
type_cite = list_cite[0][1:]
return type_cite
def start_search(self):
self.start_time = Timer.timeNow()
# loads files for the inputted search if they exist, otherwise, the files are created
self.manager = Gerenciador(self.input_search, self.root_directory)
self.list_authors = self.manager.loadAutores()
self.list_articles = set(self.manager.loadArtigos())
# runs the following code 3 times, one for each type os search
# for k in range(0, 2):
# TODO: replicate old functionality with API,
# i.e., search in different ways.
for k in range(0,2):
# label gui
self.gui.app.queueFunction(self.gui.app.setLabel, 'progress_bar_label', 'Crawling with '
+ str(k+1) + '/2 parameter...')
# + str(k+1) + '/3 parameter...')
#self.gui.app.queueFunction(self.gui.app.setMeter, 'progress_bar', 0)
# access switched to api calls,
# input now sent directly via request url,
# with extra whitespace striped
_search_query = re.sub(r"\s+", " ", self.input_search.strip())
_article_count = self.input_pages
_articles_query_params = {
"query": _search_query,
"fields": "abstract,authors,citationCount,citationStyles,title,url,venue,year",
"offset": 0,
"limit": _article_count
}
if k == 1:
_articles_query_params["year"] = str(datetime.date.today().year - 5) + '-'
# TODO: model expected JSON object types
# e.g., search :: String -> [Paper]
# Paper :: { title: String, Author: Author, ...}
_articles_endpoint = 'https://api.semanticscholar.org/graph/v1/paper/search'
with requests.get(_articles_endpoint, params=_articles_query_params) as articles_res:
_articles_res = articles_res.json()
# search for now happens only once
# TODO: mimic the old behavior,
# i.e., searching with different filters applied:
# results from the last five years;
# results with Reviews marked.
# runs the code for the amount of articles desired
self.index_progress_bar = 1
# no need for pagination yet,
# as number of articles is explicitly set,
# in API request.
# TODO: sync progress bar to other inputs,
# previously was number of pages crawled.
# for pag in range(0, self.input_pages):
# progress bar
# self.gui.app.queueFunction(self.gui.app.setMeter, 'progress_bar',
# (100 * self.index_progress_bar)/self.input_pages)
# self.index_progress_bar += 1
# from API response,
# iterates over each article in the articles list
if "data" in _articles_res.keys():
for item in _articles_res["data"]:
# now takes it directly from JSON
# saves the article title as a string
title = item["title"]
# now authors field is present in API response
_paper_authors = item["authors"]
# TODO: get html links for authors who have it,
# probably will require another API call.
# creates a set list of the authors for the article
list_authors_in_article: Set[Autor] = set()
self.list_authors = set(self.list_authors)
# iterates over each author in the list.
for temp in _paper_authors:
# author name now comes in "name" field
name = temp["name"]
# saves the author name as a string
# no link comes with author from Papers API
# TODO: fetch their link somehow.
link = None
# saves the author page html link as a string
# creates temporary author
author = Autor(name, link)
# adds new authors to the set lists
self.list_authors.add(author)
list_authors_in_article.add(author)
self.list_authors = list(self.list_authors)
self.list_authors.sort()
list_authors_in_article = list(list_authors_in_article)
list_authors_in_article.sort()
# origin comes as "venue" in API response.
_venue = item["venue"]
origin = _venue if _venue else "-"
# saves the article origin as a string
# TODO: log when field comes empty,
# print paper id, field name and its value.
# date comes as "year" in API response.
_year = item["year"]
# TODO: log when field comes empty,
# print paper id, field name and its value.
date = str(_year) if _year else "0"
# saves the article date as a string
# citationCount comes as a number in API response.
_citationCount = item["citationCount"]
citationCount = str(int(_citationCount if _citationCount else "0"))
# saves the article total citations as a string
# TODO: log when field comes empty,
# print paper id, field name and its value.
# link comes as "url" in API response.
_url = item["url"]
link = _url if _url else "-"
# saves the article html link as a string
# TODO: log when field comes empty,
# print paper id, field name and its value.
# currently no type comes in API response.
_citationStyles = item["citationStyles"]
_bibtex = _citationStyles["bibtex"] if _citationStyles else _citationStyles
bibtex = '-'
cite = '-'
if _bibtex:
bibtex = _bibtex
# TODO: format bibtex,
# it comes in a weird list-like style,
# e.g., @["Journal Article", "Review"]{...}
cite = self.return_type_cite(bibtex)
# saves the article type as a string
# TODO: get type from "publicationTypes" field.
# synopsis comes as "abstract" in API response.
_abstract = item["abstract"]
synopsis = _abstract.replace(" Expand", "") if _abstract else "No synopsis"
synopsis = synopsis.replace("TLDR\n", "")
qualis_score = find_similar_journal(target_text=origin)
# creates a new instance of a Article object
new_article = Artigo(title, list_authors_in_article, origin, date,
citationCount, link, cite, bibtex, synopsis, qualis_score)
# adds new article to set list (set list does not allow duplicates)
before = len(self.list_articles)
self.list_articles.add(new_article)
after = len(self.list_articles)
# add article to the author's article list if the article is not repeated
if before is not after:
for autorTemp in list_authors_in_article:
autorTemp.addArtigo(new_article)
# no need to switch pages when using API.
# TODO: add pagination to API calls.
else:
print("From Semantic Scholar API:", file=sys.stderr)
for k in _articles_res.keys():
print(k, ": ", _articles_res[k], sep="", file=sys.stderr)
self.end_time = Timer.timeNow()
self.list_articles = list(self.list_articles)
# saves the list of articles and authors as .pkl files
self.list_authors = list(self.list_authors)
self.manager.saveArtigos(self.list_articles)
self.manager.saveAutores(self.list_authors)
self.gui.show_search_done_alert(Timer.totalTime(self.start_time, self.end_time), str(len(self.list_articles)))
def saves_excel(self, parameter):
# creates the excel file
os.chdir(self.root_directory)
excelExporter = ExcelExporter(self.input_search, self.gui.single_or_merge, self.root_directory)
excelExporter.gui = self.gui
excelExporter.order_type(parameter)