Skip to content

Commit 0a568ba

Browse files
committed
Script for adding bibtex to Neotoma.
1 parent bb2ddc4 commit 0a568ba

File tree

1 file changed

+104
-0
lines changed

1 file changed

+104
-0
lines changed
Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
# Read in all Neotoma publications from a Postgres table export
2+
# Attempt to clean DOI fields to match the regex `\b10\.\d{4,9}/[-.;()/:\w]+`
3+
# For records with DOIs, call out to crossref to get a formatted BibTex object
4+
# For records without, call CrossRef with the article title to find a DOI.
5+
# Return all objects to a CSV for hand validation.
6+
7+
import requests
8+
import csv
9+
import sys
10+
import urllib
11+
import re
12+
import subprocess
13+
import json
14+
from crossref.restful import Works, Etiquette
15+
16+
csv.field_size_limit(sys.maxsize)
17+
18+
def clean_doi(doi_string:str):
19+
if type(doi_string) is not str:
20+
raise Exception('TypeError', f'The doi passed -- {doi_string} -- is not a string.')
21+
outcome = re.match(r'.*(\b10\.\d{4,9}/[-.;()/:\w]+)', doi_string)
22+
if outcome is None:
23+
return None
24+
else:
25+
return outcome.group(1)
26+
27+
def break_citation(citation:str):
28+
with open('temp.txt', 'w') as wr:
29+
wr.write(citation)
30+
outcome = subprocess.run(['anystyle', '-f', 'json', 'parse', 'temp.txt'], capture_output = True)
31+
if outcome.stdout == b'':
32+
raise Exception('ValueError', f'Could not perform extraction from: {citation}')
33+
else:
34+
return json.loads(outcome.stdout)
35+
36+
def return_bibtex(doi_string:str):
37+
url = 'https://doi.org/' + urllib.request.quote(doi_string)
38+
header = {
39+
'Accept': 'application/x-bibtex',
40+
'User-Agent': 'Neotoma Publication Augmenter; mailto:[email protected]'
41+
}
42+
response = requests.get(url, headers=header)
43+
return response.text.strip()
44+
45+
def check_crossref(cite_object:str):
46+
url = 'https://api.crossref.org/works'
47+
url_call = requests.get(url,
48+
headers = {'Accept': 'application/json',
49+
'User-Agent': 'Neotoma Publication Augmenter; mailto:[email protected]'},
50+
params = {'rows':1,
51+
'mailto':'[email protected]',
52+
'select':'DOI,title,container-title,published',
53+
'query':f'query.title={cite_object}'})
54+
if url_call.status_code == 200:
55+
cross_ref = json.loads(url_call.content)
56+
if cross_ref.get('message').get('total-results') > 0:
57+
return cross_ref.get('message').get('items', '')[0]
58+
else:
59+
return None
60+
else:
61+
return None
62+
63+
with open('data/neotoma_publications_202410071440.csv') as file:
64+
db_data = list(csv.DictReader(file))
65+
66+
# For each row
67+
for i in db_data:
68+
if any([j in ['bibtex', 'newdoi', 'json'] for j in i.keys()]):
69+
continue
70+
if i.get('doi', '') != '':
71+
try:
72+
outcome = clean_doi(i.get('doi'))
73+
if outcome != i.get('doi'):
74+
i['notes'] = i.get('notes', '') + f'DOI mismatch, regex returns {outcome}; '
75+
print('DOI mismatch.')
76+
else:
77+
print('DOI match:')
78+
bibtex = return_bibtex(outcome)
79+
i['bibtex'] = i.get('bibtex', '') + bibtex
80+
except TypeError as e:
81+
print('DOI present but not of the correct type.')
82+
else:
83+
print('Trying to pull in new information:')
84+
if i.get('citation', '') != '' and i.get('articletitle', '') == '' and i.get('booktitle') == '':
85+
test_text = break_citation(i.get('citation'))[0]
86+
title = test_text.get('title', [''])[0]
87+
else:
88+
title = i.get('articletitle') or i.get('booktitle')
89+
if title == '':
90+
continue
91+
outcome = check_crossref(title)
92+
if outcome is not None:
93+
print('New match found.')
94+
i['newdoi'] = outcome.get('DOI')
95+
i['json'] = json.dumps(outcome)
96+
else:
97+
print('No new match.')
98+
99+
with open('output.csv', 'w') as file:
100+
writer = csv.DictWriter(file, fieldnames=['publicationid', 'citation', 'doi', 'notes', 'newdoi', 'json', 'bibtex'])
101+
writer.writeheader()
102+
for i in db_data:
103+
row = {j: i.get(j) for j in ['publicationid', 'citation', 'doi', 'notes', 'newdoi', 'json', 'bibtex']}
104+
writer.writerow(row)

0 commit comments

Comments
 (0)