|
| 1 | +# Read in all Neotoma publications from a Postgres table export |
| 2 | +# Attempt to clean DOI fields to match the regex `\b10\.\d{4,9}/[-.;()/:\w]+` |
| 3 | +# For records with DOIs, call out to crossref to get a formatted BibTex object |
| 4 | +# For records without, call CrossRef with the article title to find a DOI. |
| 5 | +# Return all objects to a CSV for hand validation. |
| 6 | + |
| 7 | +import requests |
| 8 | +import csv |
| 9 | +import sys |
| 10 | +import urllib |
| 11 | +import re |
| 12 | +import subprocess |
| 13 | +import json |
| 14 | +from crossref.restful import Works, Etiquette |
| 15 | + |
| 16 | +csv.field_size_limit(sys.maxsize) |
| 17 | + |
| 18 | +def clean_doi(doi_string:str): |
| 19 | + if type(doi_string) is not str: |
| 20 | + raise Exception('TypeError', f'The doi passed -- {doi_string} -- is not a string.') |
| 21 | + outcome = re.match(r'.*(\b10\.\d{4,9}/[-.;()/:\w]+)', doi_string) |
| 22 | + if outcome is None: |
| 23 | + return None |
| 24 | + else: |
| 25 | + return outcome.group(1) |
| 26 | + |
| 27 | +def break_citation(citation:str): |
| 28 | + with open('temp.txt', 'w') as wr: |
| 29 | + wr.write(citation) |
| 30 | + outcome = subprocess.run(['anystyle', '-f', 'json', 'parse', 'temp.txt'], capture_output = True) |
| 31 | + if outcome.stdout == b'': |
| 32 | + raise Exception('ValueError', f'Could not perform extraction from: {citation}') |
| 33 | + else: |
| 34 | + return json.loads(outcome.stdout) |
| 35 | + |
| 36 | +def return_bibtex(doi_string:str): |
| 37 | + url = 'https://doi.org/' + urllib.request.quote(doi_string) |
| 38 | + header = { |
| 39 | + 'Accept': 'application/x-bibtex', |
| 40 | + 'User-Agent': 'Neotoma Publication Augmenter; mailto:[email protected]' |
| 41 | + } |
| 42 | + response = requests.get(url, headers=header) |
| 43 | + return response.text.strip() |
| 44 | + |
| 45 | +def check_crossref(cite_object:str): |
| 46 | + url = 'https://api.crossref.org/works' |
| 47 | + url_call = requests.get(url, |
| 48 | + headers = {'Accept': 'application/json', |
| 49 | + 'User-Agent': 'Neotoma Publication Augmenter; mailto:[email protected]'}, |
| 50 | + params = {'rows':1, |
| 51 | + |
| 52 | + 'select':'DOI,title,container-title,published', |
| 53 | + 'query':f'query.title={cite_object}'}) |
| 54 | + if url_call.status_code == 200: |
| 55 | + cross_ref = json.loads(url_call.content) |
| 56 | + if cross_ref.get('message').get('total-results') > 0: |
| 57 | + return cross_ref.get('message').get('items', '')[0] |
| 58 | + else: |
| 59 | + return None |
| 60 | + else: |
| 61 | + return None |
| 62 | + |
| 63 | +with open('data/neotoma_publications_202410071440.csv') as file: |
| 64 | + db_data = list(csv.DictReader(file)) |
| 65 | + |
| 66 | +# For each row |
| 67 | +for i in db_data: |
| 68 | + if any([j in ['bibtex', 'newdoi', 'json'] for j in i.keys()]): |
| 69 | + continue |
| 70 | + if i.get('doi', '') != '': |
| 71 | + try: |
| 72 | + outcome = clean_doi(i.get('doi')) |
| 73 | + if outcome != i.get('doi'): |
| 74 | + i['notes'] = i.get('notes', '') + f'DOI mismatch, regex returns {outcome}; ' |
| 75 | + print('DOI mismatch.') |
| 76 | + else: |
| 77 | + print('DOI match:') |
| 78 | + bibtex = return_bibtex(outcome) |
| 79 | + i['bibtex'] = i.get('bibtex', '') + bibtex |
| 80 | + except TypeError as e: |
| 81 | + print('DOI present but not of the correct type.') |
| 82 | + else: |
| 83 | + print('Trying to pull in new information:') |
| 84 | + if i.get('citation', '') != '' and i.get('articletitle', '') == '' and i.get('booktitle') == '': |
| 85 | + test_text = break_citation(i.get('citation'))[0] |
| 86 | + title = test_text.get('title', [''])[0] |
| 87 | + else: |
| 88 | + title = i.get('articletitle') or i.get('booktitle') |
| 89 | + if title == '': |
| 90 | + continue |
| 91 | + outcome = check_crossref(title) |
| 92 | + if outcome is not None: |
| 93 | + print('New match found.') |
| 94 | + i['newdoi'] = outcome.get('DOI') |
| 95 | + i['json'] = json.dumps(outcome) |
| 96 | + else: |
| 97 | + print('No new match.') |
| 98 | + |
| 99 | +with open('output.csv', 'w') as file: |
| 100 | + writer = csv.DictWriter(file, fieldnames=['publicationid', 'citation', 'doi', 'notes', 'newdoi', 'json', 'bibtex']) |
| 101 | + writer.writeheader() |
| 102 | + for i in db_data: |
| 103 | + row = {j: i.get(j) for j in ['publicationid', 'citation', 'doi', 'notes', 'newdoi', 'json', 'bibtex']} |
| 104 | + writer.writerow(row) |
0 commit comments