1111import re
1212import subprocess
1313import json
14+ from time import sleep
1415from crossref .restful import Works , Etiquette
1516
1617csv .field_size_limit (sys .maxsize )
1718
19+
1820def clean_doi (doi_string :str ):
21+ """_Clean a DOI string_
22+
23+ Args:
24+ doi_string (str): _A text string that purportedly contains a DOI._
25+
26+ Raises:
27+ Exception: _Raises a TypeError if the object passed is not a string._
28+
29+ Returns:
30+ _str_: _A cleaned DOI string._
31+ """
1932 if type (doi_string ) is not str :
20- raise Exception ('TypeError' , f'The doi passed -- { doi_string } -- is not a string.' )
33+ raise Exception ('TypeError' , f'The doi passed is not a string.' )
2134 outcome = re .match (r'.*(\b10\.\d{4,9}/[-.;()/:\w]+)' , doi_string )
2235 if outcome is None :
2336 return None
2437 else :
2538 return outcome .group (1 )
2639
40+
2741def break_citation (citation :str ):
28- with open ('temp.txt' , 'w' ) as wr :
42+ """_Break Citation String Apart_
43+
44+ Args:
45+ citation (str): _A citation string from the Neotoma Database._
46+
47+ Raises:
48+ Exception: _A ValueError exception if the object could not be parsed._
49+
50+ Returns:
51+ _dict_: _A dict representation of the anystyle output._
52+ """
53+ with open ('/tmp/temp.txt' , 'w' ) as wr :
2954 wr .write (citation )
30- outcome = subprocess .run (['anystyle' , '-f' , 'json' , 'parse' , 'temp.txt' ], capture_output = True )
55+ outcome = subprocess .run (['anystyle' , '-f' , 'json' , 'parse' , '/tmp/ temp.txt' ], capture_output = True )
3156 if outcome .stdout == b'' :
3257 raise Exception ('ValueError' , f'Could not perform extraction from: { citation } ' )
3358 else :
3459 return json .loads (outcome .stdout )
3560
61+
3662def return_bibtex (doi_string :str ):
3763 url = 'https://doi.org/' + urllib .request .quote (doi_string )
3864 header = {
@@ -42,6 +68,7 @@ def return_bibtex(doi_string:str):
4268 response = requests .get (url , headers = header )
4369 return response .text .strip ()
4470
71+
4572def check_crossref (cite_object :str ):
4673 url = 'https://api.crossref.org/works'
4774 url_call = requests .get (url ,
@@ -60,14 +87,26 @@ def check_crossref(cite_object:str):
6087 else :
6188 return None
6289
63- with open ('data/neotoma_publications_202410071440.csv' ) as file :
64- db_data = list (csv .DictReader (file ))
90+
91+ def call_publications ():
92+ """_Get Publications from Neotoma_
93+
94+ Returns:
95+ _dict_: _A dictionary of Neotoma Publications_
96+ """
97+ result = requests .get ("https://api.neotomadb.org/v2.0/data/publications?limit=100000" )
98+ if result .status_code == 200 :
99+ pubs = json .loads (result .content ).get ('data' ).get ('result' )
100+ return pubs
101+
102+ db_data = [i .get ('publication' ) for i in call_publications ()]
65103
66104# For each row
67105for i in db_data :
106+ print (f'publicationid: { i .get ('publicationid' )} ' )
68107 if any ([j in ['bibtex' , 'newdoi' , 'json' ] for j in i .keys ()]):
69108 continue
70- if i .get ('doi' , '' ) != '' :
109+ if i .get ('doi' , '' ) or '' != '' :
71110 try :
72111 outcome = clean_doi (i .get ('doi' ))
73112 if outcome != i .get ('doi' ):
@@ -93,8 +132,12 @@ def check_crossref(cite_object:str):
93132 print ('New match found.' )
94133 i ['newdoi' ] = outcome .get ('DOI' )
95134 i ['json' ] = json .dumps (outcome )
135+ bibtex = return_bibtex (outcome .get ('DOI' ))
136+ i ['bibtex' ] = i .get ('bibtex' , '' ) + bibtex
96137 else :
97138 print ('No new match.' )
139+ sleep (2 )
140+
98141
99142with open ('output.csv' , 'w' ) as file :
100143 writer = csv .DictWriter (file , fieldnames = ['publicationid' , 'citation' , 'doi' , 'notes' , 'newdoi' , 'json' , 'bibtex' ])
0 commit comments