Skip to content

Commit 7eb735d

Browse files
committed
Using uv for package management locally.
The qmd file will ultimately show how to manage these publication tools to discover DOIs, manage metadata and upload to Neotoma.
1 parent f749d2c commit 7eb735d

24 files changed

+512
-23
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
3.12

Proposals/publications/README.md

Whitespace-only changes.
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
FROM ubuntu:latest
2+
WORKDIR /usr/local/app
3+
4+
RUN apt-get update --fix-missing \
5+
&& apt-get upgrade -y --force-yes \
6+
&& apt-get install -y --force-yes \
7+
python3 \
8+
libssl-dev \
9+
wget \
10+
ruby-full \
11+
curl \
12+
git \
13+
libreadline-dev \
14+
build-essential \
15+
autoconf \
16+
automake \
17+
libtool \
18+
make \
19+
python3-pip \
20+
&& apt-get clean \
21+
&& rm -rf /var/cache/apt/archives/* /var/lib/apt/lists/*
22+
23+
ADD https://astral.sh/uv/0.4.28/install.sh /uv-installer.sh
24+
RUN sh /uv-installer.sh && rm /uv-installer.sh
25+
26+
RUN gem install anystyle
27+
28+
ADD . /app
29+
30+
WORKDIR /app
31+
RUN uv sync --frozen
32+
33+
CMD ["uv", "run" "src/update_doi_bibtex.py"]
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
[project]
2+
name = "publications"
3+
version = "0.1.0"
4+
description = "Add your description here"
5+
readme = "README.md"
6+
authors = [
7+
{ name = "Simon", email = "[email protected]" }
8+
]
9+
requires-python = ">=3.12"
10+
dependencies = [
11+
"crossref>=0.1.2",
12+
"psycopg2>=2.9.10",
13+
"pybtex",
14+
"pybtex-apa-style>=1.3",
15+
"python-dotenv>=1.0.1",
16+
"requests>=2.32.3",
17+
"six>=1.16.0",
18+
]
19+
20+
[build-system]
21+
requires = ["hatchling"]
22+
build-backend = "hatchling.build"
23+
24+
[tool.uv.sources]
25+
pybtex = { git = "https://bitbucket.org/pybtex-devs/pybtex/" }
Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
# Read in all Neotoma publications from a Postgres table export
2+
# Attempt to clean DOI fields to match the regex `\b10\.\d{4,9}/[-.;()/:\w]+`
3+
# For records with DOIs, call out to crossref to get a formatted BibTex object
4+
# For records without, call CrossRef with the article title to find a DOI.
5+
# Return all objects to a CSV for hand validation.
6+
7+
import csv
8+
import sys
9+
import json
10+
from publications import clean_doi, return_bibtex, call_publications, break_citation, check_crossref
11+
import argparse
12+
13+
parser = argparse.ArgumentParser()
14+
parser.add_argument('--output', '-O', help="A valid output filename.", type= str, default = 'output.csv')
15+
parser.add_argument('--limit', '-l', help="How many Neotoma publicaitons to process?", type= int, default= 100)
16+
17+
args = parser.parse_args()
18+
19+
csv.field_size_limit(sys.maxsize)
20+
21+
db_data = [j for j in [i.get('publication') for i in call_publications(limit = args.limit)] if j.get('doi') is None]
22+
23+
# For each Neotoma publication record without a DOI:
24+
for i in db_data:
25+
print(f'publicationid: {i.get('publicationid')}')
26+
if any([j in ['bibtex', 'newdoi', 'json'] for j in i.keys()]):
27+
continue
28+
if i.get('doi', '') or '' != '':
29+
try:
30+
outcome = clean_doi(i.get('doi'))
31+
if outcome != i.get('doi'):
32+
i['notes'] = i.get('notes', '') + f'DOI mismatch, regex returns {outcome}; '
33+
print('DOI mismatch.')
34+
else:
35+
print('DOI match:')
36+
bibtex = return_bibtex(outcome)
37+
if bibtex is None:
38+
print(f'Issue with DOI {outcome}')
39+
i['notes'] = (i.get('notes', '') or '') + ' CrossRef DOI does not exists; '
40+
else:
41+
i['bibtex'] = bibtex
42+
except TypeError:
43+
print('DOI present but not of the correct type.')
44+
else:
45+
print('Trying to pull in new information:')
46+
if i.get('citation', '') != '' and i.get('articletitle', '') == '' and i.get('booktitle') == '':
47+
test_text = break_citation(i.get('citation'))[0]
48+
title = test_text.get('title', [''])[0]
49+
else:
50+
title = i.get('articletitle') or i.get('booktitle')
51+
if title == '':
52+
continue
53+
outcome = check_crossref(title)
54+
if outcome is not None:
55+
print('New match found.')
56+
i['newdoi'] = outcome.get('DOI')
57+
i['json'] = json.dumps(outcome)
58+
bibtex = return_bibtex(outcome.get('DOI'))
59+
i['bibtex'] = i.get('bibtex', '') + bibtex
60+
else:
61+
print('No new match.')
62+
63+
with open(args.output, 'w') as file:
64+
writer = csv.DictWriter(file, fieldnames=['publicationid', 'citation', 'doi', 'newdoi', 'bibtex'])
65+
writer.writeheader()
66+
for i in db_data:
67+
row = {j: i.get(j) for j in ['publicationid', 'citation', 'doi', 'newdoi', 'bibtex']}
68+
writer.writerow(row)
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+

Proposals/publications/post_doi_bibtex.py renamed to Proposals/publications/src/post_doi_bibtex.py

Lines changed: 8 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
import os
99
import json
1010
import csv
11-
import requests
12-
import urllib
11+
import sys
12+
from publications import return_bibtex
1313

1414
dotenv.load_dotenv()
1515

@@ -23,34 +23,19 @@
2323
DO UPDATE SET doi = EXCLUDED.doi,
2424
bibtex = EXCLUDED.bibtex;"""
2525

26-
27-
def return_bibtex(doi_string:str):
28-
url = 'https://doi.org/' + urllib.request.quote(doi_string)
29-
header = {
30-
'Accept': 'application/x-bibtex',
31-
'User-Agent': 'Neotoma Publication Augmenter; mailto:[email protected]'
32-
}
33-
try:
34-
response = requests.get(url, headers=header, timeout = 10)
35-
except requests.exceptions.ReadTimeout as e:
36-
return None
37-
if response.status_code == 200:
38-
return response.text.strip()
39-
else:
40-
return None
41-
42-
43-
with open('output.csv', 'r') as newdois:
26+
with open(sys.argv[1], 'r') as newdois:
4427
reader = csv.DictReader(newdois)
28+
if sys.argv[2] not in reader.fieldnames:
29+
raise KeyError(f'The value {sys.argv[2]} is not a column heading in {sys.argv[1]}')
4530
for i in reader:
46-
bibtex = return_bibtex(i.get('doi'))
31+
bibtex = return_bibtex(i.get(sys.argv[2]))
4732
if bibtex is not None:
4833
with conn.cursor() as cur:
49-
cur.execute(QUERY, {'doi': i.get('doi'),
34+
cur.execute(QUERY, {'doi': i.get(sys.argv[2]),
5035
'bibtex': bibtex,
5136
'publicationid': int(i.get('publicationid'))})
5237
conn.commit()
5338
cur.close()
5439
print(f'Added bibtex and DOI for publication {i.get('publicationid')}.')
5540
else:
56-
print(f'Could not resolve bibtex/ DOI for publication {i.get('publicationid')}: {i.get('doi')}.')
41+
print(f'Could not resolve bibtex/ DOI for publication {i.get('publicationid')}: {i.get(sys.argv[2])}.')
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from .call_publications import call_publications as call_publications
2+
from .break_citation import break_citation as break_citation
3+
from .check_crossref import check_crossref as check_crossref
4+
from .clean_doi import clean_doi as clean_doi
5+
from .parse_raw import parse_raw as parse_raw
6+
from .return_bibtex import return_bibtex as return_bibtex
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)