Skip to content

Commit 7341c11

Browse files
committed
docker README updated
1 parent 85a9d7a commit 7341c11

File tree

4 files changed

+51
-8
lines changed

4 files changed

+51
-8
lines changed

docker/article-relevance/README.md

Lines changed: 29 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@ Arguments for controlling the xDD API Query:
2828
- `N_RECENT`: This variable can be set to a number to retrieve the n most recently added articles. When this variable is set, no MIN_DATE or MAX_DATE should be set.
2929
- `MIN_DATE`: This variable can be set to establish a earliest date for the range of articles to be included. The date should follow format yyyy-mm-dd.
3030
- `MAX_DATE`: This variable can be set to establish a latest date for the range of articles to be included. The date should follow format yyyy-mm-dd.
31+
- `TERM`: This variable can be set to a word to search for in the article.
3132
- `AUTO_MIN_DATE`: This variable can be set to True or False. If set to True, the pipeline will screen through the date of existing processed parquet files and use the latest date as the earliest date for this run.
3233
- `AUTO_CHECK_DUP`: This variable can be set to True or False. If set to True, the pipeline will screen through the date of existing processed parquet files and exclude the already-processed articles from the list.
3334

@@ -39,7 +40,9 @@ Arguments for controlling the relevance prediction:
3940

4041
## Sample Docker Compose Setup
4142

42-
Below is a sample docker compose configuration for running the image:
43+
Below is a sample docker compose configuration for running the image。
44+
45+
Sample 1: Query by number of most recently added articles
4346
```yaml
4447
version: "0.0.1"
4548
services:
@@ -52,6 +55,31 @@ services:
5255
- N_RECENT=10
5356
- MIN_DATE=
5457
- MAX_DATE=
58+
- TERM=
59+
- AUTO_MIN_DATE=False
60+
- AUTO_CHECK_DUP=False
61+
62+
# Arguments for relevance prediction script
63+
- DOI_FILE_PATH=data/article-relevance/raw/gdd_api_return.json
64+
- MODEL_PATH=models/article-relevance/logistic_regression_model.joblib
65+
- OUTPUT_PATH=data/article-relevance/processed
66+
- SEND_XDD=False
67+
```
68+
69+
Sample 2: Query by date range
70+
```yaml
71+
version: "0.0.1"
72+
services:
73+
article-relevance-prediction:
74+
image: metaextractor-article-relevance-prediction:v0.0.1
75+
environment:
76+
# Arguments for xDD API Query
77+
- DOI_PATH=data/article-relevance/raw
78+
- PARQUET_PATH=data/article-relevance/processed/prediction_parquet
79+
- N_RECENT=
80+
- MIN_DATE=2023-06-04
81+
- MAX_DATE=2023-06-05
82+
- TERM=
5583
- AUTO_MIN_DATE=False
5684
- AUTO_CHECK_DUP=False
5785

docker/article-relevance/docker-compose.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,12 @@ services:
99
- N_RECENT=10
1010
- MIN_DATE=
1111
- MAX_DATE=
12+
- TERM=
1213
- AUTO_MIN_DATE=False
1314
- AUTO_CHECK_DUP=False
1415

1516
# Arguments for relevance prediction script
1617
- DOI_FILE_PATH=data/article-relevance/raw/gdd_api_return.json
1718
- MODEL_PATH=models/article-relevance/logistic_regression_model.joblib
1819
- OUTPUT_PATH=data/article-relevance/processed
19-
- SEND_XDD=False
20+
- SEND_XDD=False

src/article_relevance/gdd_api_query.py

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -110,24 +110,29 @@ def get_new_gdd_articles(output_path,
110110

111111
# ========== Query API ==========
112112
if n_recent_articles is not None:
113+
logger.info(f'Querying by n_recent = {n_recent_articles}')
113114
api_call = "https://geodeepdive.org/api/articles?recent" + f"&max={n_recent_articles}"
114115

115116
# Query API by date range
116117
elif (min_date is not None) and (max_date is not None):
118+
logger.info(f'Querying by min_date = {min_date} and max_date = {max_date}')
117119
api_call = f"https://xdd.wisc.edu/api/articles?min_acquired={min_date}&max_acquired={max_date}&full_results=true"
118120

119121
elif (min_date is not None) and (max_date is None):
122+
logger.info(f'Querying by min_date = {min_date}.')
120123
api_call = f"https://xdd.wisc.edu/api/articles?min_acquired={min_date}&full_results=true"
121124

122125
elif (min_date is None) and (max_date is not None):
126+
logger.info(f'Querying by max_date = {max_date}.')
123127
api_call = f"https://xdd.wisc.edu/api/articles?max_acquired={max_date}&full_results=true"
124128

125129
else:
126130
raise ValueError("Please check input parameter values.")
127131

128132
if term is not None:
129-
api_extend = f"&term={term}"
130-
api_call += api_extend
133+
logger.info(f'Search term = {term}.')
134+
api_extend = f"&term={term}"
135+
api_call += api_extend
131136

132137

133138
# =========== Query xDD API to get data ==========
@@ -193,7 +198,7 @@ def get_new_gdd_articles(output_path,
193198

194199

195200
# ========== Get list of existing gddids from the parquet files =========
196-
if auto_check_dup == "True":
201+
if auto_check_dup.lower() == "true":
197202
# Get the list of existing IDs from the Parquet files
198203
logger.info(f'auto_check_dup is True. Removing duplicates.')
199204

@@ -281,14 +286,17 @@ def main():
281286
parquet_file_path = opt["--parquet_path"]
282287
param_n_recent = opt["--n_recent"]
283288

289+
if param_n_recent == '': # case when n_recent is left empty in the ENV variable
290+
param_n_recent = None
291+
284292
if param_n_recent is not None:
285-
param_n_recent = int(opt["--n_recent"])
293+
param_n_recent = int(param_n_recent)
286294

287295
param_min_date = opt["--min_date"]
288296

289297
param_auto_min_date = opt['--auto_min_date']
290298

291-
if param_auto_min_date == 'True':
299+
if param_auto_min_date.lower() == 'true':
292300
file_list = os.listdir(parquet_file_path)
293301
if len(file_list) == 0:
294302
logger.warning(f'auto_min_date is True, but no existing parquet file found. All queried articles up to max_date will be returned.')

src/article_relevance/relevance_prediction_parquet.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77

88
"""This script takes a list of DOI as input and output a dataframe containing all metadata, predicted relevance, predict_proba of each article.
99
10-
Usage: relevance_prediction.py --doi_file_path=<doi_path> --model_path=<model_path> --output_path=<output_path> --send_xdd=<send_xdd>
10+
Usage: relevance_prediction_parquet.py --doi_file_path=<doi_path> --model_path=<model_path> --output_path=<output_path> --send_xdd=<send_xdd>
1111
1212
Options:
1313
--doi_file_path=<doi_file_path> The path to where the list of DOI is.
@@ -63,6 +63,12 @@ def crossref_extract(doi_path):
6363
data_dictionary = json.load(json_file)
6464

6565
df = pd.DataFrame(data_dictionary['data'])
66+
67+
if df.shape[0] == 0:
68+
logger.warning(f'Last xDD API query did not retrieve any article. Please verify the arguments.')
69+
raise ValueError("No article to process. Script terminated.")
70+
71+
6672
doi_col = 'DOI'
6773

6874
# a list of doi

0 commit comments

Comments
 (0)