docker README updated

kellywujy · kellywujy · commit 7341c11d5ee4 · 2023-06-21T10:47:54.000-07:00
diff --git a/docker/article-relevance/README.md b/docker/article-relevance/README.md
@@ -28,6 +28,7 @@ Arguments for controlling the xDD API Query:
 - `N_RECENT`: This variable can be set to a number to retrieve the n most recently added articles. When this variable is set, no MIN_DATE or MAX_DATE should be set.
 - `MIN_DATE`: This variable can be set to establish a earliest date for the range of articles to be included. The date should follow format yyyy-mm-dd.
 - `MAX_DATE`: This variable can be set to establish a latest date for the range of articles to be included. The date should follow format yyyy-mm-dd.
+- `TERM`: This variable can be set to a word to search for in the article.
 - `AUTO_MIN_DATE`: This variable can be set to True or False. If set to True, the pipeline will screen through the date of existing processed parquet files and use the latest date as the earliest date for this run.
 - `AUTO_CHECK_DUP`:  This variable can be set to True or False. If set to True, the pipeline will screen through the date of existing processed parquet files and exclude the already-processed articles from the list.
 
@@ -39,7 +40,9 @@ Arguments for controlling the relevance prediction:
 
 ## Sample Docker Compose Setup
 
-Below is a sample docker compose configuration for running the image:
+Below is a sample docker compose configuration for running the image。
+
+Sample 1: Query by number of most recently added articles
 ```yaml
 version: "0.0.1"
 services:
@@ -52,6 +55,31 @@ services:
       - N_RECENT=10
       - MIN_DATE=
       - MAX_DATE=
+      - TERM=
+      - AUTO_MIN_DATE=False
+      - AUTO_CHECK_DUP=False
+
+      # Arguments for relevance prediction script
+      - DOI_FILE_PATH=data/article-relevance/raw/gdd_api_return.json
+      - MODEL_PATH=models/article-relevance/logistic_regression_model.joblib
+      - OUTPUT_PATH=data/article-relevance/processed
+      - SEND_XDD=False
+```
+
+Sample 2: Query by date range
+```yaml
+version: "0.0.1"
+services:
+  article-relevance-prediction:
+    image: metaextractor-article-relevance-prediction:v0.0.1
+    environment:
+      # Arguments for xDD API Query
+      - DOI_PATH=data/article-relevance/raw
+      - PARQUET_PATH=data/article-relevance/processed/prediction_parquet
+      - N_RECENT=
+      - MIN_DATE=2023-06-04
+      - MAX_DATE=2023-06-05
+      - TERM=
       - AUTO_MIN_DATE=False
       - AUTO_CHECK_DUP=False
 
diff --git a/docker/article-relevance/docker-compose.yml b/docker/article-relevance/docker-compose.yml
@@ -9,11 +9,12 @@ services:
       - N_RECENT=10
       - MIN_DATE=
       - MAX_DATE=
+      - TERM=
       - AUTO_MIN_DATE=False
       - AUTO_CHECK_DUP=False
 
       # Arguments for relevance prediction script
       - DOI_FILE_PATH=data/article-relevance/raw/gdd_api_return.json
       - MODEL_PATH=models/article-relevance/logistic_regression_model.joblib
       - OUTPUT_PATH=data/article-relevance/processed
-      - SEND_XDD=False
+      - SEND_XDD=False
diff --git a/src/article_relevance/gdd_api_query.py b/src/article_relevance/gdd_api_query.py
@@ -110,24 +110,29 @@ def get_new_gdd_articles(output_path,
             
     # ========== Query API ==========
     if n_recent_articles is not None:
+        logger.info(f'Querying by n_recent = {n_recent_articles}')
         api_call = "https://geodeepdive.org/api/articles?recent" + f"&max={n_recent_articles}"
 
     # Query API by date range
     elif (min_date is not None) and (max_date is not None):
+        logger.info(f'Querying by min_date = {min_date} and max_date = {max_date}')
         api_call = f"https://xdd.wisc.edu/api/articles?min_acquired={min_date}&max_acquired={max_date}&full_results=true"
     
     elif (min_date is not None) and (max_date is None):
+        logger.info(f'Querying by min_date = {min_date}.')
         api_call = f"https://xdd.wisc.edu/api/articles?min_acquired={min_date}&full_results=true"
 
     elif (min_date is None) and (max_date is not None):
+        logger.info(f'Querying by max_date = {max_date}.')
         api_call = f"https://xdd.wisc.edu/api/articles?max_acquired={max_date}&full_results=true"
     
     else:
         raise ValueError("Please check input parameter values.")
     
     if term is not None:
-         api_extend = f"&term={term}"
-         api_call += api_extend
+        logger.info(f'Search term = {term}.')
+        api_extend = f"&term={term}"
+        api_call += api_extend
 
 
     # =========== Query xDD API to get data ==========
@@ -193,7 +198,7 @@ def get_new_gdd_articles(output_path,
 
 
     # ========== Get list of existing gddids from the parquet files =========
-    if auto_check_dup == "True":
+    if auto_check_dup.lower() == "true":
         # Get the list of existing IDs from the Parquet files
         logger.info(f'auto_check_dup is True. Removing duplicates.')
 
@@ -281,14 +286,17 @@ def main():
     parquet_file_path = opt["--parquet_path"]
     param_n_recent = opt["--n_recent"]
 
+    if param_n_recent == '': # case when n_recent is left empty in the ENV variable
+         param_n_recent = None
+
     if param_n_recent is not None:
-        param_n_recent = int(opt["--n_recent"])
+        param_n_recent = int(param_n_recent)
 
     param_min_date = opt["--min_date"]
 
     param_auto_min_date = opt['--auto_min_date']
     
-    if param_auto_min_date == 'True':
+    if param_auto_min_date.lower() == 'true':
         file_list = os.listdir(parquet_file_path)
         if len(file_list) == 0:
              logger.warning(f'auto_min_date is True, but no existing parquet file found. All queried articles up to max_date will be returned.')
diff --git a/src/article_relevance/relevance_prediction_parquet.py b/src/article_relevance/relevance_prediction_parquet.py
@@ -7,7 +7,7 @@
 
 """This script takes a list of DOI as input and output a dataframe containing all metadata, predicted relevance, predict_proba of each article.
 
-Usage: relevance_prediction.py --doi_file_path=<doi_path> --model_path=<model_path> --output_path=<output_path> --send_xdd=<send_xdd>
+Usage: relevance_prediction_parquet.py --doi_file_path=<doi_path> --model_path=<model_path> --output_path=<output_path> --send_xdd=<send_xdd>
 
 Options:
     --doi_file_path=<doi_file_path>         The path to where the list of DOI is.
@@ -63,6 +63,12 @@ def crossref_extract(doi_path):
         data_dictionary = json.load(json_file)
 
     df = pd.DataFrame(data_dictionary['data'])
+
+    if df.shape[0] == 0:
+        logger.warning(f'Last xDD API query did not retrieve any article. Please verify the arguments.')
+        raise ValueError("No article to process. Script terminated.")
+
+
     doi_col = 'DOI'
 
     # a list of doi