merge: bring commits from br 76 into reorg

tieandrews · tieandrews · commit c3aa52c9bd8f · 2023-06-28T01:05:18.000-07:00
diff --git a/src/entity_extraction/preprocessing/README.md b/src/entity_extraction/preprocessing/README.md
@@ -94,7 +94,7 @@ This script takes labelled dataset in JSONLines format as input and splits it in
 The resulting train, validation, and test sets can be used for training and evaluating machine learning models.
 
 #### **Options**
-- `--raw_label_path=<raw_label_path>`: Specify the path to the directory where the raw label files are located.
+- `--raw_label_path=<raw_label_path>`: Specify the path to the directory where the raw label files exported from LabelStudio and the parquet files containing the reviewed entities are located.
 
 - `--output_path=<output_path>`: Specify the path to the directory where the output files will be written.
 
@@ -126,4 +126,4 @@ This script manages the creation of custom data artifacts required for training
 4. Creates the custom data artifacts that can be used for training or fine-tuning spaCy models.
 
 #### **Options**
-- `--data_path=<data_path>`: Specify the path to the folder containing files in JSONLines format.
+- `--data_path=<data_path>`: Specify the path to the folder containing JSON files in txt/json format.
diff --git a/src/entity_extraction/preprocessing/labelling_data_split.py b/src/entity_extraction/preprocessing/labelling_data_split.py
@@ -17,14 +17,16 @@
 import numpy as np
 import shutil
 import json
-
+from collections import defaultdict
+from datetime import datetime
 from docopt import docopt
 
 sys.path.append(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir))
 
 from src.logs import get_logger
-logger = get_logger(__name__)
+from src.preprocessing.labelling_preprocessing import get_hash
 
+logger = get_logger(__name__)
 
 def separate_labels_to_train_val_test(
     labelled_file_path: str,
@@ -74,6 +76,9 @@ def separate_labels_to_train_val_test(
     os.makedirs(os.path.join(output_path, "val"), exist_ok=True)
     os.makedirs(os.path.join(output_path, "test"), exist_ok=True)
 
+    # Checks for parquet files and extracts them 
+    extract_parquet_file(labelled_file_path)
+    
     gdd_ids = get_article_gdd_ids(labelled_file_path)
 
     logger.info(f"Found {len(gdd_ids)} unique GDD IDs in the labelled data.")
@@ -156,20 +161,24 @@ def separate_labels_to_train_val_test(
         },
     }
 
-    # iterate through the files in the folder and convert them to the hf format
     for file in os.listdir(labelled_file_path):
         # if file doesn't end with txt skip it
-        if not file.endswith(".txt"):
-            continue
-
-        with open(os.path.join(labelled_file_path, file), "r") as f:
-            task = json.load(f)
-
-        try:
-            gdd_id = task["task"]["data"]["gdd_id"]
-            raw_text = task["task"]["data"]["text"]
-            annotation_result = task["result"]
-
+        try:    
+            if file.endswith(".txt"):
+                with open(os.path.join(labelled_file_path, file), "r") as f:
+                    task = json.load(f)
+                annotation_result = task["result"]
+                gdd_id = task["task"]["data"]["gdd_id"]
+                raw_text = task["task"]["data"]["text"]
+            elif file.endswith(".json"):
+                with open(os.path.join(labelled_file_path, file), "r") as f:
+                    task = json.load(f)
+                annotation_result = task["result"]
+                gdd_id = task["data"]["gdd_id"]
+                raw_text = task["data"]["text"]
+            else:
+                continue      
+            
             # get the number of words in the article
             num_words = len(raw_text.split())
 
@@ -229,8 +238,16 @@ def separate_labels_to_train_val_test(
         json.dump(data_metrics, f, indent=2)
 
     logger.info("Finished separating files into train, val and test sets.")
-
-
+    logger.info(
+        f"Found {data_metrics['train']['entity_counts']} entities in {data_metrics['train']['article_count']} articles in train set."
+    )
+    logger.info(
+        f"Found {data_metrics['val']['entity_counts']} entities in {data_metrics['val']['article_count']} articles in val set."
+    )
+    logger.info(
+        f"Found {data_metrics['test']['entity_counts']} entities in {data_metrics['test']['article_count']} articles in test set."
+    )
+    
 def get_article_gdd_ids(labelled_file_path: str):
     """
     Parameters
@@ -256,24 +273,96 @@ def get_article_gdd_ids(labelled_file_path: str):
 
     # iterate through the files and get the unique gdd_ids
     gdd_ids = []
+    
     for file in os.listdir(labelled_file_path):
-        # if file doesn't end with txt skip it
-        if not file.endswith(".txt"):
-            continue
-
-        with open(os.path.join(labelled_file_path, file), "r") as f:
-            task = json.load(f)
-
+        
         try:
-            gdd_id = task["task"]["data"]["gdd_id"]
+            if file.endswith(".txt"):
+                with open(os.path.join(labelled_file_path, file), "r") as f:
+                    task = json.load(f)
+                    gdd_id = task["task"]["data"]["gdd_id"]
+            elif file.endswith(".json"):
+                with open(os.path.join(labelled_file_path, file), "r") as f:
+                    task = json.load(f)
+                    gdd_id = task["data"]["gdd_id"]
+            else:
+                continue
         except Exception as e:
             logger.warning(f"Issue with file data: {file}, {e}")
-
+            continue
+        
         if gdd_id not in gdd_ids:
             gdd_ids.append(gdd_id)
 
     return gdd_ids
 
+def extract_parquet_file(labelled_file_path: str):
+    """Checks the directory for parquet files and extracts the corrected entities
+
+    Parameter
+    ---------
+    labelled_file_path: str
+        Directory containing the data files
+    """
+    
+    files = os.listdir(labelled_file_path)
+    
+    # Iterate through the files and check if they are parquet files
+    for fin in files:
+        if fin.endswith(".parquet"):
+            df = pd.read_parquet(os.path.join(labelled_file_path, fin))
+            
+            logger.info(f"Read parquet file {fin} with {len(df)} rows.")
+            
+            for index, row in df.iterrows():
+                
+                output_files = defaultdict(list)
+                all_sentences = {}
+                gdd_id = row["gddid"]
+                if row["corrected_entities"] != "None":
+                    
+                    logger.info(f"Entities found in xDD ID: {gdd_id}")
+                    
+                    corrected_entities = json.loads(row["corrected_entities"])
+                    
+                    for ent_type in corrected_entities.keys():
+                        for entity in corrected_entities[ent_type].keys():
+                            for sentence in corrected_entities[ent_type][entity]['sentence']:
+                                if (sentence['char_index']['start'] != -1 and
+                                    sentence['char_index']['end'] != -1):
+                                    all_sentences[sentence['sentid']] = sentence['text']
+                                    output_files[sentence['sentid']].append({
+                                        "value": {
+                                            "text": corrected_entities[ent_type][entity]['corrected_name'],
+                                            "start": sentence['char_index']['start'],
+                                            "end": sentence['char_index']['end'],
+                                            "labels": [ent_type]
+                                        }          
+                                    })
+                
+                    logger.info(f"Number of sentences extracted for training: {len(output_files)}")
+                
+                # Iterate through each sentence and create a json file
+                for sentid in output_files.keys():
+                    text = all_sentences[sentid]
+                    article_data = {
+                        "text": text,
+                        "global_index": sentid,
+                        "local_index": sentid,
+                        "gdd_id": gdd_id,
+                        "doi": row['DOI'],
+                        "timestamp": str(datetime.today()),
+                        "chunk_hash": get_hash(text),
+                        "article_hash": get_hash(text),
+                    }
+                    output_data = {
+                        "data": article_data,
+                        "result": output_files[sentid]
+                    }
+                    file_name = os.path.join(labelled_file_path, f"{gdd_id}_{sentid}.json")
+                    # Save the dictionary as a json file
+                    with open(file_name, "w") as f:
+                        json.dump(output_data, f, indent=2)
 
 def main():
     opt = docopt(__doc__)
diff --git a/src/entity_extraction/preprocessing/spacy_preprocess.py b/src/entity_extraction/preprocessing/spacy_preprocess.py
@@ -20,9 +20,6 @@
 # ensure that the parent directory is on the path for relative imports
 sys.path.append(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir))
 
-from src.logs import get_logger
-logger = get_logger(__name__)
-
 def preprocess_data(data_path: str):
     """Creates data artifacts used by the Spacy model for training
 
@@ -36,56 +33,56 @@ def preprocess_data(data_path: str):
     nlp = spacy.blank("en")
     train_files = glob.glob(os.path.join(data_path, "train", "*.txt"))
     val_files = glob.glob(os.path.join(data_path, "val", "*.txt"))
+    train_files.extend(glob.glob(os.path.join(data_path, "train", "*.json")))
+    val_files.extend(glob.glob(os.path.join(data_path, "val", "*.json")))
+                
+    train_doc_bin = get_doc(nlp, train_files)
+    train_doc_bin.to_disk(os.path.join(data_path, "train.spacy"))
     
-    logger.info(
-        f"Number of files found under the train dir: {len(train_files)}")
-    logger.info(
-        f"Number of files found under the val dir: {len(val_files)}")
+    val_doc_bin = get_doc(nlp, val_files)
+    val_doc_bin.to_disk(os.path.join(data_path, "val.spacy"))
+
+def get_doc(nlp, files):
+    """Creates and saves a doc bin object for training
     
-    def get_doc(files):
-        """Creates and saves a doc bin object for training
+    Parameters
+    ----------
+    nlp: spacy.lang
+        A blank nlp object for english language
+    files: list
+        List of files that contain labelled entities
         
-        Parameters
-        ----------
-        files: list
-            List of files that contain labelled entities
-            
-        Returns
-        ----------
-        doc_bin: DocBin
-            DocBin object that can be used for training the spacy model
-        """
-        doc_bin = DocBin()
-        for labelled_file in files:
-            entities = []
-            with open(labelled_file, 'r') as fin:
-                article = fin.readlines()
-                article_data = json.loads(article[0])
-                text = article_data['task']['data']["text"]
+    Returns
+    ----------
+    doc_bin: DocBin
+        DocBin object that can be used for training the spacy model
+    """
+    doc_bin = DocBin()
+    for labelled_file in files:
+        
+        with open(labelled_file, 'r') as fin:
+            task = json.load(fin)
             
-            doc = nlp.make_doc(text)
+        if labelled_file.endswith(".txt"):
+            text = task['task']['data']["text"]
+        else:
+            text = task['data']["text"]
+        
+        entities = []
+        doc = nlp.make_doc(text)
 
-            for label in article_data['result']:
-                start = label['value']['start']
-                end = label['value']['end']
-                ent = label['value']['labels'][0]
-                span = doc.char_span(start, end, label=ent)
-                if span is not None:
-                    entities.append(span)
-                    
-            doc.ents = entities
-            doc_bin.add(doc)
-            
-        return doc_bin
-    
-    train_doc_bin = get_doc(train_files)
-    train_doc_bin.to_disk(os.path.join(data_path, "train.spacy"))
-    
-    val_doc_bin = get_doc(val_files)
-    val_doc_bin.to_disk(os.path.join(data_path, "val.spacy"))
-    
-    # TODO: Else If the data_path consists of parquet files, load JSON files from all parquet files in the directory
+        for label in task['result']:
+            start = label['value']['start']
+            end = label['value']['end']
+            ent = label['value']['labels'][0]
+            span = doc.char_span(start, end, label=ent)
+            if span is not None:
+                entities.append(span)
+                
+        doc.ents = entities
+        doc_bin.add(doc)
     
+    return doc_bin
 if __name__ == "__main__":
     opt = docopt(__doc__)
     assert  os.path.exists(opt['--data_path']), \
diff --git a/src/entity_extraction/training/spacy/README.md b/src/entity_extraction/training/spacy/README.md
@@ -12,11 +12,10 @@ This folder contains the training and evaluation scripts for the SpaCy Transform
 ## Training Workflow
 
 A bash script is used to initialize a training job. Model training is fully customizable and users are encouraged to update the parameters in the `run_spacy_training.sh` and `spacy_transfomer_train.cfg` files prior to training. The training workflow is as follows:
-1. Create a new data directory and dump all the TXT files (contains annotations in the JSONLines format) from Label Studio.
+1. Create a new data directory and dump all the JSON files containing annotations from Label Studio and any reviewed parquet files.
 2. Most parameters can be used with the default value, open the `run_spacy_training.sh` bash script and update the following fields with absolute paths or relative paths from the root of the repository:
    - `DATA_PATH`: path to directory with Label Studio labelled data
    - `DATA_OUTPUT_PATH`: path to directory to store the split dataset (train/val/test) as well as other data artifacts required for training.
-   - `MODEL_PATH`: If retraining, specify path to model artifacts. If training a model from scratch, pass empty string `""`
    - `MODEL_OUTPUT_PATH`: path to store new model artifacts
    - `VERSION`: Version can be updated to keep track of different training runs.
    - `--gpu-id`: While executing the `spacy train` command, GPU can be used, if available, by setting this flag to **0**.
diff --git a/src/entity_extraction/training/spacy/run_spacy_training.sh b/src/entity_extraction/training/spacy/run_spacy_training.sh
@@ -9,7 +9,6 @@ echo "Current working directory: $(pwd)"
 
 DATA_PATH="/path/to/sample input folder"
 DATA_OUTPUT_PATH="/path/to/sample output folder"
-MODEL_PATH="/path/to/model artifacts"
 MODEL_OUTPUT_PATH="/path/to/new model artifacts"
 VERSION="v1"
 TRAIN_SPLIT=0.7
@@ -28,34 +27,16 @@ python3 src/entity_extraction/preprocessing/labelling_data_split.py \
 
 python3 src/preprocessing/spacy_preprocess.py --data_path $DATA_OUTPUT_PATH
 
-if [ -z "$MODEL_PATH" ]; then
-    # If the model path is null, then start training from scratch
-
-    # Fill configuration with required fields
-    python -m spacy init fill-config \
-            src/entity_extraction/training/spacy/spacy_transformer_train.cfg \
-            src/entity_extraction/training/spacy/spacy_transformer_$VERSION.cfg
-
-    # Execute the training job by pointing to the new config file
-    python -m spacy train \
-        src/entity_extraction/training/spacy/spacy_transformer_$VERSION.cfg \
-        --paths.train $DATA_OUTPUT_PATH/train.spacy \
-        --paths.dev $DATA_OUTPUT_PATH/val.spacy \
-        --output $MODEL_OUTPUT_PATH \
-        --gpu-id -1
-
-else
-    # Else create a new config file to resume training
-    python src/entity_extraction/training/spacy/create_config.py \
-        --model_path $MODEL_PATH \
-        --output_path src/entity_extraction/training/spacy/spacy_transformer_$VERSION.cfg
-
-    python -m spacy train \
-        src/entity_extraction/training/spacy/spacy_transformer_$VERSION.cfg \
-        --paths.train $DATA_OUTPUT_PATH/train.spacy \
-        --paths.dev $DATA_OUTPUT_PATH/val.spacy \
-        --components.ner.source $MODEL_PATH \
-        --components.transformer.source $MODEL_PATH \
-        --output $MODEL_OUTPUT_PATH \
-        --gpu-id -1
-fi
+# Start training from scratch
+# Fill configuration with required fields
+python -m spacy init fill-config \
+        src/entity_extraction/training/spacy_ner/spacy_transformer_train.cfg \
+        src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg
+
+# Execute spacy CLI training
+python -m spacy train \
+    src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg \
+    --paths.train $DATA_OUTPUT_PATH/train.spacy \
+    --paths.dev $DATA_OUTPUT_PATH/val.spacy \
+    --output $MODEL_OUTPUT_PATH \
+    --gpu-id -1