enhancement: train spacy model from scratch

brabbit61 · brabbit61 · commit b76378d1fe2e · 2023-06-27T18:04:35.000-07:00
diff --git a/src/entity_extraction/training/spacy_ner/README.md b/src/entity_extraction/training/spacy_ner/README.md
@@ -12,11 +12,10 @@ This folder contains the training and evaluation scripts for the SpaCy Transform
 ## Training Workflow
 
 A bash script is used to initialize a training job. Model training is fully customizable and users are encouraged to update the parameters in the `run_spacy_training.sh` and `spacy_transfomer_train.cfg` files prior to training. The training workflow is as follows:
-1. Create a new data directory and dump all the TXT files (contains annotations in the JSONLines format) from Label Studio.
+1. Create a new data directory and dump all the JSON files containing annotations from Label Studio and any reviewed parquet files.
 2. Most parameters can be used with the default value, open the `run_spacy_training.sh` bash script and update the following fields with absolute paths or relative paths from the root of the repository:
    - `DATA_PATH`: path to directory with Label Studio labelled data
    - `DATA_OUTPUT_PATH`: path to directory to store the split dataset (train/val/test) as well as other data artifacts required for training.
-   - `MODEL_PATH`: If retraining, specify path to model artifacts. If training a model from scratch, pass empty string `""`
    - `MODEL_OUTPUT_PATH`: path to store new model artifacts
    - `VERSION`: Version can be updated to keep track of different training runs.
    - `--gpu-id`: While executing the `spacy train` command, GPU can be used, if available, by setting this flag to **0**.
diff --git a/src/entity_extraction/training/spacy_ner/run_spacy_training.sh b/src/entity_extraction/training/spacy_ner/run_spacy_training.sh
@@ -9,7 +9,6 @@ echo "Current working directory: $(pwd)"
 
 DATA_PATH="/path/to/sample input folder"
 DATA_OUTPUT_PATH="/path/to/sample output folder"
-MODEL_PATH="/path/to/model artifacts"
 MODEL_OUTPUT_PATH="/path/to/new model artifacts"
 VERSION="v1"
 TRAIN_SPLIT=0.7
@@ -28,34 +27,17 @@ python3 src/preprocessing/labelling_data_split.py \
 
 python3 src/preprocessing/spacy_preprocess.py --data_path $DATA_OUTPUT_PATH
 
-if [ -z "$MODEL_PATH" ]; then
-    # If the model path is null, then start training from scratch
-
-    # Fill configuration with required fields
-    python -m spacy init fill-config \
-            src/entity_extraction/training/spacy_ner/spacy_transformer_train.cfg \
-            src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg
-
-    # Execute the training job by pointing to the new config file
-    python -m spacy train \
-        src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg \
-        --paths.train $DATA_OUTPUT_PATH/train.spacy \
-        --paths.dev $DATA_OUTPUT_PATH/val.spacy \
-        --output $MODEL_OUTPUT_PATH \
-        --gpu-id -1
-
-else
-    # Else create a new config file to resume training
-    python src/entity_extraction/training/spacy_ner/create_config.py \
-        --model_path $MODEL_PATH \
-        --output_path src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg
-
-    python -m spacy train \
-        src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg \
-        --paths.train $DATA_OUTPUT_PATH/train.spacy \
-        --paths.dev $DATA_OUTPUT_PATH/val.spacy \
-        --components.ner.source $MODEL_PATH \
-        --components.transformer.source $MODEL_PATH \
-        --output $MODEL_OUTPUT_PATH \
-        --gpu-id -1
-fi
+# Start training from scratch
+
+# Fill configuration with required fields
+python -m spacy init fill-config \
+        src/entity_extraction/training/spacy_ner/spacy_transformer_train.cfg \
+        src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg
+
+# Execute spacy CLI training
+python -m spacy train \
+    src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg \
+    --paths.train $DATA_OUTPUT_PATH/train.spacy \
+    --paths.dev $DATA_OUTPUT_PATH/val.spacy \
+    --output $MODEL_OUTPUT_PATH \
+    --gpu-id -1