NeotomaDB
diff --git a/‎.gitignore‎
Lines changed: 5 additions & 0 deletions b/‎.gitignore‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎models/ner/.gitkeep‎ b/‎models/ner/.gitkeep‎
diff --git a/‎results/ner/.gitkeep‎ b/‎results/ner/.gitkeep‎
diff --git a/‎src/entity_extraction/entity_extraction_evaluation.py‎
Lines changed: 47 additions & 32 deletions b/‎src/entity_extraction/entity_extraction_evaluation.py‎
Lines changed: 47 additions & 32 deletions
diff --git a/‎src/entity_extraction/training/hf_token_classification/README.md‎
Lines changed: 140 additions & 0 deletions b/‎src/entity_extraction/training/hf_token_classification/README.md‎
Lines changed: 140 additions & 0 deletions
@@ -6,6 +6,11 @@ data/**/*.json
 data/**/*.csv
 !data/raw/taxa.csv
 
+# ignore files in models folder but keep .gitkeep
+models/ner/*
+results/ner/*
+!.gitkeep
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 
@@ -12,7 +12,7 @@
 from spacy.tokens import Doc
 import spacy
 import json
-
+import copy
 
 def load_json_label_files(labelled_file_path:str):
     """
@@ -35,24 +35,27 @@ def load_json_label_files(labelled_file_path:str):
     combined_text = ""
     all_labelled_entities = []
     for file in os.listdir(labelled_file_path):
+
+        # if file is a txt file load it
+        if  file.endswith(".txt"):
 
-        with open(os.path.join(labelled_file_path, file), "r") as f:
-            task = json.load(f)
+            with open(os.path.join(labelled_file_path, file), "r") as f:
+                task = json.load(f)
 
-        raw_text = task['task']['data']['text']
+            raw_text = task['task']['data']['text']
 
-        annotation_result = task['result']
-        labelled_entities = [annotation['value'] for annotation in annotation_result]
+            annotation_result = task['result']
+            labelled_entities = [annotation['value'] for annotation in annotation_result]
 
-        # add the current text length to the start and end indices of labels plus one for the space
-        for entity in labelled_entities:
-            entity['start'] += len(combined_text)
-            entity['end'] += len(combined_text)
+            # add the current text length to the start and end indices of labels plus one for the space
+            for entity in labelled_entities:
+                entity['start'] += len(combined_text)
+                entity['end'] += len(combined_text)
 
-        all_labelled_entities += labelled_entities
+            all_labelled_entities += labelled_entities
 
-        # add the current text to the combined text with space in between
-        combined_text += raw_text + " "
+            # add the current text to the combined text with space in between
+            combined_text += raw_text + " "
 
     return combined_text, all_labelled_entities
 
@@ -71,8 +74,6 @@ def get_token_labels(labelled_entities, raw_text):
 
     Returns
     -------
-    tokens : list
-        A list of tokens in the raw text.
     token_labels : list
         A list of labels per token in the raw text.
     """
@@ -115,9 +116,9 @@ def plot_token_classification_report(
 
     Parameters
     ----------
-    labelled_tokens : list
+    labelled_tokens : list[lists]
         A list of labels per token in the raw text.
-    predicted_tokens : list
+    predicted_tokens : list[lists]
         A list of labels per token in the raw text.
     title : str
         The title of the plot.
@@ -136,13 +137,20 @@ def plot_token_classification_report(
     """
 
     if method == "tokens":
+        # copy the lists so they aren't modified outside this function
+        labelled_tokens = copy.deepcopy(labelled_tokens)
+        predicted_tokens = copy.deepcopy(predicted_tokens)
         # in each list replace all I- labels with B- labels so each token is
-        # considered a separate entity
-        labelled_tokens = [label.replace("I-", "B-") for label in labelled_tokens]
-        predicted_tokens = [label.replace("I-", "B-") for label in predicted_tokens]
+        # considered a separate entity and update the token label objects
+        for i, document in enumerate(labelled_tokens):
+            document = [label.replace("I-", "B-") for label in document]
+            labelled_tokens[i] = document
+        for i, document in enumerate(predicted_tokens):
+            document = [label.replace("I-", "B-") for label in document]
+            predicted_tokens[i] = document
 
     clf_report = classification_report(
-        [labelled_tokens], [predicted_tokens], output_dict=True, zero_division=0
+        labelled_tokens, predicted_tokens, output_dict=True, zero_division=0
     )
 
     fig, ax = plt.subplots(figsize=(8, 6))
@@ -172,10 +180,10 @@ def calculate_entity_classification_metrics(
 
     Parameters
     ----------
-    labelled_tokens : list
-        The labelled tokens.
-    predicted_tokens : list
-        The predicted tokens.
+    labelled_tokens : list[lists]
+        The labelled tokens per document.
+    predicted_tokens : list[lists]
+        The predicted tokens per document.
     method : str, optional
         The method to use to calculate the scores, by default "entities"
         which calculates the scores based on complete entities extracted from BIO
@@ -193,18 +201,25 @@ def calculate_entity_classification_metrics(
     """
 
     if method == "tokens":
+        # copy the lists so they aren't modified outside this function
+        labelled_tokens = copy.deepcopy(labelled_tokens)
+        predicted_tokens = copy.deepcopy(predicted_tokens)
         # in each list replace all I- labels with B- labels so each token is
-        # considered a separate entity
-        labelled_tokens = [label.replace("I-", "B-") for label in labelled_tokens]
-        predicted_tokens = [label.replace("I-", "B-") for label in predicted_tokens]
+        # considered a separate entity and update the token label objects
+        for i, document in enumerate(labelled_tokens):
+            document = [label.replace("I-", "B-") for label in document]
+            labelled_tokens[i] = document
+        for i, document in enumerate(predicted_tokens):
+            document = [label.replace("I-", "B-") for label in document]
+            predicted_tokens[i] = document
 
-    accuracy = accuracy_score([labelled_tokens], [predicted_tokens])
+    accuracy = accuracy_score(labelled_tokens, predicted_tokens)
 
-    f1 = f1_score([labelled_tokens], [predicted_tokens])
+    f1 = f1_score(labelled_tokens, predicted_tokens)
 
-    recall = recall_score([labelled_tokens], [predicted_tokens])
+    recall = recall_score(labelled_tokens, predicted_tokens)
 
-    precision = precision_score([labelled_tokens], [predicted_tokens])
+    precision = precision_score(labelled_tokens, predicted_tokens)
 
     return accuracy, f1, recall, precision
 
 
@@ -0,0 +1,140 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+Original code addopted from here on May 23, 2023: https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification
+
+# Improvements/Adjustments
+The following improvements were added to the tools linked above:
+1. The ability to log models and metrics to an Azure ML workspace MLflow instance was added and requiring environment variables to be set and the azureml-mlflow package to be installed to log.
+   1. The environnment variables AZURE_TENANT_ID, AZURE_CLIENT_ID, AZURE_CLIENT_SECRET and AZURE_MLFLOW_TRACKING_URI must be set using the .env file in the root of the repo.
+2. Adding in the automated text preprocessing from labelstudio outputs in the `labelstudio_preprocessing.py` file which is added as a bash script target.
+
+# Token classification
+
+## PyTorch version
+
+Fine-tuning the library models for token classification task such as Named Entity Recognition (NER), Parts-of-speech
+tagging (POS) or phrase extraction (CHUNKS). The main scrip `run_ner.py` leverages the 🤗 Datasets library and the Trainer API. You can easily
+customize it to your needs if you need extra processing on your datasets.
+
+It will either run on a datasets hosted on our [hub](https://huggingface.co/datasets) or with your own text files for
+training and validation, you might just need to add some tweaks in the data preprocessing.
+
+The following example fine-tunes BERT on CoNLL-2003:
+
+```bash
+python run_ner.py \
+  --model_name_or_path bert-base-uncased \
+  --dataset_name conll2003 \
+  --output_dir /tmp/test-ner \
+  --do_train \
+  --do_eval
+```
+
+or just can just run the bash script `run.sh`.
+
+To run on your own training and validation files, use the following command:
+
+```bash
+python run_ner.py \
+  --model_name_or_path bert-base-uncased \
+  --train_file path_to_train_file \
+  --validation_file path_to_validation_file \
+  --output_dir /tmp/test-ner \
+  --do_train \
+  --do_eval
+```
+
+**Note:** This script only works with models that have a fast tokenizer (backed by the 🤗 Tokenizers library) as it
+uses special features of those tokenizers. You can check if your favorite model has a fast tokenizer in
+[this table](https://huggingface.co/transformers/index.html#supported-frameworks), if it doesn't you can still use the old version
+of the script.
+
+> If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
+
+## Old version of the script
+
+You can find the old version of the PyTorch script [here](https://github.com/huggingface/transformers/blob/main/examples/legacy/token-classification/run_ner.py).
+
+## Pytorch version, no Trainer
+
+Based on the script [run_ner_no_trainer.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/token-classification/run_ner_no_trainer.py).
+
+Like `run_ner.py`, this script allows you to fine-tune any of the models on the [hub](https://huggingface.co/models) on a
+token classification task, either NER, POS or CHUNKS tasks or your own data in a csv or a JSON file. The main difference is that this
+script exposes the bare training loop, to allow you to quickly experiment and add any customization you would like.
+
+It offers less options than the script with `Trainer` (for instance you can easily change the options for the optimizer
+or the dataloaders directly in the script) but still run in a distributed setup, on TPU and supports mixed precision by
+the mean of the [🤗 `Accelerate`](https://github.com/huggingface/accelerate) library. You can use the script normally
+after installing it:
+
+```bash
+pip install git+https://github.com/huggingface/accelerate
+```
+
+then
+
+```bash
+export TASK_NAME=ner
+
+python run_ner_no_trainer.py \
+  --model_name_or_path bert-base-cased \
+  --dataset_name conll2003 \
+  --task_name $TASK_NAME \
+  --max_length 128 \
+  --per_device_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3 \
+  --output_dir /tmp/$TASK_NAME/
+```
+
+You can then use your usual launchers to run in it in a distributed environment, but the easiest way is to run
+
+```bash
+accelerate config
+```
+
+and reply to the questions asked. Then
+
+```bash
+accelerate test
+```
+
+that will check everything is ready for training. Finally, you can launch training with
+
+```bash
+export TASK_NAME=ner
+
+accelerate launch run_ner_no_trainer.py \
+  --model_name_or_path bert-base-cased \
+  --dataset_name conll2003 \
+  --task_name $TASK_NAME \
+  --max_length 128 \
+  --per_device_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3 \
+  --output_dir /tmp/$TASK_NAME/
+```
+
+This command is the same and will work for:
+
+- a CPU-only setup
+- a setup with one GPU
+- a distributed training with several GPUs (single or multi node)
+- a training on TPUs
+
+Note that this library is in alpha release so your feedback is more than welcome if you encounter any problem using it.