Skip to content

Commit 410a1a6

Browse files
committed
Merge evaluation changes from branch 22
1 parent 1e082ce commit 410a1a6

5 files changed

Lines changed: 192 additions & 32 deletions

File tree

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,11 @@ data/**/*.json
66
data/**/*.csv
77
!data/raw/taxa.csv
88

9+
# ignore files in models folder but keep .gitkeep
10+
models/ner/*
11+
results/ner/*
12+
!.gitkeep
13+
914
# Byte-compiled / optimized / DLL files
1015
__pycache__/
1116
*.py[cod]

models/ner/.gitkeep

Whitespace-only changes.

results/ner/.gitkeep

Whitespace-only changes.

src/entity_extraction/entity_extraction_evaluation.py

Lines changed: 47 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
from spacy.tokens import Doc
1313
import spacy
1414
import json
15-
15+
import copy
1616

1717
def load_json_label_files(labelled_file_path:str):
1818
"""
@@ -35,24 +35,27 @@ def load_json_label_files(labelled_file_path:str):
3535
combined_text = ""
3636
all_labelled_entities = []
3737
for file in os.listdir(labelled_file_path):
38+
39+
# if file is a txt file load it
40+
if file.endswith(".txt"):
3841

39-
with open(os.path.join(labelled_file_path, file), "r") as f:
40-
task = json.load(f)
42+
with open(os.path.join(labelled_file_path, file), "r") as f:
43+
task = json.load(f)
4144

42-
raw_text = task['task']['data']['text']
45+
raw_text = task['task']['data']['text']
4346

44-
annotation_result = task['result']
45-
labelled_entities = [annotation['value'] for annotation in annotation_result]
47+
annotation_result = task['result']
48+
labelled_entities = [annotation['value'] for annotation in annotation_result]
4649

47-
# add the current text length to the start and end indices of labels plus one for the space
48-
for entity in labelled_entities:
49-
entity['start'] += len(combined_text)
50-
entity['end'] += len(combined_text)
50+
# add the current text length to the start and end indices of labels plus one for the space
51+
for entity in labelled_entities:
52+
entity['start'] += len(combined_text)
53+
entity['end'] += len(combined_text)
5154

52-
all_labelled_entities += labelled_entities
55+
all_labelled_entities += labelled_entities
5356

54-
# add the current text to the combined text with space in between
55-
combined_text += raw_text + " "
57+
# add the current text to the combined text with space in between
58+
combined_text += raw_text + " "
5659

5760
return combined_text, all_labelled_entities
5861

@@ -71,8 +74,6 @@ def get_token_labels(labelled_entities, raw_text):
7174
7275
Returns
7376
-------
74-
tokens : list
75-
A list of tokens in the raw text.
7677
token_labels : list
7778
A list of labels per token in the raw text.
7879
"""
@@ -115,9 +116,9 @@ def plot_token_classification_report(
115116
116117
Parameters
117118
----------
118-
labelled_tokens : list
119+
labelled_tokens : list[lists]
119120
A list of labels per token in the raw text.
120-
predicted_tokens : list
121+
predicted_tokens : list[lists]
121122
A list of labels per token in the raw text.
122123
title : str
123124
The title of the plot.
@@ -136,13 +137,20 @@ def plot_token_classification_report(
136137
"""
137138

138139
if method == "tokens":
140+
# copy the lists so they aren't modified outside this function
141+
labelled_tokens = copy.deepcopy(labelled_tokens)
142+
predicted_tokens = copy.deepcopy(predicted_tokens)
139143
# in each list replace all I- labels with B- labels so each token is
140-
# considered a separate entity
141-
labelled_tokens = [label.replace("I-", "B-") for label in labelled_tokens]
142-
predicted_tokens = [label.replace("I-", "B-") for label in predicted_tokens]
144+
# considered a separate entity and update the token label objects
145+
for i, document in enumerate(labelled_tokens):
146+
document = [label.replace("I-", "B-") for label in document]
147+
labelled_tokens[i] = document
148+
for i, document in enumerate(predicted_tokens):
149+
document = [label.replace("I-", "B-") for label in document]
150+
predicted_tokens[i] = document
143151

144152
clf_report = classification_report(
145-
[labelled_tokens], [predicted_tokens], output_dict=True, zero_division=0
153+
labelled_tokens, predicted_tokens, output_dict=True, zero_division=0
146154
)
147155

148156
fig, ax = plt.subplots(figsize=(8, 6))
@@ -172,10 +180,10 @@ def calculate_entity_classification_metrics(
172180
173181
Parameters
174182
----------
175-
labelled_tokens : list
176-
The labelled tokens.
177-
predicted_tokens : list
178-
The predicted tokens.
183+
labelled_tokens : list[lists]
184+
The labelled tokens per document.
185+
predicted_tokens : list[lists]
186+
The predicted tokens per document.
179187
method : str, optional
180188
The method to use to calculate the scores, by default "entities"
181189
which calculates the scores based on complete entities extracted from BIO
@@ -193,18 +201,25 @@ def calculate_entity_classification_metrics(
193201
"""
194202

195203
if method == "tokens":
204+
# copy the lists so they aren't modified outside this function
205+
labelled_tokens = copy.deepcopy(labelled_tokens)
206+
predicted_tokens = copy.deepcopy(predicted_tokens)
196207
# in each list replace all I- labels with B- labels so each token is
197-
# considered a separate entity
198-
labelled_tokens = [label.replace("I-", "B-") for label in labelled_tokens]
199-
predicted_tokens = [label.replace("I-", "B-") for label in predicted_tokens]
208+
# considered a separate entity and update the token label objects
209+
for i, document in enumerate(labelled_tokens):
210+
document = [label.replace("I-", "B-") for label in document]
211+
labelled_tokens[i] = document
212+
for i, document in enumerate(predicted_tokens):
213+
document = [label.replace("I-", "B-") for label in document]
214+
predicted_tokens[i] = document
200215

201-
accuracy = accuracy_score([labelled_tokens], [predicted_tokens])
216+
accuracy = accuracy_score(labelled_tokens, predicted_tokens)
202217

203-
f1 = f1_score([labelled_tokens], [predicted_tokens])
218+
f1 = f1_score(labelled_tokens, predicted_tokens)
204219

205-
recall = recall_score([labelled_tokens], [predicted_tokens])
220+
recall = recall_score(labelled_tokens, predicted_tokens)
206221

207-
precision = precision_score([labelled_tokens], [predicted_tokens])
222+
precision = precision_score(labelled_tokens, predicted_tokens)
208223

209224
return accuracy, f1, recall, precision
210225

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
<!---
2+
Copyright 2020 The HuggingFace Team. All rights reserved.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
-->
16+
17+
Original code addopted from here on May 23, 2023: https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification
18+
19+
# Improvements/Adjustments
20+
The following improvements were added to the tools linked above:
21+
1. The ability to log models and metrics to an Azure ML workspace MLflow instance was added and requiring environment variables to be set and the azureml-mlflow package to be installed to log.
22+
1. The environnment variables AZURE_TENANT_ID, AZURE_CLIENT_ID, AZURE_CLIENT_SECRET and AZURE_MLFLOW_TRACKING_URI must be set using the .env file in the root of the repo.
23+
2. Adding in the automated text preprocessing from labelstudio outputs in the `labelstudio_preprocessing.py` file which is added as a bash script target.
24+
25+
# Token classification
26+
27+
## PyTorch version
28+
29+
Fine-tuning the library models for token classification task such as Named Entity Recognition (NER), Parts-of-speech
30+
tagging (POS) or phrase extraction (CHUNKS). The main scrip `run_ner.py` leverages the 🤗 Datasets library and the Trainer API. You can easily
31+
customize it to your needs if you need extra processing on your datasets.
32+
33+
It will either run on a datasets hosted on our [hub](https://huggingface.co/datasets) or with your own text files for
34+
training and validation, you might just need to add some tweaks in the data preprocessing.
35+
36+
The following example fine-tunes BERT on CoNLL-2003:
37+
38+
```bash
39+
python run_ner.py \
40+
--model_name_or_path bert-base-uncased \
41+
--dataset_name conll2003 \
42+
--output_dir /tmp/test-ner \
43+
--do_train \
44+
--do_eval
45+
```
46+
47+
or just can just run the bash script `run.sh`.
48+
49+
To run on your own training and validation files, use the following command:
50+
51+
```bash
52+
python run_ner.py \
53+
--model_name_or_path bert-base-uncased \
54+
--train_file path_to_train_file \
55+
--validation_file path_to_validation_file \
56+
--output_dir /tmp/test-ner \
57+
--do_train \
58+
--do_eval
59+
```
60+
61+
**Note:** This script only works with models that have a fast tokenizer (backed by the 🤗 Tokenizers library) as it
62+
uses special features of those tokenizers. You can check if your favorite model has a fast tokenizer in
63+
[this table](https://huggingface.co/transformers/index.html#supported-frameworks), if it doesn't you can still use the old version
64+
of the script.
65+
66+
> If your model classification head dimensions do not fit the number of labels in the dataset, you can specify `--ignore_mismatched_sizes` to adapt it.
67+
68+
## Old version of the script
69+
70+
You can find the old version of the PyTorch script [here](https://github.com/huggingface/transformers/blob/main/examples/legacy/token-classification/run_ner.py).
71+
72+
## Pytorch version, no Trainer
73+
74+
Based on the script [run_ner_no_trainer.py](https://github.com/huggingface/transformers/blob/main/examples/pytorch/token-classification/run_ner_no_trainer.py).
75+
76+
Like `run_ner.py`, this script allows you to fine-tune any of the models on the [hub](https://huggingface.co/models) on a
77+
token classification task, either NER, POS or CHUNKS tasks or your own data in a csv or a JSON file. The main difference is that this
78+
script exposes the bare training loop, to allow you to quickly experiment and add any customization you would like.
79+
80+
It offers less options than the script with `Trainer` (for instance you can easily change the options for the optimizer
81+
or the dataloaders directly in the script) but still run in a distributed setup, on TPU and supports mixed precision by
82+
the mean of the [🤗 `Accelerate`](https://github.com/huggingface/accelerate) library. You can use the script normally
83+
after installing it:
84+
85+
```bash
86+
pip install git+https://github.com/huggingface/accelerate
87+
```
88+
89+
then
90+
91+
```bash
92+
export TASK_NAME=ner
93+
94+
python run_ner_no_trainer.py \
95+
--model_name_or_path bert-base-cased \
96+
--dataset_name conll2003 \
97+
--task_name $TASK_NAME \
98+
--max_length 128 \
99+
--per_device_train_batch_size 32 \
100+
--learning_rate 2e-5 \
101+
--num_train_epochs 3 \
102+
--output_dir /tmp/$TASK_NAME/
103+
```
104+
105+
You can then use your usual launchers to run in it in a distributed environment, but the easiest way is to run
106+
107+
```bash
108+
accelerate config
109+
```
110+
111+
and reply to the questions asked. Then
112+
113+
```bash
114+
accelerate test
115+
```
116+
117+
that will check everything is ready for training. Finally, you can launch training with
118+
119+
```bash
120+
export TASK_NAME=ner
121+
122+
accelerate launch run_ner_no_trainer.py \
123+
--model_name_or_path bert-base-cased \
124+
--dataset_name conll2003 \
125+
--task_name $TASK_NAME \
126+
--max_length 128 \
127+
--per_device_train_batch_size 32 \
128+
--learning_rate 2e-5 \
129+
--num_train_epochs 3 \
130+
--output_dir /tmp/$TASK_NAME/
131+
```
132+
133+
This command is the same and will work for:
134+
135+
- a CPU-only setup
136+
- a setup with one GPU
137+
- a distributed training with several GPUs (single or multi node)
138+
- a training on TPUs
139+
140+
Note that this library is in alpha release so your feedback is more than welcome if you encounter any problem using it.

0 commit comments

Comments
 (0)