Skip to content

Commit c3aa52c

Browse files
committed
merge: bring commits from br 76 into reorg
1 parent a24a567 commit c3aa52c

5 files changed

Lines changed: 174 additions & 108 deletions

File tree

src/entity_extraction/preprocessing/README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ This script takes labelled dataset in JSONLines format as input and splits it in
9494
The resulting train, validation, and test sets can be used for training and evaluating machine learning models.
9595

9696
#### **Options**
97-
- `--raw_label_path=<raw_label_path>`: Specify the path to the directory where the raw label files are located.
97+
- `--raw_label_path=<raw_label_path>`: Specify the path to the directory where the raw label files exported from LabelStudio and the parquet files containing the reviewed entities are located.
9898

9999
- `--output_path=<output_path>`: Specify the path to the directory where the output files will be written.
100100

@@ -126,4 +126,4 @@ This script manages the creation of custom data artifacts required for training
126126
4. Creates the custom data artifacts that can be used for training or fine-tuning spaCy models.
127127

128128
#### **Options**
129-
- `--data_path=<data_path>`: Specify the path to the folder containing files in JSONLines format.
129+
- `--data_path=<data_path>`: Specify the path to the folder containing JSON files in txt/json format.

src/entity_extraction/preprocessing/labelling_data_split.py

Lines changed: 114 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,16 @@
1717
import numpy as np
1818
import shutil
1919
import json
20-
20+
from collections import defaultdict
21+
from datetime import datetime
2122
from docopt import docopt
2223

2324
sys.path.append(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir))
2425

2526
from src.logs import get_logger
26-
logger = get_logger(__name__)
27+
from src.preprocessing.labelling_preprocessing import get_hash
2728

29+
logger = get_logger(__name__)
2830

2931
def separate_labels_to_train_val_test(
3032
labelled_file_path: str,
@@ -74,6 +76,9 @@ def separate_labels_to_train_val_test(
7476
os.makedirs(os.path.join(output_path, "val"), exist_ok=True)
7577
os.makedirs(os.path.join(output_path, "test"), exist_ok=True)
7678

79+
# Checks for parquet files and extracts them
80+
extract_parquet_file(labelled_file_path)
81+
7782
gdd_ids = get_article_gdd_ids(labelled_file_path)
7883

7984
logger.info(f"Found {len(gdd_ids)} unique GDD IDs in the labelled data.")
@@ -156,20 +161,24 @@ def separate_labels_to_train_val_test(
156161
},
157162
}
158163

159-
# iterate through the files in the folder and convert them to the hf format
160164
for file in os.listdir(labelled_file_path):
161165
# if file doesn't end with txt skip it
162-
if not file.endswith(".txt"):
163-
continue
164-
165-
with open(os.path.join(labelled_file_path, file), "r") as f:
166-
task = json.load(f)
167-
168-
try:
169-
gdd_id = task["task"]["data"]["gdd_id"]
170-
raw_text = task["task"]["data"]["text"]
171-
annotation_result = task["result"]
172-
166+
try:
167+
if file.endswith(".txt"):
168+
with open(os.path.join(labelled_file_path, file), "r") as f:
169+
task = json.load(f)
170+
annotation_result = task["result"]
171+
gdd_id = task["task"]["data"]["gdd_id"]
172+
raw_text = task["task"]["data"]["text"]
173+
elif file.endswith(".json"):
174+
with open(os.path.join(labelled_file_path, file), "r") as f:
175+
task = json.load(f)
176+
annotation_result = task["result"]
177+
gdd_id = task["data"]["gdd_id"]
178+
raw_text = task["data"]["text"]
179+
else:
180+
continue
181+
173182
# get the number of words in the article
174183
num_words = len(raw_text.split())
175184

@@ -229,8 +238,16 @@ def separate_labels_to_train_val_test(
229238
json.dump(data_metrics, f, indent=2)
230239

231240
logger.info("Finished separating files into train, val and test sets.")
232-
233-
241+
logger.info(
242+
f"Found {data_metrics['train']['entity_counts']} entities in {data_metrics['train']['article_count']} articles in train set."
243+
)
244+
logger.info(
245+
f"Found {data_metrics['val']['entity_counts']} entities in {data_metrics['val']['article_count']} articles in val set."
246+
)
247+
logger.info(
248+
f"Found {data_metrics['test']['entity_counts']} entities in {data_metrics['test']['article_count']} articles in test set."
249+
)
250+
234251
def get_article_gdd_ids(labelled_file_path: str):
235252
"""
236253
Parameters
@@ -256,24 +273,96 @@ def get_article_gdd_ids(labelled_file_path: str):
256273

257274
# iterate through the files and get the unique gdd_ids
258275
gdd_ids = []
276+
259277
for file in os.listdir(labelled_file_path):
260-
# if file doesn't end with txt skip it
261-
if not file.endswith(".txt"):
262-
continue
263-
264-
with open(os.path.join(labelled_file_path, file), "r") as f:
265-
task = json.load(f)
266-
278+
267279
try:
268-
gdd_id = task["task"]["data"]["gdd_id"]
280+
if file.endswith(".txt"):
281+
with open(os.path.join(labelled_file_path, file), "r") as f:
282+
task = json.load(f)
283+
gdd_id = task["task"]["data"]["gdd_id"]
284+
elif file.endswith(".json"):
285+
with open(os.path.join(labelled_file_path, file), "r") as f:
286+
task = json.load(f)
287+
gdd_id = task["data"]["gdd_id"]
288+
else:
289+
continue
269290
except Exception as e:
270291
logger.warning(f"Issue with file data: {file}, {e}")
271-
292+
continue
293+
272294
if gdd_id not in gdd_ids:
273295
gdd_ids.append(gdd_id)
274296

275297
return gdd_ids
276298

299+
def extract_parquet_file(labelled_file_path: str):
300+
"""Checks the directory for parquet files and extracts the corrected entities
301+
302+
Parameter
303+
---------
304+
labelled_file_path: str
305+
Directory containing the data files
306+
"""
307+
308+
files = os.listdir(labelled_file_path)
309+
310+
# Iterate through the files and check if they are parquet files
311+
for fin in files:
312+
if fin.endswith(".parquet"):
313+
df = pd.read_parquet(os.path.join(labelled_file_path, fin))
314+
315+
logger.info(f"Read parquet file {fin} with {len(df)} rows.")
316+
317+
for index, row in df.iterrows():
318+
319+
output_files = defaultdict(list)
320+
all_sentences = {}
321+
gdd_id = row["gddid"]
322+
if row["corrected_entities"] != "None":
323+
324+
logger.info(f"Entities found in xDD ID: {gdd_id}")
325+
326+
corrected_entities = json.loads(row["corrected_entities"])
327+
328+
for ent_type in corrected_entities.keys():
329+
for entity in corrected_entities[ent_type].keys():
330+
for sentence in corrected_entities[ent_type][entity]['sentence']:
331+
if (sentence['char_index']['start'] != -1 and
332+
sentence['char_index']['end'] != -1):
333+
all_sentences[sentence['sentid']] = sentence['text']
334+
output_files[sentence['sentid']].append({
335+
"value": {
336+
"text": corrected_entities[ent_type][entity]['corrected_name'],
337+
"start": sentence['char_index']['start'],
338+
"end": sentence['char_index']['end'],
339+
"labels": [ent_type]
340+
}
341+
})
342+
343+
logger.info(f"Number of sentences extracted for training: {len(output_files)}")
344+
345+
# Iterate through each sentence and create a json file
346+
for sentid in output_files.keys():
347+
text = all_sentences[sentid]
348+
article_data = {
349+
"text": text,
350+
"global_index": sentid,
351+
"local_index": sentid,
352+
"gdd_id": gdd_id,
353+
"doi": row['DOI'],
354+
"timestamp": str(datetime.today()),
355+
"chunk_hash": get_hash(text),
356+
"article_hash": get_hash(text),
357+
}
358+
output_data = {
359+
"data": article_data,
360+
"result": output_files[sentid]
361+
}
362+
file_name = os.path.join(labelled_file_path, f"{gdd_id}_{sentid}.json")
363+
# Save the dictionary as a json file
364+
with open(file_name, "w") as f:
365+
json.dump(output_data, f, indent=2)
277366

278367
def main():
279368
opt = docopt(__doc__)

src/entity_extraction/preprocessing/spacy_preprocess.py

Lines changed: 44 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,6 @@
2020
# ensure that the parent directory is on the path for relative imports
2121
sys.path.append(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir))
2222

23-
from src.logs import get_logger
24-
logger = get_logger(__name__)
25-
2623
def preprocess_data(data_path: str):
2724
"""Creates data artifacts used by the Spacy model for training
2825
@@ -36,56 +33,56 @@ def preprocess_data(data_path: str):
3633
nlp = spacy.blank("en")
3734
train_files = glob.glob(os.path.join(data_path, "train", "*.txt"))
3835
val_files = glob.glob(os.path.join(data_path, "val", "*.txt"))
36+
train_files.extend(glob.glob(os.path.join(data_path, "train", "*.json")))
37+
val_files.extend(glob.glob(os.path.join(data_path, "val", "*.json")))
38+
39+
train_doc_bin = get_doc(nlp, train_files)
40+
train_doc_bin.to_disk(os.path.join(data_path, "train.spacy"))
3941

40-
logger.info(
41-
f"Number of files found under the train dir: {len(train_files)}")
42-
logger.info(
43-
f"Number of files found under the val dir: {len(val_files)}")
42+
val_doc_bin = get_doc(nlp, val_files)
43+
val_doc_bin.to_disk(os.path.join(data_path, "val.spacy"))
44+
45+
def get_doc(nlp, files):
46+
"""Creates and saves a doc bin object for training
4447
45-
def get_doc(files):
46-
"""Creates and saves a doc bin object for training
48+
Parameters
49+
----------
50+
nlp: spacy.lang
51+
A blank nlp object for english language
52+
files: list
53+
List of files that contain labelled entities
4754
48-
Parameters
49-
----------
50-
files: list
51-
List of files that contain labelled entities
52-
53-
Returns
54-
----------
55-
doc_bin: DocBin
56-
DocBin object that can be used for training the spacy model
57-
"""
58-
doc_bin = DocBin()
59-
for labelled_file in files:
60-
entities = []
61-
with open(labelled_file, 'r') as fin:
62-
article = fin.readlines()
63-
article_data = json.loads(article[0])
64-
text = article_data['task']['data']["text"]
55+
Returns
56+
----------
57+
doc_bin: DocBin
58+
DocBin object that can be used for training the spacy model
59+
"""
60+
doc_bin = DocBin()
61+
for labelled_file in files:
62+
63+
with open(labelled_file, 'r') as fin:
64+
task = json.load(fin)
6565

66-
doc = nlp.make_doc(text)
66+
if labelled_file.endswith(".txt"):
67+
text = task['task']['data']["text"]
68+
else:
69+
text = task['data']["text"]
70+
71+
entities = []
72+
doc = nlp.make_doc(text)
6773

68-
for label in article_data['result']:
69-
start = label['value']['start']
70-
end = label['value']['end']
71-
ent = label['value']['labels'][0]
72-
span = doc.char_span(start, end, label=ent)
73-
if span is not None:
74-
entities.append(span)
75-
76-
doc.ents = entities
77-
doc_bin.add(doc)
78-
79-
return doc_bin
80-
81-
train_doc_bin = get_doc(train_files)
82-
train_doc_bin.to_disk(os.path.join(data_path, "train.spacy"))
83-
84-
val_doc_bin = get_doc(val_files)
85-
val_doc_bin.to_disk(os.path.join(data_path, "val.spacy"))
86-
87-
# TODO: Else If the data_path consists of parquet files, load JSON files from all parquet files in the directory
74+
for label in task['result']:
75+
start = label['value']['start']
76+
end = label['value']['end']
77+
ent = label['value']['labels'][0]
78+
span = doc.char_span(start, end, label=ent)
79+
if span is not None:
80+
entities.append(span)
81+
82+
doc.ents = entities
83+
doc_bin.add(doc)
8884

85+
return doc_bin
8986
if __name__ == "__main__":
9087
opt = docopt(__doc__)
9188
assert os.path.exists(opt['--data_path']), \

src/entity_extraction/training/spacy/README.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,10 @@ This folder contains the training and evaluation scripts for the SpaCy Transform
1212
## Training Workflow
1313

1414
A bash script is used to initialize a training job. Model training is fully customizable and users are encouraged to update the parameters in the `run_spacy_training.sh` and `spacy_transfomer_train.cfg` files prior to training. The training workflow is as follows:
15-
1. Create a new data directory and dump all the TXT files (contains annotations in the JSONLines format) from Label Studio.
15+
1. Create a new data directory and dump all the JSON files containing annotations from Label Studio and any reviewed parquet files.
1616
2. Most parameters can be used with the default value, open the `run_spacy_training.sh` bash script and update the following fields with absolute paths or relative paths from the root of the repository:
1717
- `DATA_PATH`: path to directory with Label Studio labelled data
1818
- `DATA_OUTPUT_PATH`: path to directory to store the split dataset (train/val/test) as well as other data artifacts required for training.
19-
- `MODEL_PATH`: If retraining, specify path to model artifacts. If training a model from scratch, pass empty string `""`
2019
- `MODEL_OUTPUT_PATH`: path to store new model artifacts
2120
- `VERSION`: Version can be updated to keep track of different training runs.
2221
- `--gpu-id`: While executing the `spacy train` command, GPU can be used, if available, by setting this flag to **0**.

src/entity_extraction/training/spacy/run_spacy_training.sh

Lines changed: 13 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ echo "Current working directory: $(pwd)"
99

1010
DATA_PATH="/path/to/sample input folder"
1111
DATA_OUTPUT_PATH="/path/to/sample output folder"
12-
MODEL_PATH="/path/to/model artifacts"
1312
MODEL_OUTPUT_PATH="/path/to/new model artifacts"
1413
VERSION="v1"
1514
TRAIN_SPLIT=0.7
@@ -28,34 +27,16 @@ python3 src/entity_extraction/preprocessing/labelling_data_split.py \
2827

2928
python3 src/preprocessing/spacy_preprocess.py --data_path $DATA_OUTPUT_PATH
3029

31-
if [ -z "$MODEL_PATH" ]; then
32-
# If the model path is null, then start training from scratch
33-
34-
# Fill configuration with required fields
35-
python -m spacy init fill-config \
36-
src/entity_extraction/training/spacy/spacy_transformer_train.cfg \
37-
src/entity_extraction/training/spacy/spacy_transformer_$VERSION.cfg
38-
39-
# Execute the training job by pointing to the new config file
40-
python -m spacy train \
41-
src/entity_extraction/training/spacy/spacy_transformer_$VERSION.cfg \
42-
--paths.train $DATA_OUTPUT_PATH/train.spacy \
43-
--paths.dev $DATA_OUTPUT_PATH/val.spacy \
44-
--output $MODEL_OUTPUT_PATH \
45-
--gpu-id -1
46-
47-
else
48-
# Else create a new config file to resume training
49-
python src/entity_extraction/training/spacy/create_config.py \
50-
--model_path $MODEL_PATH \
51-
--output_path src/entity_extraction/training/spacy/spacy_transformer_$VERSION.cfg
52-
53-
python -m spacy train \
54-
src/entity_extraction/training/spacy/spacy_transformer_$VERSION.cfg \
55-
--paths.train $DATA_OUTPUT_PATH/train.spacy \
56-
--paths.dev $DATA_OUTPUT_PATH/val.spacy \
57-
--components.ner.source $MODEL_PATH \
58-
--components.transformer.source $MODEL_PATH \
59-
--output $MODEL_OUTPUT_PATH \
60-
--gpu-id -1
61-
fi
30+
# Start training from scratch
31+
# Fill configuration with required fields
32+
python -m spacy init fill-config \
33+
src/entity_extraction/training/spacy_ner/spacy_transformer_train.cfg \
34+
src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg
35+
36+
# Execute spacy CLI training
37+
python -m spacy train \
38+
src/entity_extraction/training/spacy_ner/spacy_transformer_$VERSION.cfg \
39+
--paths.train $DATA_OUTPUT_PATH/train.spacy \
40+
--paths.dev $DATA_OUTPUT_PATH/val.spacy \
41+
--output $MODEL_OUTPUT_PATH \
42+
--gpu-id -1

0 commit comments

Comments
 (0)