Skip to content

Commit 21b711a

Browse files
committed
docs: setup preprocessing folder
1 parent 6d253c9 commit 21b711a

File tree

5 files changed

+136
-3
lines changed

5 files changed

+136
-3
lines changed
File renamed without changes.
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# Author: Ty Andrews
2+
# Date: 2023-05-24
3+
"""This script procsses the labelstudio output data into a format used by huggingface.
4+
5+
Usage: labelstudio_preprocessing.py --label_files=<label_files> [--max_seq_length=<max_seq_length>] [--stride=<stride>]
6+
7+
Options:
8+
--label_files=<label_files> The path to where the label files are. [default: all]
9+
--max_seq_length=<max_seq_length> How many tokens the text is split into per training example. [default: 256]
10+
--stride=<stride> How many tokens to move the window by. [default: 192]
11+
"""
12+
13+
import os, sys
14+
15+
import pandas as pd
16+
import numpy as np
17+
import json
18+
from docopt import docopt
19+
20+
sys.path.append(
21+
os.path.join(os.path.dirname(__file__), os.pardir, os.pardir, os.pardir, os.pardir)
22+
)
23+
24+
from src.logs import get_logger
25+
26+
logger = get_logger(__name__)
27+
28+
from src.entity_extraction.evaluation.entity_extraction_evaluation import get_token_labels
29+
30+
31+
def convert_labelled_data_to_hf_format(
32+
labelled_file_path: str,
33+
max_seq_length: int = 256,
34+
stride: int = 192,
35+
):
36+
"""
37+
Processes train/val/test data from labelstudio into a format used by huggingface.
38+
39+
Parameters
40+
----------
41+
labelled_file_path : str
42+
The path to the folder containing the labelled data.
43+
max_seq_length : int, optional
44+
The maximum number of words per training example, by default 256.
45+
46+
Returns
47+
-------
48+
None.
49+
"""
50+
51+
# check the folder exists
52+
if not os.path.exists(labelled_file_path):
53+
raise FileNotFoundError(f"The folder {labelled_file_path} does not exist.")
54+
55+
# check the folder contains folders train/test/val
56+
if not os.path.exists(os.path.join(labelled_file_path, "train")):
57+
raise FileNotFoundError(
58+
f"The folder {labelled_file_path} does not contain a train folder."
59+
)
60+
if not os.path.exists(os.path.join(labelled_file_path, "test")):
61+
raise FileNotFoundError(
62+
f"The folder {labelled_file_path} does not contain a test folder."
63+
)
64+
if not os.path.exists(os.path.join(labelled_file_path, "val")):
65+
raise FileNotFoundError(
66+
f"The folder {labelled_file_path} does not contain a val folder."
67+
)
68+
69+
for folder in ["train", "test", "val"]:
70+
data_folder = os.path.join(labelled_file_path, folder)
71+
72+
logger.info(f"Processing {folder} data.")
73+
74+
labelled_chunks = []
75+
76+
for file in os.listdir(data_folder):
77+
# if file doesn't end with txt skip it
78+
if not file.endswith(".txt"):
79+
continue
80+
81+
with open(os.path.join(data_folder, file), "r") as f:
82+
task = json.load(f)
83+
84+
try:
85+
raw_text = task["task"]["data"]["text"]
86+
annotation_result = task["result"]
87+
gdd_id = task["task"]["data"]["gdd_id"]
88+
89+
labelled_entities = [
90+
annotation["value"] for annotation in annotation_result
91+
]
92+
93+
tokens, token_labels = get_token_labels(labelled_entities, raw_text)
94+
95+
# split the data into chunks of tokens and labels
96+
chunked_tokens = [
97+
tokens[i : i + max_seq_length]
98+
for i in range(0, len(tokens), stride)
99+
]
100+
chunked_labels = [
101+
token_labels[i : i + max_seq_length]
102+
for i in range(0, len(token_labels), stride)
103+
]
104+
105+
# make each chunk a dict with keys ner_tags and tokens
106+
chunked_data = [
107+
{
108+
"ner_tags": chunked_labels[i],
109+
"tokens": chunked_tokens[i],
110+
}
111+
for i in range(len(chunked_tokens))
112+
]
113+
114+
labelled_chunks.extend(chunked_data)
115+
116+
except Exception as e:
117+
logger.warning(f"Issue detected with file, skipping: {file}, {e}")
118+
119+
logger.debug(f"Processed {file}, generated {len(chunked_data)} chunks.")
120+
121+
# save the data to the hf_processed folder with each list item in a new line delimited json
122+
with open(os.path.join(labelled_file_path, f"{folder}.json"), "w") as f:
123+
for item in labelled_chunks:
124+
f.write(json.dumps(item) + "\n")
125+
126+
127+
# main function to process files using docopt
128+
if __name__ == "__main__":
129+
opt = docopt(__doc__)
130+
convert_labelled_data_to_hf_format(
131+
labelled_file_path=opt["--label_files"],
132+
max_seq_length=int(opt["--max_seq_length"]),
133+
stride=int(opt["--stride"]),
134+
)
File renamed without changes.

src/preprocessing/labelling_preprocessing.py renamed to src/entity_extraction/preprocessing/labelling_preprocessing.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,9 @@
3333
from src.logs import get_logger
3434
# logger = logging.getLogger(__name__)
3535
logger = get_logger(__name__)
36-
logger.setLevel(logging.INFO)
3736

38-
from src.entity_extraction.baseline_entity_extraction import baseline_extract_all
39-
from src.entity_extraction.spacy_entity_extraction import spacy_extract_all
37+
from src.entity_extraction.prediction.baseline_entity_extraction import baseline_extract_all
38+
from src.entity_extraction.prediction.spacy_entity_extraction import spacy_extract_all
4039

4140

4241
def clean_words(words: list):
File renamed without changes.

0 commit comments

Comments
 (0)