Merge pull request #99 from NeotomaDB/dev

tieandrews · web-flow · commit a6833c66ff2d · 2023-06-28T12:57:22.000-07:00
Update entity name
diff --git a/src/entity_extraction/preprocessing/huggingface_preprocess.py b/src/entity_extraction/preprocessing/huggingface_preprocess.py
@@ -74,18 +74,22 @@ def convert_labelled_data_to_hf_format(
         labelled_chunks = []
 
         for file in os.listdir(data_folder):
-            # if file doesn't end with txt skip it
-            if not file.endswith(".txt"):
-                continue
-
-            with open(os.path.join(data_folder, file), "r") as f:
-                task = json.load(f)
-
             try:
-                raw_text = task["task"]["data"]["text"]
-                annotation_result = task["result"]
-                gdd_id = task["task"]["data"]["gdd_id"]
-
+                if file.endswith(".txt"):
+                    with open(os.path.join(data_folder, file), "r") as f:
+                        task = json.load(f)
+                    annotation_result = task["result"]
+                    gdd_id = task["task"]["data"]["gdd_id"]
+                    raw_text = task["task"]["data"]["text"]
+                elif file.endswith(".json"):
+                    with open(os.path.join(data_folder, file), "r") as f:
+                        task = json.load(f)
+                    annotation_result = task["result"]
+                    gdd_id = task["data"]["gdd_id"]
+                    raw_text = task["data"]["text"]
+                else:
+                    continue      
+                
                 labelled_entities = [
                     annotation["value"] for annotation in annotation_result
                 ]
diff --git a/src/entity_extraction/preprocessing/labelling_data_split.py b/src/entity_extraction/preprocessing/labelling_data_split.py
@@ -327,13 +327,17 @@ def extract_parquet_file(labelled_file_path: str):
                     
                     for ent_type in corrected_entities.keys():
                         for entity in corrected_entities[ent_type].keys():
+                            if corrected_entities[ent_type][entity]['corrected_name']:
+                                entity_text = corrected_entities[ent_type][entity]['corrected_name']
+                            else:
+                                entity_text = entity
                             for sentence in corrected_entities[ent_type][entity]['sentence']:
                                 if (sentence['char_index']['start'] != -1 and
                                     sentence['char_index']['end'] != -1):
                                     all_sentences[sentence['sentid']] = sentence['text']
                                     output_files[sentence['sentid']].append({
                                         "value": {
-                                            "text": corrected_entities[ent_type][entity]['corrected_name'],
+                                            "text": entity_text,
                                             "start": sentence['char_index']['start'],
                                             "end": sentence['char_index']['end'],
                                             "labels": [ent_type]