File tree Expand file tree Collapse file tree
Expand file tree Collapse file tree Original file line number Diff line number Diff line change 2828
2929logger = get_logger (__name__ )
3030
31-
3231def separate_labels_to_train_val_test (
3332 labelled_file_path : str ,
3433 output_path : str ,
@@ -239,8 +238,16 @@ def separate_labels_to_train_val_test(
239238 json .dump (data_metrics , f , indent = 2 )
240239
241240 logger .info ("Finished separating files into train, val and test sets." )
242-
243-
241+ logger .info (
242+ f"Found { data_metrics ['train' ]['entity_counts' ]} entities in { data_metrics ['train' ]['article_count' ]} articles in train set."
243+ )
244+ logger .info (
245+ f"Found { data_metrics ['val' ]['entity_counts' ]} entities in { data_metrics ['val' ]['article_count' ]} articles in val set."
246+ )
247+ logger .info (
248+ f"Found { data_metrics ['test' ]['entity_counts' ]} entities in { data_metrics ['test' ]['article_count' ]} articles in test set."
249+ )
250+
244251def get_article_gdd_ids (labelled_file_path : str ):
245252 """
246253 Parameters
Original file line number Diff line number Diff line change 2020# ensure that the parent directory is on the path for relative imports
2121sys .path .append (os .path .join (os .path .dirname (__file__ ), os .pardir , os .pardir ))
2222
23- from src .logs import get_logger
24- logger = get_logger (__name__ )
25-
2623def preprocess_data (data_path : str ):
2724 """Creates data artifacts used by the Spacy model for training
2825
@@ -38,11 +35,6 @@ def preprocess_data(data_path: str):
3835 val_files = glob .glob (os .path .join (data_path , "val" , "*.txt" ))
3936 train_files .extend (glob .glob (os .path .join (data_path , "train" , "*.json" )))
4037 val_files .extend (glob .glob (os .path .join (data_path , "val" , "*.json" )))
41-
42- logger .info (
43- f"Number of files found under the train dir: { len (train_files )} " )
44- logger .info (
45- f"Number of files found under the val dir: { len (val_files )} " )
4638
4739 train_doc_bin = get_doc (nlp , train_files )
4840 train_doc_bin .to_disk (os .path .join (data_path , "train.spacy" ))
You can’t perform that action at this time.
0 commit comments