Skip to content

Commit d761fcc

Browse files
committed
Update logging statements
1 parent 45e5a79 commit d761fcc

2 files changed

Lines changed: 10 additions & 11 deletions

File tree

src/preprocessing/labelling_data_split.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,6 @@
2828

2929
logger = get_logger(__name__)
3030

31-
3231
def separate_labels_to_train_val_test(
3332
labelled_file_path: str,
3433
output_path: str,
@@ -239,8 +238,16 @@ def separate_labels_to_train_val_test(
239238
json.dump(data_metrics, f, indent=2)
240239

241240
logger.info("Finished separating files into train, val and test sets.")
242-
243-
241+
logger.info(
242+
f"Found {data_metrics['train']['entity_counts']} entities in {data_metrics['train']['article_count']} articles in train set."
243+
)
244+
logger.info(
245+
f"Found {data_metrics['val']['entity_counts']} entities in {data_metrics['val']['article_count']} articles in val set."
246+
)
247+
logger.info(
248+
f"Found {data_metrics['test']['entity_counts']} entities in {data_metrics['test']['article_count']} articles in test set."
249+
)
250+
244251
def get_article_gdd_ids(labelled_file_path: str):
245252
"""
246253
Parameters

src/preprocessing/spacy_preprocess.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,9 +20,6 @@
2020
# ensure that the parent directory is on the path for relative imports
2121
sys.path.append(os.path.join(os.path.dirname(__file__), os.pardir, os.pardir))
2222

23-
from src.logs import get_logger
24-
logger = get_logger(__name__)
25-
2623
def preprocess_data(data_path: str):
2724
"""Creates data artifacts used by the Spacy model for training
2825
@@ -38,11 +35,6 @@ def preprocess_data(data_path: str):
3835
val_files = glob.glob(os.path.join(data_path, "val", "*.txt"))
3936
train_files.extend(glob.glob(os.path.join(data_path, "train", "*.json")))
4037
val_files.extend(glob.glob(os.path.join(data_path, "val", "*.json")))
41-
42-
logger.info(
43-
f"Number of files found under the train dir: {len(train_files)}")
44-
logger.info(
45-
f"Number of files found under the val dir: {len(val_files)}")
4638

4739
train_doc_bin = get_doc(nlp, train_files)
4840
train_doc_bin.to_disk(os.path.join(data_path, "train.spacy"))

0 commit comments

Comments
 (0)