|
1 | 1 | # Author: Ty Andrews |
2 | 2 | # Date: 2023-06-05 |
3 | 3 | """ |
4 | | -Usage: entity_extraction_pipeline.py --article_text_path=<article_text_path> --output_path=<output_path> |
| 4 | +Usage: entity_extraction_pipeline.py --article_text_path=<article_text_path> --output_path=<output_path> [--max_sentences=<max_sentences>] [--max_articles=<max_articles>] |
5 | 5 |
|
6 | 6 | Options: |
7 | 7 | --article_text_path=<article_text_path> The path to the article text data file. |
8 | 8 | --output_path=<output_path> The path to export the extracted entities to. |
| 9 | +--max_sentences=<max_sentences> The maximum number of sentences to extract entities from. [default: -1] |
| 10 | +--max_articles <max_articles> The maximum number of articles to extract entities from. [default: -1] |
9 | 11 | """ |
10 | 12 |
|
11 | 13 | import os |
|
32 | 34 | load_dotenv(find_dotenv()) |
33 | 35 |
|
34 | 36 | # get the MODEL_NAME from environment variables |
35 | | -HF_NER_MODEL_NAME = os.getenv("HF_NER_MODEL_NAME", "finding-fossils/metaextractor") |
| 37 | +HF_NER_MODEL_PATH = os.getenv("HF_NER_MODEL_PATH", "./models/ner/metaextractor") |
36 | 38 | SPACY_NER_MODEL_NAME = os.getenv("SPACY_NER_MODEL_NAME", "en_metaextractor_spacy") |
37 | 39 | USE_NER_MODEL_TYPE = os.getenv("USE_NER_MODEL_TYPE", "huggingface") |
38 | 40 | MAX_SENTENCES = os.getenv("MAX_SENTENCES", "-1") |
@@ -286,7 +288,7 @@ def recreate_original_sentences_with_labels(row): |
286 | 288 | def extract_entities( |
287 | 289 | article_text_data: pd.DataFrame, |
288 | 290 | model_type: str = "huggingface", |
289 | | - model_path: str = "finding-fossils/metaextractor", |
| 291 | + model_path: str = "metaextractor", |
290 | 292 | ) -> pd.DataFrame: |
291 | 293 | """ |
292 | 294 | Extracts the entities from the article text data. |
@@ -562,19 +564,30 @@ def main(): |
562 | 564 | ] |
563 | 565 | ) |
564 | 566 | ] |
| 567 | + logger.info( |
| 568 | + f"Using just a subsample of the data of with {int(MAX_ARTICLES)} articles" |
| 569 | + ) |
565 | 570 |
|
566 | 571 | # if max_sentences is not -1 then only use the first max_sentences sentences |
567 | 572 | if MAX_SENTENCES is not None and int(MAX_SENTENCES) != -1: |
568 | | - article_text_data = article_text_data.head(int(MAX_SENTENCES)) |
| 573 | + # get just sentence id's for each gdd up to max_sentences |
| 574 | + article_text_data = article_text_data[ |
| 575 | + article_text_data["sentid"].isin( |
| 576 | + article_text_data["sentid"].unique()[0 : int(MAX_SENTENCES)] |
| 577 | + ) |
| 578 | + ] |
| 579 | + logger.info( |
| 580 | + f"Using just a subsample of the data of with {int(MAX_SENTENCES)} sentences" |
| 581 | + ) |
569 | 582 |
|
570 | 583 | for article_gdd in article_text_data["gddid"].unique(): |
571 | 584 | logger.info(f"Processing GDD ID: {article_gdd}") |
572 | 585 |
|
573 | 586 | article_text = article_text_data[article_text_data["gddid"] == article_gdd] |
574 | 587 |
|
575 | 588 | if USE_NER_MODEL_TYPE == "huggingface": |
576 | | - logger.info(f"Using HuggingFace model {HF_NER_MODEL_NAME}") |
577 | | - model_path = HF_NER_MODEL_NAME |
| 589 | + logger.info(f"Using HuggingFace model {HF_NER_MODEL_PATH}") |
| 590 | + model_path = HF_NER_MODEL_PATH |
578 | 591 | elif USE_NER_MODEL_TYPE == "spacy": |
579 | 592 | logger.info(f"Using Spacy model {SPACY_NER_MODEL_NAME}") |
580 | 593 | model_path = SPACY_NER_MODEL_NAME |
@@ -611,6 +624,13 @@ def main(): |
611 | 624 | ) |
612 | 625 | continue |
613 | 626 |
|
| 627 | + # delete the file if it already exists with the article_gdd name |
| 628 | + if os.path.exists(os.path.join(opt["--output_path"], f"{article_gdd}.json")): |
| 629 | + os.remove(os.path.join(opt["--output_path"], f"{article_gdd}.json")) |
| 630 | + logger.warning( |
| 631 | + f"Deleted existing file {article_gdd}.json in output directory." |
| 632 | + ) |
| 633 | + |
614 | 634 | export_extracted_entities( |
615 | 635 | extracted_entities=pprocessed_entities, |
616 | 636 | output_path=opt["--output_path"], |
|
0 commit comments