|
1 | 1 | # Author: Ty Andrews |
2 | 2 | # Date: 2023-06-05 |
3 | 3 | """ |
4 | | -Usage: entity_extraction.py --article_text_path=<article_text_path> --output_path=<output_path> [--max_sentences=<max_sentences>] [--max_articles=<max_articles>] |
| 4 | +Usage: entity_extraction_pipeline.py --article_text_path=<article_text_path> --output_path=<output_path> [--max_sentences=<max_sentences>] [--max_articles=<max_articles>] |
5 | 5 |
|
6 | 6 | Options: |
7 | 7 | --article_text_path=<article_text_path> The path to the article text data file. |
|
34 | 34 | load_dotenv(find_dotenv()) |
35 | 35 |
|
36 | 36 | # get the MODEL_NAME from environment variables |
37 | | -HF_NER_MODEL_NAME = os.getenv("HF_NER_MODEL_NAME", "roberta-finetuned-v3") |
38 | | -SPACY_NER_MODEL_NAME = os.getenv("SPACY_NER_MODEL_NAME", "spacy-transformer-v3") |
| 37 | +HF_NER_MODEL_PATH = os.getenv("HF_NER_MODEL_PATH", "./models/ner/metaextractor") |
| 38 | +SPACY_NER_MODEL_NAME = os.getenv("SPACY_NER_MODEL_NAME", "en_metaextractor_spacy") |
39 | 39 | USE_NER_MODEL_TYPE = os.getenv("USE_NER_MODEL_TYPE", "huggingface") |
| 40 | +MAX_SENTENCES = os.getenv("MAX_SENTENCES", "-1") |
| 41 | +MAX_ARTICLES = os.getenv("MAX_ARTICLES", "-1") |
40 | 42 |
|
41 | 43 | logger = get_logger(__name__) |
42 | 44 |
|
@@ -286,7 +288,7 @@ def recreate_original_sentences_with_labels(row): |
286 | 288 | def extract_entities( |
287 | 289 | article_text_data: pd.DataFrame, |
288 | 290 | model_type: str = "huggingface", |
289 | | - model_path: str = os.path.join("models", "ner", "roberta-finetuned-v3"), |
| 291 | + model_path: str = "metaextractor", |
290 | 292 | ) -> pd.DataFrame: |
291 | 293 | """ |
292 | 294 | Extracts the entities from the article text data. |
@@ -553,31 +555,42 @@ def main(): |
553 | 555 |
|
554 | 556 | article_text_data = load_article_text_data(file_path) |
555 | 557 |
|
556 | | - if opt["--max_articles"] is not None and int(opt["--max_articles"]) != -1: |
| 558 | + if MAX_ARTICLES is not None and int(MAX_ARTICLES) != -1: |
557 | 559 | article_text_data = article_text_data[ |
558 | 560 | # 7 index used for testing with entities in first couple sentences of article 7 |
559 | 561 | article_text_data["gddid"].isin( |
560 | 562 | article_text_data["gddid"].unique()[ |
561 | | - 0 : 0 + int(opt["--max_articles"]) |
| 563 | + 0 : 0 + int(MAX_ARTICLES) |
562 | 564 | ] |
563 | 565 | ) |
564 | 566 | ] |
| 567 | + logger.info( |
| 568 | + f"Using just a subsample of the data of with {int(MAX_ARTICLES)} articles" |
| 569 | + ) |
565 | 570 |
|
566 | 571 | # if max_sentences is not -1 then only use the first max_sentences sentences |
567 | | - if opt["--max_sentences"] is not None and int(opt["--max_sentences"]) != -1: |
568 | | - article_text_data = article_text_data.head(int(opt["--max_sentences"])) |
| 572 | + if MAX_SENTENCES is not None and int(MAX_SENTENCES) != -1: |
| 573 | + # get just sentence id's for each gdd up to max_sentences |
| 574 | + article_text_data = article_text_data[ |
| 575 | + article_text_data["sentid"].isin( |
| 576 | + article_text_data["sentid"].unique()[0 : int(MAX_SENTENCES)] |
| 577 | + ) |
| 578 | + ] |
| 579 | + logger.info( |
| 580 | + f"Using just a subsample of the data of with {int(MAX_SENTENCES)} sentences" |
| 581 | + ) |
569 | 582 |
|
570 | 583 | for article_gdd in article_text_data["gddid"].unique(): |
571 | 584 | logger.info(f"Processing GDD ID: {article_gdd}") |
572 | 585 |
|
573 | 586 | article_text = article_text_data[article_text_data["gddid"] == article_gdd] |
574 | 587 |
|
575 | 588 | if USE_NER_MODEL_TYPE == "huggingface": |
576 | | - logger.info(f"Using HuggingFace model {HF_NER_MODEL_NAME}") |
577 | | - model_path = os.path.join("models", "ner", HF_NER_MODEL_NAME) |
| 589 | + logger.info(f"Using HuggingFace model {HF_NER_MODEL_PATH}") |
| 590 | + model_path = HF_NER_MODEL_PATH |
578 | 591 | elif USE_NER_MODEL_TYPE == "spacy": |
579 | 592 | logger.info(f"Using Spacy model {SPACY_NER_MODEL_NAME}") |
580 | | - model_path = os.path.join("models", "ner", SPACY_NER_MODEL_NAME) |
| 593 | + model_path = SPACY_NER_MODEL_NAME |
581 | 594 | else: |
582 | 595 | raise ValueError( |
583 | 596 | f"Model type {USE_NER_MODEL_TYPE} not supported. Please set MODEL_TYPE to either 'huggingface' or 'spacy'." |
@@ -611,6 +624,13 @@ def main(): |
611 | 624 | ) |
612 | 625 | continue |
613 | 626 |
|
| 627 | + # delete the file if it already exists with the article_gdd name |
| 628 | + if os.path.exists(os.path.join(opt["--output_path"], f"{article_gdd}.json")): |
| 629 | + os.remove(os.path.join(opt["--output_path"], f"{article_gdd}.json")) |
| 630 | + logger.warning( |
| 631 | + f"Deleted existing file {article_gdd}.json in output directory." |
| 632 | + ) |
| 633 | + |
614 | 634 | export_extracted_entities( |
615 | 635 | extracted_entities=pprocessed_entities, |
616 | 636 | output_path=opt["--output_path"], |
|
0 commit comments