-
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdata_preprocessing.py
More file actions
151 lines (118 loc) · 5.24 KB
/
data_preprocessing.py
File metadata and controls
151 lines (118 loc) · 5.24 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import os
import re
from typing import List
import pandas as pd # For tabular data management
from pydantic import BaseModel, DirectoryPath, FilePath
from transformers import AutoTokenizer
# Load a pre-trained tokenizer (e.g., BERT)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
class Config(BaseModel):
"""
Configuration model to validate input and output directories.
"""
input_dir: DirectoryPath
output_dir: DirectoryPath
def ensure_output_dir(self):
"""
Ensures the output directory exists. If not, it creates one.
"""
os.makedirs(self.output_dir, exist_ok=True)
def clean_text(text: str) -> str:
"""
Cleans the input text by removing URLs, special characters, and normalizing whitespace.
Args:
text (str): The raw text input.
Returns:
str: The cleaned text.
"""
# Remove URLs
text = re.sub(r"http\S+", "", text)
# Remove special characters
text = re.sub(r"[^\w\s]", "", text)
# Normalize whitespace
text = re.sub(r"\s+", " ", text).strip()
return text
def tokenize_text(text: str) -> List[str]:
"""
Tokenizes the input text into subword tokens using a pre-trained tokenizer.
Args:
text (str): The cleaned text input.
Returns:
List[str]: A list of tokens.
"""
# Tokenize using a pre-trained tokenizer
tokens = tokenizer.tokenize(text)
return tokens
def process_file(file_path: FilePath, output_dir: str):
"""
Processes a single text file: cleans and tokenizes its content, then saves the results.
Args:
file_path (FilePath): Path to the raw text file.
output_dir (str): Directory to save the processed file.
"""
file_name = os.path.basename(file_path)
output_file_path = os.path.join(output_dir, file_name.replace(".txt", "_processed.csv"))
# Read raw text
with open(file_path, "r", encoding="utf-8") as file:
raw_text = file.read()
# Clean the text
cleaned_text = clean_text(raw_text)
# Tokenize the text
tokens = tokenize_text(cleaned_text)
# Save the tokens in a tabular format
df = pd.DataFrame(tokens, columns=["Tokens"])
df.to_csv(output_file_path, index=False)
print(f"Processed and saved: {file_name}")
def process_directory(config: Config):
"""
Processes all text files in a directory by cleaning and tokenizing their contents.
Args:
config (Config): Configuration object containing directory paths.
"""
# Ensure the output directory exists
config.ensure_output_dir()
# Iterate over all files in the input directory
for file_name in os.listdir(config.input_dir):
file_path = os.path.join(config.input_dir, file_name)
# Skip non-text files
if not file_name.endswith(".txt"):
print(f"Skipping non-text file: {file_name}")
continue
# Process the text file
process_file(file_path, config.output_dir)
if __name__ == "__main__":
"""
Entry point for the script. Defines paths and starts processing.
"""
# Define paths (adjust these paths as needed)
input_directory = "./raw_data"
output_directory = "./processed_data"
# Validate configuration
config = Config(input_dir=input_directory, output_dir=output_directory)
# Process the directory
process_directory(config)
# ---------------------------------------------------------------
# What We Did:
# ---------------------------------------------------------------
# - Used Pydantic to validate and manage directory paths, ensuring clean and robust input handling.
# - Integrated Hugging Face's BERT tokenizer for state-of-the-art subword tokenization.
# - Saved cleaned and tokenized text as CSV files, making them easy to use in other workflows.
# - Broke the functionality into modular functions for better readability and reuse.
# ---------------------------------------------------------------
# What's Next:
# ---------------------------------------------------------------
# - Add parallel processing with tools like Dask to handle larger datasets more efficiently.
# - Expand features with NLP capabilities like entity extraction or sentiment analysis.
# - Build a FastAPI endpoint to turn this into a web-accessible service.
# ---------------------------------------------------------------
# 1. Pydantic is used to validate and manage directory configurations, ensuring robust input handling.
# 2. The Hugging Face tokenizer (BERT) is leveraged for subword-level tokenization, providing state-of-the-art NLP capabilities.
# 3. Cleaned and tokenized text is saved as CSV files for easy integration with downstream workflows.
# 4. Modular functions are used for cleaning, tokenization, file processing, and directory processing to enhance readability and reusability.
# ---------------------------------------------------------------
# Next Steps:
# ---------------------------------------------------------------
# - Add parallel processing using tools like Dask to handle large datasets efficiently.
# - Implement additional NLP features, such as entity extraction or sentiment analysis, for enriched outputs.
# - Create a FastAPI endpoint to expose this functionality as a web service.
# ---------------------------------------------------------------