spliter/split_data/train_classifier.py at master · triangular-opensource/spliter

executable file
375 lines (343 loc) · 10.6 KB
#!/usr/bin/env python3
Train sklearn classifier using data from generate_dummy_data.py
Creates a trained model file that can be used for classification.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import pickle
# Training data extracted from generate_dummy_data.py
# Includes EU/Ireland-specific examples
TRAINING_DATA = [
    ("McDonald", "food"),
    ("Starbucks", "food"),
    ("Pizza Hut", "food"),
    ("Subway", "food"),
    ("KFC", "food"),
    ("Burger King", "food"),
    ("Taco Bell", "food"),
    ("Domino's Pizza", "food"),
    ("Chipotle", "food"),
    ("Panera Bread", "food"),
    ("Dunkin'", "food"),
    ("Olive Garden", "food"),
    ("Red Lobster", "food"),
    ("Outback Steakhouse", "food"),
    ("Buffalo Wild Wings", "food"),
    ("Restaurant dinner", "food"),
    ("Lunch meeting", "food"),
    ("Breakfast", "food"),
    ("Coffee shop", "food"),
    ("Food truck", "food"),
    ("Supermac's", "food"),
    ("Deliveroo", "food"),
    ("Just Eat", "food"),
    ("UberEats", "food"),
    ("Cafe", "food"),
    ("Café", "food"),
    ("Dinner", "food"),
    ("Lunch", "food"),
    ("Brunch", "food"),
    ("Restaurant", "food"),
    # Travel
    ("Uber ride", "travel"),
    ("Lyft ride", "travel"),
    ("Taxi", "travel"),
    ("Train ticket", "travel"),
    ("Bus fare", "travel"),
    ("Flight ticket", "travel"),
    ("Hotel booking", "travel"),
    ("Airbnb", "travel"),
    ("Car rental", "travel"),
    ("Gas station", "travel"),
    ("Parking fee", "travel"),
    ("Toll fee", "travel"),
    ("Metro card", "travel"),
    ("Bike rental", "travel"),
    ("Scooter rental", "travel"),
    ("Luas ticket", "travel"),
    ("DART fare", "travel"),
    ("Dublin Bus", "travel"),
    ("Bus Eireann", "travel"),
    ("Aer Lingus", "travel"),
    ("Ryanair flight", "travel"),
    ("Flight", "travel"),
    ("Train", "travel"),
    ("Bus", "travel"),
    ("Parking", "travel"),
    ("Petrol", "travel"),
    ("Gas", "travel"),
    # Shopping
    ("Grocery shopping", "shopping"),
    ("Target", "shopping"),
    ("Walmart", "shopping"),
    ("Amazon order", "shopping"),
    ("Clothing store", "shopping"),
    ("Electronics store", "shopping"),
    ("Bookstore", "shopping"),
    ("Pharmacy", "shopping"),
    ("Home Depot", "shopping"),
    ("Costco", "shopping"),
    ("Online purchase", "shopping"),
    ("Gift shop", "shopping"),
    ("Convenience store", "shopping"),
    ("Tesco", "shopping"),
    ("Dunnes Stores", "shopping"),
    ("SuperValu", "shopping"),
    ("Aldi", "shopping"),
    ("Lidl", "shopping"),
    ("Penneys", "shopping"),
    ("Primark", "shopping"),
    ("Boots", "shopping"),
    ("Superdrug", "shopping"),
    ("Argos", "shopping"),
    ("IKEA", "shopping"),
    ("Shopping", "shopping"),
    ("Grocery", "shopping"),
    ("Store", "shopping"),
    # Entertainment
    ("Movie tickets", "entertainment"),
    ("Concert", "entertainment"),
    ("Theater show", "entertainment"),
    ("Museum entry", "entertainment"),
    ("Amusement park", "entertainment"),
    ("Bowling", "entertainment"),
    ("Karaoke", "entertainment"),
    ("Escape room", "entertainment"),
    ("Arcade", "entertainment"),
    ("Sports game", "entertainment"),
    ("Comedy show", "entertainment"),
    ("Music festival", "entertainment"),
    ("Festival tickets", "entertainment"),
    ("Club entry", "entertainment"),
    ("Bar drinks", "entertainment"),
    ("Cinema", "entertainment"),
    ("Movie", "entertainment"),
    ("Concert tickets", "entertainment"),
    ("3Arena", "entertainment"),
    ("Croke Park", "entertainment"),
    ("Aviva Stadium", "entertainment"),
    ("Pub", "entertainment"),
    ("Bar", "entertainment"),
    ("Event", "entertainment"),
    ("Show", "entertainment"),
    # Utilities
    ("Electricity bill", "utilities"),
    ("Water bill", "utilities"),
    ("Internet bill", "utilities"),
    ("Phone bill", "utilities"),
    ("Gas bill", "utilities"),
    ("Cable bill", "utilities"),
    ("Streaming service", "utilities"),
    ("Insurance", "utilities"),
    ("Rent", "utilities"),
    ("Mortgage", "utilities"),
    ("ESB", "utilities"),
    ("Eir", "utilities"),
    ("Vodafone", "utilities"),
    ("Netflix", "utilities"),
    ("Spotify", "utilities"),
    ("Sky TV", "utilities"),
    ("Virgin Media", "utilities"),
    ("Bill", "utilities"),
    ("Subscription", "utilities"),
    ("Service", "utilities"),
    # Other
    ("Gym membership", "other"),
    ("Gym class", "other"),
    ("Yoga class", "other"),
    ("Personal trainer", "other"),
    ("Doctor visit", "other"),
    ("Pharmacy", "other"),
    ("Dry cleaning", "other"),
    ("Laundry", "other"),
    ("Haircut", "other"),
    ("Spa", "other"),
    ("Massage", "other"),
    ("Car repair", "other"),
    ("Car wash", "other"),
    ("Pet supplies", "other"),
    ("Veterinary", "other"),
    ("Donation", "other"),
    ("Gym", "other"),
    ("Doctor", "other"),
    # Additional variations for better training
    # Food variations
    ("McDonald's", "food"),
    ("McDonalds", "food"),
    ("Coffee", "food"),
    ("Tea", "food"),
    ("Snacks", "food"),
    ("Fast food", "food"),
    ("Takeaway", "food"),
    ("Food delivery", "food"),
    ("Restaurant meal", "food"),
    ("Dining out", "food"),
    ("Lunch break", "food"),
    ("Dinner date", "food"),
    ("Breakfast meeting", "food"),
    ("Food court", "food"),
    ("Bakery", "food"),
    ("Pizzeria", "food"),
    ("Bistro", "food"),
    # Travel variations
    ("Ride", "travel"),
    ("Transport", "travel"),
    ("Commute", "travel"),
    ("Journey", "travel"),
    ("Trip", "travel"),
    ("Travel", "travel"),
    ("Transportation", "travel"),
    ("Airport", "travel"),
    ("Station", "travel"),
    ("Taxi fare", "travel"),
    ("Uber trip", "travel"),
    ("Train journey", "travel"),
    ("Bus ride", "travel"),
    ("Plane ticket", "travel"),
    ("Hotel stay", "travel"),
    ("Accommodation", "travel"),
    ("Car hire", "travel"),
    ("Vehicle rental", "travel"),
    ("Fuel", "travel"),
    ("Petrol station", "travel"),
    ("Gas pump", "travel"),
    ("Toll road", "travel"),
    ("Parking lot", "travel"),
    # Shopping variations
    ("Shop", "shopping"),
    ("Retail", "shopping"),
    ("Purchase", "shopping"),
    ("Buy", "shopping"),
    ("Shopping trip", "shopping"),
    ("Grocery store", "shopping"),
    ("Supermarket", "shopping"),
    ("Market", "shopping"),
    ("Online shopping", "shopping"),
    ("E-commerce", "shopping"),
    ("Retail store", "shopping"),
    ("Department store", "shopping"),
    ("Clothing", "shopping"),
    ("Electronics", "shopping"),
    ("Books", "shopping"),
    ("Medicine", "shopping"),
    ("Drugstore", "shopping"),
    ("Convenience", "shopping"),
    # Entertainment variations
    ("Entertainment", "entertainment"),
    ("Fun", "entertainment"),
    ("Leisure", "entertainment"),
    ("Recreation", "entertainment"),
    ("Movies", "entertainment"),
    ("Cinema tickets", "entertainment"),
    ("Film", "entertainment"),
    ("Performance", "entertainment"),
    ("Music", "entertainment"),
    ("Gig", "entertainment"),
    ("Event tickets", "entertainment"),
    ("Sports event", "entertainment"),
    ("Match", "entertainment"),
    ("Game tickets", "entertainment"),
    ("Attraction", "entertainment"),
    ("Theme park", "entertainment"),
    ("Activity", "entertainment"),
    ("Night out", "entertainment"),
    ("Drinks", "entertainment"),
    # Utilities variations
    ("Utility", "utilities"),
    ("Utilities", "utilities"),
    ("Bills", "utilities"),
    ("Monthly bill", "utilities"),
    ("Electric", "utilities"),
    ("Power", "utilities"),
    ("Water supply", "utilities"),
    ("Internet service", "utilities"),
    ("WiFi", "utilities"),
    ("Broadband", "utilities"),
    ("Phone service", "utilities"),
    ("Mobile plan", "utilities"),
    ("Cable TV", "utilities"),
    ("TV subscription", "utilities"),
    ("Streaming", "utilities"),
    ("Video streaming", "utilities"),
    ("Music streaming", "utilities"),
    ("Insurance premium", "utilities"),
    ("Rent payment", "utilities"),
    ("Housing", "utilities"),
    ("Mortgage payment", "utilities"),
def train_model():
    """Train the sklearn classifier model."""
    print("="*70)
    print("Training Expense Classifier with sklearn")
    print("="*70)
    # Prepare data
    X = [item[0] for item in TRAINING_DATA]
    y = [item[1] for item in TRAINING_DATA]
    print(f"\nTraining data: {len(X)} examples")
    print(f"Categories: {set(y)}")
    # Split into train/test (80/20)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    print(f"\nTraining set: {len(X_train)} examples")
    print(f"Test set: {len(X_test)} examples")
    # Create pipeline: TF-IDF vectorization + Naive Bayes classifier
    print("\nTraining model...")
    model = Pipeline([
        ('tfidf', TfidfVectorizer(
            lowercase=True,
            ngram_range=(1, 2),  # Use unigrams and bigrams
            max_features=5000,
            min_df=1,
            max_df=0.95
        )),
        ('clf', MultinomialNB(alpha=1.0))  # Laplace smoothing
    # Train
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\n✓ Model trained!")
    print(f"✓ Test accuracy: {accuracy:.2%}")
    # Show detailed results
    print("\n" + "="*70)
    print("Classification Report:")
    print("="*70)
    print(classification_report(y_test, y_pred))
    # Save model
    model_path = 'expense_classifier_model.pkl'
    with open(model_path, 'wb') as f:
        pickle.dump(model, f)
    file_size = os.path.getsize(model_path) / (1024 * 1024)  # Size in MB
    print(f"\n✓ Model saved to: {model_path}")
    print(f"✓ Model size: {file_size:.2f} MB")
    # Test some examples
    print("\n" + "="*70)
    print("Testing Examples:")
    print("="*70)
    test_examples = [
        "McDonald",
        "Luas ticket",
        "Tesco shopping",
        "Netflix subscription",
        "Movie tickets",
        "ESB bill"
    for example in test_examples:
        prediction = model.predict([example])[0]
        proba = model.predict_proba([example])[0]
        max_proba = max(proba)
        print(f"{example:30} → {prediction:15} (confidence: {max_proba:.2%})")
    print("\n" + "="*70)
    print("Training complete!")
    print("="*70)
    return model
if __name__ == "__main__":
    train_model()
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

train_classifier.py

Latest commit

History

train_classifier.py

File metadata and controls