spliter/split_data/analysis.py at master · triangular-opensource/spliter

436 lines (377 loc) · 17.4 KB
Advanced analysis functions for spending patterns and trends.
1. Spending pattern analysis
2. Group spending statistics aggregation
3. ML-based expense categorization (in expense_classifier.py)
4. Historical trend analysis
from database import SplitDataDB
from chart_data import ChartDataGenerator
from typing import Dict, List, Optional
from datetime import datetime, timedelta
import statistics
class SpendingAnalysis:
    """Advanced spending pattern and trend analysis."""
    def __init__(self, db: SplitDataDB):
        """
        Initialize with database connection.
        Args:
            db: SplitDataDB instance
        """
        self.db = db
        self.chart_gen = ChartDataGenerator(db)
    # ============================================================
    # 1. SPENDING PATTERN ANALYSIS
    # ============================================================
    def analyze_spending_patterns(self, user_id: int, months: int = 12) -> Dict:
        """
        Comprehensive spending pattern analysis for a user.
        Returns:
            Dict with various pattern metrics:
            - average_weekly_spending
            - average_monthly_spending
            - spending_consistency (coefficient of variation)
            - peak_spending_period
            - low_spending_period
            - category_distribution
        """
        # Get monthly data
        monthly_data = self.chart_gen.get_user_monthly_expenses(user_id, months)
        monthly_amounts = monthly_data['y']
        # Get category data
        category_data = self.chart_gen.get_user_expenses_by_category(user_id, months)
        # Calculate patterns
        avg_monthly = statistics.mean(monthly_amounts) if monthly_amounts else 0
        avg_weekly = avg_monthly / 4.33  # Approximate weeks per month
        # Spending consistency (lower CV = more consistent)
        if len(monthly_amounts) >= 2 and avg_monthly > 0:
            cv = (statistics.stdev(monthly_amounts) / avg_monthly * 100)
        else:
            cv = 0  # Not enough data or no spending
        # Peak and low periods
        if monthly_amounts:
            peak_idx = monthly_amounts.index(max(monthly_amounts))
            low_idx = monthly_amounts.index(min(monthly_amounts))
            peak_period = monthly_data['x'][peak_idx] if peak_idx < len(monthly_data['x']) else None
            low_period = monthly_data['x'][low_idx] if low_idx < len(monthly_data['x']) else None
        else:
            peak_period = None
            low_period = None
        return {
            'user_id': user_id,
            'analysis_period_months': months,
            'average_weekly_spending': round(avg_weekly, 2),
            'average_monthly_spending': round(avg_monthly, 2),
            'spending_consistency_score': round(100 - min(cv, 100), 2),  # Inverted CV (higher = more consistent)
            'peak_spending_period': peak_period,
            'low_spending_period': low_period,
            'category_distribution': {
                'x': category_data['x'],
                'y': category_data['y']
            'total_categories': len(category_data['x']),
            'top_category': category_data['x'][0] if category_data['x'] else None
    # ============================================================
    # 2. GROUP SPENDING STATISTICS AGGREGATION
    # ============================================================
    def get_group_spending_analysis(self, group_id: int) -> Dict:
        """
        Comprehensive group spending statistics and aggregation.
        Returns:
            Dict with detailed group statistics:
            - total_expenses, total_members
            - total_spending, average_per_member
            - spending_by_category
            - spending_by_member
            - most_active_member
            - most_expensive_category
            - settlement_status
        """
        # Get basic stats
        basic_stats = self.db.get_group_statistics(group_id)
        # Get expenses by category for this group
        query_category = """
            SELECT 
                COALESCE(e.tag, 'other') as category,
                SUM(ABS(s.amount)) as total_amount,
                COUNT(DISTINCT e.id) as expense_count
            FROM expense e
            JOIN split s ON e.id = s.expense_id
            WHERE e.group_id = %s
            GROUP BY category
            ORDER BY total_amount DESC
        """
        category_results = self.db.execute_query(query_category, (group_id,))
        # Get spending by member
        query_member = """
            SELECT 
                s.user_id,
                SUM(CASE WHEN s.amount > 0 THEN s.amount ELSE 0 END) as total_paid,
                SUM(CASE WHEN s.amount < 0 THEN ABS(s.amount) ELSE 0 END) as total_owed,
                COUNT(DISTINCT s.expense_id) as expenses_involved
            FROM split s
            JOIN expense e ON s.expense_id = e.id
            WHERE e.group_id = %s
            GROUP BY s.user_id
            ORDER BY total_paid DESC
        """
        member_results = self.db.execute_query(query_member, (group_id,))
        # Calculate averages
        total_members = basic_stats.get('total_members', 0)
        total_spending = basic_stats.get('total_paid', 0)
        avg_per_member = total_spending / total_members if total_members > 0 else 0
        # Find most active member and expensive category
        most_active_member = member_results[0]['user_id'] if member_results else None
        most_expensive_category = category_results[0]['category'] if category_results else None
        return {
            'group_id': group_id,
            'total_expenses': basic_stats.get('total_expenses', 0),
            'total_members': total_members,
            'total_spending': round(float(total_spending), 2),
            'average_per_member': round(avg_per_member, 2),
            'total_owed': round(float(basic_stats.get('total_owed', 0)), 2),
            'unsettled_expenses': basic_stats.get('unsettled_expenses', 0),
            'spending_by_category': [
                {'category': row['category'], 
                 'amount': round(float(row['total_amount']), 2),
                 'expense_count': row['expense_count']}
                for row in category_results
            'spending_by_member': [
                {'user_id': row['user_id'],
                 'total_paid': round(float(row['total_paid']), 2),
                 'total_owed': round(float(row['total_owed']), 2),
                 'expenses_involved': row['expenses_involved']}
                for row in member_results
            'most_active_member': most_active_member,
            'most_expensive_category': most_expensive_category,
            'settlement_rate': round(
                (1 - basic_stats.get('unsettled_expenses', 0) / max(basic_stats.get('total_expenses', 1), 1)) * 100,
    # ============================================================
    # 4. HISTORICAL TREND ANALYSIS
    # ============================================================
    def analyze_spending_trends(self, user_id: int, months: int = 12) -> Dict:
        """
        Advanced historical trend analysis.
        Returns:
            Dict with trend metrics:
            - growth_rate (month-over-month)
            - trend_direction (increasing/decreasing/stable)
            - spending_velocity
            - period_comparison
            - seasonal_patterns
        """
        # Get monthly data
        monthly_data = self.chart_gen.get_user_monthly_expenses(user_id, months)
        monthly_amounts = monthly_data['y']
        monthly_labels = monthly_data['x']
        if len(monthly_amounts) < 2:
            return {
                'user_id': user_id,
                'analysis_period_months': months,
                'error': 'Insufficient data for trend analysis (need at least 2 months)',
                'trend_direction': 'insufficient_data',
                'average_growth_rate': 0,
                'spending_velocity': 0,
                'period_comparison': {
                    'current_period': round(monthly_amounts[0] if monthly_amounts else 0, 2),
                    'previous_period': 0,
                    'change_percent': 0
                'peak_spending_month': monthly_labels[0] if monthly_labels else None,
                'trend_data': {
                    'x': monthly_labels,
                    'y': monthly_amounts
        # Calculate growth rate (average month-over-month change)
        growth_rates = []
        for i in range(1, len(monthly_amounts)):
            if monthly_amounts[i-1] > 0:
                growth = ((monthly_amounts[i] - monthly_amounts[i-1]) / monthly_amounts[i-1]) * 100
                growth_rates.append(growth)
        avg_growth_rate = statistics.mean(growth_rates) if len(growth_rates) >= 1 else 0
        # Trend direction
        if avg_growth_rate > 5:
            trend_direction = "increasing"
        elif avg_growth_rate < -5:
            trend_direction = "decreasing"
        else:
            trend_direction = "stable"
        # Spending velocity (rate of change)
        if len(monthly_amounts) >= 3:
            recent_avg = statistics.mean(monthly_amounts[-3:])  # Last 3 months
            earlier_avg = statistics.mean(monthly_amounts[:3]) if len(monthly_amounts) >= 6 else monthly_amounts[0]
            velocity = ((recent_avg - earlier_avg) / earlier_avg * 100) if earlier_avg > 0 else 0
        else:
            velocity = 0
        # Period comparison (current vs previous)
        if len(monthly_amounts) >= 2:
            current_period = monthly_amounts[-1]
            previous_period = monthly_amounts[-2]
            period_change = ((current_period - previous_period) / previous_period * 100) if previous_period > 0 else 0
        else:
            current_period = monthly_amounts[-1] if monthly_amounts else 0
            previous_period = 0
            period_change = 0
        # Identify peak spending month
        peak_idx = monthly_amounts.index(max(monthly_amounts))
        peak_month = monthly_labels[peak_idx] if peak_idx < len(monthly_labels) else None
        return {
            'user_id': user_id,
            'analysis_period_months': months,
            'average_growth_rate': round(avg_growth_rate, 2),
            'trend_direction': trend_direction,
            'spending_velocity': round(velocity, 2),
            'period_comparison': {
                'current_period': round(current_period, 2),
                'previous_period': round(previous_period, 2),
                'change_percent': round(period_change, 2)
            'peak_spending_month': peak_month,
            'trend_data': {
                'x': monthly_labels,
                'y': monthly_amounts
    def compare_periods(self, user_id: int, period1_months: int = 3, 
                       period2_months: int = 3) -> Dict:
        """
        Compare spending between two time periods.
        Args:
            user_id: User ID
            period1_months: Months for first period (most recent)
            period2_months: Months for second period (before period1)
        Returns:
            Comparison metrics
        """
        # Get data for both periods
        query1 = """
            SELECT SUM(ABS(s.amount)) as total
            FROM split s
            WHERE s.user_id = %s
            AND s.created_dt >= DATE_SUB(NOW(), INTERVAL %s MONTH)
            AND s.created_dt < DATE_SUB(NOW(), INTERVAL %s MONTH)
        """
        query2 = """
            SELECT SUM(ABS(s.amount)) as total
            FROM split s
            WHERE s.user_id = %s
            AND s.created_dt >= DATE_SUB(NOW(), INTERVAL %s MONTH)
        """
        period1_result = self.db.execute_query(query1, (user_id, period1_months + period2_months, period1_months))
        period2_result = self.db.execute_query(query2, (user_id, period1_months))
        period1_total = float(period1_result[0]['total'] or 0) if period1_result else 0
        period2_total = float(period2_result[0]['total'] or 0) if period2_result else 0
        change_amount = period2_total - period1_total
        change_percent = ((period2_total - period1_total) / period1_total * 100) if period1_total > 0 else 0
        return {
            'user_id': user_id,
            'period1': {
                'months': period2_months,
                'total_spending': round(period1_total, 2),
                'average_per_month': round(period1_total / period2_months, 2) if period2_months > 0 else 0
            'period2': {
                'months': period1_months,
                'total_spending': round(period2_total, 2),
                'average_per_month': round(period2_total / period1_months, 2) if period1_months > 0 else 0
            'comparison': {
                'change_amount': round(change_amount, 2),
                'change_percent': round(change_percent, 2),
                'direction': 'increased' if change_amount > 0 else 'decreased' if change_amount < 0 else 'unchanged'
    def get_all_analyses(self, user_id: int, group_id: Optional[int] = None, 
                        months: int = 12) -> Dict:
        """
        Get all analyses in one call.
        Perfect for dashboard/API response.
        Returns:
            Complete analysis including:
            - Spending patterns
            - Group statistics (if group_id provided)
            - Historical trends
            - Period comparisons
        """
        result = {
            'user_id': user_id,
            'generated_at': datetime.now().isoformat(),
            'spending_patterns': self.analyze_spending_patterns(user_id, months),
            'historical_trends': self.analyze_spending_trends(user_id, months),
            'period_comparison': self.compare_periods(user_id, period1_months=3, period2_months=3)
        if group_id:
            result['group_statistics'] = self.get_group_spending_analysis(group_id)
        return result
def main():
    """Test all analysis functions."""
    db = SplitDataDB()
        if not db.connect():
            print("Failed to connect to database")
            return
        analyzer = SpendingAnalysis(db)
        user_id = 1
        print("="*70)
        print("COMPREHENSIVE ANALYSIS TEST")
        print("="*70)
        # 1. Spending Pattern Analysis
        print("\n1. SPENDING PATTERN ANALYSIS:")
        print("-" * 70)
        patterns = analyzer.analyze_spending_patterns(user_id, months=12)
        print(f"Average Weekly Spending: €{patterns['average_weekly_spending']:.2f}")
        print(f"Average Monthly Spending: €{patterns['average_monthly_spending']:.2f}")
        print(f"Spending Consistency: {patterns['spending_consistency_score']:.1f}%")
        print(f"Peak Spending Period: {patterns['peak_spending_period']}")
        print(f"Top Category: {patterns['top_category']}")
        # 2. Group Statistics
        print("\n2. GROUP SPENDING STATISTICS:")
        print("-" * 70)
        group_stats = analyzer.get_group_spending_analysis(group_id=1)
        print(f"Total Expenses: {group_stats['total_expenses']}")
        print(f"Total Members: {group_stats['total_members']}")
        print(f"Total Spending: €{group_stats['total_spending']:.2f}")
        print(f"Average per Member: €{group_stats['average_per_member']:.2f}")
        print(f"Settlement Rate: {group_stats['settlement_rate']:.1f}%")
        print(f"Most Expensive Category: {group_stats['most_expensive_category']}")
        # 3. ML Categorization (already implemented)
        print("\n3. ML-BASED EXPENSE CATEGORIZATION:")
        print("-" * 70)
        print("✅ Implemented in expense_classifier.py")
        print("   - Top 3 tag suggestions")
        print("   - Auto-classification")
        print("   - EU/Ireland support")
        # 4. Historical Trend Analysis
        print("\n4. HISTORICAL TREND ANALYSIS:")
        print("-" * 70)
        trends = analyzer.analyze_spending_trends(user_id, months=12)
        print(f"Trend Direction: {trends['trend_direction']}")
        print(f"Average Growth Rate: {trends['average_growth_rate']:.2f}%")
        print(f"Spending Velocity: {trends['spending_velocity']:.2f}%")
        print(f"Current vs Previous: {trends['period_comparison']['change_percent']:.2f}% change")
        print(f"Peak Spending Month: {trends['peak_spending_month']}")
        # Period Comparison
        print("\n5. PERIOD COMPARISON:")
        print("-" * 70)
        comparison = analyzer.compare_periods(user_id, period1_months=3, period2_months=3)
        print(f"Last 3 months: €{comparison['period2']['total_spending']:.2f}")
        print(f"Previous 3 months: €{comparison['period1']['total_spending']:.2f}")
        print(f"Change: {comparison['comparison']['change_percent']:.2f}% ({comparison['comparison']['direction']})")
        print("\n" + "="*70)
        print("✅ ALL 4 RESPONSIBILITIES IMPLEMENTED!")
        print("="*70)
    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()
    finally:
        db.disconnect()
if __name__ == "__main__":
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

analysis.py

Latest commit

History

analysis.py

File metadata and controls