Pulse/data_stream.py at main · rizzshi/Pulse

242 lines (183 loc) · 7.62 KB
Algorzen Pulse - Data Stream Module
Author: Rishi Singh | Algorzen Research Division © 2025
Handles live data fetching and stream simulation for real-time monitoring.
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import time
from pathlib import Path
class DataStream:
    Manages continuous data stream simulation and live data fetching.
    This class simulates real-time business metrics (sales, engagement, conversions)
    by generating realistic time-series data with trends, seasonality, and noise.
    def __init__(self, stream_interval: int = 5):
        """
        Initialize the data stream.
        Args:
            stream_interval: Seconds between data points (default: 5)
        """
        self.stream_interval = stream_interval
        self.is_streaming = False
    def generate_live_feed(
        self,
        output_path: str,
        num_days: int = 30,
        metrics: list = None
    ) -> pd.DataFrame:
        """
        Generate a realistic live data feed with multiple KPIs.
        Args:
            output_path: Path to save the generated CSV
            num_days: Number of days of historical data to generate
            metrics: List of metric names to generate (default: sales, engagement, conversions)
        Returns:
            DataFrame with generated time-series data
        """
        if metrics is None:
            metrics = ['sales', 'engagement', 'conversions']
        # Generate timestamps (hourly data)
        end_time = datetime.now()
        start_time = end_time - timedelta(days=num_days)
        timestamps = pd.date_range(start=start_time, end=end_time, freq='H')
        data = {'timestamp': timestamps}
        # Generate each metric with realistic patterns
        for metric in metrics:
            data[metric] = self._generate_metric_series(len(timestamps), metric)
        df = pd.DataFrame(data)
        # Save to CSV
        Path(output_path).parent.mkdir(parents=True, exist_ok=True)
        df.to_csv(output_path, index=False)
        print(f"✅ Generated live feed: {len(df)} data points across {num_days} days")
        print(f"   Metrics: {', '.join(metrics)}")
        print(f"   Saved to: {output_path}")
        return df
    def _generate_metric_series(self, length: int, metric_name: str) -> np.ndarray:
        """
        Generate a realistic time-series for a specific metric.
        Includes:
        - Base trend (upward or downward)
        - Seasonal patterns (daily/weekly cycles)
        - Random noise and occasional spikes
        Args:
            length: Number of data points to generate
            metric_name: Name of the metric (affects baseline and patterns)
        Returns:
            NumPy array with generated values
        """
        # Set metric-specific baselines
        baselines = {
            'sales': 10000,
            'engagement': 5000,
            'conversions': 500,
            'revenue': 50000,
            'clicks': 8000,
            'impressions': 100000
        baseline = baselines.get(metric_name.lower(), 1000)
        # Generate base trend (slightly upward)
        trend = np.linspace(0, baseline * 0.2, length)
        # Add daily seasonality (24-hour cycle)
        daily_pattern = baseline * 0.15 * np.sin(2 * np.pi * np.arange(length) / 24)
        # Add weekly seasonality (7-day cycle)
        weekly_pattern = baseline * 0.1 * np.sin(2 * np.pi * np.arange(length) / (24 * 7))
        # Add random noise
        noise = np.random.normal(0, baseline * 0.05, length)
        # Combine components
        series = baseline + trend + daily_pattern + weekly_pattern + noise
        # Add occasional spikes (3% chance per data point)
        spike_indices = np.random.choice(length, size=int(length * 0.03), replace=False)
        series[spike_indices] *= np.random.uniform(1.3, 1.8, len(spike_indices))
        # Ensure no negative values
        series = np.maximum(series, 0)
        return series
    def load_stream(self, input_path: str) -> pd.DataFrame:
        """
        Load existing data stream from CSV.
        Args:
            input_path: Path to CSV file
        Returns:
            DataFrame with loaded data
        """
        df = pd.read_csv(input_path)
        # Ensure timestamp is datetime
        if 'timestamp' in df.columns:
            df['timestamp'] = pd.to_datetime(df['timestamp'])
        print(f"📊 Loaded stream: {len(df)} data points")
        print(f"   Columns: {', '.join(df.columns)}")
        return df
    def append_data_point(
        self,
        output_path: str,
        new_data: dict
    ) -> None:
        """
        Append a new data point to the live feed (for continuous streaming).
        Args:
            output_path: Path to CSV file
            new_data: Dictionary with new data point (must include 'timestamp')
        """
        # Ensure timestamp is included
        if 'timestamp' not in new_data:
            new_data['timestamp'] = datetime.now()
        # Append to CSV
        file_exists = Path(output_path).exists()
        with open(output_path, 'a', newline='') as f:
            writer = csv.DictWriter(f, fieldnames=new_data.keys())
            if not file_exists:
                writer.writeheader()
            writer.writerow(new_data)
    def simulate_realtime_stream(
        self,
        duration_minutes: int = 5,
        output_path: str = 'data/realtime_stream.csv'
    ) -> pd.DataFrame:
        """
        Simulate a real-time data stream for testing purposes.
        Args:
            duration_minutes: How long to run the simulation
            output_path: Where to save the stream data
        Returns:
            DataFrame with all generated data points
        """
        print(f"🔴 Starting real-time stream simulation ({duration_minutes} minutes)...")
        self.is_streaming = True
        start_time = datetime.now()
        data_points = []
        while (datetime.now() - start_time).seconds < duration_minutes * 60:
            if not self.is_streaming:
                break
            # Generate new data point with realistic variation
            data_point = {
                'timestamp': datetime.now(),
                'sales': np.random.normal(10000, 500),
                'engagement': np.random.normal(5000, 300),
                'conversions': np.random.normal(500, 50)
            data_points.append(data_point)
            self.append_data_point(output_path, data_point)
            print(f"   📍 Data point {len(data_points)}: Sales={data_point['sales']:.0f}")
            time.sleep(self.stream_interval)
        self.is_streaming = False
        df = pd.DataFrame(data_points)
        print(f"✅ Stream simulation complete: {len(df)} data points")
        return df
    def stop_stream(self):
        """Stop the real-time stream simulation."""
        self.is_streaming = False
        print("⏹️ Stream stopped")
# Example usage
if __name__ == "__main__":
    stream = DataStream()
    # Generate sample live feed
    df = stream.generate_live_feed(
        output_path='data/sample_live_feed.csv',
        num_days=30,
        metrics=['sales', 'engagement', 'conversions', 'revenue']
    print("\n📈 Sample data preview:")
    print(df.tail(10))
Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

data_stream.py

Latest commit

History

data_stream.py

File metadata and controls