22from typing import Dict , Optional
33import pandas as pd
44import hashlib
5+ from scipy .stats import chi2_contingency
6+ import numpy as np
57
68
79@dataclass
@@ -36,9 +38,25 @@ def analyze(self) -> Dict:
3638 self ._summarize_variable_types ()
3739 self ._add_reproduction_info ()
3840 self ._summarize_variables ()
41+ self ._summarize_interactions ()
42+ self ._summarize_missing_values ()
3943
4044 return self ._generate_summary ()
4145
46+ # =========================================================================
47+ # Sample Section
48+ # =========================================================================
49+ def _get_dataset_preview (self ):
50+ head = self .df .head ()
51+ tail = self .df .tail ()
52+ sample = self .df .sample (min (10 , len (self .df )))
53+ dataset_preview = {
54+ "head" : head .to_dict (orient = "records" ),
55+ "tail" : tail .to_dict (orient = "records" ),
56+ "sample" : sample .to_dict (orient = "records" ),
57+ }
58+ self .summaries .update (dataset_preview )
59+
4260 # =========================================================================
4361 # Overview Section
4462 # =========================================================================
@@ -143,7 +161,7 @@ def _summarize_categorical_column(self, col: str):
143161 stats = {
144162 "count" : int (series .count ()),
145163 "unique" : int (series .nunique ()),
146- "top_values" : series .value_counts ().head (10 ).to_dict (), # top 10 only
164+ "top_values" : series .value_counts ().head (10 ).to_dict (), # top 10 only
147165 "most_frequent" : series .mode ().iloc [0 ] if not series .empty else None ,
148166 "missing" : int (self .df [col ].isna ().sum ()),
149167 }
@@ -164,12 +182,15 @@ def _summarize_text_column(self, col: str):
164182 "avg_length" : float (lengths .mean ()) if not lengths .empty else None ,
165183 "min_length" : float (lengths .min ()) if not lengths .empty else None ,
166184 "max_length" : float (lengths .max ()) if not lengths .empty else None ,
167- "common_lengths" : lengths .value_counts ().head (5 ).to_dict (), # top 5
185+ "common_lengths" : lengths .value_counts ().head (5 ).to_dict (), # top 5
168186 "char_freq" : (
169- pd .Series (list ("" .join (series ))).value_counts ().head (10 ).to_dict () # top 10
187+ pd .Series (list ("" .join (series )))
188+ .value_counts ()
189+ .head (10 )
190+ .to_dict () # top 10
170191 if not series .empty
171192 else None
172- ), # top 10 only
193+ ), # top 10 only
173194 }
174195
175196 if "variables" not in self .summaries :
@@ -201,18 +222,112 @@ def _summarize_datetime_column(self, col: str):
201222 self .summaries ["variables" ][col ] = stats
202223
203224 # =========================================================================
204- # Sample Section
225+ # Interactions and Correlations Section
205226 # =========================================================================
206- def _get_dataset_preview (self ):
207- head = self .df .head ()
208- tail = self .df .tail ()
209- sample = self .df .sample (min (10 , len (self .df )))
210- dataset_preview = {
211- "head" : head .to_dict (orient = "records" ),
212- "tail" : tail .to_dict (orient = "records" ),
213- "sample" : sample .to_dict (orient = "records" ),
227+ def _summarize_interactions (self ):
228+ """Run interactions between variables"""
229+ self ._scatter_plots_numeric ()
230+ self ._compute_correlation_matrices ()
231+ self ._compute_categorical_correlations ()
232+ self ._compute_mixed_correlations ()
233+
234+ def _scatter_plots_numeric (self ):
235+ """
236+ Generate scatter plots between numeric variables
237+ for CLI: just the pairs
238+ for Web/Report: Plot them
239+ """
240+ numeric_columns = self .df .select_dtypes (include = "number" ).columns
241+ pairs = [
242+ (c1 , c2 )
243+ for i , c1 in enumerate (numeric_columns )
244+ for c2 in numeric_columns [i + 1 :]
245+ ]
246+ self .summaries ["scatter_pairs" ] = pairs # TODO: Plot these
247+
248+ def _compute_correlation_matrices (self ):
249+ """Compute Pearson/Spearman/Kendall correlations"""
250+ numeric_df = self .df .select_dtypes (include = "number" )
251+ corrs = {}
252+ if not numeric_df .empty :
253+ corrs ["pearson" ] = numeric_df .corr (method = "pearson" ).to_dict ()
254+ corrs ["spearman" ] = numeric_df .corr (method = "spearman" ).to_dict ()
255+ corrs ["kendall" ] = numeric_df .corr (method = "kendall" ).to_dict ()
256+ self .summaries ["numeric_correlations" ] = corrs
257+
258+ def _compute_categorical_correlations (self ):
259+ """Compute Cramer's V for categorical pairs"""
260+ categorical = self .df .select_dtypes (include = "object" ).columns
261+ results = {}
262+ for i , c1 in enumerate (categorical ):
263+ for c2 in categorical [i + 1 :]:
264+ try :
265+ table = pd .crosstab (self .df [c1 ], self .df [c2 ])
266+ chi2 , _ , _ , _ = chi2_contingency (table )
267+ n = table .sum ().sum ()
268+ phi2 = chi2 / n
269+ r , k = table .shape
270+ cramers_v = (phi2 / min (k - 1 , r - 1 )) ** 0.5
271+ results [f"{ c1 } __{ c2 } " ] = cramers_v
272+ except Exception :
273+ continue
274+
275+ self .summaries ["categorical_correlations" ] = results
276+
277+ def _compute_mixed_correlations (self ):
278+ """
279+ Compute correlation between categorical and numeric using ANOVA F-test as proxy
280+ """
281+ from scipy .stats import f_oneway
282+ import numpy as np
283+
284+ cat_cols = self .df .select_dtypes (include = ["object" , "category" ]).columns
285+ num_cols = self .df .select_dtypes (include = ["int64" , "float64" ]).columns
286+ mixed_corr = {}
287+
288+ for cat in cat_cols :
289+ for num in num_cols :
290+ # Build groups for each level of categorical variable
291+ groups = []
292+ for level in self .df [cat ].dropna ().unique ():
293+ vals = self .df .loc [self .df [cat ] == level , num ].dropna ().to_numpy ()
294+ if len (vals ) > 1 : # Only include groups with more than 1 value
295+ groups .append (vals )
296+
297+ if len (groups ) < 2 :
298+ continue # Need at least 2 valid groups for ANOVA
299+
300+ # Skip if all groups have zero variance
301+ if all (np .var (g , ddof = 1 ) == 0 for g in groups ):
302+ continue
303+
304+ try :
305+ f_stat , p_val = f_oneway (* groups )
306+ mixed_corr [f"{ cat } __{ num } " ] = {"f_stat" : f_stat , "p_value" : p_val }
307+ except Exception as e :
308+ mixed_corr [f"{ cat } __{ num } " ] = {"error" : str (e )}
309+
310+ self .summaries ["mixed_correlations" ] = mixed_corr
311+
312+ # =========================================================================
313+ # Missing Value Section
314+ # =========================================================================
315+ def _summarize_missing_values (self ):
316+ """Summarize missing value patterns"""
317+ missing_count = self .df .isnull ().sum ().to_dict ()
318+ missing_percentage = (self .df .isnull ().mean () * 100 ).round (2 ).to_dict ()
319+
320+ self .summaries ["missing_values" ] = {
321+ "count" : missing_count ,
322+ "percentage" : missing_percentage ,
323+ }
324+
325+ # Simple missingness heatmap structure (list of missing row indexes)
326+ self .summaries ["missing_patterns" ] = {
327+ col : self .df [self .df [col ].isna ()].index .tolist ()
328+ for col in self .df .columns
329+ if self .df [col ].isna ().any ()
214330 }
215- self .summaries .update (dataset_preview )
216331
217332 # =========================================================================
218333 # Generate Summary
0 commit comments