@@ -41,6 +41,19 @@ def analyze(self) -> Dict:
4141 self ._summarize_interactions ()
4242 self ._summarize_missing_values ()
4343
44+ # ---- Warnings and Critical Issues ----
45+ self ._check_data_leakage ()
46+ self ._check_high_missing_values ()
47+ self ._check_empty_columns ()
48+ self ._check_single_value_columns ()
49+ self ._check_target_leakage_patterns ()
50+ self ._check_class_imbalance ()
51+ self ._check_high_cardinality ()
52+ self ._check_duplicates ()
53+ self ._check_mixed_data_types ()
54+ self ._check_outliers ()
55+ self ._check_feature_correlation ()
56+
4457 return self ._generate_summary ()
4558
4659 # =========================================================================
@@ -320,6 +333,224 @@ def _summarize_missing_values(self):
320333 self .summaries ["missing_values" ] = {
321334 "count" : missing_count ,
322335 "percentage" : missing_percentage ,
336+
337+ }
338+
339+ # Simple missingness heatmap structure (list of missing row indexes)
340+ self .summaries ["missing_patterns" ] = {
341+ col : self .df [self .df [col ].isna ()].index .tolist ()
342+ for col in self .df .columns
343+ if self .df [col ].isna ().any ()
344+ }
345+
346+ # =========================================================================
347+ # Critical Issues & Warning Checks
348+ # =========================================================================
349+
350+ def _check_data_leakage (self , target_col : str = None ):
351+ """Check if any feature is a perfect duplicate of the target"""
352+ if target_col and target_col in self .df .columns :
353+ target = self .df [target_col ]
354+ for col in self .df .columns :
355+ if col == target_col :
356+ continue
357+ if self .df [col ].equals (target ):
358+ self .issues .append (
359+ Issues (
360+ category = "data_leakage" ,
361+ severity = "critical" ,
362+ column = col ,
363+ description = f"Column '{ col } ' is identical to target '{ target_col } '" ,
364+ impact_score = "high" ,
365+ quick_fix = "Drop the column before training." ,
366+ )
367+ )
368+
369+ def _check_high_missing_values (self , threshold : float = 0.4 ):
370+ """Flag columns with > threshold missing values"""
371+ for col in self .df .columns :
372+ missing_pct = self .df [col ].isna ().mean ()
373+ if missing_pct > threshold :
374+ self .issues .append (
375+ Issues (
376+ category = "missing_values" ,
377+ severity = "warning" ,
378+ column = col ,
379+ description = f"{ missing_pct :.1%} missing values in '{ col } '" ,
380+ impact_score = "medium" ,
381+ quick_fix = "Consider imputing or dropping this column." ,
382+ )
383+ )
384+
385+ def _check_empty_columns (self ):
386+ """Detect columns that are entirely empty"""
387+ for col in self .df .columns :
388+ if self .df [col ].notna ().sum () == 0 :
389+ self .issues .append (
390+ Issues (
391+ category = "empty_column" ,
392+ severity = "critical" ,
393+ column = col ,
394+ description = f"Column '{ col } ' has no non-missing values" ,
395+ impact_score = "high" ,
396+ quick_fix = "Drop the column." ,
397+ )
398+ )
399+
400+ def _check_single_value_columns (self ):
401+ """Detect columns with only one unique value"""
402+ for col in self .df .columns :
403+ if self .df [col ].nunique (dropna = True ) == 1 :
404+ self .issues .append (
405+ Issues (
406+ category = "single_value" ,
407+ severity = "warning" ,
408+ column = col ,
409+ description = f"Column '{ col } ' contains only one unique value" ,
410+ impact_score = "low" ,
411+ quick_fix = "Drop this column (not informative)." ,
412+ )
413+ )
414+
415+ def _check_target_leakage_patterns (self , target_col : str = None ):
416+ """
417+ Detect columns that strongly correlate with target (possible leakage).
418+ Works only if target_col is provided.
419+ """
420+ if target_col and target_col in self .df .columns :
421+ target = self .df [target_col ]
422+ numeric_cols = self .df .select_dtypes (include = "number" ).drop (
423+ columns = [target_col ], errors = "ignore"
424+ )
425+ if not numeric_cols .empty and pd .api .types .is_numeric_dtype (target ):
426+ corrs = numeric_cols .corrwith (target ).abs ()
427+ for col , corr in corrs .items ():
428+ if corr > 0.95 :
429+ self .issues .append (
430+ Issues (
431+ category = "target_leakage" ,
432+ severity = "critical" ,
433+ column = col ,
434+ description = f"Column '{ col } ' highly correlated with target ({ corr :.2f} )" ,
435+ impact_score = "high" ,
436+ quick_fix = "Remove this column before training." ,
437+ )
438+ )
439+
440+ def _check_class_imbalance (self , target_col : str = None , threshold : float = 0.9 ):
441+ """Check if target variable is highly imbalanced"""
442+ if target_col and target_col in self .df .columns :
443+ counts = self .df [target_col ].value_counts (normalize = True )
444+ if counts .iloc [0 ] > threshold :
445+ self .issues .append (
446+ Issues (
447+ category = "class_imbalance" ,
448+ severity = "warning" ,
449+ column = target_col ,
450+ description = f"Target '{ target_col } ' is imbalanced ({ counts .iloc [0 ]:.1%} in one class)" ,
451+ impact_score = "medium" ,
452+ quick_fix = "Consider stratified sampling, resampling, or class-weighted models." ,
453+ )
454+ )
455+
456+ def _check_high_cardinality (self , threshold : int = 100 ):
457+ """Detect categorical columns with too many unique values"""
458+ categorical_cols = self .df .select_dtypes (include = "object" ).columns
459+ for col in categorical_cols :
460+ unique_count = self .df [col ].nunique ()
461+ if unique_count > threshold :
462+ self .issues .append (
463+ Issues (
464+ category = "high_cardinality" ,
465+ severity = "warning" ,
466+ column = col ,
467+ description = f"Column '{ col } ' has { unique_count } unique values" ,
468+ impact_score = "medium" ,
469+ quick_fix = "Consider feature hashing or grouping rare categories." ,
470+ )
471+ )
472+
473+ def _check_duplicates (self ):
474+ """Check for duplicate rows"""
475+ duplicate_rows = self .df .duplicated ().sum ()
476+ if duplicate_rows > 0 :
477+ self .issues .append (
478+ Issues (
479+ category = "duplicates" ,
480+ severity = "warning" ,
481+ column = "__all__" ,
482+ description = f"Dataset contains { duplicate_rows } duplicate rows" ,
483+ impact_score = "medium" ,
484+ quick_fix = "Drop duplicates if not meaningful." ,
485+ )
486+ )
487+
488+ def _check_mixed_data_types (self ):
489+ """Detect columns with mixed dtypes (e.g., numbers + strings)"""
490+ for col in self .df .columns :
491+ types = self .df [col ].dropna ().map (type ).nunique ()
492+ if types > 1 :
493+ self .issues .append (
494+ Issues (
495+ category = "mixed_types" ,
496+ severity = "warning" ,
497+ column = col ,
498+ description = f"Column '{ col } ' contains mixed data types" ,
499+ impact_score = "low" ,
500+ quick_fix = "Clean or cast to a single type." ,
501+ )
502+ )
503+
504+ def _check_outliers (self , z_threshold : float = 4.0 ):
505+ """Flag numeric columns with extreme outliers based on Z-score"""
506+ from scipy .stats import zscore
507+
508+ numeric_df = self .df .select_dtypes (include = "number" ).dropna ()
509+ if numeric_df .empty :
510+ return
511+
512+ z_scores = (numeric_df - numeric_df .mean ()) / numeric_df .std (ddof = 0 )
513+ for col in numeric_df .columns :
514+ outlier_count = (abs (z_scores [col ]) > z_threshold ).sum ()
515+ if outlier_count > 0 :
516+ self .issues .append (
517+ Issues (
518+ category = "outliers" ,
519+ severity = "warning" ,
520+ column = col ,
521+ description = f"Column '{ col } ' has { outlier_count } potential outliers" ,
522+ impact_score = "medium" ,
523+ quick_fix = "Investigate values; consider winsorization or transformations." ,
524+ )
525+ )
526+
527+ def _check_feature_correlation (self , threshold : float = 0.95 ):
528+ """Detect highly correlated numeric features"""
529+ numeric_df = self .df .select_dtypes (include = "number" )
530+ if numeric_df .empty :
531+ return
532+
533+ corr_matrix = numeric_df .corr ().abs ()
534+ upper = corr_matrix .where (np .tril (np .ones (corr_matrix .shape )).astype (bool ))
535+ correlated_pairs = [
536+ (col , row , val )
537+ for row in upper .index
538+ for col , val in upper [row ].dropna ().items ()
539+ if val > threshold
540+ ]
541+
542+ for col1 , col2 , corr in correlated_pairs :
543+ self .issues .append (
544+ Issues (
545+ category = "feature_correlation" ,
546+ severity = "warning" ,
547+ column = f"{ col1 } ,{ col2 } " ,
548+ description = f"Columns '{ col1 } ' and '{ col2 } ' are highly correlated ({ corr :.2f} )" ,
549+ impact_score = "medium" ,
550+ quick_fix = "Consider dropping one of the correlated features." ,
551+ )
552+ )
553+
323554 }
324555
325556 # Simple missingness heatmap structure (list of missing row indexes)
@@ -329,6 +560,7 @@ def _summarize_missing_values(self):
329560 if self .df [col ].isna ().any ()
330561 }
331562
563+
332564 # =========================================================================
333565 # Generate Summary
334566 # =========================================================================
0 commit comments