Skip to content

Commit ce62d89

Browse files
authored
wip: add warnings and critical issues checks (#9)
* feat(analyzer): add interactions and correlations - add scatter plots pairs for numeric variables - compute correlation matrices (Pearson, Spearman and Kendall) - compute categorical correlations (Cramer's V) - compute mixed correlations (using ANOVA F-test as proxy). * chore(packagesj): add numpy (#8) * wip: add warnings and critical issues checks It can detect warnings and critical issues, but still skeptical about the quick fixes and impact scores.
1 parent 2b882cd commit ce62d89

File tree

1 file changed

+232
-0
lines changed

1 file changed

+232
-0
lines changed

hashprep/analyzer.py

Lines changed: 232 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,19 @@ def analyze(self) -> Dict:
4141
self._summarize_interactions()
4242
self._summarize_missing_values()
4343

44+
# ---- Warnings and Critical Issues ----
45+
self._check_data_leakage()
46+
self._check_high_missing_values()
47+
self._check_empty_columns()
48+
self._check_single_value_columns()
49+
self._check_target_leakage_patterns()
50+
self._check_class_imbalance()
51+
self._check_high_cardinality()
52+
self._check_duplicates()
53+
self._check_mixed_data_types()
54+
self._check_outliers()
55+
self._check_feature_correlation()
56+
4457
return self._generate_summary()
4558

4659
# =========================================================================
@@ -320,6 +333,224 @@ def _summarize_missing_values(self):
320333
self.summaries["missing_values"] = {
321334
"count": missing_count,
322335
"percentage": missing_percentage,
336+
337+
}
338+
339+
# Simple missingness heatmap structure (list of missing row indexes)
340+
self.summaries["missing_patterns"] = {
341+
col: self.df[self.df[col].isna()].index.tolist()
342+
for col in self.df.columns
343+
if self.df[col].isna().any()
344+
}
345+
346+
# =========================================================================
347+
# Critical Issues & Warning Checks
348+
# =========================================================================
349+
350+
def _check_data_leakage(self, target_col: str = None):
351+
"""Check if any feature is a perfect duplicate of the target"""
352+
if target_col and target_col in self.df.columns:
353+
target = self.df[target_col]
354+
for col in self.df.columns:
355+
if col == target_col:
356+
continue
357+
if self.df[col].equals(target):
358+
self.issues.append(
359+
Issues(
360+
category="data_leakage",
361+
severity="critical",
362+
column=col,
363+
description=f"Column '{col}' is identical to target '{target_col}'",
364+
impact_score="high",
365+
quick_fix="Drop the column before training.",
366+
)
367+
)
368+
369+
def _check_high_missing_values(self, threshold: float = 0.4):
370+
"""Flag columns with > threshold missing values"""
371+
for col in self.df.columns:
372+
missing_pct = self.df[col].isna().mean()
373+
if missing_pct > threshold:
374+
self.issues.append(
375+
Issues(
376+
category="missing_values",
377+
severity="warning",
378+
column=col,
379+
description=f"{missing_pct:.1%} missing values in '{col}'",
380+
impact_score="medium",
381+
quick_fix="Consider imputing or dropping this column.",
382+
)
383+
)
384+
385+
def _check_empty_columns(self):
386+
"""Detect columns that are entirely empty"""
387+
for col in self.df.columns:
388+
if self.df[col].notna().sum() == 0:
389+
self.issues.append(
390+
Issues(
391+
category="empty_column",
392+
severity="critical",
393+
column=col,
394+
description=f"Column '{col}' has no non-missing values",
395+
impact_score="high",
396+
quick_fix="Drop the column.",
397+
)
398+
)
399+
400+
def _check_single_value_columns(self):
401+
"""Detect columns with only one unique value"""
402+
for col in self.df.columns:
403+
if self.df[col].nunique(dropna=True) == 1:
404+
self.issues.append(
405+
Issues(
406+
category="single_value",
407+
severity="warning",
408+
column=col,
409+
description=f"Column '{col}' contains only one unique value",
410+
impact_score="low",
411+
quick_fix="Drop this column (not informative).",
412+
)
413+
)
414+
415+
def _check_target_leakage_patterns(self, target_col: str = None):
416+
"""
417+
Detect columns that strongly correlate with target (possible leakage).
418+
Works only if target_col is provided.
419+
"""
420+
if target_col and target_col in self.df.columns:
421+
target = self.df[target_col]
422+
numeric_cols = self.df.select_dtypes(include="number").drop(
423+
columns=[target_col], errors="ignore"
424+
)
425+
if not numeric_cols.empty and pd.api.types.is_numeric_dtype(target):
426+
corrs = numeric_cols.corrwith(target).abs()
427+
for col, corr in corrs.items():
428+
if corr > 0.95:
429+
self.issues.append(
430+
Issues(
431+
category="target_leakage",
432+
severity="critical",
433+
column=col,
434+
description=f"Column '{col}' highly correlated with target ({corr:.2f})",
435+
impact_score="high",
436+
quick_fix="Remove this column before training.",
437+
)
438+
)
439+
440+
def _check_class_imbalance(self, target_col: str = None, threshold: float = 0.9):
441+
"""Check if target variable is highly imbalanced"""
442+
if target_col and target_col in self.df.columns:
443+
counts = self.df[target_col].value_counts(normalize=True)
444+
if counts.iloc[0] > threshold:
445+
self.issues.append(
446+
Issues(
447+
category="class_imbalance",
448+
severity="warning",
449+
column=target_col,
450+
description=f"Target '{target_col}' is imbalanced ({counts.iloc[0]:.1%} in one class)",
451+
impact_score="medium",
452+
quick_fix="Consider stratified sampling, resampling, or class-weighted models.",
453+
)
454+
)
455+
456+
def _check_high_cardinality(self, threshold: int = 100):
457+
"""Detect categorical columns with too many unique values"""
458+
categorical_cols = self.df.select_dtypes(include="object").columns
459+
for col in categorical_cols:
460+
unique_count = self.df[col].nunique()
461+
if unique_count > threshold:
462+
self.issues.append(
463+
Issues(
464+
category="high_cardinality",
465+
severity="warning",
466+
column=col,
467+
description=f"Column '{col}' has {unique_count} unique values",
468+
impact_score="medium",
469+
quick_fix="Consider feature hashing or grouping rare categories.",
470+
)
471+
)
472+
473+
def _check_duplicates(self):
474+
"""Check for duplicate rows"""
475+
duplicate_rows = self.df.duplicated().sum()
476+
if duplicate_rows > 0:
477+
self.issues.append(
478+
Issues(
479+
category="duplicates",
480+
severity="warning",
481+
column="__all__",
482+
description=f"Dataset contains {duplicate_rows} duplicate rows",
483+
impact_score="medium",
484+
quick_fix="Drop duplicates if not meaningful.",
485+
)
486+
)
487+
488+
def _check_mixed_data_types(self):
489+
"""Detect columns with mixed dtypes (e.g., numbers + strings)"""
490+
for col in self.df.columns:
491+
types = self.df[col].dropna().map(type).nunique()
492+
if types > 1:
493+
self.issues.append(
494+
Issues(
495+
category="mixed_types",
496+
severity="warning",
497+
column=col,
498+
description=f"Column '{col}' contains mixed data types",
499+
impact_score="low",
500+
quick_fix="Clean or cast to a single type.",
501+
)
502+
)
503+
504+
def _check_outliers(self, z_threshold: float = 4.0):
505+
"""Flag numeric columns with extreme outliers based on Z-score"""
506+
from scipy.stats import zscore
507+
508+
numeric_df = self.df.select_dtypes(include="number").dropna()
509+
if numeric_df.empty:
510+
return
511+
512+
z_scores = (numeric_df - numeric_df.mean()) / numeric_df.std(ddof=0)
513+
for col in numeric_df.columns:
514+
outlier_count = (abs(z_scores[col]) > z_threshold).sum()
515+
if outlier_count > 0:
516+
self.issues.append(
517+
Issues(
518+
category="outliers",
519+
severity="warning",
520+
column=col,
521+
description=f"Column '{col}' has {outlier_count} potential outliers",
522+
impact_score="medium",
523+
quick_fix="Investigate values; consider winsorization or transformations.",
524+
)
525+
)
526+
527+
def _check_feature_correlation(self, threshold: float = 0.95):
528+
"""Detect highly correlated numeric features"""
529+
numeric_df = self.df.select_dtypes(include="number")
530+
if numeric_df.empty:
531+
return
532+
533+
corr_matrix = numeric_df.corr().abs()
534+
upper = corr_matrix.where(np.tril(np.ones(corr_matrix.shape)).astype(bool))
535+
correlated_pairs = [
536+
(col, row, val)
537+
for row in upper.index
538+
for col, val in upper[row].dropna().items()
539+
if val > threshold
540+
]
541+
542+
for col1, col2, corr in correlated_pairs:
543+
self.issues.append(
544+
Issues(
545+
category="feature_correlation",
546+
severity="warning",
547+
column=f"{col1},{col2}",
548+
description=f"Columns '{col1}' and '{col2}' are highly correlated ({corr:.2f})",
549+
impact_score="medium",
550+
quick_fix="Consider dropping one of the correlated features.",
551+
)
552+
)
553+
323554
}
324555

325556
# Simple missingness heatmap structure (list of missing row indexes)
@@ -329,6 +560,7 @@ def _summarize_missing_values(self):
329560
if self.df[col].isna().any()
330561
}
331562

563+
332564
# =========================================================================
333565
# Generate Summary
334566
# =========================================================================

0 commit comments

Comments
 (0)