cachevector · maskedsyntax · Oct 1, 2025 · Oct 1, 2025
diff --git a/hashprep/analyzer.py b/hashprep/analyzer.py
@@ -1,5 +1,7 @@
+from statistics import correlation
 from typing import Dict, List, Optional
 import pandas as pd
+from .checks.type_inference import infer_types
 
 from .checks import run_checks
 from .summaries import (
@@ -18,12 +20,15 @@ def __init__(
         df: pd.DataFrame,
         target_col: Optional[str] = None,
         selected_checks: Optional[List[str]] = None,
+        include_plots: bool = False,
     ):
         self.df = df
         self.target_col = target_col
         self.selected_checks = selected_checks
+        self.include_plots = include_plots
         self.issues = []
         self.summaries = {}
+        self.column_types = infer_types(df)
         self.all_checks = [
             "data_leakage", "high_missing_values", "empty_columns", "single_value_columns",
             "target_leakage_patterns", "class_imbalance", "high_cardinality", "duplicates",
@@ -32,15 +37,19 @@ def __init__(
             "extreme_text_lengths", "datetime_skew", "missing_patterns",
         ]
 
+
     def analyze(self) -> Dict:
+        # """analyze columns first for better results"""
+        # classifications = self.classify_columns()
+        # print(classifications)
         """Run all summaries and checks, return summary"""
         self.summaries.update(get_dataset_preview(self.df))
         self.summaries.update(summarize_dataset_info(self.df))
-        self.summaries["variable_types"] = summarize_variable_types(self.df)
+        self.summaries["variable_types"] = summarize_variable_types(self.df, column_types=self.column_types) # Todo: Implement this arg
         self.summaries["reproduction_info"] = add_reproduction_info(self.df)
-        self.summaries["variables"] = summarize_variables(self.df)
-        self.summaries.update(summarize_interactions(self.df))
-        self.summaries.update(summarize_missing_values(self.df))
+        self.summaries["variables"] = summarize_variables(self.df, include_plots=self.include_plots)
+        self.summaries.update(summarize_interactions(self.df, include_plots=self.include_plots))
+        self.summaries.update(summarize_missing_values(self.df, include_plots=self.include_plots))
 
         checks_to_run = self.all_checks if self.selected_checks is None else [
             check for check in self.selected_checks if check in self.all_checks
@@ -67,4 +76,5 @@ def _generate_summary(self):
                 } for issue in self.issues
             ],
             "summaries": self.summaries,
+            "column_types": self.column_types,
         }
diff --git a/hashprep/checks/__init__.py b/hashprep/checks/__init__.py
@@ -1,9 +1,12 @@
+from typing import List, Optional
+
 from .core import Issues
 from .leakage import _check_data_leakage, _check_target_leakage_patterns
-from .missing_values import _check_high_missing_values, _check_empty_columns, _check_dataset_missingness, _check_missing_patterns
+from .missing_values import _check_high_missing_values, _check_empty_columns, _check_dataset_missingness, \
+    _check_missing_patterns
 from .columns import _check_single_value_columns, _check_high_cardinality, _check_duplicates, _check_mixed_data_types
 from .outliers import _check_outliers, _check_high_zero_counts, _check_extreme_text_lengths, _check_datetime_skew
-from .correlations import _check_feature_correlation, _check_categorical_correlation, _check_mixed_correlation
+from .correlations import calculate_correlations
 from .imbalance import _check_class_imbalance
 
 CHECKS = {
@@ -17,18 +20,28 @@
     "duplicates": _check_duplicates,
     "mixed_data_types": _check_mixed_data_types,
     "outliers": _check_outliers,
-    "feature_correlation": _check_feature_correlation,
-    "categorical_correlation": _check_categorical_correlation,
-    "mixed_correlation": _check_mixed_correlation,
     "dataset_missingness": _check_dataset_missingness,
     "high_zero_counts": _check_high_zero_counts,
     "extreme_text_lengths": _check_extreme_text_lengths,
     "datetime_skew": _check_datetime_skew,
     "missing_patterns": _check_missing_patterns,
 }
 
-def run_checks(analyzer, checks_to_run):
+CORRELATION_CHECKS = {"feature_correlation", "categorical_correlation", "mixed_correlation"}
+
+
+def run_checks(analyzer, checks_to_run: List[str]):
     issues = []
+    correlation_requested = False
+
     for check in checks_to_run:
-        issues.extend(CHECKS[check](analyzer))
+        if check in CORRELATION_CHECKS:
+            correlation_requested = True
+            continue  # Skip individual correlation checks; handle via calculate_correlations
+        if check in CHECKS:
+            issues.extend(CHECKS[check](analyzer))
+
+    if correlation_requested:
+        issues.extend(calculate_correlations(analyzer))
+
     return issues
diff --git a/hashprep/checks/correlations.py b/hashprep/checks/correlations.py
@@ -1,9 +1,133 @@
 from .core import Issues
 import pandas as pd
-from scipy.stats import chi2_contingency, f_oneway
 import numpy as np
+from scipy.stats import f_oneway, spearmanr, pearsonr, kendalltau, chi2_contingency
+from itertools import combinations
+from .discretizer import Discretizer, DiscretizationType
+from .type_inference import infer_types, is_usable_for_corr
 
-def _check_feature_correlation(analyzer, threshold: float = 0.95, critical_threshold: float = 0.98):
+
+# Thresholds
+CORR_THRESHOLDS = {
+    'numeric': {
+        'spearman': {'warning': 0.7, 'critical': 0.95},
+        'pearson': {'warning': 0.7, 'critical': 0.95},
+        'kendall': {'warning': 0.6, 'critical': 0.85},  # Lower for Kendall (typically smaller values)
+    },
+    'categorical': {'warning': 0.5, 'critical': 0.8},
+    'mixed': {'warning': 0.5, 'critical': 0.8},  # Updated to coefficient thresholds (matching categorical) for Cramer's V
+}
+CAT_MAX_DISTINCT = 50
+LOW_CARD_NUM_THRESHOLD = 10  # From type_inference.py
+
+def _cramers_v_corrected(table: pd.DataFrame) -> float:
+    if table.empty or (table.shape[0] == 1 or table.shape[1] == 1):
+        return 0.0
+    chi2 = chi2_contingency(table, correction=True)[0]
+    n = table.sum().sum()
+    phi2 = chi2 / n
+    r, k = table.shape
+    with np.errstate(divide='ignore', invalid='ignore'):
+        phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
+        rcorr = r - ((r-1)**2)/(n-1)
+        kcorr = k - ((k-1)**2)/(n-1)
+        rkcorr = min((kcorr-1), (rcorr-1))
+        if rkcorr == 0:
+            return 1.0
+        return np.sqrt(phi2corr / rkcorr)
+
+
+def calculate_correlations(analyzer, thresholds=None):
+    """
+    Compute correlations using internal defaults: Spearman + Pearson for numerics,
+    with Kendall added automatically for low-cardinality pairs.
+    """
+    if thresholds is None:
+        thresholds = CORR_THRESHOLDS
+
+    inferred_types = analyzer.column_types  # Use analyzer.column_types for inferred types dict
+    issues = []
+
+    numeric_cols = [col for col, typ in inferred_types.items() if
+                    typ == 'Numeric' and is_usable_for_corr(analyzer.df[col])]
+    cat_cols = [col for col, typ in inferred_types.items() if typ == 'Categorical' and
+                1 < analyzer.df[col].nunique() <= CAT_MAX_DISTINCT and is_usable_for_corr(analyzer.df[col])]
+    text_cols = [col for col, typ in inferred_types.items() if typ == 'Text']
+
+    # Internal default methods
+    default_methods = ['spearman', 'pearson']
+    issues.extend(_check_numeric_correlation(analyzer, numeric_cols, thresholds['numeric'], default_methods))
+    issues.extend(_check_categorical_correlation(analyzer, cat_cols, thresholds['categorical']))
+    issues.extend(_check_mixed_correlation(analyzer, numeric_cols, cat_cols, thresholds['mixed']))
+
+    return issues
+
+
+def _check_numeric_correlation(analyzer, numeric_cols: list, thresholds: dict, methods: list):
+    issues = []
+    if len(numeric_cols) < 2:
+        return issues
+
+    num_df = analyzer.df[numeric_cols].dropna(how='all')
+    corr_methods = {
+        'spearman': lambda x, y: spearmanr(x, y),
+        'pearson': lambda x, y: pearsonr(x, y),
+        'kendall': lambda x, y: kendalltau(x, y)
+    }
+
+    for col1, col2 in combinations(numeric_cols, 2):
+        series1, series2 = num_df[col1].dropna(), num_df[col2].dropna()
+        common_idx = series1.index.intersection(series2.index)
+        if len(common_idx) < 2:
+            continue
+        series1, series2 = series1.loc[common_idx], series2.loc[common_idx]
+
+        # Spearman (default, robust)
+        spearman_corr, spearman_p = spearmanr(series1, series2)
+        spearman_corr = abs(spearman_corr)
+
+        # Pearson (linear, for comparison)
+        pearson_corr, pearson_p = pearsonr(series1, series2)
+        pearson_corr = abs(pearson_corr)
+
+        # Kendall (only for low-cardinality numerics)
+        kendall_corr, kendall_p = None, None
+        is_low_card = (series1.nunique() <= LOW_CARD_NUM_THRESHOLD or
+                       series2.nunique() <= LOW_CARD_NUM_THRESHOLD)
+        if is_low_card:
+            kendall_corr, kendall_p = kendalltau(series1, series2)
+            kendall_corr = abs(kendall_corr)
+
+        # Flag if any metric exceeds threshold
+        metrics = [('Spearman', spearman_corr, spearman_p, thresholds['spearman']),
+                   ('Pearson', pearson_corr, pearson_p, thresholds['pearson'])]
+        if kendall_corr is not None:
+            metrics.append(('Kendall', kendall_corr, kendall_p, thresholds['kendall']))
+
+        for method, corr, p_val, thresh in metrics:
+            if corr > thresh['warning']:
+                severity = 'critical' if corr > thresh['critical'] else 'warning'
+                impact = 'high' if severity == 'critical' else 'medium'
+                quick_fix = (
+                    f"Options: \n- Drop one feature (e.g., {col2}): Reduces multicollinearity.\n- PCA/combine: Retains info.\n- Use tree-based models."
+                    if severity == 'critical' else
+                    f"Options: \n- Monitor in modeling.\n- Drop if redundant."
+                )
+                issues.append(Issues(
+                    category="feature_correlation",
+                    severity=severity,
+                    column=f"{col1},{col2}",
+                    description=f"Numeric columns '{col1}' and '{col2}' highly correlated ({method}: {corr:.3f}, p={p_val:.4f})",
+                    impact_score=impact,
+                    quick_fix=quick_fix,
+                ))
+
+    return issues
+
+
+def _check_feature_correlation(
+    analyzer, threshold: float = 0.95, critical_threshold: float = 0.98
+):
     issues = []
     numeric_df = analyzer.df.select_dtypes(include="number")
     if numeric_df.empty:
@@ -36,79 +160,60 @@ def _check_feature_correlation(analyzer, threshold: float = 0.95, critical_thres
         )
     return issues
 
-def _check_categorical_correlation(analyzer, threshold: float = 0.8, critical_threshold: float = 0.95):
+
+def _check_categorical_correlation(analyzer, cat_cols: list, thresholds: dict):
     issues = []
-    categorical = analyzer.df.select_dtypes(include="object").columns.tolist()
-    for i, c1 in enumerate(categorical):
-        for c2 in categorical[i + 1 :]:
-            try:
-                table = pd.crosstab(analyzer.df[c1], analyzer.df[c2])
-                chi2, _, _, _ = chi2_contingency(table)
-                n = table.sum().sum()
-                phi2 = chi2 / n
-                r, k = table.shape
-                cramers_v = np.sqrt(phi2 / min(k - 1, r - 1))
-                if cramers_v > threshold:
-                    severity = "critical" if cramers_v > critical_threshold else "warning"
-                    impact = "high" if severity == "critical" else "medium"
-                    quick_fix = (
-                        "Options: \n- Drop one feature: Avoids overfitting from high redundancy (Pros: Simplifies model; Cons: Loses info).\n- Engineer feature: Extract common patterns (e.g., group categories) (Pros: Retains info; Cons: Requires domain knowledge).\n- Retain and test: Use robust models (e.g., trees) (Pros: Keeps info; Cons: May affect sensitive models)."
-                        if severity == "critical"
-                        else "Options: \n- Drop one feature: If less predictive (Pros: Simplifies model; Cons: Loses info).\n- Retain and test: Evaluate with robust models (Pros: Keeps info; Cons: Risk of redundancy).\n- Engineer feature: Group categories or encode differently (Pros: Reduces redundancy; Cons: Adds complexity)."
-                    )
-                    issues.append(
-                        Issues(
-                            category="feature_correlation",
-                            severity=severity,
-                            column=f"{c1},{c2}",
-                            description=f"Columns '{c1}' and '{c2}' are highly associated (Cramer's V: {float(cramers_v):.2f})",
-                            impact_score=impact,
-                            quick_fix=quick_fix,
-                        )
-                    )
-            except Exception:
-                continue
+    if len(cat_cols) < 2:
+        return issues
+
+    for col1, col2 in combinations(cat_cols, 2):
+        table = pd.crosstab(analyzer.df[col1], analyzer.df[col2])
+        cramers_v = _cramers_v_corrected(table)
+        if cramers_v > thresholds['warning']:
+            severity = 'critical' if cramers_v > thresholds['critical'] else 'warning'
+            impact = 'high' if severity == 'critical' else 'medium'
+            quick_fix = (
+                "Options: \n- Drop one (less predictive). \n- Group categories. \n- Use trees (robust to assoc.)."
+                if severity == 'critical' else
+                "Options: \n- Monitor redundancy. \n- Re-encode."
+            )
+            issues.append(Issues(
+                category="feature_correlation",
+                severity=severity,
+                column=f"{col1},{col2}",
+                description=f"Categorical columns '{col1}' and '{col2}' highly associated (Cramer's V: {cramers_v:.3f})",
+                impact_score=impact,
+                quick_fix=quick_fix,
+            ))
     return issues
 
-def _check_mixed_correlation(analyzer, p_threshold: float = 0.05, critical_p_threshold: float = 0.001):
+
+def _check_mixed_correlation(analyzer, numeric_cols: list, cat_cols: list, thresholds: dict):
     issues = []
-    cat_cols = analyzer.df.select_dtypes(
-        include=["object", "category"]
-    ).columns.tolist()
-    num_cols = analyzer.df.select_dtypes(include=["int64", "float64"]).columns.tolist()
-    for cat in cat_cols:
-        for num in num_cols:
-            groups = [
-                analyzer.df.loc[analyzer.df[cat] == level, num].dropna().to_numpy()
-                for level in analyzer.df[cat].dropna().unique()
-                if len(analyzer.df.loc[analyzer.df[cat] == level, num].dropna()) > 1
-            ]
-            if len(groups) < 2 or all(np.var(g, ddof=1) == 0 for g in groups):
-                continue
-            try:
-                f_stat, p_val = f_oneway(*groups)
-                if p_val < p_threshold:
-                    severity = (
-                        "critical"
-                        if p_val < critical_p_threshold and f_stat > 20.0
-                        else "warning"
-                    )
-                    impact = "high" if severity == "critical" else "medium"
-                    quick_fix = (
-                        "Options: \n- Drop one feature: Avoids redundancy (Pros: Simplifies model; Cons: Loses info).\n- Engineer feature: Transform categorical or numeric feature (Pros: Retains info; Cons: Adds complexity).\n- Retain and test: Use robust models (e.g., trees) (Pros: Keeps info; Cons: May affect sensitive models)."
-                        if severity == "critical"
-                        else "Options: \n- Drop one feature: If less predictive (Pros: Simplifies model; Cons: Loses info).\n- Retain and test: Evaluate with robust models (Pros: Keeps info; Cons: Risk of redundancy).\n- Engineer feature: Transform or encode differently (Pros: Reduces redundancy; Cons: Adds complexity)."
-                    )
-                    issues.append(
-                        Issues(
-                            category="feature_correlation",
-                            severity=severity,
-                            column=f"{cat},{num}",
-                            description=f"Columns '{cat}' and '{num}' show strong association (F: {float(f_stat):.2f}, p: {float(p_val):.4f})",
-                            impact_score=impact,
-                            quick_fix=quick_fix,
-                        )
-                    )
-            except Exception:
-                continue
+    if not numeric_cols or not cat_cols:
+        return issues
+
+    discretizer = Discretizer(DiscretizationType.UNIFORM, n_bins=10)
+    df_disc = discretizer.discretize_dataframe(analyzer.df[numeric_cols + cat_cols])
+
+    for num_col, cat_col in [(n, c) for n in numeric_cols for c in cat_cols]:
+        table = pd.crosstab(df_disc[cat_col], df_disc[num_col])
+        cramers_v = _cramers_v_corrected(table)
+        if cramers_v > thresholds['warning']:
+            severity = 'critical' if cramers_v > thresholds['critical'] else 'warning'
+            impact = 'high' if severity == 'critical' else 'medium'
+            quick_fix = (
+                "Options: \n- Drop one. \n- Discretize/encode differently. \n- Use robust models."
+                if severity == 'critical' else
+                "Options: \n- Monitor in modeling."
+            )
+            issues.append(Issues(
+                category="feature_correlation",
+                severity=severity,
+                column=f"{cat_col},{num_col}",
+                description=f"Mixed columns '{cat_col}' (cat) and '{num_col}' (num) associated (Discretized Cramer's V: {cramers_v:.3f})",
+                impact_score=impact,
+                quick_fix=quick_fix,
+            ))
+
     return issues