Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions hashprep/analyzer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from statistics import correlation
from typing import Dict, List, Optional
import pandas as pd
from .checks.type_inference import infer_types

from .checks import run_checks
from .summaries import (
Expand All @@ -18,12 +20,15 @@ def __init__(
df: pd.DataFrame,
target_col: Optional[str] = None,
selected_checks: Optional[List[str]] = None,
include_plots: bool = False,
):
self.df = df
self.target_col = target_col
self.selected_checks = selected_checks
self.include_plots = include_plots
self.issues = []
self.summaries = {}
self.column_types = infer_types(df)
self.all_checks = [
"data_leakage", "high_missing_values", "empty_columns", "single_value_columns",
"target_leakage_patterns", "class_imbalance", "high_cardinality", "duplicates",
Expand All @@ -32,15 +37,19 @@ def __init__(
"extreme_text_lengths", "datetime_skew", "missing_patterns",
]


def analyze(self) -> Dict:
# """analyze columns first for better results"""
# classifications = self.classify_columns()
# print(classifications)
"""Run all summaries and checks, return summary"""
self.summaries.update(get_dataset_preview(self.df))
self.summaries.update(summarize_dataset_info(self.df))
self.summaries["variable_types"] = summarize_variable_types(self.df)
self.summaries["variable_types"] = summarize_variable_types(self.df, column_types=self.column_types) # Todo: Implement this arg
self.summaries["reproduction_info"] = add_reproduction_info(self.df)
self.summaries["variables"] = summarize_variables(self.df)
self.summaries.update(summarize_interactions(self.df))
self.summaries.update(summarize_missing_values(self.df))
self.summaries["variables"] = summarize_variables(self.df, include_plots=self.include_plots)
self.summaries.update(summarize_interactions(self.df, include_plots=self.include_plots))
self.summaries.update(summarize_missing_values(self.df, include_plots=self.include_plots))

checks_to_run = self.all_checks if self.selected_checks is None else [
check for check in self.selected_checks if check in self.all_checks
Expand All @@ -67,4 +76,5 @@ def _generate_summary(self):
} for issue in self.issues
],
"summaries": self.summaries,
"column_types": self.column_types,
}
27 changes: 20 additions & 7 deletions hashprep/checks/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
from typing import List, Optional

from .core import Issues
from .leakage import _check_data_leakage, _check_target_leakage_patterns
from .missing_values import _check_high_missing_values, _check_empty_columns, _check_dataset_missingness, _check_missing_patterns
from .missing_values import _check_high_missing_values, _check_empty_columns, _check_dataset_missingness, \
_check_missing_patterns
from .columns import _check_single_value_columns, _check_high_cardinality, _check_duplicates, _check_mixed_data_types
from .outliers import _check_outliers, _check_high_zero_counts, _check_extreme_text_lengths, _check_datetime_skew
from .correlations import _check_feature_correlation, _check_categorical_correlation, _check_mixed_correlation
from .correlations import calculate_correlations
from .imbalance import _check_class_imbalance

CHECKS = {
Expand All @@ -17,18 +20,28 @@
"duplicates": _check_duplicates,
"mixed_data_types": _check_mixed_data_types,
"outliers": _check_outliers,
"feature_correlation": _check_feature_correlation,
"categorical_correlation": _check_categorical_correlation,
"mixed_correlation": _check_mixed_correlation,
"dataset_missingness": _check_dataset_missingness,
"high_zero_counts": _check_high_zero_counts,
"extreme_text_lengths": _check_extreme_text_lengths,
"datetime_skew": _check_datetime_skew,
"missing_patterns": _check_missing_patterns,
}

def run_checks(analyzer, checks_to_run):
CORRELATION_CHECKS = {"feature_correlation", "categorical_correlation", "mixed_correlation"}


def run_checks(analyzer, checks_to_run: List[str]):
issues = []
correlation_requested = False

for check in checks_to_run:
issues.extend(CHECKS[check](analyzer))
if check in CORRELATION_CHECKS:
correlation_requested = True
continue # Skip individual correlation checks; handle via calculate_correlations
if check in CHECKS:
issues.extend(CHECKS[check](analyzer))

if correlation_requested:
issues.extend(calculate_correlations(analyzer))

return issues
251 changes: 178 additions & 73 deletions hashprep/checks/correlations.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,133 @@
from .core import Issues
import pandas as pd
from scipy.stats import chi2_contingency, f_oneway
import numpy as np
from scipy.stats import f_oneway, spearmanr, pearsonr, kendalltau, chi2_contingency
from itertools import combinations
from .discretizer import Discretizer, DiscretizationType
from .type_inference import infer_types, is_usable_for_corr

def _check_feature_correlation(analyzer, threshold: float = 0.95, critical_threshold: float = 0.98):

# Thresholds
CORR_THRESHOLDS = {
'numeric': {
'spearman': {'warning': 0.7, 'critical': 0.95},
'pearson': {'warning': 0.7, 'critical': 0.95},
'kendall': {'warning': 0.6, 'critical': 0.85}, # Lower for Kendall (typically smaller values)
},
'categorical': {'warning': 0.5, 'critical': 0.8},
'mixed': {'warning': 0.5, 'critical': 0.8}, # Updated to coefficient thresholds (matching categorical) for Cramer's V
}
CAT_MAX_DISTINCT = 50
LOW_CARD_NUM_THRESHOLD = 10 # From type_inference.py

def _cramers_v_corrected(table: pd.DataFrame) -> float:
if table.empty or (table.shape[0] == 1 or table.shape[1] == 1):
return 0.0
chi2 = chi2_contingency(table, correction=True)[0]
n = table.sum().sum()
phi2 = chi2 / n
r, k = table.shape
with np.errstate(divide='ignore', invalid='ignore'):
phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
rcorr = r - ((r-1)**2)/(n-1)
kcorr = k - ((k-1)**2)/(n-1)
rkcorr = min((kcorr-1), (rcorr-1))
if rkcorr == 0:
return 1.0
return np.sqrt(phi2corr / rkcorr)


def calculate_correlations(analyzer, thresholds=None):
"""
Compute correlations using internal defaults: Spearman + Pearson for numerics,
with Kendall added automatically for low-cardinality pairs.
"""
if thresholds is None:
thresholds = CORR_THRESHOLDS

inferred_types = analyzer.column_types # Use analyzer.column_types for inferred types dict
issues = []

numeric_cols = [col for col, typ in inferred_types.items() if
typ == 'Numeric' and is_usable_for_corr(analyzer.df[col])]
cat_cols = [col for col, typ in inferred_types.items() if typ == 'Categorical' and
1 < analyzer.df[col].nunique() <= CAT_MAX_DISTINCT and is_usable_for_corr(analyzer.df[col])]
text_cols = [col for col, typ in inferred_types.items() if typ == 'Text']

# Internal default methods
default_methods = ['spearman', 'pearson']
issues.extend(_check_numeric_correlation(analyzer, numeric_cols, thresholds['numeric'], default_methods))
issues.extend(_check_categorical_correlation(analyzer, cat_cols, thresholds['categorical']))
issues.extend(_check_mixed_correlation(analyzer, numeric_cols, cat_cols, thresholds['mixed']))

return issues


def _check_numeric_correlation(analyzer, numeric_cols: list, thresholds: dict, methods: list):
issues = []
if len(numeric_cols) < 2:
return issues

num_df = analyzer.df[numeric_cols].dropna(how='all')
corr_methods = {
'spearman': lambda x, y: spearmanr(x, y),
'pearson': lambda x, y: pearsonr(x, y),
'kendall': lambda x, y: kendalltau(x, y)
}

for col1, col2 in combinations(numeric_cols, 2):
series1, series2 = num_df[col1].dropna(), num_df[col2].dropna()
common_idx = series1.index.intersection(series2.index)
if len(common_idx) < 2:
continue
series1, series2 = series1.loc[common_idx], series2.loc[common_idx]

# Spearman (default, robust)
spearman_corr, spearman_p = spearmanr(series1, series2)
spearman_corr = abs(spearman_corr)

# Pearson (linear, for comparison)
pearson_corr, pearson_p = pearsonr(series1, series2)
pearson_corr = abs(pearson_corr)

# Kendall (only for low-cardinality numerics)
kendall_corr, kendall_p = None, None
is_low_card = (series1.nunique() <= LOW_CARD_NUM_THRESHOLD or
series2.nunique() <= LOW_CARD_NUM_THRESHOLD)
if is_low_card:
kendall_corr, kendall_p = kendalltau(series1, series2)
kendall_corr = abs(kendall_corr)

# Flag if any metric exceeds threshold
metrics = [('Spearman', spearman_corr, spearman_p, thresholds['spearman']),
('Pearson', pearson_corr, pearson_p, thresholds['pearson'])]
if kendall_corr is not None:
metrics.append(('Kendall', kendall_corr, kendall_p, thresholds['kendall']))

for method, corr, p_val, thresh in metrics:
if corr > thresh['warning']:
severity = 'critical' if corr > thresh['critical'] else 'warning'
impact = 'high' if severity == 'critical' else 'medium'
quick_fix = (
f"Options: \n- Drop one feature (e.g., {col2}): Reduces multicollinearity.\n- PCA/combine: Retains info.\n- Use tree-based models."
if severity == 'critical' else
f"Options: \n- Monitor in modeling.\n- Drop if redundant."
)
issues.append(Issues(
category="feature_correlation",
severity=severity,
column=f"{col1},{col2}",
description=f"Numeric columns '{col1}' and '{col2}' highly correlated ({method}: {corr:.3f}, p={p_val:.4f})",
impact_score=impact,
quick_fix=quick_fix,
))

return issues


def _check_feature_correlation(
analyzer, threshold: float = 0.95, critical_threshold: float = 0.98
):
issues = []
numeric_df = analyzer.df.select_dtypes(include="number")
if numeric_df.empty:
Expand Down Expand Up @@ -36,79 +160,60 @@ def _check_feature_correlation(analyzer, threshold: float = 0.95, critical_thres
)
return issues

def _check_categorical_correlation(analyzer, threshold: float = 0.8, critical_threshold: float = 0.95):

def _check_categorical_correlation(analyzer, cat_cols: list, thresholds: dict):
issues = []
categorical = analyzer.df.select_dtypes(include="object").columns.tolist()
for i, c1 in enumerate(categorical):
for c2 in categorical[i + 1 :]:
try:
table = pd.crosstab(analyzer.df[c1], analyzer.df[c2])
chi2, _, _, _ = chi2_contingency(table)
n = table.sum().sum()
phi2 = chi2 / n
r, k = table.shape
cramers_v = np.sqrt(phi2 / min(k - 1, r - 1))
if cramers_v > threshold:
severity = "critical" if cramers_v > critical_threshold else "warning"
impact = "high" if severity == "critical" else "medium"
quick_fix = (
"Options: \n- Drop one feature: Avoids overfitting from high redundancy (Pros: Simplifies model; Cons: Loses info).\n- Engineer feature: Extract common patterns (e.g., group categories) (Pros: Retains info; Cons: Requires domain knowledge).\n- Retain and test: Use robust models (e.g., trees) (Pros: Keeps info; Cons: May affect sensitive models)."
if severity == "critical"
else "Options: \n- Drop one feature: If less predictive (Pros: Simplifies model; Cons: Loses info).\n- Retain and test: Evaluate with robust models (Pros: Keeps info; Cons: Risk of redundancy).\n- Engineer feature: Group categories or encode differently (Pros: Reduces redundancy; Cons: Adds complexity)."
)
issues.append(
Issues(
category="feature_correlation",
severity=severity,
column=f"{c1},{c2}",
description=f"Columns '{c1}' and '{c2}' are highly associated (Cramer's V: {float(cramers_v):.2f})",
impact_score=impact,
quick_fix=quick_fix,
)
)
except Exception:
continue
if len(cat_cols) < 2:
return issues

for col1, col2 in combinations(cat_cols, 2):
table = pd.crosstab(analyzer.df[col1], analyzer.df[col2])
cramers_v = _cramers_v_corrected(table)
if cramers_v > thresholds['warning']:
severity = 'critical' if cramers_v > thresholds['critical'] else 'warning'
impact = 'high' if severity == 'critical' else 'medium'
quick_fix = (
"Options: \n- Drop one (less predictive). \n- Group categories. \n- Use trees (robust to assoc.)."
if severity == 'critical' else
"Options: \n- Monitor redundancy. \n- Re-encode."
)
issues.append(Issues(
category="feature_correlation",
severity=severity,
column=f"{col1},{col2}",
description=f"Categorical columns '{col1}' and '{col2}' highly associated (Cramer's V: {cramers_v:.3f})",
impact_score=impact,
quick_fix=quick_fix,
))
return issues

def _check_mixed_correlation(analyzer, p_threshold: float = 0.05, critical_p_threshold: float = 0.001):

def _check_mixed_correlation(analyzer, numeric_cols: list, cat_cols: list, thresholds: dict):
issues = []
cat_cols = analyzer.df.select_dtypes(
include=["object", "category"]
).columns.tolist()
num_cols = analyzer.df.select_dtypes(include=["int64", "float64"]).columns.tolist()
for cat in cat_cols:
for num in num_cols:
groups = [
analyzer.df.loc[analyzer.df[cat] == level, num].dropna().to_numpy()
for level in analyzer.df[cat].dropna().unique()
if len(analyzer.df.loc[analyzer.df[cat] == level, num].dropna()) > 1
]
if len(groups) < 2 or all(np.var(g, ddof=1) == 0 for g in groups):
continue
try:
f_stat, p_val = f_oneway(*groups)
if p_val < p_threshold:
severity = (
"critical"
if p_val < critical_p_threshold and f_stat > 20.0
else "warning"
)
impact = "high" if severity == "critical" else "medium"
quick_fix = (
"Options: \n- Drop one feature: Avoids redundancy (Pros: Simplifies model; Cons: Loses info).\n- Engineer feature: Transform categorical or numeric feature (Pros: Retains info; Cons: Adds complexity).\n- Retain and test: Use robust models (e.g., trees) (Pros: Keeps info; Cons: May affect sensitive models)."
if severity == "critical"
else "Options: \n- Drop one feature: If less predictive (Pros: Simplifies model; Cons: Loses info).\n- Retain and test: Evaluate with robust models (Pros: Keeps info; Cons: Risk of redundancy).\n- Engineer feature: Transform or encode differently (Pros: Reduces redundancy; Cons: Adds complexity)."
)
issues.append(
Issues(
category="feature_correlation",
severity=severity,
column=f"{cat},{num}",
description=f"Columns '{cat}' and '{num}' show strong association (F: {float(f_stat):.2f}, p: {float(p_val):.4f})",
impact_score=impact,
quick_fix=quick_fix,
)
)
except Exception:
continue
if not numeric_cols or not cat_cols:
return issues

discretizer = Discretizer(DiscretizationType.UNIFORM, n_bins=10)
df_disc = discretizer.discretize_dataframe(analyzer.df[numeric_cols + cat_cols])

for num_col, cat_col in [(n, c) for n in numeric_cols for c in cat_cols]:
table = pd.crosstab(df_disc[cat_col], df_disc[num_col])
cramers_v = _cramers_v_corrected(table)
if cramers_v > thresholds['warning']:
severity = 'critical' if cramers_v > thresholds['critical'] else 'warning'
impact = 'high' if severity == 'critical' else 'medium'
quick_fix = (
"Options: \n- Drop one. \n- Discretize/encode differently. \n- Use robust models."
if severity == 'critical' else
"Options: \n- Monitor in modeling."
)
issues.append(Issues(
category="feature_correlation",
severity=severity,
column=f"{cat_col},{num_col}",
description=f"Mixed columns '{cat_col}' (cat) and '{num_col}' (num) associated (Discretized Cramer's V: {cramers_v:.3f})",
impact_score=impact,
quick_fix=quick_fix,
))

return issues
Loading