Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions hashprep/checks/columns.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from .core import Issue
from ..config import DEFAULT_CONFIG

_COL_THRESHOLDS = DEFAULT_CONFIG.columns

def _check_single_value_columns(analyzer):
issues = []
Expand All @@ -23,7 +26,7 @@ def _check_single_value_columns(analyzer):
)
return issues

def _check_high_cardinality(analyzer, threshold: int = 100, critical_threshold: float = 0.9):
def _check_high_cardinality(analyzer, threshold: int = _COL_THRESHOLDS.high_cardinality_count, critical_threshold: float = _COL_THRESHOLDS.high_cardinality_ratio_critical):
issues = []
categorical_cols = analyzer.df.select_dtypes(include="object").columns.tolist()
for col in categorical_cols:
Expand Down Expand Up @@ -54,7 +57,7 @@ def _check_duplicates(analyzer):
duplicate_rows = int(analyzer.df.duplicated().sum())
if duplicate_rows > 0:
duplicate_ratio = float(duplicate_rows / len(analyzer.df))
severity = "critical" if duplicate_ratio > 0.1 else "warning"
severity = "critical" if duplicate_ratio > _COL_THRESHOLDS.duplicate_ratio_critical else "warning"
impact = "high" if severity == "critical" else "medium"
quick_fix = (
"Options: \n- Drop duplicates: Ensures data integrity (Pros: Cleaner data; Cons: May lose valid repeats).\n- Verify duplicates: Check if intentional (e.g., time-series) (Pros: Validates data; Cons: Time-consuming)."
Expand Down
72 changes: 10 additions & 62 deletions hashprep/checks/correlations.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,16 @@
from .core import Issue
import pandas as pd
import numpy as np
from scipy.stats import f_oneway, spearmanr, pearsonr, kendalltau, chi2_contingency
from scipy.stats import spearmanr, pearsonr, kendalltau, chi2_contingency
from itertools import combinations
from .discretizer import Discretizer, DiscretizationType
from ..utils.type_inference import infer_types, is_usable_for_corr


# Thresholds
CORR_THRESHOLDS = {
'numeric': {
'spearman': {'warning': 0.7, 'critical': 0.95},
'pearson': {'warning': 0.7, 'critical': 0.95},
'kendall': {'warning': 0.6, 'critical': 0.85}, # Lower for Kendall (typically smaller values)
},
'categorical': {'warning': 0.5, 'critical': 0.8},
'mixed': {'warning': 0.5, 'critical': 0.8}, # Updated to coefficient thresholds (matching categorical) for Cramer's V
}
CAT_MAX_DISTINCT = 50
LOW_CARD_NUM_THRESHOLD = 10 # From type_inference.py
from ..utils.type_inference import is_usable_for_corr
from ..config import DEFAULT_CONFIG

_CORR = DEFAULT_CONFIG.correlations
CORR_THRESHOLDS = _CORR.as_nested_dict()
CAT_MAX_DISTINCT = _CORR.max_distinct_categories
LOW_CARD_NUM_THRESHOLD = _CORR.low_cardinality_numeric

def _cramers_v_corrected(table: pd.DataFrame) -> float:
if table.empty or (table.shape[0] == 1 or table.shape[1] == 1):
Expand Down Expand Up @@ -52,28 +44,20 @@ def calculate_correlations(analyzer, thresholds=None):
typ == 'Numeric' and is_usable_for_corr(analyzer.df[col])]
cat_cols = [col for col, typ in inferred_types.items() if typ == 'Categorical' and
1 < analyzer.df[col].nunique() <= CAT_MAX_DISTINCT and is_usable_for_corr(analyzer.df[col])]
text_cols = [col for col, typ in inferred_types.items() if typ == 'Text']

# Internal default methods
default_methods = ['spearman', 'pearson']
issues.extend(_check_numeric_correlation(analyzer, numeric_cols, thresholds['numeric'], default_methods))
issues.extend(_check_numeric_correlation(analyzer, numeric_cols, thresholds['numeric']))
issues.extend(_check_categorical_correlation(analyzer, cat_cols, thresholds['categorical']))
issues.extend(_check_mixed_correlation(analyzer, numeric_cols, cat_cols, thresholds['mixed']))

return issues


def _check_numeric_correlation(analyzer, numeric_cols: list, thresholds: dict, methods: list):
def _check_numeric_correlation(analyzer, numeric_cols: list, thresholds: dict):
issues = []
if len(numeric_cols) < 2:
return issues

num_df = analyzer.df[numeric_cols].dropna(how='all')
corr_methods = {
'spearman': lambda x, y: spearmanr(x, y),
'pearson': lambda x, y: pearsonr(x, y),
'kendall': lambda x, y: kendalltau(x, y)
}

for col1, col2 in combinations(numeric_cols, 2):
series1, series2 = num_df[col1].dropna(), num_df[col2].dropna()
Expand Down Expand Up @@ -125,42 +109,6 @@ def _check_numeric_correlation(analyzer, numeric_cols: list, thresholds: dict, m
return issues


def _check_feature_correlation(
analyzer, threshold: float = 0.95, critical_threshold: float = 0.98
):
issues = []
numeric_df = analyzer.df.select_dtypes(include="number")
if numeric_df.empty:
return issues
corr_matrix = numeric_df.corr().abs()
upper = corr_matrix.where(np.tril(np.ones(corr_matrix.shape)).astype(bool))
correlated_pairs = [
(col, row, float(val))
for row in upper.index
for col, val in upper[row].dropna().items()
if val > threshold and col != row
]
for col1, col2, corr in correlated_pairs:
severity = "critical" if corr > critical_threshold else "warning"
impact = "high" if severity == "critical" else "medium"
quick_fix = (
"Options: \n- Drop one feature: Reduces multicollinearity (Pros: Simplifies model; Cons: Loses info).\n- Combine features: Create composite feature (e.g., PCA) (Pros: Retains info; Cons: Less interpretable).\n- Retain and test: Use robust models (e.g., trees) (Pros: Keeps info; Cons: May affect sensitive models)."
if severity == "critical"
else "Options: \n- Drop one feature: If less predictive (Pros: Simplifies model; Cons: Loses info).\n- Retain and test: Evaluate with robust models (Pros: Keeps info; Cons: Risk of multicollinearity).\n- Engineer feature: Combine or transform features (Pros: Reduces redundancy; Cons: Adds complexity)."
)
issues.append(
Issue(
category="feature_correlation",
severity=severity,
column=f"{col1},{col2}",
description=f"Columns '{col1}' and '{col2}' are highly correlated ({corr:.2f})",
impact_score=impact,
quick_fix=quick_fix,
)
)
return issues


def _check_categorical_correlation(analyzer, cat_cols: list, thresholds: dict):
issues = []
if len(cat_cols) < 2:
Expand Down
10 changes: 6 additions & 4 deletions hashprep/checks/distribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,11 @@
from scipy.stats import kstest

from .core import Issue
from ..config import DEFAULT_CONFIG

_DIST = DEFAULT_CONFIG.distribution

def _check_uniform_distribution(analyzer, p_threshold: float = 0.1) -> List[Issue]:
def _check_uniform_distribution(analyzer, p_threshold: float = _DIST.uniform_p_value) -> List[Issue]:
"""
Detect uniformly distributed numeric columns using Kolmogorov-Smirnov test.
Uniform distributions often indicate synthetic IDs or sequential data.
Expand All @@ -14,7 +16,7 @@ def _check_uniform_distribution(analyzer, p_threshold: float = 0.1) -> List[Issu

for col in analyzer.df.select_dtypes(include="number").columns:
series = analyzer.df[col].dropna()
if len(series) < 20:
if len(series) < _DIST.uniform_min_samples:
continue

min_val, max_val = series.min(), series.max()
Expand Down Expand Up @@ -46,7 +48,7 @@ def _check_uniform_distribution(analyzer, p_threshold: float = 0.1) -> List[Issu
return issues


def _check_unique_values(analyzer, threshold: float = 0.95) -> List[Issue]:
def _check_unique_values(analyzer, threshold: float = _DIST.unique_value_ratio) -> List[Issue]:
"""
Detect columns where nearly all values are unique.
High uniqueness often indicates identifiers, names, or free-text fields.
Expand All @@ -55,7 +57,7 @@ def _check_unique_values(analyzer, threshold: float = 0.95) -> List[Issue]:

for col in analyzer.df.columns:
series = analyzer.df[col].dropna()
if len(series) < 10:
if len(series) < _DIST.unique_min_samples:
continue

unique_count = series.nunique()
Expand Down
12 changes: 7 additions & 5 deletions hashprep/checks/drift.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,15 +3,17 @@
from scipy.stats import chisquare, ks_2samp

from .core import Issue
from ..config import DEFAULT_CONFIG

CRITICAL_P_VALUE = 0.001
MAX_CATEGORIES_FOR_CHI2 = 50
_DRIFT = DEFAULT_CONFIG.drift
CRITICAL_P_VALUE = _DRIFT.critical_p_value
MAX_CATEGORIES_FOR_CHI2 = _DRIFT.max_categories_for_chi2


def check_drift(
df_train: pd.DataFrame,
df_test: pd.DataFrame,
threshold: float = 0.05,
threshold: float = _DRIFT.p_value,
) -> list[Issue]:
"""
Check for distribution shift between two datasets.
Expand Down Expand Up @@ -80,13 +82,13 @@ def _check_categorical_drift(

new_categories = set(test_counts.index) - set(train_counts.index)
if new_categories:
sample_new = list(new_categories)[:5]
sample_new = list(new_categories)[:_DRIFT.max_new_category_samples]
issues.append(
Issue(
category="dataset_drift",
severity="warning",
column=col,
description=f"New categories in test set for '{col}': {sample_new}{'...' if len(new_categories) > 5 else ''}",
description=f"New categories in test set for '{col}': {sample_new}{'...' if len(new_categories) > _DRIFT.max_new_category_samples else ''}",
impact_score="medium",
quick_fix="Handle unseen categories in preprocessing pipeline (e.g., OrdinalEncoder with unknown_value).",
)
Expand Down
3 changes: 2 additions & 1 deletion hashprep/checks/imbalance.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from .core import Issue
from ..config import DEFAULT_CONFIG

def _check_class_imbalance(analyzer, threshold: float = 0.9):
def _check_class_imbalance(analyzer, threshold: float = DEFAULT_CONFIG.imbalance.majority_class_ratio):
issues = []
if analyzer.target_col and analyzer.target_col in analyzer.df.columns:
counts = analyzer.df[analyzer.target_col].value_counts(normalize=True)
Expand Down
11 changes: 7 additions & 4 deletions hashprep/checks/leakage.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@
import pandas as pd
from scipy.stats import chi2_contingency, f_oneway
import numpy as np
from ..config import DEFAULT_CONFIG

_LEAK = DEFAULT_CONFIG.leakage

def _check_data_leakage(analyzer):
issues = []
Expand Down Expand Up @@ -36,7 +39,7 @@ def _check_target_leakage_patterns(analyzer):
corrs = numeric_cols.corrwith(target).abs()
for col, corr in corrs.items():
severity = (
"critical" if corr > 0.98 else "warning" if corr > 0.95 else None
"critical" if corr > _LEAK.numeric_critical else "warning" if corr > _LEAK.numeric_warning else None
)
if severity:
impact = "high" if severity == "critical" else "medium"
Expand Down Expand Up @@ -69,7 +72,7 @@ def _check_target_leakage_patterns(analyzer):
r, k = table.shape
cramers_v = np.sqrt(phi2 / min(k - 1, r - 1))
severity = (
"critical" if cramers_v > 0.95 else "warning" if cramers_v > 0.8 else None
"critical" if cramers_v > _LEAK.categorical_critical else "warning" if cramers_v > _LEAK.categorical_warning else None
)
if severity:
impact = "high" if severity == "critical" else "medium"
Expand Down Expand Up @@ -104,8 +107,8 @@ def _check_target_leakage_patterns(analyzer):
try:
f_stat, p_val = f_oneway(*groups)
severity = (
"critical" if f_stat > 20.0 and p_val < 0.001
else "warning" if f_stat > 10.0 and p_val < 0.001 else None
"critical" if f_stat > _LEAK.f_stat_critical and p_val < _LEAK.f_stat_p_value
else "warning" if f_stat > _LEAK.f_stat_warning and p_val < _LEAK.f_stat_p_value else None
)
if severity:
impact = "high" if severity == "critical" else "medium"
Expand Down
25 changes: 14 additions & 11 deletions hashprep/checks/missing_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
import pandas as pd
from collections import defaultdict
import numpy as np
from ..config import DEFAULT_CONFIG

def _check_high_missing_values(analyzer, threshold: float = 0.4, critical_threshold: float = 0.7):
_THRESHOLDS = DEFAULT_CONFIG.missing_values

def _check_high_missing_values(analyzer, threshold: float = _THRESHOLDS.warning, critical_threshold: float = _THRESHOLDS.critical):
issues = []
for col in analyzer.df.columns:
missing_pct = float(analyzer.df[col].isna().mean())
Expand Down Expand Up @@ -44,7 +47,7 @@ def _check_empty_columns(analyzer):
)
return issues

def _check_dataset_missingness(analyzer, threshold: float = 20.0, critical_threshold: float = 50.0):
def _check_dataset_missingness(analyzer, threshold: float = _THRESHOLDS.dataset_warning_pct, critical_threshold: float = _THRESHOLDS.dataset_critical_pct):
issues = []
missing_pct = float(
(analyzer.df.isnull().sum().sum() / (analyzer.df.shape[0] * analyzer.df.shape[1])) * 100
Expand All @@ -70,11 +73,11 @@ def _check_dataset_missingness(analyzer, threshold: float = 20.0, critical_thres
return issues


def _check_missing_patterns(analyzer, threshold: float = 0.01,
critical_p_threshold: float = 0.001):
def _check_missing_patterns(analyzer, threshold: float = _THRESHOLDS.pattern_p_value,
critical_p_threshold: float = _THRESHOLDS.pattern_critical_p_value):
issues = []
missing_cols = [
col for col in analyzer.df.columns if int(analyzer.df[col].isna().sum()) >= 10
col for col in analyzer.df.columns if int(analyzer.df[col].isna().sum()) >= _THRESHOLDS.pattern_min_missing_count
]

# grouping logic
Expand All @@ -89,7 +92,7 @@ def _check_missing_patterns(analyzer, threshold: float = 0.01,
continue
try:
value_counts = analyzer.df[other_col].value_counts()
rare_cats = value_counts[value_counts < 5].index
rare_cats = value_counts[value_counts < _THRESHOLDS.pattern_rare_category_count].index
temp_col = analyzer.df[other_col].copy()
if not rare_cats.empty:
temp_col = temp_col.where(~temp_col.isin(rare_cats), "Other")
Expand All @@ -112,7 +115,7 @@ def cramers_v(table):
return np.sqrt(phi2corr / rkcorr)

cramers = cramers_v(table)
if p_val < threshold and cramers > 0.1:
if p_val < threshold and cramers > _THRESHOLDS.pattern_cramers_v_min:
cat_patterns[col].append((other_col, p_val, cramers))
except Exception:
continue
Expand All @@ -125,7 +128,7 @@ def cramers_v(table):
try:
missing = analyzer.df[analyzer.df[col].isna()][other_col].dropna()
non_missing = analyzer.df[analyzer.df[col].notna()][other_col].dropna()
if len(missing) < 10 or len(non_missing) < 10:
if len(missing) < _THRESHOLDS.pattern_min_group_size or len(non_missing) < _THRESHOLDS.pattern_min_group_size:
continue

# Replaced f_oneway with mannwhitneyu
Expand All @@ -135,7 +138,7 @@ def cramers_v(table):
pooled_std = np.sqrt((np.std(missing) ** 2 + np.std(non_missing) ** 2) / 2)
cohens_d = abs(np.mean(missing) - np.mean(non_missing)) / pooled_std if pooled_std > 0 else 0

if p_val < threshold and cohens_d > 0.2:
if p_val < threshold and cohens_d > _THRESHOLDS.pattern_cohens_d_min:
num_patterns[col].append((other_col, p_val, cohens_d))
except Exception:
continue
Expand All @@ -151,7 +154,7 @@ def cramers_v(table):
if all_patterns:
# Sort by effect size (descending) and take top 3
all_patterns.sort(key=lambda x: x[2], reverse=True) # x[2] is effect size
top_corrs = [pat[0] for pat in all_patterns[:3]]
top_corrs = [pat[0] for pat in all_patterns[:_THRESHOLDS.pattern_top_correlations]]
total_count = len(all_patterns)

desc = f"Missingness in '{col}' correlates with {total_count} columns ({', '.join(top_corrs)})"
Expand All @@ -161,7 +164,7 @@ def cramers_v(table):
is_target_correlated = any(pat[0] == analyzer.target_col for pat in all_patterns)
severity = (
"critical"
if p_val < critical_p_threshold and is_target_correlated and max_effect > 0.3 # medium effect threshold
if p_val < critical_p_threshold and is_target_correlated and max_effect > _THRESHOLDS.pattern_effect_critical
else "warning"
)
impact = "high" if severity == "critical" else "medium"
Expand Down
Loading