Skip to content

Commit 2705769

Browse files
authored
refactor: centralize config and remove dead code (#61)
- Create hashprep/config.py with all thresholds as typed dataclasses - Replace 80+ magic numbers across check files with config references - Remove ~160 lines of commented-out code in summaries/variables.py - Remove dead _check_feature_correlation() function in correlations.py - Remove unused imports (f_oneway, infer_types, numpy) in correlations.py and type_inference.py - Fix redundant infer_types() call: pass column_types from analyzer to summarize_variables() - Remove unused dependencies: fastapi, starlette, brotli from pyproject.toml - Clean up unused variables (text_cols, corr_methods, default_methods) in correlations.py
1 parent f15d5f3 commit 2705769

15 files changed

Lines changed: 258 additions & 481 deletions

hashprep/checks/columns.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,7 @@
11
from .core import Issue
2+
from ..config import DEFAULT_CONFIG
3+
4+
_COL_THRESHOLDS = DEFAULT_CONFIG.columns
25

36
def _check_single_value_columns(analyzer):
47
issues = []
@@ -23,7 +26,7 @@ def _check_single_value_columns(analyzer):
2326
)
2427
return issues
2528

26-
def _check_high_cardinality(analyzer, threshold: int = 100, critical_threshold: float = 0.9):
29+
def _check_high_cardinality(analyzer, threshold: int = _COL_THRESHOLDS.high_cardinality_count, critical_threshold: float = _COL_THRESHOLDS.high_cardinality_ratio_critical):
2730
issues = []
2831
categorical_cols = analyzer.df.select_dtypes(include="object").columns.tolist()
2932
for col in categorical_cols:
@@ -54,7 +57,7 @@ def _check_duplicates(analyzer):
5457
duplicate_rows = int(analyzer.df.duplicated().sum())
5558
if duplicate_rows > 0:
5659
duplicate_ratio = float(duplicate_rows / len(analyzer.df))
57-
severity = "critical" if duplicate_ratio > 0.1 else "warning"
60+
severity = "critical" if duplicate_ratio > _COL_THRESHOLDS.duplicate_ratio_critical else "warning"
5861
impact = "high" if severity == "critical" else "medium"
5962
quick_fix = (
6063
"Options: \n- Drop duplicates: Ensures data integrity (Pros: Cleaner data; Cons: May lose valid repeats).\n- Verify duplicates: Check if intentional (e.g., time-series) (Pros: Validates data; Cons: Time-consuming)."

hashprep/checks/correlations.py

Lines changed: 10 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -1,24 +1,16 @@
11
from .core import Issue
22
import pandas as pd
33
import numpy as np
4-
from scipy.stats import f_oneway, spearmanr, pearsonr, kendalltau, chi2_contingency
4+
from scipy.stats import spearmanr, pearsonr, kendalltau, chi2_contingency
55
from itertools import combinations
66
from .discretizer import Discretizer, DiscretizationType
7-
from ..utils.type_inference import infer_types, is_usable_for_corr
8-
9-
10-
# Thresholds
11-
CORR_THRESHOLDS = {
12-
'numeric': {
13-
'spearman': {'warning': 0.7, 'critical': 0.95},
14-
'pearson': {'warning': 0.7, 'critical': 0.95},
15-
'kendall': {'warning': 0.6, 'critical': 0.85}, # Lower for Kendall (typically smaller values)
16-
},
17-
'categorical': {'warning': 0.5, 'critical': 0.8},
18-
'mixed': {'warning': 0.5, 'critical': 0.8}, # Updated to coefficient thresholds (matching categorical) for Cramer's V
19-
}
20-
CAT_MAX_DISTINCT = 50
21-
LOW_CARD_NUM_THRESHOLD = 10 # From type_inference.py
7+
from ..utils.type_inference import is_usable_for_corr
8+
from ..config import DEFAULT_CONFIG
9+
10+
_CORR = DEFAULT_CONFIG.correlations
11+
CORR_THRESHOLDS = _CORR.as_nested_dict()
12+
CAT_MAX_DISTINCT = _CORR.max_distinct_categories
13+
LOW_CARD_NUM_THRESHOLD = _CORR.low_cardinality_numeric
2214

2315
def _cramers_v_corrected(table: pd.DataFrame) -> float:
2416
if table.empty or (table.shape[0] == 1 or table.shape[1] == 1):
@@ -52,28 +44,20 @@ def calculate_correlations(analyzer, thresholds=None):
5244
typ == 'Numeric' and is_usable_for_corr(analyzer.df[col])]
5345
cat_cols = [col for col, typ in inferred_types.items() if typ == 'Categorical' and
5446
1 < analyzer.df[col].nunique() <= CAT_MAX_DISTINCT and is_usable_for_corr(analyzer.df[col])]
55-
text_cols = [col for col, typ in inferred_types.items() if typ == 'Text']
5647

57-
# Internal default methods
58-
default_methods = ['spearman', 'pearson']
59-
issues.extend(_check_numeric_correlation(analyzer, numeric_cols, thresholds['numeric'], default_methods))
48+
issues.extend(_check_numeric_correlation(analyzer, numeric_cols, thresholds['numeric']))
6049
issues.extend(_check_categorical_correlation(analyzer, cat_cols, thresholds['categorical']))
6150
issues.extend(_check_mixed_correlation(analyzer, numeric_cols, cat_cols, thresholds['mixed']))
6251

6352
return issues
6453

6554

66-
def _check_numeric_correlation(analyzer, numeric_cols: list, thresholds: dict, methods: list):
55+
def _check_numeric_correlation(analyzer, numeric_cols: list, thresholds: dict):
6756
issues = []
6857
if len(numeric_cols) < 2:
6958
return issues
7059

7160
num_df = analyzer.df[numeric_cols].dropna(how='all')
72-
corr_methods = {
73-
'spearman': lambda x, y: spearmanr(x, y),
74-
'pearson': lambda x, y: pearsonr(x, y),
75-
'kendall': lambda x, y: kendalltau(x, y)
76-
}
7761

7862
for col1, col2 in combinations(numeric_cols, 2):
7963
series1, series2 = num_df[col1].dropna(), num_df[col2].dropna()
@@ -125,42 +109,6 @@ def _check_numeric_correlation(analyzer, numeric_cols: list, thresholds: dict, m
125109
return issues
126110

127111

128-
def _check_feature_correlation(
129-
analyzer, threshold: float = 0.95, critical_threshold: float = 0.98
130-
):
131-
issues = []
132-
numeric_df = analyzer.df.select_dtypes(include="number")
133-
if numeric_df.empty:
134-
return issues
135-
corr_matrix = numeric_df.corr().abs()
136-
upper = corr_matrix.where(np.tril(np.ones(corr_matrix.shape)).astype(bool))
137-
correlated_pairs = [
138-
(col, row, float(val))
139-
for row in upper.index
140-
for col, val in upper[row].dropna().items()
141-
if val > threshold and col != row
142-
]
143-
for col1, col2, corr in correlated_pairs:
144-
severity = "critical" if corr > critical_threshold else "warning"
145-
impact = "high" if severity == "critical" else "medium"
146-
quick_fix = (
147-
"Options: \n- Drop one feature: Reduces multicollinearity (Pros: Simplifies model; Cons: Loses info).\n- Combine features: Create composite feature (e.g., PCA) (Pros: Retains info; Cons: Less interpretable).\n- Retain and test: Use robust models (e.g., trees) (Pros: Keeps info; Cons: May affect sensitive models)."
148-
if severity == "critical"
149-
else "Options: \n- Drop one feature: If less predictive (Pros: Simplifies model; Cons: Loses info).\n- Retain and test: Evaluate with robust models (Pros: Keeps info; Cons: Risk of multicollinearity).\n- Engineer feature: Combine or transform features (Pros: Reduces redundancy; Cons: Adds complexity)."
150-
)
151-
issues.append(
152-
Issue(
153-
category="feature_correlation",
154-
severity=severity,
155-
column=f"{col1},{col2}",
156-
description=f"Columns '{col1}' and '{col2}' are highly correlated ({corr:.2f})",
157-
impact_score=impact,
158-
quick_fix=quick_fix,
159-
)
160-
)
161-
return issues
162-
163-
164112
def _check_categorical_correlation(analyzer, cat_cols: list, thresholds: dict):
165113
issues = []
166114
if len(cat_cols) < 2:

hashprep/checks/distribution.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@
33
from scipy.stats import kstest
44

55
from .core import Issue
6+
from ..config import DEFAULT_CONFIG
67

8+
_DIST = DEFAULT_CONFIG.distribution
79

8-
def _check_uniform_distribution(analyzer, p_threshold: float = 0.1) -> List[Issue]:
10+
def _check_uniform_distribution(analyzer, p_threshold: float = _DIST.uniform_p_value) -> List[Issue]:
911
"""
1012
Detect uniformly distributed numeric columns using Kolmogorov-Smirnov test.
1113
Uniform distributions often indicate synthetic IDs or sequential data.
@@ -14,7 +16,7 @@ def _check_uniform_distribution(analyzer, p_threshold: float = 0.1) -> List[Issu
1416

1517
for col in analyzer.df.select_dtypes(include="number").columns:
1618
series = analyzer.df[col].dropna()
17-
if len(series) < 20:
19+
if len(series) < _DIST.uniform_min_samples:
1820
continue
1921

2022
min_val, max_val = series.min(), series.max()
@@ -46,7 +48,7 @@ def _check_uniform_distribution(analyzer, p_threshold: float = 0.1) -> List[Issu
4648
return issues
4749

4850

49-
def _check_unique_values(analyzer, threshold: float = 0.95) -> List[Issue]:
51+
def _check_unique_values(analyzer, threshold: float = _DIST.unique_value_ratio) -> List[Issue]:
5052
"""
5153
Detect columns where nearly all values are unique.
5254
High uniqueness often indicates identifiers, names, or free-text fields.
@@ -55,7 +57,7 @@ def _check_unique_values(analyzer, threshold: float = 0.95) -> List[Issue]:
5557

5658
for col in analyzer.df.columns:
5759
series = analyzer.df[col].dropna()
58-
if len(series) < 10:
60+
if len(series) < _DIST.unique_min_samples:
5961
continue
6062

6163
unique_count = series.nunique()

hashprep/checks/drift.py

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,15 +3,17 @@
33
from scipy.stats import chisquare, ks_2samp
44

55
from .core import Issue
6+
from ..config import DEFAULT_CONFIG
67

7-
CRITICAL_P_VALUE = 0.001
8-
MAX_CATEGORIES_FOR_CHI2 = 50
8+
_DRIFT = DEFAULT_CONFIG.drift
9+
CRITICAL_P_VALUE = _DRIFT.critical_p_value
10+
MAX_CATEGORIES_FOR_CHI2 = _DRIFT.max_categories_for_chi2
911

1012

1113
def check_drift(
1214
df_train: pd.DataFrame,
1315
df_test: pd.DataFrame,
14-
threshold: float = 0.05,
16+
threshold: float = _DRIFT.p_value,
1517
) -> list[Issue]:
1618
"""
1719
Check for distribution shift between two datasets.
@@ -80,13 +82,13 @@ def _check_categorical_drift(
8082

8183
new_categories = set(test_counts.index) - set(train_counts.index)
8284
if new_categories:
83-
sample_new = list(new_categories)[:5]
85+
sample_new = list(new_categories)[:_DRIFT.max_new_category_samples]
8486
issues.append(
8587
Issue(
8688
category="dataset_drift",
8789
severity="warning",
8890
column=col,
89-
description=f"New categories in test set for '{col}': {sample_new}{'...' if len(new_categories) > 5 else ''}",
91+
description=f"New categories in test set for '{col}': {sample_new}{'...' if len(new_categories) > _DRIFT.max_new_category_samples else ''}",
9092
impact_score="medium",
9193
quick_fix="Handle unseen categories in preprocessing pipeline (e.g., OrdinalEncoder with unknown_value).",
9294
)

hashprep/checks/imbalance.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
from .core import Issue
2+
from ..config import DEFAULT_CONFIG
23

3-
def _check_class_imbalance(analyzer, threshold: float = 0.9):
4+
def _check_class_imbalance(analyzer, threshold: float = DEFAULT_CONFIG.imbalance.majority_class_ratio):
45
issues = []
56
if analyzer.target_col and analyzer.target_col in analyzer.df.columns:
67
counts = analyzer.df[analyzer.target_col].value_counts(normalize=True)

hashprep/checks/leakage.py

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
import pandas as pd
33
from scipy.stats import chi2_contingency, f_oneway
44
import numpy as np
5+
from ..config import DEFAULT_CONFIG
6+
7+
_LEAK = DEFAULT_CONFIG.leakage
58

69
def _check_data_leakage(analyzer):
710
issues = []
@@ -36,7 +39,7 @@ def _check_target_leakage_patterns(analyzer):
3639
corrs = numeric_cols.corrwith(target).abs()
3740
for col, corr in corrs.items():
3841
severity = (
39-
"critical" if corr > 0.98 else "warning" if corr > 0.95 else None
42+
"critical" if corr > _LEAK.numeric_critical else "warning" if corr > _LEAK.numeric_warning else None
4043
)
4144
if severity:
4245
impact = "high" if severity == "critical" else "medium"
@@ -69,7 +72,7 @@ def _check_target_leakage_patterns(analyzer):
6972
r, k = table.shape
7073
cramers_v = np.sqrt(phi2 / min(k - 1, r - 1))
7174
severity = (
72-
"critical" if cramers_v > 0.95 else "warning" if cramers_v > 0.8 else None
75+
"critical" if cramers_v > _LEAK.categorical_critical else "warning" if cramers_v > _LEAK.categorical_warning else None
7376
)
7477
if severity:
7578
impact = "high" if severity == "critical" else "medium"
@@ -104,8 +107,8 @@ def _check_target_leakage_patterns(analyzer):
104107
try:
105108
f_stat, p_val = f_oneway(*groups)
106109
severity = (
107-
"critical" if f_stat > 20.0 and p_val < 0.001
108-
else "warning" if f_stat > 10.0 and p_val < 0.001 else None
110+
"critical" if f_stat > _LEAK.f_stat_critical and p_val < _LEAK.f_stat_p_value
111+
else "warning" if f_stat > _LEAK.f_stat_warning and p_val < _LEAK.f_stat_p_value else None
109112
)
110113
if severity:
111114
impact = "high" if severity == "critical" else "medium"

hashprep/checks/missing_values.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,11 @@
33
import pandas as pd
44
from collections import defaultdict
55
import numpy as np
6+
from ..config import DEFAULT_CONFIG
67

7-
def _check_high_missing_values(analyzer, threshold: float = 0.4, critical_threshold: float = 0.7):
8+
_THRESHOLDS = DEFAULT_CONFIG.missing_values
9+
10+
def _check_high_missing_values(analyzer, threshold: float = _THRESHOLDS.warning, critical_threshold: float = _THRESHOLDS.critical):
811
issues = []
912
for col in analyzer.df.columns:
1013
missing_pct = float(analyzer.df[col].isna().mean())
@@ -44,7 +47,7 @@ def _check_empty_columns(analyzer):
4447
)
4548
return issues
4649

47-
def _check_dataset_missingness(analyzer, threshold: float = 20.0, critical_threshold: float = 50.0):
50+
def _check_dataset_missingness(analyzer, threshold: float = _THRESHOLDS.dataset_warning_pct, critical_threshold: float = _THRESHOLDS.dataset_critical_pct):
4851
issues = []
4952
missing_pct = float(
5053
(analyzer.df.isnull().sum().sum() / (analyzer.df.shape[0] * analyzer.df.shape[1])) * 100
@@ -70,11 +73,11 @@ def _check_dataset_missingness(analyzer, threshold: float = 20.0, critical_thres
7073
return issues
7174

7275

73-
def _check_missing_patterns(analyzer, threshold: float = 0.01,
74-
critical_p_threshold: float = 0.001):
76+
def _check_missing_patterns(analyzer, threshold: float = _THRESHOLDS.pattern_p_value,
77+
critical_p_threshold: float = _THRESHOLDS.pattern_critical_p_value):
7578
issues = []
7679
missing_cols = [
77-
col for col in analyzer.df.columns if int(analyzer.df[col].isna().sum()) >= 10
80+
col for col in analyzer.df.columns if int(analyzer.df[col].isna().sum()) >= _THRESHOLDS.pattern_min_missing_count
7881
]
7982

8083
# grouping logic
@@ -89,7 +92,7 @@ def _check_missing_patterns(analyzer, threshold: float = 0.01,
8992
continue
9093
try:
9194
value_counts = analyzer.df[other_col].value_counts()
92-
rare_cats = value_counts[value_counts < 5].index
95+
rare_cats = value_counts[value_counts < _THRESHOLDS.pattern_rare_category_count].index
9396
temp_col = analyzer.df[other_col].copy()
9497
if not rare_cats.empty:
9598
temp_col = temp_col.where(~temp_col.isin(rare_cats), "Other")
@@ -112,7 +115,7 @@ def cramers_v(table):
112115
return np.sqrt(phi2corr / rkcorr)
113116

114117
cramers = cramers_v(table)
115-
if p_val < threshold and cramers > 0.1:
118+
if p_val < threshold and cramers > _THRESHOLDS.pattern_cramers_v_min:
116119
cat_patterns[col].append((other_col, p_val, cramers))
117120
except Exception:
118121
continue
@@ -125,7 +128,7 @@ def cramers_v(table):
125128
try:
126129
missing = analyzer.df[analyzer.df[col].isna()][other_col].dropna()
127130
non_missing = analyzer.df[analyzer.df[col].notna()][other_col].dropna()
128-
if len(missing) < 10 or len(non_missing) < 10:
131+
if len(missing) < _THRESHOLDS.pattern_min_group_size or len(non_missing) < _THRESHOLDS.pattern_min_group_size:
129132
continue
130133

131134
# Replaced f_oneway with mannwhitneyu
@@ -135,7 +138,7 @@ def cramers_v(table):
135138
pooled_std = np.sqrt((np.std(missing) ** 2 + np.std(non_missing) ** 2) / 2)
136139
cohens_d = abs(np.mean(missing) - np.mean(non_missing)) / pooled_std if pooled_std > 0 else 0
137140

138-
if p_val < threshold and cohens_d > 0.2:
141+
if p_val < threshold and cohens_d > _THRESHOLDS.pattern_cohens_d_min:
139142
num_patterns[col].append((other_col, p_val, cohens_d))
140143
except Exception:
141144
continue
@@ -151,7 +154,7 @@ def cramers_v(table):
151154
if all_patterns:
152155
# Sort by effect size (descending) and take top 3
153156
all_patterns.sort(key=lambda x: x[2], reverse=True) # x[2] is effect size
154-
top_corrs = [pat[0] for pat in all_patterns[:3]]
157+
top_corrs = [pat[0] for pat in all_patterns[:_THRESHOLDS.pattern_top_correlations]]
155158
total_count = len(all_patterns)
156159

157160
desc = f"Missingness in '{col}' correlates with {total_count} columns ({', '.join(top_corrs)})"
@@ -161,7 +164,7 @@ def cramers_v(table):
161164
is_target_correlated = any(pat[0] == analyzer.target_col for pat in all_patterns)
162165
severity = (
163166
"critical"
164-
if p_val < critical_p_threshold and is_target_correlated and max_effect > 0.3 # medium effect threshold
167+
if p_val < critical_p_threshold and is_target_correlated and max_effect > _THRESHOLDS.pattern_effect_critical
165168
else "warning"
166169
)
167170
impact = "high" if severity == "critical" else "medium"

0 commit comments

Comments
 (0)