cachevector · maskedsyntax · Feb 19, 2026 · Feb 19, 2026
diff --git a/hashprep/checks/columns.py b/hashprep/checks/columns.py
@@ -1,4 +1,7 @@
 from .core import Issue
+from ..config import DEFAULT_CONFIG
+
+_COL_THRESHOLDS = DEFAULT_CONFIG.columns
 
 def _check_single_value_columns(analyzer):
     issues = []
@@ -23,7 +26,7 @@ def _check_single_value_columns(analyzer):
             )
     return issues
 
-def _check_high_cardinality(analyzer, threshold: int = 100, critical_threshold: float = 0.9):
+def _check_high_cardinality(analyzer, threshold: int = _COL_THRESHOLDS.high_cardinality_count, critical_threshold: float = _COL_THRESHOLDS.high_cardinality_ratio_critical):
     issues = []
     categorical_cols = analyzer.df.select_dtypes(include="object").columns.tolist()
     for col in categorical_cols:
@@ -54,7 +57,7 @@ def _check_duplicates(analyzer):
     duplicate_rows = int(analyzer.df.duplicated().sum())
     if duplicate_rows > 0:
         duplicate_ratio = float(duplicate_rows / len(analyzer.df))
-        severity = "critical" if duplicate_ratio > 0.1 else "warning"
+        severity = "critical" if duplicate_ratio > _COL_THRESHOLDS.duplicate_ratio_critical else "warning"
         impact = "high" if severity == "critical" else "medium"
         quick_fix = (
             "Options: \n- Drop duplicates: Ensures data integrity (Pros: Cleaner data; Cons: May lose valid repeats).\n- Verify duplicates: Check if intentional (e.g., time-series) (Pros: Validates data; Cons: Time-consuming)."

diff --git a/hashprep/checks/correlations.py b/hashprep/checks/correlations.py
@@ -1,24 +1,16 @@
 from .core import Issue
 import pandas as pd
 import numpy as np
-from scipy.stats import f_oneway, spearmanr, pearsonr, kendalltau, chi2_contingency
+from scipy.stats import spearmanr, pearsonr, kendalltau, chi2_contingency
 from itertools import combinations
 from .discretizer import Discretizer, DiscretizationType
-from ..utils.type_inference import infer_types, is_usable_for_corr
-
-
-# Thresholds
-CORR_THRESHOLDS = {
-    'numeric': {
-        'spearman': {'warning': 0.7, 'critical': 0.95},
-        'pearson': {'warning': 0.7, 'critical': 0.95},
-        'kendall': {'warning': 0.6, 'critical': 0.85},  # Lower for Kendall (typically smaller values)
-    },
-    'categorical': {'warning': 0.5, 'critical': 0.8},
-    'mixed': {'warning': 0.5, 'critical': 0.8},  # Updated to coefficient thresholds (matching categorical) for Cramer's V
-}
-CAT_MAX_DISTINCT = 50
-LOW_CARD_NUM_THRESHOLD = 10  # From type_inference.py
+from ..utils.type_inference import is_usable_for_corr
+from ..config import DEFAULT_CONFIG
+
+_CORR = DEFAULT_CONFIG.correlations
+CORR_THRESHOLDS = _CORR.as_nested_dict()
+CAT_MAX_DISTINCT = _CORR.max_distinct_categories
+LOW_CARD_NUM_THRESHOLD = _CORR.low_cardinality_numeric
 
 def _cramers_v_corrected(table: pd.DataFrame) -> float:
     if table.empty or (table.shape[0] == 1 or table.shape[1] == 1):
@@ -52,28 +44,20 @@ def calculate_correlations(analyzer, thresholds=None):
                     typ == 'Numeric' and is_usable_for_corr(analyzer.df[col])]
     cat_cols = [col for col, typ in inferred_types.items() if typ == 'Categorical' and
                 1 < analyzer.df[col].nunique() <= CAT_MAX_DISTINCT and is_usable_for_corr(analyzer.df[col])]
-    text_cols = [col for col, typ in inferred_types.items() if typ == 'Text']
 
-    # Internal default methods
-    default_methods = ['spearman', 'pearson']
-    issues.extend(_check_numeric_correlation(analyzer, numeric_cols, thresholds['numeric'], default_methods))
+    issues.extend(_check_numeric_correlation(analyzer, numeric_cols, thresholds['numeric']))
     issues.extend(_check_categorical_correlation(analyzer, cat_cols, thresholds['categorical']))
     issues.extend(_check_mixed_correlation(analyzer, numeric_cols, cat_cols, thresholds['mixed']))
 
     return issues
 
 
-def _check_numeric_correlation(analyzer, numeric_cols: list, thresholds: dict, methods: list):
+def _check_numeric_correlation(analyzer, numeric_cols: list, thresholds: dict):
     issues = []
     if len(numeric_cols) < 2:
         return issues
 
     num_df = analyzer.df[numeric_cols].dropna(how='all')
-    corr_methods = {
-        'spearman': lambda x, y: spearmanr(x, y),
-        'pearson': lambda x, y: pearsonr(x, y),
-        'kendall': lambda x, y: kendalltau(x, y)
-    }
 
     for col1, col2 in combinations(numeric_cols, 2):
         series1, series2 = num_df[col1].dropna(), num_df[col2].dropna()
@@ -125,42 +109,6 @@ def _check_numeric_correlation(analyzer, numeric_cols: list, thresholds: dict, m
     return issues
 
 
-def _check_feature_correlation(
-    analyzer, threshold: float = 0.95, critical_threshold: float = 0.98
-):
-    issues = []
-    numeric_df = analyzer.df.select_dtypes(include="number")
-    if numeric_df.empty:
-        return issues
-    corr_matrix = numeric_df.corr().abs()
-    upper = corr_matrix.where(np.tril(np.ones(corr_matrix.shape)).astype(bool))
-    correlated_pairs = [
-        (col, row, float(val))
-        for row in upper.index
-        for col, val in upper[row].dropna().items()
-        if val > threshold and col != row
-    ]
-    for col1, col2, corr in correlated_pairs:
-        severity = "critical" if corr > critical_threshold else "warning"
-        impact = "high" if severity == "critical" else "medium"
-        quick_fix = (
-            "Options: \n- Drop one feature: Reduces multicollinearity (Pros: Simplifies model; Cons: Loses info).\n- Combine features: Create composite feature (e.g., PCA) (Pros: Retains info; Cons: Less interpretable).\n- Retain and test: Use robust models (e.g., trees) (Pros: Keeps info; Cons: May affect sensitive models)."
-            if severity == "critical"
-            else "Options: \n- Drop one feature: If less predictive (Pros: Simplifies model; Cons: Loses info).\n- Retain and test: Evaluate with robust models (Pros: Keeps info; Cons: Risk of multicollinearity).\n- Engineer feature: Combine or transform features (Pros: Reduces redundancy; Cons: Adds complexity)."
-        )
-        issues.append(
-            Issue(
-                category="feature_correlation",
-                severity=severity,
-                column=f"{col1},{col2}",
-                description=f"Columns '{col1}' and '{col2}' are highly correlated ({corr:.2f})",
-                impact_score=impact,
-                quick_fix=quick_fix,
-            )
-        )
-    return issues
-
-
 def _check_categorical_correlation(analyzer, cat_cols: list, thresholds: dict):
     issues = []
     if len(cat_cols) < 2:

diff --git a/hashprep/checks/distribution.py b/hashprep/checks/distribution.py
@@ -3,9 +3,11 @@
 from scipy.stats import kstest
 
 from .core import Issue
+from ..config import DEFAULT_CONFIG
 
+_DIST = DEFAULT_CONFIG.distribution
 
-def _check_uniform_distribution(analyzer, p_threshold: float = 0.1) -> List[Issue]:
+def _check_uniform_distribution(analyzer, p_threshold: float = _DIST.uniform_p_value) -> List[Issue]:
     """
     Detect uniformly distributed numeric columns using Kolmogorov-Smirnov test.
     Uniform distributions often indicate synthetic IDs or sequential data.
@@ -14,7 +16,7 @@ def _check_uniform_distribution(analyzer, p_threshold: float = 0.1) -> List[Issu
 
     for col in analyzer.df.select_dtypes(include="number").columns:
         series = analyzer.df[col].dropna()
-        if len(series) < 20:
+        if len(series) < _DIST.uniform_min_samples:
             continue
 
         min_val, max_val = series.min(), series.max()
@@ -46,7 +48,7 @@ def _check_uniform_distribution(analyzer, p_threshold: float = 0.1) -> List[Issu
     return issues
 
 
-def _check_unique_values(analyzer, threshold: float = 0.95) -> List[Issue]:
+def _check_unique_values(analyzer, threshold: float = _DIST.unique_value_ratio) -> List[Issue]:
     """
     Detect columns where nearly all values are unique.
     High uniqueness often indicates identifiers, names, or free-text fields.
@@ -55,7 +57,7 @@ def _check_unique_values(analyzer, threshold: float = 0.95) -> List[Issue]:
 
     for col in analyzer.df.columns:
         series = analyzer.df[col].dropna()
-        if len(series) < 10:
+        if len(series) < _DIST.unique_min_samples:
             continue
 
         unique_count = series.nunique()

diff --git a/hashprep/checks/drift.py b/hashprep/checks/drift.py
@@ -3,15 +3,17 @@
 from scipy.stats import chisquare, ks_2samp
 
 from .core import Issue
+from ..config import DEFAULT_CONFIG
 
-CRITICAL_P_VALUE = 0.001
-MAX_CATEGORIES_FOR_CHI2 = 50
+_DRIFT = DEFAULT_CONFIG.drift
+CRITICAL_P_VALUE = _DRIFT.critical_p_value
+MAX_CATEGORIES_FOR_CHI2 = _DRIFT.max_categories_for_chi2
 
 
 def check_drift(
     df_train: pd.DataFrame,
     df_test: pd.DataFrame,
-    threshold: float = 0.05,
+    threshold: float = _DRIFT.p_value,
 ) -> list[Issue]:
     """
     Check for distribution shift between two datasets.
@@ -80,13 +82,13 @@ def _check_categorical_drift(
 
         new_categories = set(test_counts.index) - set(train_counts.index)
         if new_categories:
-            sample_new = list(new_categories)[:5]
+            sample_new = list(new_categories)[:_DRIFT.max_new_category_samples]
             issues.append(
                 Issue(
                     category="dataset_drift",
                     severity="warning",
                     column=col,
-                    description=f"New categories in test set for '{col}': {sample_new}{'...' if len(new_categories) > 5 else ''}",
+                    description=f"New categories in test set for '{col}': {sample_new}{'...' if len(new_categories) > _DRIFT.max_new_category_samples else ''}",
                     impact_score="medium",
                     quick_fix="Handle unseen categories in preprocessing pipeline (e.g., OrdinalEncoder with unknown_value).",
                 )

diff --git a/hashprep/checks/imbalance.py b/hashprep/checks/imbalance.py
@@ -1,6 +1,7 @@
 from .core import Issue
+from ..config import DEFAULT_CONFIG
 
-def _check_class_imbalance(analyzer, threshold: float = 0.9):
+def _check_class_imbalance(analyzer, threshold: float = DEFAULT_CONFIG.imbalance.majority_class_ratio):
     issues = []
     if analyzer.target_col and analyzer.target_col in analyzer.df.columns:
         counts = analyzer.df[analyzer.target_col].value_counts(normalize=True)

diff --git a/hashprep/checks/leakage.py b/hashprep/checks/leakage.py
@@ -2,6 +2,9 @@
 import pandas as pd
 from scipy.stats import chi2_contingency, f_oneway
 import numpy as np
+from ..config import DEFAULT_CONFIG
+
+_LEAK = DEFAULT_CONFIG.leakage
 
 def _check_data_leakage(analyzer):
     issues = []
@@ -36,7 +39,7 @@ def _check_target_leakage_patterns(analyzer):
                 corrs = numeric_cols.corrwith(target).abs()
                 for col, corr in corrs.items():
                     severity = (
-                        "critical" if corr > 0.98 else "warning" if corr > 0.95 else None
+                        "critical" if corr > _LEAK.numeric_critical else "warning" if corr > _LEAK.numeric_warning else None
                     )
                     if severity:
                         impact = "high" if severity == "critical" else "medium"
@@ -69,7 +72,7 @@ def _check_target_leakage_patterns(analyzer):
                     r, k = table.shape
                     cramers_v = np.sqrt(phi2 / min(k - 1, r - 1))
                     severity = (
-                        "critical" if cramers_v > 0.95 else "warning" if cramers_v > 0.8 else None
+                        "critical" if cramers_v > _LEAK.categorical_critical else "warning" if cramers_v > _LEAK.categorical_warning else None
                     )
                     if severity:
                         impact = "high" if severity == "critical" else "medium"
@@ -104,8 +107,8 @@ def _check_target_leakage_patterns(analyzer):
                 try:
                     f_stat, p_val = f_oneway(*groups)
                     severity = (
-                        "critical" if f_stat > 20.0 and p_val < 0.001
-                        else "warning" if f_stat > 10.0 and p_val < 0.001 else None
+                        "critical" if f_stat > _LEAK.f_stat_critical and p_val < _LEAK.f_stat_p_value
+                        else "warning" if f_stat > _LEAK.f_stat_warning and p_val < _LEAK.f_stat_p_value else None
                     )
                     if severity:
                         impact = "high" if severity == "critical" else "medium"

diff --git a/hashprep/checks/missing_values.py b/hashprep/checks/missing_values.py
@@ -3,8 +3,11 @@
 import pandas as pd
 from collections import defaultdict
 import numpy as np
+from ..config import DEFAULT_CONFIG
 
-def _check_high_missing_values(analyzer, threshold: float = 0.4, critical_threshold: float = 0.7):
+_THRESHOLDS = DEFAULT_CONFIG.missing_values
+
+def _check_high_missing_values(analyzer, threshold: float = _THRESHOLDS.warning, critical_threshold: float = _THRESHOLDS.critical):
     issues = []
     for col in analyzer.df.columns:
         missing_pct = float(analyzer.df[col].isna().mean())
@@ -44,7 +47,7 @@ def _check_empty_columns(analyzer):
             )
     return issues
 
-def _check_dataset_missingness(analyzer, threshold: float = 20.0, critical_threshold: float = 50.0):
+def _check_dataset_missingness(analyzer, threshold: float = _THRESHOLDS.dataset_warning_pct, critical_threshold: float = _THRESHOLDS.dataset_critical_pct):
     issues = []
     missing_pct = float(
         (analyzer.df.isnull().sum().sum() / (analyzer.df.shape[0] * analyzer.df.shape[1])) * 100
@@ -70,11 +73,11 @@ def _check_dataset_missingness(analyzer, threshold: float = 20.0, critical_thres
     return issues
 
 
-def _check_missing_patterns(analyzer, threshold: float = 0.01,
-                            critical_p_threshold: float = 0.001):
+def _check_missing_patterns(analyzer, threshold: float = _THRESHOLDS.pattern_p_value,
+                            critical_p_threshold: float = _THRESHOLDS.pattern_critical_p_value):
     issues = []
     missing_cols = [
-        col for col in analyzer.df.columns if int(analyzer.df[col].isna().sum()) >= 10
+        col for col in analyzer.df.columns if int(analyzer.df[col].isna().sum()) >= _THRESHOLDS.pattern_min_missing_count
     ]
 
     # grouping logic
@@ -89,7 +92,7 @@ def _check_missing_patterns(analyzer, threshold: float = 0.01,
                 continue
             try:
                 value_counts = analyzer.df[other_col].value_counts()
-                rare_cats = value_counts[value_counts < 5].index
+                rare_cats = value_counts[value_counts < _THRESHOLDS.pattern_rare_category_count].index
                 temp_col = analyzer.df[other_col].copy()
                 if not rare_cats.empty:
                     temp_col = temp_col.where(~temp_col.isin(rare_cats), "Other")
@@ -112,7 +115,7 @@ def cramers_v(table):
                     return np.sqrt(phi2corr / rkcorr)
 
                 cramers = cramers_v(table)
-                if p_val < threshold and cramers > 0.1:
+                if p_val < threshold and cramers > _THRESHOLDS.pattern_cramers_v_min:
                     cat_patterns[col].append((other_col, p_val, cramers))
             except Exception:
                 continue
@@ -125,7 +128,7 @@ def cramers_v(table):
             try:
                 missing = analyzer.df[analyzer.df[col].isna()][other_col].dropna()
                 non_missing = analyzer.df[analyzer.df[col].notna()][other_col].dropna()
-                if len(missing) < 10 or len(non_missing) < 10:
+                if len(missing) < _THRESHOLDS.pattern_min_group_size or len(non_missing) < _THRESHOLDS.pattern_min_group_size:
                     continue
 
                 # Replaced f_oneway with mannwhitneyu
@@ -135,7 +138,7 @@ def cramers_v(table):
                 pooled_std = np.sqrt((np.std(missing) ** 2 + np.std(non_missing) ** 2) / 2)
                 cohens_d = abs(np.mean(missing) - np.mean(non_missing)) / pooled_std if pooled_std > 0 else 0
 
-                if p_val < threshold and cohens_d > 0.2:
+                if p_val < threshold and cohens_d > _THRESHOLDS.pattern_cohens_d_min:
                     num_patterns[col].append((other_col, p_val, cohens_d))
             except Exception:
                 continue
@@ -151,7 +154,7 @@ def cramers_v(table):
         if all_patterns:
             # Sort by effect size (descending) and take top 3
             all_patterns.sort(key=lambda x: x[2], reverse=True)  # x[2] is effect size
-            top_corrs = [pat[0] for pat in all_patterns[:3]]
+            top_corrs = [pat[0] for pat in all_patterns[:_THRESHOLDS.pattern_top_correlations]]
             total_count = len(all_patterns)
 
             desc = f"Missingness in '{col}' correlates with {total_count} columns ({', '.join(top_corrs)})"
@@ -161,7 +164,7 @@ def cramers_v(table):
             is_target_correlated = any(pat[0] == analyzer.target_col for pat in all_patterns)
             severity = (
                 "critical"
-                if p_val < critical_p_threshold and is_target_correlated and max_effect > 0.3  # medium effect threshold
+                if p_val < critical_p_threshold and is_target_correlated and max_effect > _THRESHOLDS.pattern_effect_critical
                 else "warning"
             )
             impact = "high" if severity == "critical" else "medium"