|
1 | 1 | from .core import Issue |
2 | 2 | import pandas as pd |
3 | 3 | import numpy as np |
4 | | -from scipy.stats import f_oneway, spearmanr, pearsonr, kendalltau, chi2_contingency |
| 4 | +from scipy.stats import spearmanr, pearsonr, kendalltau, chi2_contingency |
5 | 5 | from itertools import combinations |
6 | 6 | from .discretizer import Discretizer, DiscretizationType |
7 | | -from ..utils.type_inference import infer_types, is_usable_for_corr |
8 | | - |
9 | | - |
10 | | -# Thresholds |
11 | | -CORR_THRESHOLDS = { |
12 | | - 'numeric': { |
13 | | - 'spearman': {'warning': 0.7, 'critical': 0.95}, |
14 | | - 'pearson': {'warning': 0.7, 'critical': 0.95}, |
15 | | - 'kendall': {'warning': 0.6, 'critical': 0.85}, # Lower for Kendall (typically smaller values) |
16 | | - }, |
17 | | - 'categorical': {'warning': 0.5, 'critical': 0.8}, |
18 | | - 'mixed': {'warning': 0.5, 'critical': 0.8}, # Updated to coefficient thresholds (matching categorical) for Cramer's V |
19 | | -} |
20 | | -CAT_MAX_DISTINCT = 50 |
21 | | -LOW_CARD_NUM_THRESHOLD = 10 # From type_inference.py |
| 7 | +from ..utils.type_inference import is_usable_for_corr |
| 8 | +from ..config import DEFAULT_CONFIG |
| 9 | + |
| 10 | +_CORR = DEFAULT_CONFIG.correlations |
| 11 | +CORR_THRESHOLDS = _CORR.as_nested_dict() |
| 12 | +CAT_MAX_DISTINCT = _CORR.max_distinct_categories |
| 13 | +LOW_CARD_NUM_THRESHOLD = _CORR.low_cardinality_numeric |
22 | 14 |
|
23 | 15 | def _cramers_v_corrected(table: pd.DataFrame) -> float: |
24 | 16 | if table.empty or (table.shape[0] == 1 or table.shape[1] == 1): |
@@ -52,28 +44,20 @@ def calculate_correlations(analyzer, thresholds=None): |
52 | 44 | typ == 'Numeric' and is_usable_for_corr(analyzer.df[col])] |
53 | 45 | cat_cols = [col for col, typ in inferred_types.items() if typ == 'Categorical' and |
54 | 46 | 1 < analyzer.df[col].nunique() <= CAT_MAX_DISTINCT and is_usable_for_corr(analyzer.df[col])] |
55 | | - text_cols = [col for col, typ in inferred_types.items() if typ == 'Text'] |
56 | 47 |
|
57 | | - # Internal default methods |
58 | | - default_methods = ['spearman', 'pearson'] |
59 | | - issues.extend(_check_numeric_correlation(analyzer, numeric_cols, thresholds['numeric'], default_methods)) |
| 48 | + issues.extend(_check_numeric_correlation(analyzer, numeric_cols, thresholds['numeric'])) |
60 | 49 | issues.extend(_check_categorical_correlation(analyzer, cat_cols, thresholds['categorical'])) |
61 | 50 | issues.extend(_check_mixed_correlation(analyzer, numeric_cols, cat_cols, thresholds['mixed'])) |
62 | 51 |
|
63 | 52 | return issues |
64 | 53 |
|
65 | 54 |
|
66 | | -def _check_numeric_correlation(analyzer, numeric_cols: list, thresholds: dict, methods: list): |
| 55 | +def _check_numeric_correlation(analyzer, numeric_cols: list, thresholds: dict): |
67 | 56 | issues = [] |
68 | 57 | if len(numeric_cols) < 2: |
69 | 58 | return issues |
70 | 59 |
|
71 | 60 | num_df = analyzer.df[numeric_cols].dropna(how='all') |
72 | | - corr_methods = { |
73 | | - 'spearman': lambda x, y: spearmanr(x, y), |
74 | | - 'pearson': lambda x, y: pearsonr(x, y), |
75 | | - 'kendall': lambda x, y: kendalltau(x, y) |
76 | | - } |
77 | 61 |
|
78 | 62 | for col1, col2 in combinations(numeric_cols, 2): |
79 | 63 | series1, series2 = num_df[col1].dropna(), num_df[col2].dropna() |
@@ -125,42 +109,6 @@ def _check_numeric_correlation(analyzer, numeric_cols: list, thresholds: dict, m |
125 | 109 | return issues |
126 | 110 |
|
127 | 111 |
|
128 | | -def _check_feature_correlation( |
129 | | - analyzer, threshold: float = 0.95, critical_threshold: float = 0.98 |
130 | | -): |
131 | | - issues = [] |
132 | | - numeric_df = analyzer.df.select_dtypes(include="number") |
133 | | - if numeric_df.empty: |
134 | | - return issues |
135 | | - corr_matrix = numeric_df.corr().abs() |
136 | | - upper = corr_matrix.where(np.tril(np.ones(corr_matrix.shape)).astype(bool)) |
137 | | - correlated_pairs = [ |
138 | | - (col, row, float(val)) |
139 | | - for row in upper.index |
140 | | - for col, val in upper[row].dropna().items() |
141 | | - if val > threshold and col != row |
142 | | - ] |
143 | | - for col1, col2, corr in correlated_pairs: |
144 | | - severity = "critical" if corr > critical_threshold else "warning" |
145 | | - impact = "high" if severity == "critical" else "medium" |
146 | | - quick_fix = ( |
147 | | - "Options: \n- Drop one feature: Reduces multicollinearity (Pros: Simplifies model; Cons: Loses info).\n- Combine features: Create composite feature (e.g., PCA) (Pros: Retains info; Cons: Less interpretable).\n- Retain and test: Use robust models (e.g., trees) (Pros: Keeps info; Cons: May affect sensitive models)." |
148 | | - if severity == "critical" |
149 | | - else "Options: \n- Drop one feature: If less predictive (Pros: Simplifies model; Cons: Loses info).\n- Retain and test: Evaluate with robust models (Pros: Keeps info; Cons: Risk of multicollinearity).\n- Engineer feature: Combine or transform features (Pros: Reduces redundancy; Cons: Adds complexity)." |
150 | | - ) |
151 | | - issues.append( |
152 | | - Issue( |
153 | | - category="feature_correlation", |
154 | | - severity=severity, |
155 | | - column=f"{col1},{col2}", |
156 | | - description=f"Columns '{col1}' and '{col2}' are highly correlated ({corr:.2f})", |
157 | | - impact_score=impact, |
158 | | - quick_fix=quick_fix, |
159 | | - ) |
160 | | - ) |
161 | | - return issues |
162 | | - |
163 | | - |
164 | 112 | def _check_categorical_correlation(analyzer, cat_cols: list, thresholds: dict): |
165 | 113 | issues = [] |
166 | 114 | if len(cat_cols) < 2: |
|
0 commit comments