Skip to content

Commit 352d38f

Browse files
authored
feat(analyzer): add interactions and correlations (#7)
- add scatter plots pairs for numeric variables - compute correlation matrices (Pearson, Spearman and Kendall) - compute categorical correlations (Cramer's V) - compute mixed correlations (using ANOVA F-test as proxy).
1 parent 384b44a commit 352d38f

5 files changed

Lines changed: 198 additions & 57 deletions

File tree

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -96,4 +96,5 @@ build/
9696
dist/
9797

9898
hashprep-dev.json
99-
todo.txt
99+
todo.md
100+
test.json

hashprep/analyzer.py

Lines changed: 129 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
from typing import Dict, Optional
33
import pandas as pd
44
import hashlib
5+
from scipy.stats import chi2_contingency
6+
import numpy as np
57

68

79
@dataclass
@@ -36,9 +38,25 @@ def analyze(self) -> Dict:
3638
self._summarize_variable_types()
3739
self._add_reproduction_info()
3840
self._summarize_variables()
41+
self._summarize_interactions()
42+
self._summarize_missing_values()
3943

4044
return self._generate_summary()
4145

46+
# =========================================================================
47+
# Sample Section
48+
# =========================================================================
49+
def _get_dataset_preview(self):
50+
head = self.df.head()
51+
tail = self.df.tail()
52+
sample = self.df.sample(min(10, len(self.df)))
53+
dataset_preview = {
54+
"head": head.to_dict(orient="records"),
55+
"tail": tail.to_dict(orient="records"),
56+
"sample": sample.to_dict(orient="records"),
57+
}
58+
self.summaries.update(dataset_preview)
59+
4260
# =========================================================================
4361
# Overview Section
4462
# =========================================================================
@@ -143,7 +161,7 @@ def _summarize_categorical_column(self, col: str):
143161
stats = {
144162
"count": int(series.count()),
145163
"unique": int(series.nunique()),
146-
"top_values": series.value_counts().head(10).to_dict(), # top 10 only
164+
"top_values": series.value_counts().head(10).to_dict(), # top 10 only
147165
"most_frequent": series.mode().iloc[0] if not series.empty else None,
148166
"missing": int(self.df[col].isna().sum()),
149167
}
@@ -164,12 +182,15 @@ def _summarize_text_column(self, col: str):
164182
"avg_length": float(lengths.mean()) if not lengths.empty else None,
165183
"min_length": float(lengths.min()) if not lengths.empty else None,
166184
"max_length": float(lengths.max()) if not lengths.empty else None,
167-
"common_lengths": lengths.value_counts().head(5).to_dict(), # top 5
185+
"common_lengths": lengths.value_counts().head(5).to_dict(), # top 5
168186
"char_freq": (
169-
pd.Series(list("".join(series))).value_counts().head(10).to_dict() # top 10
187+
pd.Series(list("".join(series)))
188+
.value_counts()
189+
.head(10)
190+
.to_dict() # top 10
170191
if not series.empty
171192
else None
172-
), # top 10 only
193+
), # top 10 only
173194
}
174195

175196
if "variables" not in self.summaries:
@@ -201,18 +222,112 @@ def _summarize_datetime_column(self, col: str):
201222
self.summaries["variables"][col] = stats
202223

203224
# =========================================================================
204-
# Sample Section
225+
# Interactions and Correlations Section
205226
# =========================================================================
206-
def _get_dataset_preview(self):
207-
head = self.df.head()
208-
tail = self.df.tail()
209-
sample = self.df.sample(min(10, len(self.df)))
210-
dataset_preview = {
211-
"head": head.to_dict(orient="records"),
212-
"tail": tail.to_dict(orient="records"),
213-
"sample": sample.to_dict(orient="records"),
227+
def _summarize_interactions(self):
228+
"""Run interactions between variables"""
229+
self._scatter_plots_numeric()
230+
self._compute_correlation_matrices()
231+
self._compute_categorical_correlations()
232+
self._compute_mixed_correlations()
233+
234+
def _scatter_plots_numeric(self):
235+
"""
236+
Generate scatter plots between numeric variables
237+
for CLI: just the pairs
238+
for Web/Report: Plot them
239+
"""
240+
numeric_columns = self.df.select_dtypes(include="number").columns
241+
pairs = [
242+
(c1, c2)
243+
for i, c1 in enumerate(numeric_columns)
244+
for c2 in numeric_columns[i + 1 :]
245+
]
246+
self.summaries["scatter_pairs"] = pairs # TODO: Plot these
247+
248+
def _compute_correlation_matrices(self):
249+
"""Compute Pearson/Spearman/Kendall correlations"""
250+
numeric_df = self.df.select_dtypes(include="number")
251+
corrs = {}
252+
if not numeric_df.empty:
253+
corrs["pearson"] = numeric_df.corr(method="pearson").to_dict()
254+
corrs["spearman"] = numeric_df.corr(method="spearman").to_dict()
255+
corrs["kendall"] = numeric_df.corr(method="kendall").to_dict()
256+
self.summaries["numeric_correlations"] = corrs
257+
258+
def _compute_categorical_correlations(self):
259+
"""Compute Cramer's V for categorical pairs"""
260+
categorical = self.df.select_dtypes(include="object").columns
261+
results = {}
262+
for i, c1 in enumerate(categorical):
263+
for c2 in categorical[i + 1 :]:
264+
try:
265+
table = pd.crosstab(self.df[c1], self.df[c2])
266+
chi2, _, _, _ = chi2_contingency(table)
267+
n = table.sum().sum()
268+
phi2 = chi2 / n
269+
r, k = table.shape
270+
cramers_v = (phi2 / min(k - 1, r - 1)) ** 0.5
271+
results[f"{c1}__{c2}"] = cramers_v
272+
except Exception:
273+
continue
274+
275+
self.summaries["categorical_correlations"] = results
276+
277+
def _compute_mixed_correlations(self):
278+
"""
279+
Compute correlation between categorical and numeric using ANOVA F-test as proxy
280+
"""
281+
from scipy.stats import f_oneway
282+
import numpy as np
283+
284+
cat_cols = self.df.select_dtypes(include=["object", "category"]).columns
285+
num_cols = self.df.select_dtypes(include=["int64", "float64"]).columns
286+
mixed_corr = {}
287+
288+
for cat in cat_cols:
289+
for num in num_cols:
290+
# Build groups for each level of categorical variable
291+
groups = []
292+
for level in self.df[cat].dropna().unique():
293+
vals = self.df.loc[self.df[cat] == level, num].dropna().to_numpy()
294+
if len(vals) > 1: # Only include groups with more than 1 value
295+
groups.append(vals)
296+
297+
if len(groups) < 2:
298+
continue # Need at least 2 valid groups for ANOVA
299+
300+
# Skip if all groups have zero variance
301+
if all(np.var(g, ddof=1) == 0 for g in groups):
302+
continue
303+
304+
try:
305+
f_stat, p_val = f_oneway(*groups)
306+
mixed_corr[f"{cat}__{num}"] = {"f_stat": f_stat, "p_value": p_val}
307+
except Exception as e:
308+
mixed_corr[f"{cat}__{num}"] = {"error": str(e)}
309+
310+
self.summaries["mixed_correlations"] = mixed_corr
311+
312+
# =========================================================================
313+
# Missing Value Section
314+
# =========================================================================
315+
def _summarize_missing_values(self):
316+
"""Summarize missing value patterns"""
317+
missing_count = self.df.isnull().sum().to_dict()
318+
missing_percentage = (self.df.isnull().mean() * 100).round(2).to_dict()
319+
320+
self.summaries["missing_values"] = {
321+
"count": missing_count,
322+
"percentage": missing_percentage,
323+
}
324+
325+
# Simple missingness heatmap structure (list of missing row indexes)
326+
self.summaries["missing_patterns"] = {
327+
col: self.df[self.df[col].isna()].index.tolist()
328+
for col in self.df.columns
329+
if self.df[col].isna().any()
214330
}
215-
self.summaries.update(dataset_preview)
216331

217332
# =========================================================================
218333
# Generate Summary

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,5 +7,6 @@ requires-python = ">=3.10"
77
dependencies = [
88
"fastapi>=0.116.1",
99
"pandas>=2.3.2",
10+
"scikit-learn>=1.7.2",
1011
"scipy>=1.15.3",
1112
]

todo.md

Lines changed: 0 additions & 42 deletions
This file was deleted.

0 commit comments

Comments
 (0)