-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdataset_cache.py
More file actions
199 lines (160 loc) · 7.11 KB
/
dataset_cache.py
File metadata and controls
199 lines (160 loc) · 7.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
"""
Dataset caching and smart splitting for SOFAM experiments.
Strategy:
- Download a UCI dataset ONCE via ucimlrepo, save to disk as .npz
- Create splits: small training subsets + large test set
- Train FAM on small subsets (fast), evaluate on large test set (representative)
- Reuse cached data across all experiments
To use a different UCI dataset:
1. Change DATASET_ID below (browse https://archive.ics.uci.edu/ for IDs)
2. Delete cached_data/ to force re-download
3. Also update gqfam.py Config.DATASET_ID to match
The preprocessing pipeline (median imputation, MinMax scaling, complement
coding, label encoding) is dataset-agnostic and works with any UCI tabular
classification dataset that has numeric/categorical features and a single
target column.
"""
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
# --- DATASET CONFIGURATION ---
# Change this ID to use any UCI ML Repository dataset.
# Browse available datasets at: https://archive.ics.uci.edu/
# Examples: 891 (CDC Diabetes), 53 (Iris), 17 (Breast Cancer), 2 (Adult)
DATASET_ID = 891
DATASET_NAME = "CDC Diabetes Health Indicators"
CACHE_DIR = Path(__file__).parent / "cached_data"
CACHE_FILE = CACHE_DIR / f"uci{DATASET_ID}_full.npz"
CACHE_META = CACHE_DIR / f"uci{DATASET_ID}_meta.json"
def download_and_cache(force=False):
"""Download full dataset once and cache to disk."""
CACHE_DIR.mkdir(exist_ok=True)
if CACHE_FILE.exists() and not force:
print(f"Dataset already cached at {CACHE_FILE}")
return load_cached()
print(f"Downloading UCI {DATASET_ID} ({DATASET_NAME})...")
from ucimlrepo import fetch_ucirepo
dataset = fetch_ucirepo(id=DATASET_ID)
X = dataset.data.features.copy()
y = dataset.data.targets.copy()
# Ensure DataFrame
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
if not isinstance(y, pd.DataFrame):
y = pd.DataFrame(y)
y_array = y.values.ravel()
# Preprocessing: median imputation + MinMax normalization
X_filled = X.fillna(X.median())
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X_filled)
# Complement coding: [x] -> [x, 1-x]
X_complement = np.hstack([X_scaled, 1 - X_scaled])
# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y_array)
# Save everything
np.savez_compressed(
CACHE_FILE,
X_scaled=X_scaled,
X_complement=X_complement,
y=y_encoded,
feature_names=np.array(X.columns.tolist()),
class_names=np.array(le.classes_.tolist()),
n_samples=len(y_encoded),
n_features_raw=X_scaled.shape[1],
n_features_complement=X_complement.shape[1],
n_classes=len(le.classes_),
)
# Save metadata
import json
meta = {
"source": "UCI ML Repository",
"dataset_id": DATASET_ID,
"name": DATASET_NAME,
"n_samples": int(len(y_encoded)),
"n_features_raw": int(X_scaled.shape[1]),
"n_features_complement": int(X_complement.shape[1]),
"n_classes": int(len(le.classes_)),
"class_distribution": {str(k): int(v) for k, v in zip(*np.unique(y_encoded, return_counts=True))},
"preprocessing": "median_imputation + minmax_normalization + complement_coding",
}
with open(CACHE_META, 'w') as f:
json.dump(meta, f, indent=2)
print(f"Cached {len(y_encoded)} samples to {CACHE_FILE}")
print(f" Raw features: {X_scaled.shape[1]}, Complement coded: {X_complement.shape[1]}")
print(f" Classes: {dict(zip(le.classes_, np.bincount(y_encoded)))}")
return X_scaled, X_complement, y_encoded, le
def load_cached():
"""Load cached dataset from disk."""
if not CACHE_FILE.exists():
print("Cache not found. Downloading...")
return download_and_cache()
data = np.load(CACHE_FILE, allow_pickle=True)
X_scaled = data['X_scaled']
X_complement = data['X_complement']
y = data['y']
# Reconstruct label encoder
le = LabelEncoder()
le.classes_ = data['class_names']
print(f"Loaded cached dataset: {len(y)} samples, {X_complement.shape[1]} features (complement coded)")
return X_scaled, X_complement, y, le
def create_splits(X_complement, y, train_size=3000, random_seed=42):
"""
Create smart splits for SOFAM experiments.
Strategy: Use `train_size` rows for training, rest for evaluation.
This means HA/GA train fast on small data but get tested on large data.
Returns dict with:
X_train, y_train: small training set (complement coded, for FAM)
X_val, y_val: validation set (for hyperparam tuning during HA/GA/QA)
X_test, y_test: held-out test set (for final evaluation)
"""
n_total = len(y)
# First: separate a large test set (15% of full data = ~38K samples)
X_trainval, X_test, y_trainval, y_test = train_test_split(
X_complement, y, test_size=0.15, random_state=random_seed, stratify=y
)
# From trainval, take train_size for training, rest becomes validation
if train_size is not None and train_size < len(X_trainval):
# Stratified subsample for training
np.random.seed(random_seed)
# Use stratified sampling to maintain class ratios
train_idx = []
for cls in np.unique(y_trainval):
cls_idx = np.where(y_trainval == cls)[0]
cls_ratio = len(cls_idx) / len(y_trainval)
n_cls = max(1, int(train_size * cls_ratio))
chosen = np.random.choice(cls_idx, size=min(n_cls, len(cls_idx)), replace=False)
train_idx.extend(chosen)
train_idx = np.array(train_idx[:train_size])
val_idx = np.array([i for i in range(len(X_trainval)) if i not in set(train_idx)])
# Subsample validation to keep it manageable (max 10K)
if len(val_idx) > 10000:
val_idx = np.random.choice(val_idx, size=10000, replace=False)
X_train = X_trainval[train_idx]
y_train = y_trainval[train_idx]
X_val = X_trainval[val_idx]
y_val = y_trainval[val_idx]
else:
# Use all trainval data, split 82/18 to get ~70/15 overall
X_train, X_val, y_train, y_val = train_test_split(
X_trainval, y_trainval, test_size=0.176, random_state=random_seed, stratify=y_trainval
)
splits = {
'X_train': X_train, 'y_train': y_train,
'X_val': X_val, 'y_val': y_val,
'X_test': X_test, 'y_test': y_test,
}
print(f"Splits created (train_size={train_size}):")
print(f" Train: {len(y_train)} samples, class dist: {dict(zip(*np.unique(y_train, return_counts=True)))}")
print(f" Val: {len(y_val)} samples, class dist: {dict(zip(*np.unique(y_val, return_counts=True)))}")
print(f" Test: {len(y_test)} samples, class dist: {dict(zip(*np.unique(y_test, return_counts=True)))}")
return splits
if __name__ == "__main__":
# Cache the dataset
X_scaled, X_complement, y, le = download_and_cache()
# Show different split configs
for size in [100, 3000, 30000, None]:
print(f"\n{'='*50}")
splits = create_splits(X_complement, y, train_size=size)