Skip to content

Commit 97a16fe

Browse files
committed
init
1 parent 35e8d72 commit 97a16fe

16 files changed

Lines changed: 72505 additions & 0 deletions
6 KB
Binary file not shown.
Lines changed: 170 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,170 @@
1+
"""
2+
This is a feature importance tutorial for the CCEM group meeting.
3+
"""
4+
import pandas as pd
5+
import matplotlib.pyplot as plt
6+
import random
7+
from sklearn.preprocessing import StandardScaler
8+
import seaborn as sns
9+
import numpy as np
10+
from sklearn.datasets import load_diabetes
11+
from pathlib import Path
12+
pd.set_option('display.max_rows', None)
13+
pd.set_option('display.max_columns', None)
14+
pd.set_option('display.width', 1000)
15+
pd.set_option('display.colheader_justify', 'center')
16+
17+
18+
def get_data(path: str = None, features: list = None, target: str = None):
19+
if path is None:
20+
data_bunch = load_diabetes(as_frame=True, scaled=True)
21+
df = data_bunch.frame
22+
features = data_bunch.feature_names
23+
target = data_bunch.target.name
24+
else:
25+
df = pd.read_csv(path)
26+
27+
return df, features, target
28+
29+
def get_df_with_all_correlations_of_features_with_target(df, features, target):
30+
# Fix random seed for reproducibility
31+
seed = 0
32+
random.seed(seed)
33+
np.random.seed(seed)
34+
35+
# Definitions
36+
lin = 'linear'
37+
nonlin = 'nonlinear'
38+
mono = 'monotonic'
39+
corr_types = {}
40+
41+
#%% Preprocess data
42+
df = df.select_dtypes(include='number') # Select only numerical columns
43+
features = [col for col in df.columns if col in features]
44+
if not target in df.columns:
45+
raise ValueError(f'Could not find a numerical target "{target}" in dataframe!')
46+
47+
# Scale numerical data to have mean 0 and variance 1
48+
scaler = StandardScaler()
49+
scaler.fit(df[df.columns])
50+
df[df.columns] = scaler.transform(df)
51+
52+
53+
54+
# %% ============== Correlations of features with target ==================
55+
correlations = []
56+
for method in ['pearson',
57+
'spearman']: # 'pearson' catches linear correlations, 'spearman' catches monotonic correlations
58+
corrs = df.corr(method=method)[target][features]
59+
corrs = corrs.rename(method)
60+
correlations.append(corrs)
61+
corr_types[method] = lin if method == 'pearson' else mono
62+
correlations = pd.concat(correlations, axis=1)
63+
64+
# %% ============== Non-linear feature importances with Random Forests ==================
65+
from sklearn.ensemble import RandomForestRegressor
66+
RF_model = RandomForestRegressor(n_estimators=20, random_state=0)
67+
RF_model.fit(df[features], df[target])
68+
RF_importances = RF_model.feature_importances_
69+
RF_importances = pd.Series(RF_importances, index=features)
70+
RF_importances = RF_importances.rename('Random Forest')
71+
corr_types['Random Forest'] = nonlin
72+
73+
# %% ============== Non-linear feature importances with permutation importance ==================
74+
from sklearn.inspection import permutation_importance
75+
result = permutation_importance(RF_model, df[features], df[target], n_repeats=10,
76+
random_state=0) # uses the Random Forest model from above
77+
perm_sorted_idx = result.importances_mean.argsort()
78+
perm_importances = pd.Series(result.importances_mean[perm_sorted_idx],
79+
index=np.array(features)[perm_sorted_idx].tolist())
80+
perm_importances = perm_importances.rename('Permutation')
81+
corr_types['Permutation'] = nonlin
82+
83+
# %% ============== Linear feature importances with PCA ==================
84+
from sklearn.decomposition import PCA
85+
pca = PCA(n_components=1)
86+
pca.fit(df[features])
87+
PCA_importances = pca.components_[0]
88+
PCA_importances = pd.Series(PCA_importances, index=features)
89+
PCA_importances = PCA_importances.rename('PCA')
90+
corr_types['PCA'] = lin
91+
92+
# %% ============== Linear feature importances with Lasso ==================
93+
from sklearn.linear_model import Lasso
94+
lasso = Lasso(alpha=0.1)
95+
lasso.fit(df[features], df[target])
96+
Lasso_importances = lasso.coef_ / np.abs(lasso.coef_).sum()
97+
Lasso_importances = pd.Series(Lasso_importances, index=features)
98+
Lasso_importances = Lasso_importances.rename('Lasso')
99+
corr_types['Lasso'] = lin
100+
101+
# %% ============== Linear feature importances with Linear Regression ==================
102+
from sklearn.linear_model import LinearRegression
103+
linreg = LinearRegression()
104+
linreg.fit(df[features], df[target])
105+
linreg_importances = linreg.coef_ / np.abs(linreg.coef_).sum()
106+
linreg_importances = pd.Series(linreg_importances, index=features)
107+
linreg_importances = linreg_importances.rename('Lin. Reg.')
108+
corr_types['Lin. Reg.'] = lin
109+
110+
# %% ============== Non-linear feature importances with XGBoost ==================
111+
import xgboost as xgb
112+
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=0)
113+
xgb_model.fit(df[features], df[target])
114+
xgb_importances = xgb_model.feature_importances_
115+
xgb_importances = pd.Series(xgb_importances, index=features)
116+
xgb_importances = xgb_importances.rename('XGBoost')
117+
corr_types['XGBoost'] = nonlin
118+
119+
# %% ============== Combine and save all feature importances ==================
120+
df_importances = pd.concat(
121+
[RF_importances, perm_importances, xgb_importances, Lasso_importances, linreg_importances, PCA_importances,
122+
correlations], axis=1)
123+
# Sort dataframe columns by linear, non-linear, and monotonic correlations and add this as a multiindex
124+
df_importances.columns = pd.MultiIndex.from_tuples([(corr_types[col], col) for col in df_importances.columns])
125+
df_importances = df_importances.sort_index(axis=1, level=0, ascending=False)
126+
# Sort dataframe by XGBoost feature importances
127+
df_importances = df_importances.sort_values(by=('nonlinear', 'XGBoost'), ascending=False)
128+
129+
return df_importances
130+
131+
132+
133+
if __name__ == '__main__':
134+
135+
output_dir = 'output'
136+
137+
#%% ============== Load data ==================
138+
df, features, target = get_data()
139+
140+
#%% ============== Calculate feature importances ==================
141+
df_importances = get_df_with_all_correlations_of_features_with_target(df, features, target)
142+
143+
#% ============== Print feature importances ==================
144+
precision = 2
145+
pd.set_option('display.precision', precision)
146+
print(df_importances.round(precision))
147+
print('Other methods to look into for feature importances: SHAP, permutation importance, partial dependence plots, LIME, ELI5, ...')
148+
149+
#%% ============== Save and print results ==================
150+
print(f'Saving output...')
151+
if not Path(output_dir).exists():
152+
raise FileNotFoundError(f'Could not find specified output directory "{output_dir}"!')
153+
df_importances.to_csv(f'{output_dir}/feature_importances.csv')
154+
print(f'- Feature importances saved to "{output_dir}/feature_importances.csv".')
155+
156+
# Plot scatter plots of features vs target
157+
plotdir = Path(output_dir, 'correlation_plots')
158+
plotdir.mkdir(parents=True, exist_ok=True)
159+
for feature in features:
160+
plt.figure()
161+
sns.regplot(x=feature, y=target, data=df, scatter_kws={'alpha': 0.5})
162+
plt.savefig(f'{plotdir}/{feature}_vs_{target}.png', dpi=300, bbox_inches='tight')
163+
plt.show()
164+
plt.close()
165+
print(f'- Correlation plot of each feature & target saved to "{plotdir}".')
166+
167+
print('Done!')
168+
169+
170+
283 KB
Loading
337 KB
Loading
311 KB
Loading
325 KB
Loading
324 KB
Loading
343 KB
Loading
280 KB
Loading
333 KB
Loading

0 commit comments

Comments
 (0)