-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathICIS_Output_Features.py
More file actions
74 lines (63 loc) · 3.2 KB
/
ICIS_Output_Features.py
File metadata and controls
74 lines (63 loc) · 3.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pandas as pd
%matplotlib inline
%run Pipeline//upload_and_vizualize
%run Pipeline//classify_and_evaluate
%run Pipeline//aux_1
%run Pipeline//ULAB_ML_Pipeline
%run Pipeline//processing
%run Pipeline//cleaning
%run Pipeline//magicloops.py
years = list(range(2000,2016)) #the years are inclusive (the last year will be the ultimate test year and not included in model generation)
year_list = train_test_dates(years) #calling the function that creates our train test data
models_to_run = ['RF', 'LR', 'ET','AB','GB','NB','DT','SVM','KNN']
#models with base parameters
clfs = {'RF': RandomForestClassifier(n_estimators=50, n_jobs=-1),
'ET': ExtraTreesClassifier(n_estimators=10, n_jobs=-1, criterion='entropy'),
'AB': AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME", n_estimators=200),
'LR': LogisticRegression(penalty='l1', C=1e5),
'SVM': svm.SVC(kernel='linear', probability=True, random_state=0),
'GB': GradientBoostingClassifier(learning_rate=0.05, subsample=0.5, max_depth=6, n_estimators=10),
'NB': GaussianNB(),
'DT': DecisionTreeClassifier(),
'SGD': SGDClassifier(loss="hinge", penalty="l2"),
'KNN': KNeighborsClassifier(n_neighbors=3)}
#The small grid seemed most appropriate for our purposes since we end up with only 4000 rows of data
small_grid = {
'RF':{'n_estimators': [10,100], 'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10]},
'LR': { 'penalty': ['l1','l2'], 'C': [0.00001,0.001,0.1,1,10]},
'SGD': { 'loss': ['hinge','log'], 'penalty': ['l2','l1','elasticnet']},
'ET': { 'n_estimators': [10,100], 'criterion' : ['gini', 'entropy'] ,'max_depth': [5,50], 'max_features': ['sqrt','log2'],'min_samples_split': [2,10]},
'AB': { 'algorithm': ['SAMME', 'SAMME.R'], 'n_estimators': [1,10,100,1000,10000]},
'GB': {'n_estimators': [10,100], 'learning_rate' : [0.001,0.1,0.5],'subsample' : [0.1,0.5,1.0], 'max_depth': [5,50]},
'NB' : {},
'DT': {'criterion': ['gini', 'entropy'], 'max_depth': [1,5,10,20,50,100], 'max_features': ['sqrt','log2'],'min_samples_split': [2,5,10]},
'SVM' :{'C' :[0.00001,0.0001,0.001,0.01,0.1,1,10],'kernel':['linear']},
'KNN' :{'n_neighbors': [1,5,10,25,50,100],'weights': ['uniform','distance'],'algorithm': ['auto','ball_tree','kd_tree']}
}
#runs the small grid with the years dictionary declared above, our selected models, and standard model parameters
result_df = run_loops(year_list, models_to_run, clfs, small_grid)
#view results dataframe in csv
result_df.to_csv('results.csv')
year_list =[{'test': '2016',
'train': [('2000', '2015'),
('2001', '2015'),
('2002', '2015'),
('2003', '2015'),
('2004', '2015'),
('2005', '2015'),
('2006', '2015'),
('2007', '2015'),
('2008', '2015'),
('2009', '2015'),
('2010', '2015'),
('2011', '2015'),
('2012', '2015'),
('2013', '2015'),
('2014', '2015'),
('2015', '2015')]}]
# we wanted to run a version of our best performing model (Gradient Boosted Decision Trees) with the most complete set of
#train and test set years
result_df = run_loops(year_list, ['GB'], clfs, small_grid)
#view results dataframe in csv
result_df.to_csv('results_final.csv')
##fin!