forked from PPPLDeepLearning/plasma-python
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathextract_best_overtime.py
More file actions
171 lines (140 loc) · 5.56 KB
/
extract_best_overtime.py
File metadata and controls
171 lines (140 loc) · 5.56 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
import matplotlib.pylab as plt
import pandas as pd
import glob
import numpy as np
from random import shuffle
from joblib import Parallel, delayed
import multiprocessing
import matplotlib
matplotlib.use('Agg')
def arrangeTrialsAtRandom(filenames, scale=1.0):
shuffle(filenames)
previous = pd.read_csv(filenames[0])
previous['times'] = previous['times'].apply(lambda x: x/60.0/scale)
dataframes = [previous]
for filename in filenames[1:]:
shift = max(previous['times'].values)
current = pd.read_csv(filename)
current['times'] = current['times'].apply(
lambda x: x/60.0/scale + shift)
dataframes.append(current)
previous = current
return pd.concat(dataframes)
def getOneBestValidationAUC(T_of_test, dataset):
# select subset of dataframe by time for all
dataset = dataset[dataset.times <= T_of_test]
# apply emulate_converge script
aucs = dataset['val_roc'].values
if len(aucs) > 0:
return max(aucs)
else:
return 0.0
def doPlot(parallel_aucs, serial_aucs, times, errors):
times = list(times)
np.histogram(parallel_aucs, bins=times)
# values,edges = times_histo
parallel_values = parallel_aucs[1:]
edges = times
print(len(parallel_values))
print(len(edges))
serial_values = np.array(serial_aucs[1:])
errors = np.array(errors[1:])
edges = np.array(times[:-1])
print(errors.shape)
print(edges.shape)
print(serial_values.shape)
plt.figure()
# , width=np.diff(edges), ec="k", align="edge")
plt.plot(edges, parallel_values, label="Distributed search")
# , width=np.diff(edges), ec="k", align="edge")
plt.plot(edges, serial_values, label="Sequential search")
# plt.fill_between(edges, serial_values-errors,serial_values+errors)
plt.legend(loc=(0.6, 0.7))
plt.xlabel("Time [minutes]", fontsize=20)
# plt.yscale('log')
plt.ylabel('Best validation AUC', fontsize=20)
plt.savefig("times.png")
plt.figure()
# , width=np.diff(edges), ec="k", align="edge")
plt.plot(edges, parallel_values, label="Distributed search")
# , width=np.diff(edges), ec="k", align="edge")
plt.plot(edges, serial_values, label="Sequential search")
# plt.fill_between(edges, serial_values-errors,serial_values+errors)
plt.legend(loc=(0.6, 0.7))
plt.xlabel("Time [minutes]", fontsize=20)
plt.xscale('log')
plt.xlim([0, 100])
plt.ylabel('Best validation AUC', fontsize=20)
plt.savefig("times_logx_start.png")
plt.figure()
# , width=np.diff(edges), ec="k", align="edge")
plt.plot(edges, parallel_values, label="Distributed search")
# , width=np.diff(edges), ec="k", align="edge")
plt.plot(edges, serial_values, label="Sequential search")
# plt.fill_between(edges, serial_values-errors,serial_values+errors)
plt.legend(loc=(0.6, 0.7))
plt.xlabel("Time [minutes]", fontsize=20)
plt.xscale('log')
plt.xlim([100, 10000])
plt.ylabel('Best validation AUC', fontsize=20)
plt.savefig("times_logx.png")
def getReplica(filenames, times):
serial_auc_replica = arrangeTrialsAtRandom(filenames, 100.0)
best_serial_aucs_over_time = []
for T in times:
current_best = 0
# pass AUCs and real epoch counts to emulate_converge
auc = getOneBestValidationAUC(T, serial_auc_replica)
if auc > current_best:
current_best = auc
best_serial_aucs_over_time.append(current_best)
# replicas.append(best_serial_aucs_over_time)
return best_serial_aucs_over_time
def getTimeReplica(filenames, T):
current_best = 0
for filename in filenames:
# get AUCs for this trial, one per effective epoch
try:
dataset = pd.read_csv(filename)
dataset['times'] = dataset['times'].apply(lambda x: x/60.0)
except BaseException:
print("No data in {}".format(filename))
continue
# pass AUCs and real epoch counts to emulate_converge
auc = getOneBestValidationAUC(T, dataset)
if auc > current_best:
current_best = auc
return current_best
def getTimeReplicaSerial(serial_auc_replica, T):
current_best = 0
# pass AUCs and real epoch counts to emulate_converge
auc = getOneBestValidationAUC(T, serial_auc_replica)
if auc > current_best:
current_best = auc
# replicas.append(best_serial_aucs_over_time)
return current_best
if __name__ == '__main__':
filenames = glob.glob(
"/tigress/FRNN/JET_Titan_hyperparameter_run/*/temporal_csv_log.csv")
patience = 5
times = np.linspace(0, 310*30, 186*30)
best_parallel_aucs_over_time = []
num_cores = multiprocessing.cpu_count()
print("Running on ", num_cores, " CPU cores")
best_parallel_aucs_over_time = Parallel(n_jobs=num_cores)(
delayed(getTimeReplica)(filenames, T) for T in times)
Nreplicas = 20
replicas = []
for i in range(Nreplicas):
serial_auc_replica = arrangeTrialsAtRandom(filenames, 100.0)
# replicas = Parallel(n_jobs=num_cores)(delayed(getReplica)(filenames,
# times) for i in range(Nreplicas))
best_serial_aucs_over_time = Parallel(n_jobs=num_cores)(
delayed(getTimeReplicaSerial)(serial_auc_replica,
T) for T in times)
replicas.append(best_serial_aucs_over_time)
from statistics import mean, stdev
best_serial_aucs_over_time = list(map(mean, zip(*replicas)))
errors = list(map(stdev, zip(*replicas)))
doPlot(best_parallel_aucs_over_time, best_serial_aucs_over_time, times,
errors)