-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathoptimizers.py
More file actions
392 lines (307 loc) · 18.1 KB
/
optimizers.py
File metadata and controls
392 lines (307 loc) · 18.1 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
'''
Created by Zhaoyu Lou on 12/31/17.
'''
import numpy as np
import itertools
'''
This file supplies the optimizers for the neural network. Each optimizer class
has a verify_params method which checks that all necessary additional hyperparameters
are in the opt_params dictionary with the proper keys, and an update method which
updates the network parameters.
'''
# Stochastic gradient descent: just follows the gradient down.
class SGD:
# Just some book keeping for readability later on.
def __init__(self, w, b):
self.w = w
self.b = b
# No additional hyperparameters are needed for stochastic gradient descent.
def verify_params(self, opt_params):
return
# Updates by simply taking a step opposite the gradient and adding L2 regularization.
def update(self, params, learning_rate, reg_strength, grads, opt_params):
for weight in self.w:
params[weight] -= learning_rate * (grads[weight] + 2 * reg_strength * params[weight])
for bias in self.b:
params[bias] -= learning_rate * grads[bias]
'''
Gradient Descent with Momentum: Treats parameters as an actual position on the 'landscape'
defined by the loss function. The gradient is then proportional the the acceleration,
and is integrated to get changes in the 'velocity' and the parameters themselves.
'''
class Momentum:
# Store the velocity, since it will be updated later
velocities = {}
# Initializes the velocity dictionary so that all the required keys are present.
# Also sets a few variables for readability purposes.
def __init__(self, w, b):
self.w = w
self.b = b
for param in itertools.chain(w, b):
self.velocities[param] = 0
# Momentum optimization requires 2 additional hyperparameters: the initial
# momentum (mu) and the rate at which the momentum is annealed towards 1 (anneal).
def verify_params(self, opt_params):
assert ('mu' in opt_params), 'Initial momentum must be specified with key \'mu\''
assert ('anneal' in opt_params), 'Momentum annealing rate must be specified with key \'anneal\''
return
# Updates parameters by first integrating the acceleration (the gradient) to update the velocity,
# then integrating the velocity to update the position (parameters). Returns the updated parameters.
def update(self, params, learning_rate, reg_strength, grads, opt_params):
# Anneal the momentum based on the epoch
momentum = 1 - ((1 - opt_params['mu']) * (opt_params['anneal'] ** opt_params['epoch']))
for weight in self.w:
# Integrate the gradient (with L2 regularization) to get the change
# in velocity, then update the velocity
gradient_update = learning_rate * (grads[weight] + 2 * reg_strength * params[weight])
self.velocities[weight] = momentum * self.velocities[weight] - gradient_update
# Integrate the velocity to update the parameters
params[weight] += self.velocities[weight]
for bias in self.b:
# Integrate gradient to get change in velocity, then update velocity
gradient_update = learning_rate * grads[bias]
self.velocities[bias] = momentum * self.velocities[bias] - gradient_update
# Integrate velocity to update parameters
params[bias] += self.velocities[bias]
'''
Nesterov Accelerated Gradient: NAG improves on standard momentum optimization by noting
that we know a priori some information about where the parameters will end up on the next
step - we know that part of the update is the vector (momentum * velocity), and both of
those quantities are known. We can therefore 'look ahead' at where we are going and evaluate
the gradient there instead of at our current position for a better estimate of the true
acceleration we would feel in the real world. Note that in NAG, the parameters we store
are the 'look-ahead' parameters, not the true parameters.
Among non-adaptive (and even many adaptive) approaches, NAG is considered the best optimizer,
and is often considered as an alternative to the adam optimizer as a good default option.
'''
class NAG:
# Store the velocity, since it will be updated later
velocities = {}
# Initializes the velocity dictionary so that all the required keys are present.
# Also sets a few variables for readability purposes.
def __init__(self, w, b):
self.w = w
self.b = b
for param in itertools.chain(w, b):
self.velocities[param] = 0
# Momentum optimization requires 2 additional hyperparameters: the initial
# momentum (mu) and the rate at which the momentum is annealed towards 1 (anneal).
def verify_params(self, opt_params):
assert ('mu' in opt_params), 'Initial momentum must be specified with key \'mu\''
assert ('anneal' in opt_params), 'Momentum annealing rate must be specified with key \'anneal\''
# Updates parameters by integrating the acceleration (gradient) to update the velocity,
# correcting the 'look-ahead' parameters back to the true parameters, and then making
# both the new parameter update and the new look-ahead update using the new velocity.
def update(self, params, learning_rate, reg_strength, grads, opt_params):
# Anneal the momentum based on the epoch
momentum = 1 - ((1 - opt_params['mu']) * (opt_params['anneal'] ** opt_params['epoch']))
for weight in self.w:
# Store the current velocity before updating to the future velocity
curr_velocity = self.velocities[weight]
# Integrate the gradient (with L2 regularization) to get the change
# in velocity, then update the velocity
gradient_update = learning_rate * (grads[weight] + 2 * reg_strength * params[weight])
self.velocities[weight] = momentum * self.velocities[weight] - gradient_update
# Subtract off the current velocity update to get the actual position,
# then update parameters with the new velocity, and finally do the
# look-ahead step forwards
params[weight] += (1 + momentum) * self.velocities[weight] - momentum * curr_velocity
for bias in self.b:
# Store the current velocity before updating to the future velocity
curr_velocity = self.velocities[bias]
# Integrate gradient to get change in velocity, then update velocity
gradient_update = learning_rate * grads[bias]
self.velocities[bias] = momentum * self.velocities[bias] - gradient_update
# Integrate velocity to update parameters
params[bias] += (1 + momentum) * self.velocities[bias] - momentum * curr_velocity
# Corrects all the 'look-ahead' parameters back to their true values or the true
# values back to the look-ahead parameters, depending on the direction argument.
# (1 changes to true values, -1 changes to look-ahead values)
def correct(self, params, opt_params, direction):
for param in itertools.chain(self.w, self.b):
params[param] -= direction * opt_params['mu'] * self.velocities[param]
######## Optimizers below here are adaptive per-parameter methods. ########
'''
Adaptive Gradient Descent: Dynamically adapts the learning rates of each parameter
by normalizing by the squared sum of previous gradients. This means that parameters
with high gradients end up with lower learning rates, and low gradients end up with
higher learning rates.
'''
class Adagrad:
# Store the sum, since it's updated as training progresses
histories = {}
# Sets a few variables for readability purposes and initializes the histories
# so that all the necessary keys are present.
def __init__(self, w, b):
self.w = w
self.b = b
for param in itertools.chain(w, b):
self.histories[param] = 0
# No additional hyperparameters are needed for adagrad.
def verify_params(self, opt_params):
return
# Updates parameters by adding the square of the current gradient to the history,
# then stepping down the gradient with the normalized learning rate.
def update(self, params, learning_rate, reg_strength, grads, opt_params):
for weight in self.w:
# Regularize the gradient with L2 regularization
reg_gradient = grads[weight] + 2 * reg_strength * params[weight]
# Update the history with the squared regularized gradient
self.histories[weight] += np.square(reg_gradient)
params[weight] -= learning_rate * reg_gradient / (np.sqrt(self.histories[weight]) + 1e-8)
for bias in self.b:
# Update the history with the squared gradient
self.histories[bias] += np.square(grads[bias])
# Update the parameters using the normalized learning rate
params[bias] -= learning_rate * grads[bias] / (np.sqrt(self.histories[bias]) + 1e-8)
'''
Root Mean Square Propagation: A problem that comes up with adagrad is that the normalization
factors monotonically increase, causing the learning rates to monotonically decrease. This
aggressive reduction in learning often stops learning too early. RMSprop seeks to rectify this
by using a moving average of previous squared gradients instead of the sum. Indeed, with this
approach the normalization factor is an estimate of the second moment, or the uncentered
variance. If the gradient has high variance then intuitively we are unsure of which direction
to move in, so we should take a small step, and vice versa for low variance, which is realized
by the choice of normalization.
'''
class RMSprop:
# Store the estimated variance, since it's updated as training progresses
variance = {}
# Sets a few variables for readability purposes and initializes the histories so that
# the necessary keys are present.
def __init__(self, w, b):
self.w = w
self.b = b
for param in itertools.chain(w, b):
self.variance[param] = 0
# RMSprop requires one additional hyperparameter, the historical decay rate ('decay').
def verify_params(self, opt_params):
assert ('decay' in opt_params), 'Average decay rate must be specified with key \'decay\''
return
# Updates parameters by first taking a convex combination of the history and the
# new squared gradient, then updating the parameters with the normalized learning rate.
def update(self, params, learning_rate, reg_strength, grads, opt_params):
decay = opt_params['decay']
for weight in self.w:
# Regularize the gradient with L2 regularization
reg_gradient = grads[weight] + 2 * reg_strength * params[weight]
# Update the variance estimate by taking a convex combination of the previous history
# and the new squared gradient
self.variance[weight] = decay * self.variance[weight] + (1 - decay) * np.square(reg_gradient)
# Update parameters with normalized learning rate
params[weight] -= learning_rate * reg_gradient / (np.sqrt(self.variance[weight]) + 1e-8)
for bias in self.b:
# Take convex combination of previous history and new squared gradient
self.variance[bias] = decay * self.variance[bias] + (1 - decay) * np.square(grads[bias])
# Update parameters with normalized learning rate
params[bias] -= learning_rate * grads[bias] / (np.sqrt(self.variance[bias]) + 1e-8)
'''
Adadelta: The adadelta algorithm makes a further refinement to the RMSprop algorithm by
scaling the regularized gradients by a moving average of previous updates in the update
rule. Intuitively, if the gradient has pointed in a certain direction for a long time, the
parameters should probably keep moving in that direction. The adagrad scaling prevents
sudden gradient changes (from saddle points or chance variations in the loss) from causing
the algorithm to drastically change direction; the parameters have 'momentum', so to speak.
'''
class Adadelta:
# Store the variance estimate and the average update for later updates
variance = {}
deltas = {}
# Sets a few variables for readability purposes and initializes the estimates so that
# the necessary keys are present.
def __init__(self, w, b):
self.w = w
self.b = b
for param in itertools.chain(w, b):
self.variance[param] = 0
self.deltas[param] = 0
# Adadelta requires one additional hyperparameter, the decay rate for the average estimators.
def verify_params(self, opt_params):
assert ('decay' in opt_params), 'Average decay rates must be specified with key \'decay\''
return
# Updates the parameters by updating the variance estimate and the average of updates,
# then performing the update by scaling by the average update divided by the estimated
# variance.
def update(self, params, learning_rate, reg_strength, grads, opt_params):
decay = opt_params['decay']
for weight in self.w:
# Regularize the gradient with L2 regularization
reg_gradient = grads[weight] + 2 * reg_strength * params[weight]
# Update the variance estimate by taking a convex combination of the previous history
# and the new squared gradient
self.variance[weight] = decay * self.variance[weight] + (1 - decay) * np.square(reg_gradient)
# Compute the update using the normalized learning rate, and add it into the moving
# average of deltas
delta = learning_rate * reg_gradient / (np.sqrt(self.variance[weight]) + 1e-8)
self.deltas[weight] = decay * self.deltas[weight] + (1 - decay) * np.square(delta)
# Update the parameters, scaling the gradient by the average recent update divided
# by the estimated variance
params[weight] -= np.sqrt(self.deltas[weight] / (self.variance[weight] + 1e-16)) * reg_gradient
for bias in self.b:
# Take convex combination of previous history and new squared gradient
self.variance[bias] = decay * self.variance[bias] + (1 - decay) * np.square(grads[bias])
# Compute update and add to moving average
delta = learning_rate * grads[bias] / (np.sqrt(self.variance[bias]) + 1e-8)
self.deltas[bias] = decay * self.deltas[bias] + (1 - decay) * np.square(delta)
# Update parameters with proper scaling
params[bias] -= np.sqrt(self.deltas[bias] / (self.variance[bias] + 1e-16)) * grads[bias]
'''
Adaptive Moment Estimation: The Adam optimizer makes two significant changes to the RMSprop
algorithm. First, the gradient in the update rule is replaced by a moving average of previous
gradients. This is to reduce noise; unless the batch size is close to the entire dataset the
minibatch gradient does not in general match the true gradient. By taking an average of gradients,
we get a better estimate for the true gradient (and also avoid the sudden change problem that
Adagrad sought to rectify. The second change is that both the estimate for the first moment
(the mean of the gradient) and the second moment (the uncentered variance of the gradient) are
divided by an additional term, called the 'bias correction' term. This is because the zero
initialization of the estimators makes them biased; these correction terms fix that problem.
Generally, the Adam optimizer is considered to be the best optimizer and is a good default choice,
along with NAG.
'''
class Adam:
# Store the biased estimates for the first and second moments so they can be continuously updated
mean = {}
variance = {}
# Sets a few variables for readability purposes and initializes the estimates so that
# the necessary keys are present.
def __init__(self, w, b):
self.w = w
self.b = b
for param in itertools.chain(w, b):
self.mean[param] = 0
self.variance[param] = 0
# The adam optimizer requires two additional hyperparameters: the decay rate for the moving average
# in the mean estimator, and the decay rate for the variance estimator.
def verify_params(self, opt_params):
assert ('decay1' in opt_params), 'First order moment decay must be specified with key \'decay1\''
assert ('decay2' in opt_params), 'Second order moment decay must be specified with key \'decay2\''
return
# Updates parameters by first updating the biased estimates for the moments, then adding
# bias correction terms, then performing the update with the mean as the gradient
# and normalizing with the variance
def update(self, params, learning_rate, reg_strength, grads, opt_params):
b1 = opt_params['decay1']
b2 = opt_params['decay2']
t = opt_params['iteration']
for weight in self.w:
# Regularize the gradient with L2 regularization
reg_gradient = grads[weight] + 2 * reg_strength * params[weight]
# Update the biased mean estimate, then calculate the unbiased estimate
self.mean[weight] = b1 * self.mean[weight] + (1 - b1) * reg_gradient
m_hat = self.mean[weight] / (1 - (b1 ** t))
# Update the biased variance estimate, the calculate the unbiased estimate
self.variance[weight] = b2 * self.variance[weight] + (1 - b2) * np.square(reg_gradient)
v_hat = self.variance[weight] / (1 - (b2 ** t))
# Update parameters using the mean as the gradient and normalizing
# by the variance
params[weight] -= learning_rate * m_hat / (np.sqrt(v_hat) + 1e-8)
for bias in self.b:
# Update the biased mean estimate, then calculate the unbiased estimate
self.mean[bias] = b1 * self.mean[bias] + (1 - b1) * grads[bias]
m_hat = self.mean[bias] / (1 - (b1 ** t))
# Update the biased variance estimate, the calculate the unbiased estimate
self.variance[bias] = b2 * self.variance[bias] + (1 - b2) * np.square(grads[bias])
v_hat = self.variance[bias] / (1 - (b2 ** t))
# Update parameters using the mean as the gradient and normalizing
# by the variance
params[bias] -= learning_rate * m_hat / (np.sqrt(v_hat) + 1e-8)