-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathneuralNet.py
More file actions
274 lines (222 loc) · 11.9 KB
/
neuralNet.py
File metadata and controls
274 lines (222 loc) · 11.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
'''
Created by Zhaoyu Lou on 12/20/17.
'''
import matplotlib.pyplot as plt
import numpy as np
import activations
import loss_functions
import optimizers
# This class provides a python implementation of a fully connected neural network.
class NeuralNet:
'''
Initializes some of the hyperparameters of the network.
layer_sizes is a list containing the number of neurons in each layer.
input dimension is the dimension of the input data (number of features), and output
dimension is the number of classes to classify into. Activation should be 'relu' or
'sigmoid'. Loss should be 'cross entropy' or 'hinge'.
'''
def __init__(self, num_hidden_layers, layer_sizes, input_dimension, output_dimension,
activation='relu', loss='cross entropy', optimizer='sgd'):
# Some basic error checking
assert (activation in ['relu', 'sigmoid', 'tanh']), 'Activation must be \'relu\', \'sigmoid\' or \'tanh\'.'
assert (loss in ['cross entropy', 'hinge']), 'Loss must be either \'cross entropy\' or \'hinge\'.'
assert (len(layer_sizes) == num_hidden_layers), 'Too many or too few layer sizes given.'
assert (optimizer in ['sgd', 'momentum', 'nesterov', 'adagrad', 'rmsprop', 'adadelta', 'adam']), \
'Optimizer must be \'sgd\', \'momentum\', \'nesterov\', \'adagrad\', \'rmsprop\', or \'adam\'.'
# Set hyperparameters
self.num_layers = num_hidden_layers
self.layer_sizes = layer_sizes
self.input_dimension = input_dimension
self.num_classes = output_dimension
# These two variables are for ease of readability in later functions
self.w = ['W1']
self.b = ['b1']
for layer in range(1, self.num_layers + 1):
weight = 'W' + str(layer + 1)
bias = 'b' + str(layer + 1)
self.w.append(weight)
self.b.append(bias)
# Set activation function
if activation == 'relu':
self.activation = activations.ReLU()
elif activation == 'sigmoid':
self.activation = activations.Sigmoid()
elif activation == 'tanh':
self.activation = activations.Tanh()
# Set loss function
if loss == 'cross entropy':
self.loss = loss_functions.CrossEntropy()
elif loss == 'hinge':
self.loss = loss_functions.Hinge()
# Set optimizer
if optimizer == 'sgd':
self.optimizer = optimizers.SGD(self.w, self.b)
elif optimizer == 'momentum':
self.optimizer = optimizers.Momentum(self.w, self.b)
elif optimizer == 'nesterov':
self.optimizer = optimizers.NAG(self.w, self.b)
elif optimizer == 'adagrad':
self.optimizer = optimizers.Adagrad(self.w, self.b)
elif optimizer == 'rmsprop':
self.optimizer = optimizers.RMSprop(self.w, self.b)
elif optimizer == 'adadelta':
self.optimizer = optimizers.Adadelta(self.w, self.b)
elif optimizer == 'adam':
self.optimizer = optimizers.Adam(self.w, self.b)
# Initializes the architecture of the network and handles some internal book keeping.
# Returns a dictionary of all weights and biases.
def init_params(self):
params = {}
# Initialize input layer; this is different from other layers because it's shape depends
# on the shape of the input data.
params['W1'] = np.random.normal(scale=0.01, size=(self.layer_sizes[0], self.input_dimension))
params['b1'] = np.random.normal(scale=0.01, size=(self.layer_sizes[0], 1))
# Initialize hidden layers
for layer in range(1, self.num_layers):
params[self.w[layer]] = np.random.normal(scale=0.01, size=(self.layer_sizes[layer], self.layer_sizes[layer - 1]))
params[self.b[layer]] = np.random.normal(scale=0.01, size=(self.layer_sizes[layer], 1))
# Initialize output layer, which depends on the number of classes being classified.
params[self.w[-1]] = np.random.normal(scale=0.01, size=(self.num_classes, self.layer_sizes[self.num_layers - 1]))
params[self.b[-1]] = np.random.normal(scale=0.01, size=(self.num_classes, 1))
return params
# Defines the training and dev (validation) datasets. The data and labels follow the standard
# convention of each row being it's own training example, but my particular implementation
# is optimized for columns being examples so I transpose the input data.
def input_data(self, trainData, trainLabels, devData, devLabels):
self.trainData = trainData.T
self.trainLabels = (trainLabels.T).astype(int)
self.devData = devData.T
self.devLabels = (devLabels.T).astype(int)
# Forward pass of the neural net, also caches the layer outputs for more efficient backprop.
# Returns the predictions and the cost, along with the cached values.
def forward_prop(self, data, labels, params):
cache = {}
# Input layer
z = np.matmul(params['W1'], data) + params['b1']
cache['1'] = self.activation.activate(z)
# Hidden layers
for layer in range(1, self.num_layers):
z = np.matmul(params[self.w[layer]], cache[str(layer)]) + params[self.b[layer]]
cache[str(layer + 1)] = self.activation.activate(z)
# Output layer; no activation function here
logits = np.matmul(params[self.w[self.num_layers]], cache[str(self.num_layers)]) + params[
self.b[self.num_layers]]
# Calculate the loss from logits
cost, output = self.loss.calculate_loss(logits, labels)
return cache, output, cost
# Backwards pass. Calculates gradients of all layers and biases using backpropagation.
def backward_prop(self, data, cache, labels, params):
output = params['o']
deltas = {}
# Deltas are gradients with respect to layer outputs; it is easy to calculate
# these using backpropagation.
deltas[str(self.num_layers + 1)] = self.loss.grad_loss(labels, output)
for layer in range(self.num_layers, 0, -1):
gradient = self.activation.gradient(cache[str(layer)])
deltas[str(layer)] = np.matmul(params[self.w[layer]].T, deltas[str(layer + 1)]) * gradient
# From the calculated deltas, we can now easily calculate gradients of weights and biases
# via backprop again.
grads = {}
grads['W1'] = np.matmul(deltas['1'], data.T)
grads['b1'] = np.sum(deltas['1'], axis=1, keepdims=True)
for layer in range(1, self.num_layers + 1):
grads[self.w[layer]] = np.matmul(deltas[str(layer + 1)], cache[str(layer)].T)
grads[self.b[layer]] = np.sum(deltas[str(layer + 1)], axis=1, keepdims=True)
return grads
'''
Trains the neural network.
Training is done in minibatches, where the optimizer was set by the user during initialization.
Hyperparameters are taken as inputs to this function. Note that minibatches are taken sequentially,
so the training data must be randomized prior to training. The learning rate is annealed by a factor
of the decay rate every epoch. The dev set is used for validation and has no effect on the training.
If verbose is set to true, the progress is reported every epoch and a plot of the performance history
is generated at the end. Returns the set of trained parameters.
'''
def train(self, epochs, batch_size, learning_rate, opt_params, reg_strength=0, decay_rate=1, verbose=False):
# Initialize variables, preprocess data.
m = self.trainData.shape[1]
training_losses = []
dev_losses = []
training_accs = []
dev_accs = []
epoch = 0
iterations = m // batch_size * epochs
opt_params['epoch'] = 0
# Initialize architecture
params = self.init_params()
# Make sure the optimization parameters are valid and present
self.optimizer.verify_params(opt_params)
# Train model.
for it in range(iterations):
# Select minibatches sequentially.
start = (it * batch_size) % m
end = start + batch_size
data = self.trainData[:, start:end]
labels = self.trainLabels[:, start:end]
# Forward and backward passes, calculate gradients.
cache, params['o'], cost = self.forward_prop(data, labels, params)
grads = self.backward_prop(data, cache, labels, params)
# Pass current iteration to optimizer in case some parameters need to be corrected
opt_params['iteration'] = it + 1
# Update parameters with both gradients and regularization.
self.optimizer.update(params, learning_rate, reg_strength, grads, opt_params)
# Decay the learning rate every epoch. and pass the current epoch to the optimizer
# in case some parameters need to be annealed
if it % (m // batch_size) == 0:
epoch += 1
opt_params['epoch'] = epoch
learning_rate *= decay_rate
# If verbose, provide a progress report
if verbose:
# Evaluate model on both the entire training set and the entire dev set.
self.report_progress(params, epoch, opt_params, training_losses, dev_losses, training_accs, dev_accs)
# If verbose, plot model performance history.
if verbose:
self.plot(training_accs, dev_accs, training_losses, dev_losses, epochs)
plt.show()
return params
# Evaluates model on a the training set and a dev (validation set), prints the results,
# and stores them for plotting later.
def report_progress(self, params, epoch, opt_params, training_losses, dev_losses, training_accs, dev_accs):
# In the case of NAG, the parameters stored are the 'look-ahead' parameters, so
# they must be corrected to get the true parameters
if self.optimizer == 'nesterov':
self.optimizer.correct(params, opt_params, 1)
# Evaluate model
_, _, train_loss = self.forward_prop(self.trainData, self.trainLabels, params)
_, _, val_loss = self.forward_prop(self.devData, self.devLabels, params)
train_accuracy = self.test(self.trainData, self.trainLabels, params)
dev_accuracy = self.test(self.devData, self.devLabels, params)
print('Epoch %d, Train Accuracy %f, Validation Accuracy %f' % (epoch, train_accuracy, dev_accuracy))
# Set the parameters back if we changed them
if self.optimizer == 'nesterov':
self.optimizer.correct(params, opt_params, -1)
# Store a history of losses and accuracies to plot at the end.
training_losses.append(train_loss)
dev_losses.append(val_loss)
training_accs.append(train_accuracy)
dev_accs.append(dev_accuracy)
# Tests the neural network on a validation dataset and returns the accuracy.
def test(self, data, labels, params):
_, output, _ = self.forward_prop(data, labels, params)
accuracy = self.compute_accuracy(output, labels)
return accuracy
# Helper function to compute accuracies.
def compute_accuracy(self, output, labels):
accuracy = (np.argmax(output, axis=0) == np.argmax(labels, axis=0)).sum() * 1. / labels.shape[1]
return accuracy
# Plots the model performance history.
def plot(self, training_accs, dev_accs, training_losses, dev_losses, epochs):
plt.plot(range(1, epochs), training_accs[1:])
plt.xlabel('# of Epochs')
plt.ylabel('Accuracy')
plt.title('Neural Network Accuracy')
plt.plot(range(1, epochs), dev_accs[1:])
plt.legend(('Training Accuracy', 'Validation Accuracy'))
plt.figure()
plt.plot(range(1, epochs), training_losses[1:])
plt.xlabel('# of Epochs')
plt.ylabel('Loss')
plt.title('Neural Network Loss')
plt.plot(range(1, epochs), dev_losses[1:])
plt.legend(('Training Loss', 'Validation Loss'))