-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathLSTM.py
More file actions
265 lines (217 loc) · 8.49 KB
/
LSTM.py
File metadata and controls
265 lines (217 loc) · 8.49 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
import numpy as np
np.random.seed(0)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def dsigmoid(y):
return y * (1 - y)
def tanh(x):
return np.tanh(x)
def dtanh(y):
return 1 - y * y
def stable_softmax(X):
exps = np.exp(X - np.max(X))
return exps / np.sum(exps)
def initialise_parameters(P, num_cells, features):
# weights corresponing to new input, size = (feature, num_cells)
P['Wi'], P['Wf'], P['Wo'] = (np.random.randn(3, num_cells, features) * 0.1) + 0.5
P['Wz'] = np.random.randn(num_cells, features) * 0.1
# weights corresponding to recurrence, size = (num_cells, num_cells)
P['Ri'], P['Rf'], P['Ro'] = (np.random.randn(3, num_cells, num_cells) * 0.1) + 0.5
P['Rz'] = np.random.randn(num_cells, num_cells) * 0.1
# weights corresponding to bias, size = (1, num_cells)
P['bz'], P['bi'], P['bf'], P['bo'] = np.random.randn(4, num_cells, 1) * 0.1
# W weights, size = (num_cells, features)
P['Wv'] = np.random.rand(features, num_cells)*0.1
# bias, size = (1, num_cells)
P['bv'] = np.random.rand(features, 1)*0.1
return P
def initialise_derivatives(num_cells, features):
dP = {}
# weights corresponing to new input, size = (feature, num_cells)
dP['Wz'], dP['Wi'], dP['Wf'], dP['Wo'] = np.zeros((4, num_cells, features))
# weights corresponding to recurrence, size = (num_cells, num_cells)
dP['Rz'], dP['Ri'], dP['Rf'], dP['Ro'] = np.zeros((4, num_cells, num_cells))
# weights corresponding to bias, size = (1, num_cells)
dP['bz'], dP['bi'], dP['bf'], dP['bo'] = np.zeros((4, num_cells, 1))
# W weights, size = (num_cells, features)
dP['Wv'] = np.zeros((features, num_cells))
# bias, size = (1, num_cells)
dP['bv'] = np.zeros((features, 1))
return dP
# FOR USE WITH ADAM OPTIMIZER
def initialise_momentum(dP):
M1, M2 = {}, {}
for key in P:
M1[key] = np.zeros_like(P[key])
M2[key] = np.zeros_like(P[key])
return M1, M2
def forward_pass_through_LSTM(x, state, P):
h_prev, c_prev = state
# forget gate
f = sigmoid(np.dot(P['Wf'], x) + np.dot(P['Rf'], h_prev) + P['bf'])
# input gate
i = sigmoid(np.dot(P['Wi'], x) + np.dot(P['Ri'], h_prev) + P['bi'])
# z neuron
z = tanh(np.dot(P['Wz'], x) + np.dot(P['Rz'], h_prev) + P['bz'])
# cell state
c = f*c_prev + i*z
# output gate
o = sigmoid(np.dot(P['Wo'], x) + np.dot(P['Ro'], h_prev) + P['bo'])
# hidden state
h = o * tanh(c)
cache = f, i, z, o
state = h, c
return state, cache
def forward_pass_through_dense(x, P):
# Dense layer with softmax activation
v = np.dot(P['Wv'], x) + P['bv']
y = stable_softmax(v)
return y
def backprop_through_dense(x, y, target, dP):
# computer error
dv = np.copy(y)
dv[np.argmax(target)] -= 1
dP['Wv'] += np.dot(dv, x.T)
dP['bv'] += dv
return dv, dP
def backward_pass_through_LSTM(deriv, passing_state, x, state_prev, state, cache, P, dP):
# unpack cache and state
f, i, z, o = cache
h, c = state
h_prev, c_prev = state_prev
dh_next, dc_next = passing_state
# hidden state
dh = np.dot(P['Wv'].T, deriv)
dh += dh_next
# open gate
do = dsigmoid(o) * dh * tanh(c)
dP['Wo'] += np.dot(do, x.T)
dP['Ro'] += np.dot(do, h_prev.T)
dP['bo'] += do
# cell state
dc = np.copy(dc_next)
dc += dh * o * dtanh(tanh(c))
# z neuron
dz = dtanh(z) * dc * i
dP['Wz'] += np.dot(dz, x.T)
dP['Rz'] += np.dot(dz, h_prev.T)
dP['bz'] += dz
# input gate
di = dsigmoid(i) * dc * z
dP['Wi'] += np.dot(di, x.T)
dP['Ri'] += np.dot(di, h_prev.T)
dP['bi'] += di
# forget gate
df = dsigmoid(f) * dc * c_prev
dP['Wf'] += np.dot(df, x.T)
dP['Rf'] += np.dot(df, h_prev.T)
dP['bf'] += df
# if passing to lower layer
# dx = (np.dot(P['Wf'].T, df) + np.dot(P['Wi'].T, di) + np.dot(P['Wz'].T, dz) + np.dot(P['Wo'].T, do))
# pass hidden state
dh_prev = (np.dot(P['Rf'].T, df) + np.dot(P['Ri'].T, di) + np.dot(P['Rz'].T, dz) + np.dot(P['Ro'].T, do))
# pass cell state
dc_prev = f * dc
passing_state = dh_prev, dc_prev
return passing_state, dP
def forward_backward_pass(inputs, targets2, input_state, P, M1, M2, optimizer, learning_rate, t):
x, y, cache_1, state = {}, {}, {}, {}
state[-1] = (np.copy(input_state[0]), np.copy(input_state[1]))
loss = 0
# Forward pass through time steps
for t in range(len(inputs)):
(state[t], cache_1[t]) = forward_pass_through_LSTM(np.array(inputs[t]).reshape(-1,1), state[t-1], P)
(y[t]) = forward_pass_through_dense(state[t][0], P)
# cross entropy loss
loss += -np.log(y[t][np.argmax(targets2[t]), 0])
# Backward pass through time steps
passing_state = (np.zeros_like(state[-1][0]), np.zeros_like(state[-1][1]))
dP = initialise_derivatives(num_cells, features) # clear gradients
for t in reversed(range(len(inputs))):
dv, dP = backprop_through_dense(state[t][0], y[t], targets2[t], dP)
passing_state, dP = backward_pass_through_LSTM(dv, passing_state, np.array(inputs[t]).reshape(-1,1), state[t-1], state[t], cache_1[t], P, dP)
P, M1, M2, t = update_parameters(optimizer, P, dP, M1, M2, t, learning_rate = learning_rate)
return loss, state[len(inputs)-1], P, M1, M2, t
def prediction(state, x, length, P, alphabet):
x = np.array(x).reshape(-1,1)
pred = ''
for t in range(length):
state, _ = forward_pass_through_LSTM(x, state, P)
p = forward_pass_through_dense(state[0], P)
idx = np.random.choice(range(features), p=p.ravel())
x = np.zeros((features, 1))
x[idx] = 1
pred += alphabet[idx]
return pred
def update_parameters(optimizer, P, dP, M1, M2, t, learning_rate = 0.001, decay_rate_1 = 0.9, decay_rate_2 = 0.999):
if optimizer == 'adam':
for key in P:
t = t + 1
dP[key] = np.clip(dP[key], -5, 5, out= dP[key])
M1[key] = (decay_rate_1)*M1[key] + (1 - decay_rate_1)*dP[key]
M2[key] = (decay_rate_2)*M2[key] + (1 - decay_rate_2)*(dP[key]**2)
m1 = M1[key]/(1 - decay_rate_1**(t))
m2 = M2[key]/(1 - decay_rate_2**(t))
P[key] = P[key] - learning_rate*(m1/(np.sqrt(m2)+ 1e-8))
else:
for key in P:
dP[key] = np.clip(dP[key], -1, 1, out= dP[key])
P[key] -= learning_rate * dP[key]
return P, M1, M2, t
################################################################################
data = 'stevenstevenstevenstevenstevenstevenstevenstevensteven'
optimizer = 'adam' # 'adam' or 'GD'
num_cells = 100
T_steps = 4
num_iterations = 100
# One hot encode
alphabet = sorted(list(set(data)))
features = len(alphabet)
encoded_data = [[0 if char != letter else 1 for char in alphabet] for letter in data]
state = (np.zeros((num_cells, 1)), np.zeros((num_cells, 1)))
# Initialise parameters
P = {}
P = initialise_parameters(P, num_cells, features)
M1, M2 = initialise_momentum(P)
iteration, location = 0, 0
Loss = []
reset = False
if optimizer == 'adam':
learning_rate = 0.001
t = 0
elif optimizer == 'GD':
learning_rate = 2
t = 0
else:
print('Unrecognised optimizer, default set to adam')
optimizer == 'adam'
while iteration < num_iterations:
if location + T_steps >= len(data):
reset = True
# last chunk of data
inputs = encoded_data[location: -1]
targets = encoded_data[location + 1:]
else:
# define inputs and outputs
inputs = encoded_data[location: location + T_steps]
targets = encoded_data[location + 1: location + T_steps + 1]
loss, state, P, M1, M2, t = forward_backward_pass(inputs, targets, state, P, M1, M2, optimizer, learning_rate, t)
Loss.append(loss)
# Print an example prediction every 500 iterations
if (iteration % 500 == 0) and (iteration != 0) and reset:
if optimizer == 'GD':
learning_rate /= 2
print(prediction(state, encoded_data[0], 40, P, alphabet))
if reset:
reset = False
state = (np.zeros((num_cells, 1)), np.zeros((num_cells, 1)))
location = 0
iteration += 1
else:
location += T_steps
# Pass through with no backprop (also passes through last element of training set, unlike during training)
state = (np.zeros((num_cells, 1)), np.zeros((num_cells, 1)))
for t in range(len(encoded_data)):
(state, _) = forward_pass_through_LSTM(np.array(encoded_data[t]).reshape(-1,1), state, P)
_ = forward_pass_through_dense(state[0], P)
print(prediction(state, encoded_data[0], 40, P, alphabet))