-
Notifications
You must be signed in to change notification settings - Fork 2
Expand file tree
/
Copy pathgradientdescent.py
More file actions
132 lines (111 loc) · 3.59 KB
/
gradientdescent.py
File metadata and controls
132 lines (111 loc) · 3.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
# Note: eta = learning rate
#
def grad(w):
print("w=", w)
N = Xbar.shape[0]
return 1/N * Xbar.T.dot(Xbar.dot(w) - y)
def cost(w):
N = Xbar.shape[0]
return .5/N*np.linalg.norm(y - Xbar.dot(w), 2)**2
def numerical_grad(w, cost):
eps = 1e-4
g = np.zeros_like(w)
for i in range(len(w)):
w_p = w.copy()
w_n = w.copy()
w_p[i] += eps
w_n[i] -= eps
g[i] = (cost(w_p) - cost(w_n))/(2*eps)
return g
def check_grad(w, cost, grad):
w = np.random.rand(w.shape[0], w.shape[1])
grad1 = grad(w)
grad2 = numerical_grad(w, cost)
return True if np.linalg.norm(grad1 - grad2) < 1e-6 else False
def myGD(w_init, grad, eta):
w = [w_init]
for it in range(100):
w_new = w[-1] - eta*grad(w[-1])
if np.linalg.norm(grad(w_new))/len(w_new) < 1e-3:
break
w.append(w_new)
return (w, it)
# check convergence
def has_converged(theta_new, grad):
return np.linalg.norm(grad(theta_new)) / len(theta_new) < 1e-3
def GD_momentum(theta_init, grad, eta, gamma):
# Suppose we want to store history of theta
theta = [theta_init]
v_old = np.zeros_like(theta_init)
for it in range(100):
v_new = gamma*v_old + eta*grad(theta[-1])
theta_new = theta[-1] - v_new
if has_converged(theta_new, grad):
break
theta.append(theta_new)
v_old = v_new
return theta
# this variable includes all points in the path
# if you just want the final answer,
# use `return theta[-1]`
# single point gradient
def sgrad(w, i, rd_id):
true_i = rd_id[i]
xi = Xbar[true_i, :]
yi = y[true_i]
a = np.dot(xi, w) - yi
return (xi*a).reshape(2, 1)
def SGD(w_init, grad, eta):
w = [w_init]
w_last_check = w_init
iter_check_w = 10
N = X.shape[0]
count = 0
for it in range(10):
# shuffle data
rd_id = np.random.permutation(N)
for i in range(N):
count += 1
g = sgrad(w[-1], i, rd_id)
w_new = w[-1] - eta*g
w.append(w_new)
if count % iter_check_w == 0:
w_this_check = w_new
if np.linalg.norm(w_this_check - w_last_check)/len(w_init) < 1e-3:
return w
w_last_check = w_this_check
return (w, it)
if __name__ == "__main__":
np.random.seed(2)
X = np.random.rand(1000, 1)
y = 4 + 3 * X + .2*np.random.randn(1000, 1) # noise added
# Building Xbar
one = np.ones((X.shape[0],1))
Xbar = np.concatenate((one, X), axis = 1)
A = np.dot(Xbar.T, Xbar)
b = np.dot(Xbar.T, y)
w_lr = np.dot(np.linalg.pinv(A), b)
print('Solution found by formula: w = ',w_lr.T)
# Display result
w = w_lr
w_0 = w[0][0]
w_1 = w[1][0]
x0 = np.linspace(0, 1, 2, endpoint=True)
y0 = w_0 + w_1*x0
print( 'Checking gradient...', check_grad(np.random.rand(2, 1), cost, grad))
# Draw the fitting line
plt.plot(X.T, y.T, 'b.') # data
plt.plot(x0, y0, 'y', linewidth = 2) # the fitting line
plt.axis([0, 1, 0, 10])
plt.show()
# GD
w_init = np.array([[2], [1]])
(w1, it1) = myGD(w_init, grad, 100)
print('Solution found by GD: w = ', w1[-1].T, ',\nafter %d iterations.' %(it1+1))
(w, it) = SGD(w_init, grad, 1)
print('Solution found by SGD: w = ', w[-1].T, ',\nafter %d iterations.' %(it+1))