1+ #-*- coding: utf-8 -*-
2+ import numpy as np
3+ import matplotlib .pyplot as plt
4+ import scipy .io as spio
5+ from scipy import optimize
6+ from matplotlib .font_manager import FontProperties
7+ font = FontProperties (fname = r"c:\windows\fonts\simsun.ttc" , size = 14 ) # 解决windows环境下画图汉字乱码问题
8+
9+
10+ def logisticRegression_OneVsAll ():
11+ data = loadmat_data ("data_digits.mat" )
12+ X = data ['X' ] # 获取X数据,每一行对应一个数字20x20px
13+ y = data ['y' ]
14+ m ,n = X .shape
15+ num_labels = 10
16+
17+ rand_indices = [t for t in [np .random .randint (x - x , m ) for x in range (100 )]] # 生成100个0-m的随机数
18+ display_data (X [rand_indices ,:]) # 显示100个数字
19+
20+ Lambda = 0.1 # 正则化系数
21+ #y = y.reshape(-1,1)
22+ all_theta = oneVsAll (X , y , num_labels , Lambda ) # 计算所有的theta
23+
24+ p = predict_oneVsAll (all_theta ,X ) # 预测
25+ # 将预测结果和真实结果保存到文件中
26+ #res = np.hstack((p,y.reshape(-1,1)))
27+ #np.savetxt("predict.csv", res, delimiter=',')
28+
29+ print u"预测准确度为:%f%%" % np .mean (np .float64 (p == y .reshape (- 1 ,1 ))* 100 )
30+
31+ # 加载mat文件
32+ def loadmat_data (fileName ):
33+ return spio .loadmat (fileName )
34+
35+ # 显示10个数字
36+ def display_data (imgData ):
37+ sum = 0
38+ display_array = np .ones ((200 ,200 ))
39+ for i in range (10 ):
40+ for j in range (10 ):
41+ display_array [i * 20 :(i + 1 )* 20 ,j * 20 :(j + 1 )* 20 ] = imgData [sum ,:].reshape (20 ,20 )
42+ sum += 1
43+
44+ plt .imshow (display_array ,cmap = 'gray' )
45+ plt .axis ('off' )
46+ plt .show ()
47+
48+ # 求每个分类的theta
49+ def oneVsAll (X ,y ,num_labels ,Lambda ):
50+ # 初始化变量
51+ m ,n = X .shape
52+ all_theta = np .zeros ((n + 1 ,num_labels ))
53+ X = np .hstack ((np .ones ((m ,1 )),X ))
54+ class_y = np .zeros ((m ,num_labels ))
55+ initial_theta = np .zeros ((n + 1 ,1 ))
56+
57+ # 格式化y,将y两两分类
58+ for i in range (num_labels ):
59+ class_y [:,i ] = np .int32 (y == i ).reshape (1 ,- 1 )
60+
61+ for i in range (num_labels ):
62+ #all_theta[:,i] = optimize.fmin(costFunction,initial_theta,args=(X,class_y[:,i].reshape(-1,1),Lambda),maxiter=50)
63+ result = optimize .fmin_bfgs (costFunction , initial_theta , fprime = gradient , args = (X ,class_y [:,i ],Lambda ))
64+ all_theta [:,i ] = result .reshape (1 ,- 1 )
65+
66+ all_theta = np .transpose (all_theta )
67+ return all_theta
68+
69+ # 代价函数
70+ def costFunction (initial_theta ,X ,y ,inital_lambda ):
71+ m = len (y )
72+ J = 0
73+
74+ h = sigmoid (np .dot (X ,initial_theta )) # 计算h(z)
75+ theta1 = initial_theta .copy () # 因为正则化j=1从1开始,不包含0,所以复制一份,前theta(0)值为0
76+ theta1 [0 ] = 0
77+
78+ temp = np .dot (np .transpose (theta1 ),theta1 )
79+ J = (- np .dot (np .transpose (y ),np .log (h ))- np .dot (np .transpose (1 - y ),np .log (1 - h ))+ temp * inital_lambda / 2 )/ m # 正则化的代价方程
80+ return J
81+
82+ # 计算梯度
83+ def gradient (initial_theta ,X ,y ,inital_lambda ):
84+ m = len (y )
85+ grad = np .zeros ((initial_theta .shape [0 ]))
86+
87+ h = sigmoid (np .dot (X ,initial_theta )) # 计算h(z)
88+ theta1 = initial_theta .copy ()
89+ theta1 [0 ] = 0
90+
91+ grad = np .dot (np .transpose (X ),h - y )/ m + inital_lambda / m * theta1 #正则化的梯度
92+ return grad
93+
94+ # S型函数
95+ def sigmoid (z ):
96+ h = np .zeros ((len (z ),1 )) # 初始化,与z的长度一致
97+
98+ h = 1.0 / (1.0 + np .exp (- z ))
99+ return h
100+
101+ # 预测
102+ def predict_oneVsAll (all_theta ,X ):
103+ m = X .shape [0 ]
104+ num_labels = all_theta .shape [0 ]
105+ p = np .zeros ((m ,1 ))
106+ X = np .hstack ((np .ones ((m ,1 )),X )) #在X最前面加一列1
107+
108+ h = sigmoid (np .dot (X ,np .transpose (all_theta ))) #预测
109+
110+ '''
111+ 返回h中每一行最大值所在的列号
112+ - np.max(h, axis=1)返回h中每一行的最大值(是某个数字的最大概率)
113+ - 最后where找到的最大概率所在的列号(列号即是对应的数字)
114+ '''
115+ p = np .array (np .where (h [0 ,:] == np .max (h , axis = 1 )[0 ]))
116+ for i in np .arange (1 , m ):
117+ t = np .array (np .where (h [i ,:] == np .max (h , axis = 1 )[i ]))
118+ p = np .vstack ((p ,t ))
119+ return p
120+
121+
122+ if __name__ == "__main__" :
123+ logisticRegression_OneVsAll ()
0 commit comments