-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathmlearn.py
More file actions
337 lines (310 loc) · 12.8 KB
/
mlearn.py
File metadata and controls
337 lines (310 loc) · 12.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
# This is a module containing the following classifiers:
# - Naive (Gauss) Bayes Classifier
# - Binary Logistic Regression
# - AdaBoosted decision stump
#
# by Shih-Ho Cheng ([email protected])
from pylab import *
import sys
import scipy.stats as st
class Classifier:
"""
Base class for all the classifiers
Args:
trainingData: training data set
target: target of the training data set
Attributes:
tData: the training data itself
tTarget: the target of tData
classes: an array containing the distinct classes
nTData: number of training examples
nFeatures: number of features (dimensions) in the data
nClasses: number of (distinct) classes or classifications
hasTrained: flag indicating whether the object has been trained
isCrossValidated: flag indicating whether the object has been cross-validated
Methods:
train: template method for the daughters classes (specific learners)
classify: template method for the daughters classes (specific learners)
crossValidate: cross validates the training given a test data set
"""
def __init__(self, trainingData, target):
"""
Initializes the Classifier objects and populates their attributes
"""
if trainingData.shape[0]==len(target):
self.tData = trainingData # training data array
self.tTarget = target # target class array
self.classes = array( list(set(target)) ) # (non-repeated) class array
self.nTData = trainingData.shape[0] # N of training samples
self.nFeatures = trainingData.shape[1] # N of features per sample
self.nClasses = len(self.classes) # N of classes (non-repeated) classes
self.hasTrained = False
self.isCrossValidated = False
else:
print " <!> ERROR: length of training sample doesn't match the length \
of target array!"
sys.exit(1)
def train():
"""
Dummy method for the inherited classes
"""
pass
def classify():
"""
Dummy method for the inherited classes
"""
pass
def crossValidate(self, testData, testTarget):
"""
Creates a cross validation table with a test Data set
"""
if self.hasTrained:
if len(testData)==len(testTarget) and set(testTarget)==set(self.classes):
# Create and initialize the crossValMatrix dictionary (of a dictionary)
crossValMatrix = {}
for ci in self.classes:
crossValMatrix[ci] = {}
for cj in self.classes:
crossValMatrix[ci][cj] = 0
testClassification = self.classify(testData)
for i in range(len(testTarget)):
crossValMatrix[testTarget[i]][testClassification[i]] += 1
self.crossValidated = True
return crossValMatrix
else:
print " <!> WARNING: the test data and test target don't have the same dimensions."
else:
print " <!> WARNING: The learner has not been trained yet."
class Ngbayes(Classifier):
"""
Naive gaussian bayes classifier for
continuous feature and discrete targets.
"""
def train(self):
"""
Trains the Ngbayes object
"""
# ML estimate of sample mean and sigma for each feature of each classification
self.mean = zeros( (len(self.classes),self.nFeatures) )
self.sigma = zeros( (len(self.classes),self.nFeatures) )
for c in range(len(self.classes)): # find the samples belonging to class c
sampleStatus = (self.tTarget==self.classes[c]) # True if t==0, False otherwise
buffer_tData = self.tData[sampleStatus,0:]
num_c_Samples = len(buffer_tData) # Number of samples with classification c
self.mean[c,0:] = buffer_tData.sum(axis=0)/(num_c_Samples*1.)
self.sigma[c,0:] = ((buffer_tData - self.mean[c,0:])**2).sum(axis=0)/(num_c_Samples-1.)
self.hasTrained = True
def classify(self, data):
"""
Runs the input data forward through the trained
classifier and returns the predicted classification
"""
if self.hasTrained:
classification = zeros(len(data))
for i in range(len(data)):
dataLine = repeat( data[i,0:].reshape(1,len(data[i,0:])), len(self.classes), axis=0 )
condProb = st.norm.pdf(dataLine, self.mean, self.sigma)
TTcondProb = condProb.prod(axis=1)
max = 0
#candPosterioProb = zeros(len(self.nClasses))
for c in range(len(self.classes)):
priorProb = list(self.tTarget).count(self.classes[c]) / (len(self.tTarget)*1.)
posteriorProb = TTcondProb[c]*priorProb
if posteriorProb>=max:
max = posteriorProb
classification[i] = self.classes[c]
return classification
else:
print "<!> WARNING: the Ngbayes classifier hasn't been trained yet."
class BinLogReg(Classifier):
"""
Implementation of a binary logistic regression classifier
(the classification of training samples must be 0 or 1)
"""
def train(self, eta=0.001, numIter=1000):
"""
Trains the logistic regression
"""
if set(self.classes)==set([0,1]):
buffer_data = concatenate( (ones((self.nTData,1)),self.tData), axis=1 )
w = zeros( self.nFeatures+1 )
for i in range(numIter):
expArg = repeat(w[0],self.nTData) + sum( w[1:]*buffer_data[:,1:], axis=1 )
estimatedP = exp(expArg)/(1.+exp(expArg))
gradientDir = sum( transpose(buffer_data)*(self.tTarget - estimatedP), axis=1 )
w = w + eta*gradientDir
self.hasTrained = True
self.w = w
else:
print "<!> WARNING: Unable to train with logit reg! \
The binary classes needs to be (0,1)."
def classify(self, data):
"""
Classifies with the trained classifier
"""
if self.hasTrained:
inferredClass = self.w[0] + sum( self.w[1:]*data, axis=1 )
decision = (inferredClass<0)
decision = [0 if i else 1 for i in decision]
return array(decision)
else:
print "<!> WARNING: the LogReg classifier hasn't been trained yet."
class Stump(Classifier):
"""
A decision stump
This is a weak classifier by itself. It should be use in conjunction
with the adaBoost meta-algorithm.
NOTE: all misclassification weights are considered to be the same,
i.e., misclassifying c1 as c2 carries the same penalty as misclassifying
c1 as c3.
"""
def __mostCommon(self, classes):
"""
Finds the most common element in an array of classes
"""
mostCommon = classes[random_integers(0,len(classes)-1)]
maxCount = 0
for c in classes:
count = list(classes).count(c)
if count > maxCount:
mostCommon = c
maxCount = count
else:
continue
return mostCommon
def train(self, weights):
candBoundary = zeros(self.nFeatures) # cand. split boundaries per feature
candError = zeros(self.nFeatures) # gini coef. at the candidate split
candLeftClass= zeros(self.nFeatures) # cand. class that satisfies the split condition (<)
candRightClass= zeros(self.nFeatures) # cand. class that satisfies the split condition (<)
for feature in range(self.nFeatures):
# sorted tData indices with 'feature'
idx_sorted = lexsort( (self.tTarget,self.tData[:,feature]) )
buffer_tData = self.tData[idx_sorted,feature] # sorted copy of tData
buffer_target = self.tTarget[idx_sorted] # sorted (according to tData) copy of target
buffer_weights = weights[idx_sorted] # sorted copy of weights
# find the indices of candidate splits
candStatus = diff(buffer_target)!=0 # this has one unit length less than tData
critIdx = where(candStatus)[0] # get the indices where candStatus is True
errAtBoundaries = []
boundaries = []
classesToTheLeft = [] # majority of classes satisfying the split condition (< boundary)
classesToTheRight = [] # majority of classes NOT satisfying the split condition (> boundary)
for i in critIdx:
critLimit = (buffer_tData[i+1]+buffer_tData[i])/2.
boundaries.append( critLimit )
leftClass = self.__mostCommon(buffer_target[:i+1])
# This just gives the complementary of leftClass
rightClass = self.classes[self.classes!=leftClass][0]
classesToTheLeft.append( leftClass )
classesToTheRight.append( rightClass )
classError = sum( buffer_weights[:i+1] * (buffer_target[:i+1]!=leftClass) )
classError += sum( buffer_weights[i+1:] * (buffer_target[i+1:]!=rightClass) )
classError /= self.nTData*1.
errAtBoundaries.append( classError )
candError[feature] = min(errAtBoundaries)
candBoundary[feature] = boundaries[ argmin(errAtBoundaries) ]
candLeftClass[feature] = classesToTheLeft[ argmin(errAtBoundaries) ]
candRightClass[feature] = classesToTheRight[ argmin(errAtBoundaries) ]
self.classError = min(errAtBoundaries)
self.rootBoundary = candBoundary[ argmin(candError) ]
self.rootFeature = argmin(candError)
self.leftClass = candLeftClass[ argmin(candError) ]
self.rightClass = candRightClass[ argmin(candError) ]
self.hasTrained = True
def classify(self, data):
if self.hasTrained:
classification = zeros(len(data))
for i in range(len(data)):
if data[i,self.rootFeature]<=self.rootBoundary:
classification[i] = self.leftClass
else:
classification[i] = self.rightClass
return classification
else:
print "<!> WARNING: the Stump classifier hasn't been trained yet."
class adaBoostStump(Stump):
"""
Binary adaptive boosting implementation of
the stump decision.
Note: The classification needs to be -1 or 1.
"""
def bTrain(self, maxNumIter=500, errorTolerance=1e-6, verbose=True):
"""
Trains the decision stump with adaboost
"""
# Initialize the weights and the ensemble weight (alpha) holder
weights = ones(self.nTData)/(self.nTData*1.)
alphaErr = []
classFeature = []
classCritLim = []
classOnLeft = []
classOnRight = []
for i in range(maxNumIter):
if verbose:
print ">> Iter", i
self.train(weights)
inferredClass = self.classify(self.tData)
alpha = 0.5*log((1.-self.classError)/self.classError)
alphaErr.append( alpha ) # collect the ensemble weights
classFeature.append( self.rootFeature )
classCritLim.append( self.rootBoundary )
classOnLeft.append( self.leftClass )
classOnRight.append( self.rightClass )
if verbose:
print "Nmisclassified=", sum(inferredClass!=self.tTarget)
print "err=", self.classError, "alpha=", alpha
print "left=", self.leftClass, "right=", self.rightClass, \
"rootBound=", self.rootBoundary, "rootFeat=", self.rootFeature
print "--"
print
# Update the weights
idx = where(inferredClass!=self.tTarget)[0]
weights[idx] = weights[idx] * exp( alpha )
idx = where(inferredClass==self.tTarget)[0]
weights[idx] = weights[idx] * exp( -alpha )
weights = weights/sum(weights)
if self.classError<errorTolerance: break
else: continue
self.alphaErr = array(alphaErr)
self.adaBFeature = array(classFeature)
self.adaBCritLim = array(classCritLim)
self.adaBOnLeft = array(classOnLeft)
self.adaBOnRight = array(classOnRight)
self.hasTrained = True
def bClassify(self, data):
"""
Returns the ensemble decision
"""
if self.hasTrained:
accSum = zeros(len(data))
for i in range(len(data)):
for f in range(len(self.adaBFeature)):
if data[i,self.adaBFeature[f]]<=self.adaBCritLim[f]:
accSum[i] += self.adaBOnLeft[f]*self.alphaErr[f]
else:
accSum[i] += self.adaBOnRight[f]*self.alphaErr[f]
return sign(accSum)
else:
print " <!> WARNING: adaboost has not been trained yet"
def bCrossValidate(self, testData, testTarget):
"""
Creates a cross validation table for the adaBoost
example with a test Data set and their true classifications
"""
if self.hasTrained:
if len(testData)==len(testTarget) and set(testTarget)==set(self.classes):
# Create and initialize the crossValMatrix dictionary (of a dictionary)
bCrossValMatrix = {}
for ci in self.classes:
bCrossValMatrix[ci] = {}
for cj in self.classes:
bCrossValMatrix[ci][cj] = 0
testClassification = self.bClassify(testData)
for i in range(len(testTarget)):
bCrossValMatrix[testTarget[i]][testClassification[i]] += 1
return bCrossValMatrix
else:
print " <!> WARNING: the test data and test target don't have the same dimensions."
else:
print " <!> WARNING: adaboost has not been trained yet"