Skip to content

Commit b27fe45

Browse files
author
xuming06
committed
update active learning.
1 parent 7bec188 commit b27fe45

15 files changed

Lines changed: 4487 additions & 12 deletions

11scikit-learn/active_learning/Active_Learning_Tutorial.ipynb

Lines changed: 2373 additions & 0 deletions
Large diffs are not rendered by default.

11scikit-learn/active_learning/active_learning_mnist.ipynb

Lines changed: 508 additions & 0 deletions
Large diffs are not rendered by default.

11scikit-learn/active_learning/active_learning_risk.ipynb

Lines changed: 385 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 256 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,256 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
@author:XuMing([email protected])
4+
@description:
5+
"""
6+
from sklearn.linear_model import LogisticRegression
7+
8+
'''
9+
Created on Jul 4, 2014
10+
based on http://scikit-learn.org/stable/auto_examples/document_classification_20newsgroups.html
11+
12+
This program implements active learning (http://en.wikipedia.org/wiki/Active_learning_(machine_learning))
13+
for text classification tasks with scikit-learn's LinearSVC classifier. Despite differences this can also be called
14+
incremental training.
15+
Instead of using Stochastic Gradient Descent we used the batch mode because the data is not that big
16+
and accuracy here was more of concern than efficiency.
17+
18+
The algorithm trains the model based on a train dataset and evaluates using a test dataset.
19+
After each evaluation algorithm selects 2*NUM_QUESTIONS samples from unlabeled dataset in order
20+
to be labeled by a user/expert. The labeled sample is then moved to the corresponding directory in
21+
the train dataset and the model will start training again with the new improved training set.
22+
23+
The selection of unlabeled samples is based on decision_function of SVM which is
24+
the distance of the samples X to the separating hyperplane. This distance is between
25+
[-1, 1] but because we need confidence levels we use absolute values. In case the classes
26+
are more than two, the decision function will return a confidence level for each class and for each sample
27+
so in case we have more than 2 classes we average over the absolute values of confidence over all the classes.
28+
29+
We use top NUM_QUESTIONS samples with highest average absolute confidence and also top NUM_QUESTIONS
30+
samples with lowest average absolute confidence for expert labeling. This procedure can be easily changed
31+
by modifying the code in benchmark function.
32+
33+
This program requires a directory structure similar to what is shown below:
34+
mainDirectory
35+
train
36+
pos
37+
1.txt
38+
2.txt
39+
neg
40+
3.txt
41+
4.txt
42+
test
43+
pos
44+
5.txt
45+
6.txt
46+
neg
47+
7.txt
48+
8.txt
49+
unlabeled
50+
unlabeled
51+
9.txt
52+
10.txt
53+
11.txt
54+
The filenames in unlabeled should not be a duplicate of filenames in train directory because every time we label a file
55+
we will move that file into the corresponding class directory in train directory.
56+
57+
The pos and neg categories are arbitrary and both the number of the classes and their name can be different with what is shown here.
58+
The classifier can also be changed to any other classifier in scikit-learn.
59+
60+
61+
@author: afshin rahimi
62+
63+
'''
64+
# matplotlib.use('Agg')
65+
import os
66+
import shutil
67+
from time import time
68+
69+
import numpy as np
70+
import pylab as pl
71+
from sklearn import metrics
72+
from sklearn.datasets import load_files
73+
from sklearn.feature_extraction.text import TfidfVectorizer
74+
from sklearn.svm import LinearSVC
75+
from sklearn.utils.extmath import density
76+
77+
NUM_QUESTIONS = 3
78+
PLOT_RESULTS = False
79+
ACTIVE = True
80+
DATA_FOLDER = "./data"
81+
TRAIN_FOLDER = os.path.join(DATA_FOLDER, "train")
82+
TEST_FOLDER = os.path.join(DATA_FOLDER, "test")
83+
UNLABELED_FOLDER = os.path.join(DATA_FOLDER, "unlabeled")
84+
ENCODING = 'utf-8'
85+
while True:
86+
data_train = load_files(TRAIN_FOLDER, encoding=ENCODING)
87+
data_test = load_files(TEST_FOLDER, encoding=ENCODING)
88+
data_unlabeled = load_files(UNLABELED_FOLDER, encoding=ENCODING)
89+
90+
categories = data_train.target_names
91+
92+
93+
def size_mb(docs):
94+
return sum(len(s.encode('utf-8')) for s in docs) / 1e6
95+
96+
97+
data_train_size_mb = size_mb(data_train.data)
98+
data_test_size_mb = size_mb(data_test.data)
99+
data_unlabeled_size_mb = size_mb(data_unlabeled.data)
100+
101+
print("%d documents - %0.3fMB (training set)" % (
102+
len(data_train.data), data_train_size_mb))
103+
print("%d documents - %0.3fMB (test set)" % (
104+
len(data_test.data), data_test_size_mb))
105+
print("%d documents - %0.3fMB (unlabeled set)" % (
106+
len(data_unlabeled.data), data_unlabeled_size_mb))
107+
print("%d categories" % len(categories))
108+
print()
109+
110+
# split a training set and a test set
111+
y_train = data_train.target
112+
y_test = data_test.target
113+
114+
print("Extracting features from the training dataset using a sparse vectorizer")
115+
t0 = time()
116+
vectorizer = TfidfVectorizer(encoding=ENCODING, use_idf=True, norm='l2', binary=False, sublinear_tf=True,
117+
min_df=0.001, max_df=1.0, ngram_range=(1, 2), analyzer='word', stop_words=None)
118+
119+
# the output of the fit_transform (x_train) is a sparse csc matrix.
120+
X_train = vectorizer.fit_transform(data_train.data)
121+
duration = time() - t0
122+
print("done in %fs at %0.3fMB/s" % (duration, data_train_size_mb / duration))
123+
print("n_samples: %d, n_features: %d" % X_train.shape)
124+
print()
125+
126+
print("Extracting features from the test dataset using the same vectorizer")
127+
t0 = time()
128+
X_test = vectorizer.transform(data_test.data)
129+
duration = time() - t0
130+
print("done in %fs at %0.3fMB/s" % (duration, data_test_size_mb / duration))
131+
print("n_samples: %d, n_features: %d" % X_test.shape)
132+
print()
133+
134+
print("Extracting features from the unlabled dataset using the same vectorizer")
135+
t0 = time()
136+
X_unlabeled = vectorizer.transform(data_unlabeled.data)
137+
duration = time() - t0
138+
print("done in %fs at %0.3fMB/s" % (duration, data_unlabeled_size_mb / duration))
139+
print("n_samples: %d, n_features: %d" % X_unlabeled.shape)
140+
print()
141+
142+
143+
def trim(s):
144+
"""Trim string to fit on terminal (assuming 80-column display)"""
145+
return s if len(s) <= 80 else s[:77] + "..."
146+
147+
148+
###############################################################################
149+
# Benchmark classifiers
150+
def benchmark(clf):
151+
print('_' * 80)
152+
print("Training: ")
153+
print(clf)
154+
t0 = time()
155+
clf.fit(X_train, y_train)
156+
train_time = time() - t0
157+
print("train time: %0.3fs" % train_time)
158+
159+
t0 = time()
160+
pred = clf.predict(X_test)
161+
test_time = time() - t0
162+
print("test time: %0.3fs" % test_time)
163+
164+
score = metrics.f1_score(y_test, pred)
165+
accscore = metrics.accuracy_score(y_test, pred)
166+
print("pred count is %d" % len(pred))
167+
print('accuracy score: %0.3f' % accscore)
168+
print("f1-score: %0.3f" % score)
169+
170+
if hasattr(clf, 'coef_'):
171+
print("dimensionality: %d" % clf.coef_.shape[1])
172+
print("density: %f" % density(clf.coef_))
173+
174+
print("classification report:")
175+
print(metrics.classification_report(y_test, pred,
176+
target_names=categories))
177+
178+
print("confusion matrix:")
179+
print(metrics.confusion_matrix(y_test, pred))
180+
181+
print("confidence for unlabeled data:")
182+
# compute absolute confidence for each unlabeled sample in each class
183+
confidences = np.abs(clf.decision_function(X_unlabeled))
184+
# average abs(confidence) over all classes for each unlabeled sample (if there is more than 2 classes)
185+
if (len(categories) > 2):
186+
confidences = np.average(confidences, axix=1)
187+
188+
print(confidences)
189+
sorted_confidences = np.argsort(confidences)
190+
question_samples = []
191+
# select top k low confidence unlabeled samples
192+
low_confidence_samples = sorted_confidences[0:NUM_QUESTIONS]
193+
# select top k high confidence unlabeled samples
194+
high_confidence_samples = sorted_confidences[-NUM_QUESTIONS:]
195+
196+
question_samples.extend(low_confidence_samples.tolist())
197+
question_samples.extend(high_confidence_samples.tolist())
198+
199+
print()
200+
clf_descr = str(clf).split('(')[0]
201+
return clf_descr, score, train_time, test_time, question_samples
202+
203+
204+
results = []
205+
# clf = LinearSVC(loss='l2', penalty='l2',dual=False, tol=1e-3, class_weight='auto')
206+
clf = LogisticRegression()
207+
results.append(benchmark(clf))
208+
209+
# make some plots
210+
211+
indices = np.arange(len(results))
212+
213+
results = [[x[i] for x in results] for i in range(5)]
214+
215+
clf_names, score, training_time, test_time, question_samples = results
216+
training_time = np.array(training_time) / np.max(training_time)
217+
test_time = np.array(test_time) / np.max(test_time)
218+
if PLOT_RESULTS:
219+
pl.figure(figsize=(12, 8))
220+
pl.title("Score")
221+
pl.barh(indices, score, .2, label="score", color='r')
222+
pl.barh(indices + .3, training_time, .2, label="training time", color='g')
223+
pl.barh(indices + .6, test_time, .2, label="test time", color='b')
224+
pl.yticks(())
225+
pl.legend(loc='best')
226+
pl.subplots_adjust(left=.25)
227+
pl.subplots_adjust(top=.95)
228+
pl.subplots_adjust(bottom=.05)
229+
230+
for i, c in zip(indices, clf_names):
231+
pl.text(-.3, i, c)
232+
pl.savefig('ngramoptimize.png')
233+
pl.show()
234+
235+
if ACTIVE:
236+
for i in question_samples[0]:
237+
filename = data_unlabeled.filenames[i]
238+
print(filename)
239+
print('**************************content***************************')
240+
print(data_unlabeled.data[i])
241+
print('**************************content end***********************')
242+
print("Annotate this text (select one label):")
243+
for i in range(0, len(categories)):
244+
print("%d = %s" % (i + 1, categories[i]))
245+
labelNumber = input("Enter the correct label number:")
246+
while labelNumber.isdigit() == False:
247+
labelNumber = input("Enter the correct label number (a number please):")
248+
labelNumber = int(labelNumber)
249+
category = categories[labelNumber - 1]
250+
dstDir = os.path.join(TRAIN_FOLDER, category)
251+
shutil.move(filename, dstDir)
252+
else:
253+
break
254+
255+
import codecs
256+
codecs.open("a.txt",'r',encoding='gbk',errors='ignore')

0 commit comments

Comments
 (0)