Skip to content

Commit a63bc16

Browse files
committed
update train model with noisy.
1 parent 6893d17 commit a63bc16

2 files changed

Lines changed: 178 additions & 58 deletions

File tree

Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
@author:XuMing([email protected])
4+
@description:
5+
"""
6+
7+
# # simplified Confident Learning Tutorial
8+
# *Author: Curtis G. Northcutt, [email protected]*
9+
#
10+
# In this tutorial, we show how to implement confident learning without using cleanlab (for the most part).
11+
# This tutorial is to confident learning what this tutorial https://pytorch.org/tutorials/beginner/examples_tensor/two_layer_net_numpy.html
12+
# is to deep learning.
13+
#
14+
# The actual implementations in cleanlab are complex because they support parallel processing, numerous type and input checaccuracy_scores, lots of hyper-parameter settings, lots of utilities to maaccuracy_scoree things woraccuracy_score smoothly for all types of inputs, and ancillary functions.
15+
#
16+
# I ignore all of that here and provide you a bare-bones implementation using mostly for-loops and some numpy.
17+
# Here we'll do two simple things:
18+
# 1. Compute the confident joint which fully characterizes all label noise.
19+
# 2. Find the indices of all label errors, ordered by liaccuracy_scoreelihood of being an error.
20+
#
21+
# ## INPUT (stuff we need beforehand):
22+
# 1. s - These are the noisy labels. This is an np.array of noisy labels, shape (n,1)
23+
# 2. psx - These are the out-of-sample holdout predicted probabilities for every example in your dataset. This is an np.array (2d) of probabilities, shape (n, m)
24+
#
25+
# ## OUTPUT (what this returns):
26+
# 1. confident_joint - an (m, m) np.array matrix characterizing all the label error counts for every pair of labels.
27+
# 2. label_errors_idx - a numpy array comprised of indices of every label error, ordered by likelihood of being a label error.
28+
#
29+
# In this tutorial we use the handwritten digits dataset as an example.
30+
31+
# In[1]:
32+
33+
34+
from __future__ import print_function, absolute_import, division, with_statement
35+
36+
# To silence convergence warnings caused by using a weak
37+
# logistic regression classifier on image data
38+
import warnings
39+
40+
import cleanlab
41+
import numpy as np
42+
from cleanlab.classification import LearningWithNoisyLabels
43+
from cleanlab.pruning import get_noise_indices
44+
from sklearn import datasets
45+
from sklearn.linear_model import LogisticRegression
46+
from sklearn.metrics import accuracy_score
47+
from sklearn.model_selection import train_test_split
48+
49+
warnings.simplefilter("ignore")
50+
np.random.seed(477)
51+
52+
# In[2]:
53+
54+
55+
# STEP 0 - Get some real digits data. Add a bunch of label errors. Get probs.
56+
iris = datasets.load_iris()
57+
# Get handwritten digits data
58+
X = iris.data # we only take the first two features.
59+
y = iris.target
60+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
61+
print("X:", X[:10])
62+
print("y_train:", y_train[:200])
63+
print('datasets number of classes:', len(np.unique(y)))
64+
print('datasets number of examples:', len(y))
65+
print(len(set(y)))
66+
# Add lots of errors to labels
67+
s = np.array(y_train)
68+
for i in range(10):
69+
# Switch to some wrong label thats a different class
70+
s[i] = 2
71+
72+
# Confirm that we indeed added NUM_ERRORS label errors
73+
actual_label_errors = np.arange(len(y_train))[s != y_train]
74+
print('\nIndices of actual label errors:\n', actual_label_errors)
75+
print('error with y, y[:20]:', s[:20])
76+
print("len of errors:", len(actual_label_errors))
77+
actual_num_errors = len(actual_label_errors)
78+
# To keep the tutorial short, we use cleanlab to get the
79+
# out-of-sample predicted probabilities using cross-validation
80+
# with a very simple, non-optimized logistic regression classifier
81+
clf = LogisticRegression()
82+
psx = cleanlab.latent_estimation.estimate_cv_predicted_probabilities(
83+
X_train, s, clf=clf)
84+
85+
# Now we have our noisy labels s and predicted probabilities psx.
86+
# That's all we need for confident learning.
87+
88+
89+
# STEP 1 - Compute confident joint
90+
91+
# Verify inputs
92+
s = np.asarray(s)
93+
psx = np.asarray(psx)
94+
95+
ordered_label_errors = get_noise_indices(
96+
s=s,
97+
psx=psx,
98+
sorted_index_method='normalized_margin', # Orders label errors
99+
)
100+
101+
print('orderd_label_errors:')
102+
103+
print(np.array(sorted(ordered_label_errors)))
104+
idx_errors = ordered_label_errors
105+
106+
label_errors_idx = np.array(sorted(ordered_label_errors))
107+
score = sum([e in label_errors_idx for e in actual_label_errors]) / actual_num_errors
108+
print('% actual errors that confident learning found: {:.0%}'.format(score))
109+
score = sum([e in actual_label_errors for e in label_errors_idx]) / len(label_errors_idx)
110+
print('% confident learning errors that are actual errors: {:.0%}'.format(score))
111+
112+
# original lr f1
113+
114+
print('WITHOUT confident learning,', end=" ")
115+
116+
clf.fit(X_train, s)
117+
pred = clf.predict(X_test)
118+
print("dataset test f1:", round(accuracy_score(pred, y_test), 4))
119+
120+
print("\nNow we show improvement using cleanlab to characterize the noise")
121+
print("and learn on the data that is (with high confidence) labeled correctly.")
122+
print()
123+
print('WITH confident learning (psx not given),', end=" ")
124+
rp = LearningWithNoisyLabels(clf=clf)
125+
rp.fit(X_train, s)
126+
pred = rp.predict(X_test)
127+
print("dataset test f1:", round(accuracy_score(pred, y_test), 4))
128+
129+
print('WITH confident learning (psx given),', end=" ")
130+
rp.fit(X=X_train, s=s, psx=psx)
131+
pred = rp.predict(X_test)
132+
print("dataset test f1:", round(accuracy_score(pred, y_test), 4))
133+
134+
print('WITH all label right,', end=" ")
135+
clf.fit(X_train, y_train)
136+
pred = clf.predict(X_test)
137+
print("dataset test f1:", round(accuracy_score(pred, y_test), 4))
138+
139+
print("-------------------")
140+
rp_score = accuracy_score(y_test, rp.fit(X_train, s, psx=psx).predict(X_test))
141+
print("Logistic regression (+rankpruning):", rp_score)
142+
143+
clf.fit(X_train[~idx_errors], s[~idx_errors])
144+
pred = clf.predict(X_test)
145+
print('Fit on denoised data without re-weighting:', accuracy_score(y_test, pred))

37confident-learning/03cleanlib_retrain_model_demo.py renamed to 37confident-learning/04cleanlib_retrain_model_digit_demo.py

Lines changed: 33 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -39,8 +39,11 @@
3939

4040
import cleanlab
4141
import numpy as np
42+
from cleanlab.classification import LearningWithNoisyLabels
4243
from sklearn.datasets import load_digits
4344
from sklearn.linear_model import LogisticRegression
45+
from sklearn.metrics import f1_score
46+
from sklearn.model_selection import train_test_split
4447

4548
warnings.simplefilter("ignore")
4649
np.random.seed(477)
@@ -57,24 +60,25 @@
5760
print("y:", y[:100])
5861
print('Handwritten digits datasets number of classes:', len(np.unique(y)))
5962
print('Handwritten digits datasets number of examples:', len(y))
60-
63+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
6164
# Add lots of errors to labels
62-
s = np.array(y)
63-
for i in range(50):
65+
s = np.array(y_train)
66+
for i in range(100):
6467
# Switch to some wrong label thats a different class
6568
s[i] = 0
6669

6770
# Confirm that we indeed added NUM_ERRORS label errors
68-
actual_label_errors = np.arange(len(y))[s != y]
71+
actual_label_errors = np.arange(len(y_train))[s != y_train]
6972
print('\nIndices of actual label errors:\n', actual_label_errors)
7073
print('error with y, y[:20]:', s[:20])
7174
print("len of errors:", len(actual_label_errors))
7275
actual_num_errors = len(actual_label_errors)
7376
# To keep the tutorial short, we use cleanlab to get the
7477
# out-of-sample predicted probabilities using cross-validation
7578
# with a very simple, non-optimized logistic regression classifier
79+
clf = LogisticRegression()
7680
psx = cleanlab.latent_estimation.estimate_cv_predicted_probabilities(
77-
X, s, clf=LogisticRegression(solver='lbfgs'))
81+
X_train, s, clf=clf)
7882

7983
# Now we have our noisy labels s and predicted probabilities psx.
8084
# That's all we need for confident learning.
@@ -94,7 +98,6 @@
9498
ordered_label_errors = get_noise_indices(
9599
s=s,
96100
psx=psx,
97-
frac_noise=2.1,
98101
sorted_index_method='normalized_margin', # Orders label errors
99102
)
100103

@@ -109,64 +112,36 @@
109112
score = sum([e in actual_label_errors for e in label_errors_idx]) / len(label_errors_idx)
110113
print('% confident learning errors that are actual errors: {:.0%}'.format(score))
111114

112-
113-
from sklearn.metrics import roc_auc_score,accuracy_score,f1_score
114115
# original lr f1
115-
m = LogisticRegression()
116-
m.fit(X,y=s)
117-
m_pred = m.predict(X)
118-
f1_origin = f1_score(s,m_pred,average='micro')
119-
print('f1_origin_compare_error:',f1_origin)
120-
121-
f1_origin_true = f1_score(y,m_pred,average='micro')
122-
print('f1_origin_compare_truth:',f1_origin_true)
123-
from cleanlab.classification import LearningWithNoisyLabels
124116

125-
# Wrap around any classifier. Yup, you can use sklearn/pyTorch/Tensorflow/FastText/etc.
126-
lnl = LearningWithNoisyLabels(clf=LogisticRegression())
127-
lnl.fit(X=X, s=s)
128-
# Estimate the predictions you would have gotten by training with *no* label errors.
129-
predicted_test_labels = lnl.predict(X)
130-
f1_origin = f1_score(s,predicted_test_labels,average='micro')
131-
print('f1_new_compare_error:',f1_origin)
117+
print('WITHOUT confident learning,', end=" ")
132118

133-
f1_origin_true = f1_score(y,predicted_test_labels,average='micro')
134-
print('f1_new_compare_truth:',f1_origin_true)
119+
clf.fit(X_train, s)
120+
pred = clf.predict(X_test)
121+
print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4))
135122

136-
lnl = LearningWithNoisyLabels(clf=LogisticRegression())
137-
lnl.fit(X=X, s=s,psx=psx)
138-
# Estimate the predictions you would have gotten by training with *no* label errors.
139-
predicted_test_labels = lnl.predict(X)
140-
f1_origin = f1_score(s,predicted_test_labels,average='micro')
141-
print('f1_psx_compare_error:',f1_origin)
123+
print("\nNow we show improvement using cleanlab to characterize the noise")
124+
print("and learn on the data that is (with high confidence) labeled correctly.")
125+
print()
126+
print('WITH confident learning (psx not given),', end=" ")
127+
rp = LearningWithNoisyLabels(clf=clf)
128+
rp.fit(X_train, s)
129+
pred = rp.predict(X_test)
130+
print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4))
142131

143-
f1_origin_true = f1_score(y,predicted_test_labels,average='micro')
144-
print('f1_psx_compare_truth:',f1_origin_true)
132+
print('WITH confident learning (psx given),', end=" ")
133+
rp.fit(X=X_train, s=s, psx=psx)
134+
pred = rp.predict(X_test)
135+
print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4))
145136

146-
f1 = f1_score(y,predicted_test_labels,average='micro')
147-
print("f1:",f1)
148-
score = lnl.score(X,s)
149-
print('score_compare_error:',score)
137+
print('WITH all label right,', end=" ")
138+
clf.fit(X_train, y_train)
139+
pred = clf.predict(X_test)
140+
print("dataset test f1:", round(f1_score(pred, y_test, average='micro'), 4))
150141

151-
score = lnl.score(X,y)
152-
print('score_compare_truth:',score)
153-
154-
m = LogisticRegression()
155-
m.fit(X,y)
156-
m_pred = m.predict(X)
157-
158-
f1_origin_true = f1_score(y,m_pred,average='micro')
159-
print('f1_all_right:',f1_origin_true)
160-
161-
print("f1 Comparison")
162142
print("-------------------")
163-
clf = LogisticRegression(solver = 'lbfgs', multi_class = 'auto')
164-
baseline_score = f1_score(y, clf.fit(X, s).predict(X),average='micro')
165-
print("Logistic regression baseline_score:", baseline_score)
166-
rp = LearningWithNoisyLabels()
167-
rp_score = f1_score(y, rp.fit(X, s, psx=psx).predict(X),average='micro')
143+
rp_score = f1_score(y_test, rp.fit(X_train, s, psx=psx).predict(X_test), average='micro')
168144
print("Logistic regression (+rankpruning):", rp_score)
169-
diff = rp_score - baseline_score
170-
clf = LogisticRegression(solver = 'lbfgs', multi_class = 'auto')
171-
print('Fit on denoised data without re-weighting:', f1_score(y, clf.fit(X[~idx_errors], s[~idx_errors]).predict(X),average='micro'))
172-
145+
clf = LogisticRegression(solver='lbfgs', multi_class='auto')
146+
print('Fit on denoised data without re-weighting:',
147+
f1_score(y_test, clf.fit(X_train[~idx_errors], s[~idx_errors]).predict(X_test), average='micro'))

0 commit comments

Comments
 (0)