Skip to content

Commit 7bec188

Browse files
author
xuming06
committed
update with mini batch kmeans.
1 parent cbfd810 commit 7bec188

1 file changed

Lines changed: 22 additions & 13 deletions

File tree

11scikit-learn/text_cluster/sklearn_cluster.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@
99
from sklearn.feature_extraction.text import TfidfVectorizer
1010
import numpy as np
1111

12-
1312
tfidf_filepath = 'tfidf.pkl'
1413
file_path = 'yl_10.txt'
1514
titles = []
@@ -64,19 +63,19 @@ def feature():
6463
doc = trim_stopwords(words, stopwords)
6564
docs.append(" ".join(doc))
6665
word_set |= set(doc)
67-
print('word set size:%s'%len(word_set))
66+
print('word set size:%s' % len(word_set))
6867
tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=0.1, analyzer='word', ngram_range=(1, 2),
6968
vocabulary=list(word_set))
70-
return tfidf_vectorizer,docs
69+
return tfidf_vectorizer, docs
7170

7271

7372
if not os.path.exists(tfidf_filepath):
74-
tfidf_vectorizer,docs = feature()
73+
tfidf_vectorizer, docs = feature()
7574
tfidf_matrix = tfidf_vectorizer.fit_transform(docs) # fit the vectorizer to synopses
7675
# terms is just a 集合 of the features used in the tf-idf matrix. This is a vocabulary
7776
terms = tfidf_vectorizer.get_feature_names() # 长度258
7877
print(terms)
79-
print('feature name size:%s'%len(terms))
78+
print('feature name size:%s' % len(terms))
8079

8180
with open(tfidf_filepath, 'wb') as f:
8281
pickle.dump(tfidf_matrix, f)
@@ -105,6 +104,8 @@ def feature():
105104
# Z[i] will tell us which clusters were merged, let's take a look at the first two points that were merged
106105
# We can see that ach row of the resulting array has the format [idx1, idx2, dist, sample_count]
107106
print(linkage_matrix)
107+
108+
108109
def show_link():
109110
plt.figure(figsize=(25, 10))
110111
plt.title('中文文本层次聚类树状图', fontproperties=zh_font)
@@ -118,18 +119,26 @@ def show_link():
118119
)
119120
plt.show()
120121
plt.close()
121-
show_link()
122122

123-
from sklearn.cluster import KMeans
123+
124+
# show_link()
125+
126+
from sklearn.cluster import MiniBatchKMeans
124127
from sklearn.metrics.pairwise import pairwise_distances_argmin
125-
n_clusters=4
126-
X=tfidf_matrix
127-
kmeans = KMeans(n_clusters=4,random_state=10).fit(X)
128-
print(kmeans.labels_)
128+
129+
n_clusters = 4
130+
X = tfidf_matrix
131+
kmeans = MiniBatchKMeans(init='k-means++', n_clusters=n_clusters, batch_size=100,
132+
n_init=10, max_no_improvement=10, verbose=0)
133+
kmeans.fit(X)
129134
print(kmeans.cluster_centers_)
135+
print(kmeans.labels_)
136+
from collections import Counter
137+
138+
print(Counter(kmeans.labels_))
130139

131-
print("-"*30)
140+
print("-" * 30)
132141
k_means_cluster_centers = np.sort(kmeans.cluster_centers_, axis=0)
133142
k_means_labels = pairwise_distances_argmin(X, k_means_cluster_centers)
134143
print(k_means_cluster_centers)
135-
print(k_means_labels)
144+
print(k_means_labels)

0 commit comments

Comments
 (0)