99from sklearn .feature_extraction .text import TfidfVectorizer
1010import numpy as np
1111
12-
1312tfidf_filepath = 'tfidf.pkl'
1413file_path = 'yl_10.txt'
1514titles = []
@@ -64,19 +63,19 @@ def feature():
6463 doc = trim_stopwords (words , stopwords )
6564 docs .append (" " .join (doc ))
6665 word_set |= set (doc )
67- print ('word set size:%s' % len (word_set ))
66+ print ('word set size:%s' % len (word_set ))
6867 tfidf_vectorizer = TfidfVectorizer (max_df = 0.9 , min_df = 0.1 , analyzer = 'word' , ngram_range = (1 , 2 ),
6968 vocabulary = list (word_set ))
70- return tfidf_vectorizer ,docs
69+ return tfidf_vectorizer , docs
7170
7271
7372if not os .path .exists (tfidf_filepath ):
74- tfidf_vectorizer ,docs = feature ()
73+ tfidf_vectorizer , docs = feature ()
7574 tfidf_matrix = tfidf_vectorizer .fit_transform (docs ) # fit the vectorizer to synopses
7675 # terms is just a 集合 of the features used in the tf-idf matrix. This is a vocabulary
7776 terms = tfidf_vectorizer .get_feature_names () # 长度258
7877 print (terms )
79- print ('feature name size:%s' % len (terms ))
78+ print ('feature name size:%s' % len (terms ))
8079
8180 with open (tfidf_filepath , 'wb' ) as f :
8281 pickle .dump (tfidf_matrix , f )
@@ -105,6 +104,8 @@ def feature():
105104# Z[i] will tell us which clusters were merged, let's take a look at the first two points that were merged
106105# We can see that ach row of the resulting array has the format [idx1, idx2, dist, sample_count]
107106print (linkage_matrix )
107+
108+
108109def show_link ():
109110 plt .figure (figsize = (25 , 10 ))
110111 plt .title ('中文文本层次聚类树状图' , fontproperties = zh_font )
@@ -118,18 +119,26 @@ def show_link():
118119 )
119120 plt .show ()
120121 plt .close ()
121- show_link ()
122122
123- from sklearn .cluster import KMeans
123+
124+ # show_link()
125+
126+ from sklearn .cluster import MiniBatchKMeans
124127from sklearn .metrics .pairwise import pairwise_distances_argmin
125- n_clusters = 4
126- X = tfidf_matrix
127- kmeans = KMeans (n_clusters = 4 ,random_state = 10 ).fit (X )
128- print (kmeans .labels_ )
128+
129+ n_clusters = 4
130+ X = tfidf_matrix
131+ kmeans = MiniBatchKMeans (init = 'k-means++' , n_clusters = n_clusters , batch_size = 100 ,
132+ n_init = 10 , max_no_improvement = 10 , verbose = 0 )
133+ kmeans .fit (X )
129134print (kmeans .cluster_centers_ )
135+ print (kmeans .labels_ )
136+ from collections import Counter
137+
138+ print (Counter (kmeans .labels_ ))
130139
131- print ("-" * 30 )
140+ print ("-" * 30 )
132141k_means_cluster_centers = np .sort (kmeans .cluster_centers_ , axis = 0 )
133142k_means_labels = pairwise_distances_argmin (X , k_means_cluster_centers )
134143print (k_means_cluster_centers )
135- print (k_means_labels )
144+ print (k_means_labels )
0 commit comments