Skip to content

Commit feb6ae7

Browse files
author
xuming06
committed
add idf
1 parent b232d08 commit feb6ae7

1 file changed

Lines changed: 27 additions & 0 deletions

File tree

11scikit-learn/idf.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# -*- coding: utf-8 -*-
2+
# Author: XuMing <[email protected]>
3+
# Brief:
4+
5+
from sklearn.feature_extraction.text import TfidfVectorizer
6+
7+
corpus = ["I come to China to travel",
8+
"This is a car polupar in China",
9+
"I love tea and Apple ",
10+
"The work is to write some papers in science"]
11+
12+
vectorizer = TfidfVectorizer()
13+
14+
tfidf = vectorizer.fit_transform(corpus)
15+
print(tfidf)
16+
print('vocab:')
17+
print(vectorizer.vocabulary_)
18+
word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语
19+
weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重
20+
for i in range(len(weight)): # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重
21+
print(u"-------这里输出第", i, u"类文本的词语tf-idf权重------")
22+
k_v = dict()
23+
for j in range(len(word)):
24+
print(word[j], weight[i][j])
25+
k_v[word[j]] = weight[i][j]
26+
sorts = sorted(k_v.items(), key=lambda d:d[1],reverse=True)
27+
print(sorts[:5])

0 commit comments

Comments
 (0)