Skip to content

Commit fc1b88e

Browse files
committed
2021.9.3.19.05
1 parent b5562d8 commit fc1b88e

65 files changed

Lines changed: 2987 additions & 39 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

chinese_stop_words.txt

Lines changed: 1470 additions & 0 deletions
Large diffs are not rendered by default.

classify_file.txt

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
./test_segments/人工智能算法/为什么国家将加快人工智能研究生培养?又为什么很多研究生评论人工智能是个大坑呢?.txt 实际类别:人工智能算法 -->预测类别:统计学
2+
./test_segments/人工智能算法/人工智能可以产生自主意识吗?.txt 实际类别:人工智能算法 -->预测类别:统计学
3+
./test_segments/人工智能算法/如果人工智能迎来下一个寒冬,你认为会是卡在什么问题上?.txt 实际类别:人工智能算法 -->预测类别:统计学
4+
./test_segments/人工智能算法/浅析 Hinton 最近提出的 Capsule 计划.txt 实际类别:人工智能算法 -->预测类别:统计学
5+
./test_segments/人工智能算法/面对大数据杀熟、算法困住骑手,民主促进会中央建议推行算法开发主体责任制,你怎么看?.txt 实际类别:人工智能算法 -->预测类别:人工智能算法
6+
./test_segments/统计学/Kaggle如何入门?.txt 实际类别:统计学 -->预测类别:统计学
7+
./test_segments/统计学/为什么全网都在说生育率下降,而我看周围的人都结婚生子有条不紊地进行呢?.txt 实际类别:统计学 -->预测类别:人工智能算法
8+
./test_segments/统计学/为什么样本方差(sample variance)的分母是 n-1?.txt 实际类别:统计学 -->预测类别:统计学
9+
./test_segments/统计学/什么是幸存者偏差?.txt 实际类别:统计学 -->预测类别:统计学
10+
./test_segments/统计学/神经网络为什么可以(理论上)拟合任何函数?.txt 实际类别:统计学 -->预测类别:统计学

cut_words.py

Lines changed: 29 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -47,63 +47,63 @@ def cast_words(origin_path, save_path, theme_tag):
4747
'''
4848
file_lists = os.listdir(origin_path) #原文档所在路径
4949

50-
print('\n'+'file_lists:')
51-
print(file_lists)
52-
print('\n'+'origin_path:')
53-
print(origin_path)
50+
# print('\n'+'file_lists:')
51+
# print(file_lists)
52+
# print('\n'+'origin_path:')
53+
# print(origin_path)
5454

5555
for dir_1 in file_lists: #找到文件夹
5656
file_path = origin_path + dir_1 + "/" #原始文件路径
5757

58-
print('\n' + 'dir_1:')
59-
print(dir_1)
60-
61-
print('\n' + 'file_path:')
62-
print(file_path)
58+
# print('\n' + 'dir_1:')
59+
# print(dir_1)
60+
#
61+
# print('\n' + 'file_path:')
62+
# print(file_path)
6363

6464
seg_path = save_path + dir_1 + "/" #切词后文件路径
6565

66-
print('\n' + 'save_path:')
67-
print(save_path)
68-
69-
print('\n' + 'seg_path:')
70-
print(seg_path)
66+
# print('\n' + 'save_path:')
67+
# print(save_path)
68+
#
69+
# print('\n' + 'seg_path:')
70+
# print(seg_path)
7171

7272
if not os.path.exists(seg_path):
7373
os.makedirs(seg_path)
7474
detail_paths = os.listdir(file_path)
7575

76-
print('\n' + 'detail_paths:')
77-
print(detail_paths)
76+
# print('\n' + 'detail_paths:')
77+
# print(detail_paths)
7878

7979
for detail_path in detail_paths: #找到文件夹下具体文件路径
8080
full_path = file_path + detail_path #原始文件下每个文档路径
8181

82-
print('\n' + 'detail_path:')
83-
print(detail_path)
84-
85-
print('\n' + 'full_path:')
86-
print(full_path)
82+
# print('\n' + 'detail_path:')
83+
# print(detail_path)
84+
#
85+
# print('\n' + 'full_path:')
86+
# print(full_path)
8787

8888
file_content = read_file(full_path)
8989

90-
print('\n' + 'file_content:')
91-
print(file_content)
90+
# print('\n' + 'file_content:')
91+
# print(file_content)
9292

9393
file_content = file_content.strip() # replace("\r\n", " ")
9494
# 删除换行
95-
print('\n' + 'file_content.strip():')
96-
print(file_content)
95+
# print('\n' + 'file_content.strip():')
96+
# print(file_content)
9797

9898
file_content = file_content.replace("\'", "")
9999

100-
print('\n' + 'file_content.replace("\'", ""):')
101-
print(file_content)
100+
# print('\n' + 'file_content.replace("\'", ""):')
101+
# print(file_content)
102102

103103
file_content = file_content.replace("\\n", "")
104104

105-
print('\n' + 'file_content.replace("\\n", ""):')
106-
print(file_content)
105+
# print('\n' + 'file_content.replace("\\n", ""):')
106+
# print(file_content)
107107

108108
content_seg1 = jieba.cut(file_content) # 为文件内容分词
109109
content_seg2 = jieba.cut(file_content) # 为文件内容分词

nbayes.py

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/usr/bin/env python
2+
# -*- coding: UTF-8 -*-
3+
import pickle
4+
from sklearn.naive_bayes import MultinomialNB
5+
import warnings
6+
from sklearn import metrics
7+
warnings.filterwarnings("ignore")
8+
# 读取bunch对象
9+
def read_bunch(path):
10+
with open(path, "rb") as fp:
11+
bunch = pickle.load(fp) # joblib 同样可用于存储模型文件
12+
return bunch
13+
# 分类结果,保存至文件
14+
def save_file(save_path, content):
15+
with open(save_path, "a",encoding= 'utf-8',errors='ignore') as fp:
16+
fp.write(content)
17+
# 朴素贝叶斯分类
18+
def nbayes_classify(train_set, test_set):
19+
'''
20+
train_set: 训练集样本数据
21+
test_set: 测试集样本数据
22+
:return: 测试集样本分类
23+
'''
24+
clf = MultinomialNB(alpha=0.5)
25+
clf.fit(train_set.tdm, train_set.label) # 训练模型
26+
predict = clf.predict(test_set.tdm)
27+
return predict
28+
def classification_result(actual, predict):
29+
print('精度:{0:.3f}'.format(metrics.precision_score(actual,predict,average='weighted')))
30+
print('召回:{0:0.3f}'.format(metrics.recall_score(actual,predict,average='weighted')))
31+
print('f1-score:{0:.3f}'.format(metrics.f1_score(actual,predict,average='weighted')))
32+
if __name__ == '__main__':
33+
# 导入训练集
34+
train_path = './train_tfdifspace.dat'
35+
train_set = read_bunch(train_path)
36+
# 导入测试集
37+
test_path = "./test_tfidfspace.dat"
38+
test_set = read_bunch(test_path)
39+
predict = nbayes_classify(train_set, test_set) #
40+
classification_result(test_set.label, predict)
41+
print('-' * 100)
42+
#保存结果路径
43+
save_path = './classify_file.txt'
44+
for label, filename, predict in zip(test_set.label, test_set.filepath ,predict): #test_set
45+
print(filename, "\t实际类别:",label,"\t-->预测类别:", predict)
46+
save_content = filename + "\t实际类别:" + label + "\t-->预测类别:" + predict+ '\n'
47+
save_file(save_path, save_content) # 将分类结果写入txt

test_bunch_bag.dat

161 KB
Binary file not shown.

test_segments/kong_test/网络.txt

Lines changed: 0 additions & 1 deletion
This file was deleted.

0 commit comments

Comments
 (0)