-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathword_to_bunch.py
More file actions
37 lines (37 loc) · 1.31 KB
/
word_to_bunch.py
File metadata and controls
37 lines (37 loc) · 1.31 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import os
import pickle
import time
from sklearn.datasets._base import Bunch
'''
label: 文章类型
filepath: 文章路径
contents: 分词后的文章
'''
def read_file(file_path):
with open(file_path, "r",encoding= 'utf-8',errors='ignore') as fp:
content = fp.readlines()
return str(content)
def word_to_bunch(train_save_path, train_bunch_path):
bunch = Bunch(label=[], filepath=[], contents=[])
all_labels = os.listdir(train_save_path)
for label in all_labels:
detail_path = train_save_path + label + '/'
all_details = os.listdir(detail_path)
for all_detail in all_details:
file_detail_path = detail_path + all_detail # 文件具体路径
bunch.label.append(label)
bunch.filepath.append(file_detail_path)
contents = read_file(file_detail_path)
bunch.contents.append(contents)
with open(train_bunch_path, "wb+") as fp:
pickle.dump(bunch, fp)
print("创建完成")
if __name__ == "__main__":
train_save_path = './train_segments/'
train_bunch_path = "train_bunch_bag.dat"
word_to_bunch(train_save_path, train_bunch_path)
test_save_path = './test_segments/'
test_bunch_path = "test_bunch_bag.dat"
word_to_bunch(test_save_path, test_bunch_path)