Skip to content

Commit b232d08

Browse files
author
xuming06
committed
add thread
1 parent a7c5159 commit b232d08

12 files changed

Lines changed: 256 additions & 63 deletions

File tree

07keras/11lstm_text_generation.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,10 @@ def get_corpus(data_path):
6262
model.add(Activation('softmax'))
6363

6464
model.compile(optimizer=RMSprop(lr=0.01), loss='categorical_crossentropy')
65+
model.summary()
6566

66-
67+
print("*"*40)
68+
print(model.summary())
6769
def sample(preds, temperature=1.0):
6870
preds = np.asarray(preds).astype('float64')
6971
preds = np.log(preds) / temperature

12gensim/04.doc2vec.py

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,9 @@
2222
from sklearn.linear_model import LogisticRegression
2323

2424
# 获取训练与测试数据及其类别标注
25-
neg_file = 'neg.txt'
26-
pos_file = 'douban_imdb_data/aclImdb/train/pos'
27-
unsup_file = 'douban_imdb_data/aclImdb/train/unsup'
25+
neg_file = '../data/douban_imdb_data/neg.txt'
26+
pos_file = '../data/douban_imdb_data/pos.txt'
27+
unsup_file = '../data/douban_imdb_data/unsup.txt'
2828
sentences = gensim.models.doc2vec.TaggedLineDocument(neg_file)
2929
model = gensim.models.doc2vec.Doc2Vec(sentences)
3030
model.save('neg.d2v.model')
@@ -39,7 +39,6 @@
3939
print(model.doesnt_match("I'm sure I missed some plot points".split()))
4040

4141

42-
print(model.most_similar(positive=['but', 'what'], negative=['fact']))
4342
print(model.most_similar(positive=['blue', 'shirt'], negative=['blue']))
4443

4544

12gensim/04.doc2vec_demo.py

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,10 @@ def get_data(pos_file, neg_file, unsup_file):
2020
:return:
2121
"""
2222

23-
def get_folder_txt(folder_path):
23+
def get_folder_txt(path):
2424
result = []
25-
for parent, dirnames, filenames in os.walk(folder_path):
26-
for filename in filenames:
27-
path = os.path.join(folder_path, filename)
28-
with open(path, 'r', encoding='utf-8') as f:
29-
result.append(f.read())
25+
with open(path, 'r', encoding='utf-8') as f:
26+
result.append(f.read())
3027
return result
3128

3229
pos_reviews = get_folder_txt(pos_file)
@@ -185,9 +182,9 @@ def ROC_curve(lr, y_test):
185182
# 设置向量维度和训练次数
186183
size, epoch_num = 400, 10
187184
# 获取训练与测试数据及其类别标注
188-
neg_file = 'douban_imdb_data/aclImdb/train/neg'
189-
pos_file = 'douban_imdb_data/aclImdb/train/pos'
190-
unsup_file = 'douban_imdb_data/aclImdb/train/unsup'
185+
neg_file = '../data/douban_imdb_data/neg.txt'
186+
pos_file = '../data/douban_imdb_data/pos.txt'
187+
unsup_file = '../data/douban_imdb_data/unsup.txt'
191188
x_train, x_test, unsup_reviews, y_train, y_test = get_data(neg_file, pos_file, unsup_file)
192189
# 对数据进行训练,获得模型
193190
model_dm, model_dbow = train(x_train, x_test, unsup_reviews, size, epoch_num)

12gensim/doc2v.py

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
# -*- coding: utf-8 -*-
2+
# Author: XuMing <[email protected]>
3+
# Brief:
4+
# coding:utf-8
5+
6+
import sys
7+
import gensim
8+
import sklearn
9+
import numpy as np
10+
11+
from gensim.models.doc2vec import Doc2Vec, LabeledSentence
12+
13+
TaggededDocument = gensim.models.doc2vec.TaggedDocument
14+
15+
16+
def get_datasest():
17+
with open("../data/ngram_wordseg/gold.txt", 'r', encoding='utf8') as cf:
18+
docs = cf.readlines()
19+
print(len(docs))
20+
21+
x_train = []
22+
# y = np.concatenate(np.ones(len(docs)))
23+
for i, text in enumerate(docs):
24+
word_list = text.split()
25+
l = len(word_list)
26+
word_list[l - 1] = word_list[l - 1].strip()
27+
document = TaggededDocument(word_list, tags=[i])
28+
x_train.append(document)
29+
30+
return x_train
31+
32+
33+
def getVecs(model, corpus, size):
34+
vecs = [np.array(model.docvecs[z.tags[0]].reshape(1, size)) for z in corpus]
35+
return np.concatenate(vecs)
36+
37+
38+
def train(x_train, size=200, epoch_num=1):
39+
model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4)
40+
model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=70)
41+
model_dm.save('model_dm')
42+
43+
return model_dm
44+
45+
46+
def test():
47+
model_dm = Doc2Vec.load("model_dm")
48+
test_text = ['《', '舞林', '争霸' '》', '十强' '出炉', '复活', '舞者', '澳门', '踢馆']
49+
inferred_vector_dm = model_dm.infer_vector(test_text)
50+
print(inferred_vector_dm)
51+
sims = model_dm.docvecs.most_similar([inferred_vector_dm], topn=10)
52+
53+
return sims
54+
55+
56+
if __name__ == '__main__':
57+
x_train = get_datasest()
58+
model_dm = train(x_train)
59+
60+
sims = test()
61+
for count, sim in sims:
62+
sentence = x_train[count]
63+
words = ''
64+
for word in sentence[0]:
65+
words = words + word + ' '
66+
print(words, sim, len(sentence[0]))
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
# -*- coding: utf-8 -*-
2+
# Author: XuMing <[email protected]>
3+
# Brief:
4+
import numpy as np
5+
import matplotlib.pyplot as plt
6+
import matplotlib.gridspec as gridspec
7+
import itertools
8+
from sklearn.linear_model import LogisticRegression
9+
from sklearn.svm import SVC
10+
from sklearn.ensemble import RandomForestClassifier
11+
from mlxtend.classifier import EnsembleVoteClassifier
12+
from mlxtend.data import iris_data
13+
from mlxtend.plotting import plot_decision_regions
14+
15+
# Initializing Classifiers
16+
clf1 = LogisticRegression(random_state=0)
17+
clf2 = RandomForestClassifier(random_state=0)
18+
clf3 = SVC(random_state=0, probability=True)
19+
eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], weights=[2, 1, 1], voting='soft')
20+
21+
# Loading some example data
22+
X, y = iris_data()
23+
X = X[:,[0, 2]]
24+
25+
# Plotting Decision Regions
26+
gs = gridspec.GridSpec(2, 2)
27+
fig = plt.figure(figsize=(10, 8))
28+
29+
for clf, lab, grd in zip([clf1, clf2, clf3, eclf],
30+
['Logistic Regression', 'Random Forest', 'RBF kernel SVM', 'Ensemble'],
31+
itertools.product([0, 1], repeat=2)):
32+
clf.fit(X, y)
33+
ax = plt.subplot(gs[grd[0], grd[1]])
34+
fig = plot_decision_regions(X=X, y=y, clf=clf, legend=2)
35+
plt.title(lab)
36+
plt.savefig('1.png')
37+
plt.show()

22data-mining/frequent_patterns/dataSet/dblpDataAll.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ Michael Ley
22
E. F. Codd
33
E. F. Codd,C. J. Date
44
Patrick A. V. Hall
5-
E. F. Codd
5+
E. F. Codd Codd
66
Markus Tresch
77
E. F. Codd
88
E. F. Codd

22data-mining/frequent_patterns/freq_utils.py

Lines changed: 22 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -58,55 +58,35 @@ def loadUnixData(fileRead, fileWrite):
5858
return dataSet
5959

6060

61-
def getAuthorsData(fileRead, fileWrite):
61+
def load_title_data(file_path, flag, row_num=1):
6262
'''
63-
加载原始作者数据预处理
64-
:param fileName:
65-
:return:
66-
'''
67-
f = open(fileRead, 'r')
68-
fwrite = open(fileWrite, "w")
69-
dataSet = []
70-
i = 0
71-
for line in f.readlines():
72-
if line == "\n":
73-
continue
74-
line = line[:len(line) - 2]
75-
line_arr = line.strip().split(',')
76-
dataSet.append(line_arr)
77-
fwrite.write(line + "\n")
78-
return dataSet
79-
80-
81-
def getUnixData(fileRead, fileWrite):
82-
'''
83-
加载数据Unix用户命令数据
84-
:param fileName:
63+
加载title的数据
64+
:param file_path:
8565
:return:
8666
'''
87-
f = open(fileRead, 'r')
88-
fwrite = open(fileWrite, "w")
67+
dataSetDict = {}
8968
dataSet = []
90-
temp = ''
91-
for line in f.readlines():
92-
if line == "\n":
93-
continue
94-
line = line.split("\n")[0]
95-
print(line)
96-
if line == "**SOF**":
97-
temp = ''
98-
elif line == "**EOF**":
99-
if temp == "":
100-
continue
101-
fwrite.write(temp + "\n")
102-
else:
103-
if temp == "":
104-
temp = line
105-
else:
106-
temp = temp + ',' + line
69+
count = 0
70+
print(file_path)
71+
with open(file_path, 'r', encoding='utf-8') as f:
72+
for line in f:
73+
if count > row_num:
74+
break
75+
line = list(line.strip())
76+
dataSet.append(line)
77+
dataLine = [word for word in line]
78+
dataSetDict[frozenset(dataLine)] = dataSetDict.get(frozenset(dataLine), 0) + 1
79+
count += 1
80+
return dataSetDict, dataSet
10781

10882

10983
def printDataSet(dataSet):
11084
for i in range(len(dataSet)):
11185
for j in range(len(dataSet[i])):
11286
print(dataSet[i][j])
87+
88+
89+
def save_freqItems(freqItems_fp, save_path):
90+
with open(save_path, 'w', encoding='utf-8') as f:
91+
for i in freqItems_fp:
92+
f.write(' '.join(i[0]) + '\t' + str(i[1]) + '\n')

22data-mining/frequent_patterns/main.py

Lines changed: 26 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,11 @@
33
# Brief:
44
import time
55
from eclat import eclat_zc
6-
from freq_utils import loadDblpData, loadData, loadUnixData
6+
from freq_utils import loadDblpData, load_title_data, printDataSet, save_freqItems
77
from apriori import apriori_zc
88
from fp_growth import fp_growth
99

10+
1011
def test_fp_growth(minSup, dataSetDict, dataSet):
1112
freqItems = fp_growth(dataSetDict, minSup)
1213
freqItems = sorted(freqItems.items(), key=lambda item: item[1])
@@ -32,6 +33,7 @@ def print_freqItems(logo, freqItems):
3233
print(len(freqItems))
3334
print("-------------------", logo, " end ---------------")
3435

36+
3537
def do_experiment_data_size():
3638
data_name = 'unixData8_pro.txt'
3739
x_name = "Data_Size"
@@ -151,7 +153,7 @@ def do_test():
151153
def do_dblp_data():
152154
data_name = 'dblpDataAll.txt'
153155
x_name = "Min_Support"
154-
data_num = 2715700
156+
data_num = 980
155157
minSup = 100
156158
dataSetDict, dataSet = loadDblpData(("dataSet/" + data_name), ',', data_num)
157159

@@ -165,9 +167,27 @@ def do_dblp_data():
165167
print(item)
166168

167169

168-
if __name__ == '__main__':
169-
x_value, y_value = do_experiment_min_support()
170-
x_value, y_value = do_experiment_data_size()
171-
do_test()
170+
def do_title_data():
171+
data_name = 'title.txt'
172+
x_name = "Min_Support"
173+
data_num = 22846
174+
minSup = data_num / 100
175+
dataSetDict, dataSet = load_title_data(("dataSet/" + data_name), ',', data_num)
176+
printDataSet(dataSet[:10])
177+
time_fp = 0
178+
ticks0 = time.time()
179+
freqItems_fp = test_eclat(minSup, dataSetDict, dataSet)
180+
time_fp += time.time() - ticks0
181+
print(time_fp)
182+
183+
print(freqItems_fp[:10])
184+
save_freqItems(freqItems_fp, "dataSet/title_out.txt")
172185

186+
187+
if __name__ == '__main__':
188+
# x_value, y_value = do_experiment_min_support()
189+
# x_value, y_value = do_experiment_data_size()
190+
# do_test()
191+
#
173192
do_dblp_data()
193+
# do_title_data()

22data-mining/prob_stack.py

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
# -*- coding: utf-8 -*-
2+
# Author: XuMing <[email protected]>
3+
# Brief:
4+
import numpy as np
5+
import matplotlib.pyplot as plt
6+
import matplotlib.gridspec as gridspec
7+
import itertools
8+
from sklearn.linear_model import LogisticRegression
9+
from sklearn.neighbors import KNeighborsClassifier
10+
from sklearn.naive_bayes import GaussianNB
11+
from sklearn.svm import SVC
12+
from sklearn.ensemble import RandomForestClassifier
13+
from sklearn.model_selection import cross_val_score
14+
from mlxtend.classifier import EnsembleVoteClassifier, StackingClassifier
15+
from mlxtend.data import iris_data
16+
from mlxtend.plotting import plot_decision_regions
17+
18+
# Loading some example data
19+
X, y = iris_data()
20+
X = X[:, [0, 2]]
21+
22+
clf1 = KNeighborsClassifier(n_neighbors=1)
23+
clf2 = RandomForestClassifier(random_state=1)
24+
clf3 = GaussianNB()
25+
lr = LogisticRegression()
26+
sclf = StackingClassifier(classifiers=[clf1, clf2, clf3],
27+
use_probas=True,
28+
average_probas=False,
29+
meta_classifier=lr)
30+
31+
print('3-fold cross validation:\n')
32+
33+
# Plotting Decision Regions
34+
gs = gridspec.GridSpec(2, 2)
35+
fig = plt.figure(figsize=(10, 8))
36+
37+
for clf, lab, grd in zip([clf1, clf2, clf3, sclf],
38+
['KNN',
39+
'Random Forest',
40+
'Naive Bayes',
41+
'StackingClassifier'],
42+
itertools.product([0, 1], repeat=2)):
43+
clf.fit(X, y)
44+
ax = plt.subplot(gs[grd[0], grd[1]])
45+
fig = plot_decision_regions(X=X, y=y, clf=clf, legend=2)
46+
plt.title(lab)
47+
48+
scores = cross_val_score(clf, X, y, cv=3, scoring='accuracy')
49+
print("Accuracy: %0.2f (+/- %0.2f) [%s]"
50+
% (scores.mean(), scores.std(), lab))
51+
plt.savefig('2.png')
52+
plt.show()
53+
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# -*- coding: utf-8 -*-
2+
# Author: XuMing <[email protected]>
3+
# Brief: get special words by sentence
4+
5+
sentences = [
6+
"{甲状腺节结治疗哪家医院好}?郑州市第二中医院",
7+
"{检查前列腺囊仲对身体有害吗}前列腺囊仲的病因之一:久站久坐,性生活过频,性生活被迫中断.过多的手淫等.临床表现有排尿疫常等!",
8+
"{龟头长小红点},在阴茎的龟头上有红斑点,红肿等切不可怠慢,可能是泌尿发炎症或性疾病所致.如龟头疾病,泡诊等泌尿疾病",
9+
"现在处女模修复手术多少钱}<<沧州华美妇产医院>>韩式处女膜修复术,落红高",
10+
"乳房乳腺节结手术吗}?南京京科医院,知名乳腺医师,美国先进检查设备,精准确诊乳腺节结",
11+
"青岛市里医院{关键词}{能洗眼线吗}?选华韩整形洗眼线",
12+
"男性朋友需警惕,冠状沟疾病是由于真菌感蒅,不洁性行为引起!",
13+
"乐清哪个看妇科焱症的医院好}同济妇科医院,老百姓的妇科医院",
14+
"信阳妇科医院} 专业开展无疼流产手术,技术治疗妇科炎症,玑瘤肿囊,女性不孕",
15+
",常见的臯丸仲胀,臯丸疼痛不适等.都可能是臯丸焱所引起的",
16+
"中医治疗鱼鳞痔的方法},西安北方中医皮肤病医院,西安鱼鳞痔跈疗基地",
17+
]
18+
for i in sentences:
19+
print(i)

0 commit comments

Comments
 (0)