forked from shibing624/python-tutorial
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathdoc2v.py
More file actions
66 lines (48 loc) · 1.76 KB
/
doc2v.py
File metadata and controls
66 lines (48 loc) · 1.76 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
# -*- coding: utf-8 -*-
# Author: XuMing <[email protected]>
# Brief:
# coding:utf-8
import sys
import gensim
import sklearn
import numpy as np
from gensim.models.doc2vec import Doc2Vec, LabeledSentence
TaggededDocument = gensim.models.doc2vec.TaggedDocument
def get_datasest():
with open("../data/ngram_wordseg/gold.txt", 'r', encoding='utf8') as cf:
docs = cf.readlines()
print(len(docs))
x_train = []
# y = np.concatenate(np.ones(len(docs)))
for i, text in enumerate(docs):
word_list = text.split()
l = len(word_list)
word_list[l - 1] = word_list[l - 1].strip()
document = TaggededDocument(word_list, tags=[i])
x_train.append(document)
return x_train
def getVecs(model, corpus, size):
vecs = [np.array(model.docvecs[z.tags[0]].reshape(1, size)) for z in corpus]
return np.concatenate(vecs)
def train(x_train, size=200, epoch_num=1):
model_dm = Doc2Vec(x_train, min_count=1, window=3, size=size, sample=1e-3, negative=5, workers=4)
model_dm.train(x_train, total_examples=model_dm.corpus_count, epochs=70)
model_dm.save('model_dm')
return model_dm
def test():
model_dm = Doc2Vec.load("model_dm")
test_text = ['《', '舞林', '争霸' '》', '十强' '出炉', '复活', '舞者', '澳门', '踢馆']
inferred_vector_dm = model_dm.infer_vector(test_text)
print(inferred_vector_dm)
sims = model_dm.docvecs.most_similar([inferred_vector_dm], topn=10)
return sims
if __name__ == '__main__':
x_train = get_datasest()
model_dm = train(x_train)
sims = test()
for count, sim in sims:
sentence = x_train[count]
words = ''
for word in sentence[0]:
words = words + word + ' '
print(words, sim, len(sentence[0]))