Skip to content

Commit b1b9eac

Browse files
author
xuming06
committed
add ngram segword.
1 parent 20555b9 commit b1b9eac

16 files changed

Lines changed: 524398 additions & 1 deletion

15Ngram/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# -*- coding: utf-8 -*-
2+
# Author: XuMing <[email protected]>
3+
# Data: 17/8/29
4+
# Brief:
5+
6+
7+

15Ngram/config.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# -*- coding: utf-8 -*-
2+
# Author: XuMing <[email protected]>
3+
# Data: 17/8/29
4+
# Brief:
5+
6+
train_data_path = '../data/ngram_wordseg/TrainData.txt'
7+
test_data_path = '../data/ngram_wordseg/test.txt'
8+
test_result_path = '../data/ngram_wordseg/result.txt'
9+
test_gold_path = '../data/ngram_wordseg/gold.txt'
10+
11+
Punctuation = [u'、', u'”', u'“', u'。', u'(', u')', u':', u'《', u'》', u';', u'!', u',', u'、']
12+
13+
span = 16
14+
15+
Number = [u'0', u'1', u'2', u'3', u'4', u'5', u'6', u'7', u'8', u'9', u'%', u'.']
16+
17+
English = [u'a', u'b', u'c', u'd', u'e', u'f', u'g', u'h', u'i', u'j', u'k', u'l', u'm', u'n', u'o', u'p', u'q', u'r',
18+
u's', u't', u'u', u'v', u'w', u'x', u'y', u'z',
19+
u'A', u'B', u'C', u'D', u'E', u'F', u'G', u'H', u'I', u'J', u'K', u'L', u'M', u'N', u'O', u'P', u'Q', u'R',
20+
u'S', u'T', u'U', u'V', u'W', u'X', u'Y', u'Z']

15Ngram/evaluate.py

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
from config import test_result_path
2+
from config import test_gold_path
3+
4+
5+
class Evaluate():
6+
def __init__(self):
7+
pass
8+
9+
def evaluate(self):
10+
test_result_file = open(test_result_path, encoding='utf-8')
11+
test_gold_file = open(test_gold_path, encoding='utf-8')
12+
13+
result_cnt = 0.0
14+
gold_cnt = 0.0
15+
right_cnt = 0.0
16+
17+
for line1, line2 in zip(test_result_file, test_gold_file):
18+
result_list = line1.strip().split(' ')
19+
gold_list = line2.strip().split(' ')
20+
for words in gold_list:
21+
if words != '':
22+
gold_list.remove(words)
23+
for words in gold_list:
24+
if words != '':
25+
result_list.remove(words)
26+
27+
result_cnt += len(result_list)
28+
gold_cnt += len(gold_list)
29+
for words in result_list:
30+
if words in gold_list:
31+
right_cnt += 1.0
32+
gold_list.remove(words)
33+
34+
p = right_cnt / result_cnt
35+
r = right_cnt / gold_cnt
36+
F = 2.0 * p * r / (p + r + 1)
37+
38+
print('right_cnt: \t\t', right_cnt)
39+
print('result_cnt: \t', result_cnt)
40+
print('gold_cnt: \t\t', gold_cnt)
41+
42+
print('P: \t\t', p)
43+
print('R: \t\t', r)
44+
print('F: \t\t', F)
45+
46+
47+
if __name__ == '__main__':
48+
E = Evaluate()
49+
E.evaluate()

15Ngram/seg_ngram.py

Lines changed: 315 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,315 @@
1+
# -*- coding: utf-8 -*-
2+
# Author: XuMing <[email protected]>
3+
# Data: 17/8/29
4+
# Brief:
5+
6+
7+
import math
8+
from config import train_data_path
9+
from config import test_data_path
10+
from config import test_result_path
11+
from config import span
12+
from config import Punctuation
13+
from config import Number
14+
from config import English
15+
from evaluate import Evaluate
16+
17+
18+
class PrePostNgram():
19+
def __init__(self):
20+
self._WordDict = {}
21+
self._NextCount = {}
22+
self._NextSize = 0
23+
self._WordSize = 0
24+
25+
def Training(self):
26+
"""
27+
读取训练集文件
28+
得到每个词出现的个数 self._WordDict
29+
得到每个词后接词出现的个数 self._NextCount
30+
:return:
31+
"""
32+
print('start training...')
33+
self._NextCount['<BEG>'] = {}
34+
traing_file = open(train_data_path, encoding='utf-8')
35+
traing_cnt = 0
36+
for line in traing_file:
37+
line = line.strip()
38+
line = line.split(' ')
39+
line_list = []
40+
# 得到每个词出现的个数
41+
for pos, words in enumerate(line):
42+
if words != '' and words not in Punctuation:
43+
line_list.append(words)
44+
traing_cnt += len(line_list)
45+
for pos, words in enumerate(line_list):
46+
if words not in self._WordDict:
47+
self._WordDict[words] = 1
48+
else:
49+
self._WordDict[words] += 1
50+
# 得到每个词后接词出现的个数
51+
words1, words2 = '', ''
52+
if pos == 0:
53+
words1, words2 = '<BEG>', words
54+
elif pos == len(line_list) - 1:
55+
words1, words2 = words, '<END>'
56+
else:
57+
words1, words2 = words, line_list[pos + 1]
58+
if words1 not in self._NextCount:
59+
self._NextCount[words1] = {}
60+
if words2 not in self._NextCount[words1]:
61+
self._NextCount[words1][words2] = 1
62+
else:
63+
self._NextCount[words1][words2] += 1
64+
65+
traing_file.close()
66+
self._NextSize = traing_cnt
67+
print('total training words length is: ', traing_cnt)
68+
print('training done...')
69+
self._WordSize = len(self._WordDict)
70+
print("len _WordDict: ", len(self._WordDict))
71+
print("len _NextCount: ", len(self._NextCount))
72+
73+
def SeparWords(self, mode):
74+
print('start SeparWords...')
75+
76+
test_file = open(test_data_path, encoding='utf-8')
77+
test_result_file = open(test_result_path, mode='w', encoding='utf-8')
78+
79+
SenListCnt = 0
80+
tmp_words = ''
81+
SpecialDict = {}
82+
for line in test_file:
83+
# 编码方式改为utf-8
84+
line = line.strip()
85+
SenList = []
86+
87+
# 记录是否有英文或者数字的flag
88+
flag = 0
89+
for sentense in line:
90+
if sentense in Number or sentense in English:
91+
flag = 1
92+
tmp_words += sentense
93+
elif sentense in Punctuation:
94+
if tmp_words != '':
95+
SenList.append(tmp_words)
96+
SenListCnt += 1
97+
SenList.append(sentense)
98+
if flag == 1:
99+
SpecialDict[tmp_words] = 1
100+
flag = 0
101+
tmp_words = ''
102+
else:
103+
if flag == 1:
104+
SenList.append(tmp_words)
105+
SenListCnt += 1
106+
SpecialDict[tmp_words] = 1
107+
flag = 0
108+
tmp_words = sentense
109+
else:
110+
tmp_words += sentense
111+
if tmp_words != '':
112+
SenList.append(tmp_words)
113+
SenListCnt += 1
114+
if flag == 1:
115+
SpecialDict[tmp_words] = 1
116+
tmp_words = ''
117+
118+
for sentense in SenList:
119+
if sentense not in Punctuation and sentense not in SpecialDict:
120+
if mode == 'Pre':
121+
ParseList = self.PreMax(sentense)
122+
elif mode == 'Post':
123+
ParseList = self.PosMax(sentense)
124+
else:
125+
ParseList1 = self.PreMax(sentense)
126+
ParseList2 = self.PosMax(sentense)
127+
ParseList1.insert(0, '<BEG>')
128+
ParseList1.append('<END>')
129+
ParseList2.insert(0, '<BEG>')
130+
ParseList2.append('<END>')
131+
# 根据前向最大匹配和后向最大匹配得到得到句子的两个词序列(添加BEG和END作为句子的开始和结束)
132+
133+
# 记录最终选择后拼接得到的句子
134+
ParseList = []
135+
136+
# CalList1和CalList2分别记录两个句子词序列不同的部分
137+
CalList1 = []
138+
CalList2 = []
139+
140+
# pos1和pos2记录两个句子的当前字的位置,cur1和cur2记录两个句子的第几个词
141+
pos1 = pos2 = 0
142+
cur1 = cur2 = 0
143+
while (1):
144+
if cur1 == len(ParseList1) and cur2 == len(ParseList2):
145+
break
146+
# 如果当前位置一样
147+
if pos1 == pos2:
148+
# 当前位置一样,并且词也一样
149+
if len(ParseList1[cur1]) == len(ParseList2[cur2]):
150+
pos1 += len(ParseList1[cur1])
151+
pos2 += len(ParseList2[cur2])
152+
# 说明此时得到两个不同的词序列,根据bigram选择概率大的
153+
# 注意算不同的时候要考虑加上前面一个词和后面一个词,拼接的时候再去掉即可
154+
if len(CalList1) > 0:
155+
CalList1.insert(0, ParseList[-1])
156+
CalList2.insert(0, ParseList[-1])
157+
if cur1 < len(ParseList1) - 1:
158+
CalList1.append(ParseList1[cur1])
159+
CalList2.append(ParseList2[cur2])
160+
161+
p1 = self.CalSegProbability(CalList1)
162+
p2 = self.CalSegProbability(CalList2)
163+
if p1 > p2:
164+
CalList = CalList1
165+
else:
166+
CalList = CalList2
167+
CalList.remove(CalList[0])
168+
if cur1 < len(ParseList1) - 1:
169+
CalList.remove(ParseList1[cur1])
170+
for words in CalList:
171+
ParseList.append(words)
172+
CalList1 = []
173+
CalList2 = []
174+
ParseList.append(ParseList1[cur1])
175+
cur1 += 1
176+
cur2 += 1
177+
# pos1相同,len(ParseList1[cur1])不同,向后滑动,不同的添加到list中
178+
elif len(ParseList1[cur1]) > len(ParseList2[cur2]):
179+
CalList2.append(ParseList2[cur2])
180+
pos2 += len(ParseList2[cur2])
181+
cur2 += 1
182+
else:
183+
CalList1.append(ParseList1[cur1])
184+
pos1 += len(ParseList1[cur1])
185+
cur1 += 1
186+
else:
187+
# pos1不同,而结束的位置相同,两个同时向后滑动
188+
if pos1 + len(ParseList1[cur1]) == pos2 + len(ParseList2[cur2]):
189+
CalList1.append(ParseList1[cur1])
190+
CalList2.append(ParseList2[cur2])
191+
pos1 += len(ParseList1[cur1])
192+
pos2 += len(ParseList2[cur2])
193+
cur1 += 1
194+
cur2 += 1
195+
elif pos1 + len(ParseList1[cur1]) > pos2 + len(ParseList2[cur2]):
196+
CalList2.append(ParseList2[cur2])
197+
pos2 += len(ParseList2[cur2])
198+
cur2 += 1
199+
else:
200+
CalList1.append(ParseList1[cur1])
201+
pos1 += len(ParseList1[cur1])
202+
cur1 += 1
203+
ParseList.remove('<BEG>')
204+
ParseList.remove('<END>')
205+
206+
for pos, words in enumerate(ParseList):
207+
tmp_words += ' ' + words
208+
else:
209+
tmp_words += ' ' + sentense
210+
test_result_file.write(tmp_words)
211+
test_result_file.write('\n')
212+
213+
tmp_words = ''
214+
215+
test_file.close()
216+
test_result_file.close()
217+
print('SenList length: ', SenListCnt)
218+
219+
def CalSegProbability(self, ParseList):
220+
p = 0
221+
# 由于概率很小,对连乘做了取对数处理转化为加法
222+
for pos, words in enumerate(ParseList):
223+
if pos < len(ParseList) - 1:
224+
# 乘以后面词的条件概率
225+
word1, word2 = words, ParseList[pos + 1]
226+
if word1 not in self._NextCount:
227+
# 加1平滑
228+
p += math.log(1.0 / self._NextSize)
229+
else:
230+
# 加1平滑
231+
fenzi, fenmu = 1.0, self._NextSize
232+
for key in self._NextCount[word1]:
233+
if key == word2:
234+
fenzi += self._NextCount[word1][word2]
235+
fenmu += self._NextCount[word1][key]
236+
p += math.log((fenzi / fenmu))
237+
# 乘以第一个词的概率
238+
if (pos == 0 and words != '<BEG>') or (pos == 1 and ParseList[0] == '<BEG>'):
239+
if words in self._WordDict:
240+
p += math.log(float(self._WordDict[words]) + 1 / self._WordSize + self._NextSize)
241+
else:
242+
# 加1平滑
243+
p += math.log(1 / self._WordSize + self._NextSize)
244+
return p
245+
246+
def PreMax(self, sentence):
247+
"""
248+
把每个句子正向最大匹配
249+
"""
250+
cur, tail = 0, span
251+
ParseList = []
252+
while (cur < tail and cur <= len(sentence)):
253+
if len(sentence) < tail:
254+
tail = len(sentence)
255+
if tail == cur + 1:
256+
ParseList.append(sentence[cur:tail])
257+
cur += 1
258+
tail = cur + span
259+
elif sentence[cur:tail] in self._WordDict:
260+
ParseList.append(sentence[cur:tail])
261+
cur = tail
262+
tail = cur + span
263+
else:
264+
tail -= 1
265+
return ParseList
266+
267+
def PosMax(self, sentence):
268+
"""
269+
把每个句子后向最大匹配
270+
:param sentence:
271+
:return:
272+
"""
273+
cur = len(sentence) - span
274+
tail = len(sentence)
275+
if cur < 0:
276+
cur = 0
277+
278+
ParseList = []
279+
while (cur < tail and tail > 0):
280+
if tail == cur + 1:
281+
ParseList.append(sentence[cur:tail])
282+
tail -= 1
283+
cur = tail - span
284+
if cur < 0:
285+
cur = 0
286+
elif sentence[cur:tail] in self._WordDict:
287+
ParseList.append(sentence[cur:tail])
288+
tail = cur
289+
cur = tail - span
290+
if cur < 0:
291+
cur = 0
292+
else:
293+
cur += 1
294+
ParseList.reverse()
295+
return ParseList
296+
297+
298+
if __name__ == '__main__':
299+
E = Evaluate()
300+
p = PrePostNgram()
301+
p.Training()
302+
p.SeparWords('Pre')
303+
print('*****')
304+
print('Pre Max')
305+
E.evaluate()
306+
print('*****')
307+
p.SeparWords('Post')
308+
print('*****')
309+
print('Post Max')
310+
E.evaluate()
311+
print('*****')
312+
p.SeparWords('prepostBigram')
313+
print('*****')
314+
print('PrePostSegBigram Max')
315+
E.evaluate()

0 commit comments

Comments
 (0)