Skip to content

Commit 20555b9

Browse files
author
xuming06
committed
add average preceptron.
1 parent f06ec97 commit 20555b9

6 files changed

Lines changed: 4268 additions & 1 deletion

File tree

13wordcloud/chinese-wordcloud.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
with open('../data/stopword.txt', encoding='utf-8') as f:
1515
for line in f:
1616
STOPWORDS.add(line.strip())
17-
print("stopwrod size:" + len(STOPWORDS))
17+
print("stopwrod size:", len(STOPWORDS))
1818
backgroud_Image = plt.imread('../data/cloud/girl.jpg')
1919
wc = WordCloud(background_color='white', # 设置背景颜色
2020
# mask=backgroud_Image, # 设置背景图片

14BP/AveragePerceptron.py

Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# -*- coding: utf-8 -*-
2+
# Author: XuMing <[email protected]>
3+
# Data: 17/8/10
4+
# Brief: 平均感知机
5+
6+
from collections import defaultdict
7+
import pickle
8+
import random
9+
10+
11+
class AveragePerceptron:
12+
def __init__(self):
13+
self.weights = {}
14+
self.classes = set()
15+
self._totals = defaultdict(int)
16+
self._tstamps = defaultdict(int)
17+
self.i = 0
18+
19+
def predict(self, features):
20+
scores = defaultdict(float)
21+
for feat, value in features.items():
22+
if feat not in self.weights or value == 0:
23+
continue
24+
weights = self.weights[feat]
25+
for label, weight in weights.items():
26+
scores[label] += value * weight
27+
return max(self.classes, key=lambda label: (scores[label], label))
28+
29+
def update(self, truth, guess, features):
30+
"""Update the feature weights"""
31+
32+
def update_feat(c, f, w, v):
33+
param = (f, c)
34+
self._totals[param] += (self.i - self._tstamps[param]) * w
35+
self._tstamps[param] = self.i
36+
self.weights[f][c] = w + v
37+
38+
self.i += 1
39+
if truth == guess:
40+
return None
41+
for f in features:
42+
weights = self.weights.setdefault(f, {})
43+
update_feat(truth, f, weights.get(truth, 0.0), 1.0)
44+
update_feat(guess, f, weights.get(guess, 0.0), -1.0)
45+
return None
46+
47+
def average_weights(self):
48+
"""Average weights from all iterator"""
49+
for feat, weights in self.weights.items():
50+
new_feat_weights = {}
51+
for clas, weight in weights.items():
52+
new_feat_weights = {}
53+
for clas, weight in weights.items():
54+
param = (feat, clas)
55+
total = self._totals[param]
56+
total += (self.i - self._tstamps[param]) * weight
57+
averaged = round(total / float(self.i), 3)
58+
if averaged:
59+
new_feat_weights[clas] = averaged
60+
self.weights[feat] = new_feat_weights
61+
return None
62+
63+
def save(self, path):
64+
return pickle.dump(dict(self.weights), open(path, 'w'))
65+
66+
def load(self, path):
67+
self.weights = pickle.load(open(path))
68+
return None
69+
70+
71+
def train(nr_iter, examples):
72+
model = AveragePerceptron()
73+
for i in range(nr_iter):
74+
random.shuffle(examples)
75+
for features, clazz in examples:
76+
scores = model.predict(features)
77+
guess, score = max(scores.items(), key=lambda i: i[1])
78+
if guess != clazz:
79+
model.update(clazz, guess, features)
80+
model.average_weights()
81+
return model

14BP/PerceptronTagger.py

Lines changed: 186 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,186 @@
1+
# -*- coding: utf-8 -*-
2+
# Author: XuMing <[email protected]>
3+
# Data: 17/8/10
4+
# Brief: 平均感知机:词性标注测试
5+
6+
import os
7+
import random
8+
from collections import defaultdict
9+
import pickle
10+
import logging
11+
12+
from AveragePerceptron import AveragePerceptron
13+
14+
PICKLE = "../data/bp/trontagger-0.1.pkg"
15+
TRAIN_FILE_PATH = "../data/bp/train.txt"
16+
TEST_FILE_PATH = "../data/bp/test.txt"
17+
18+
19+
class PerceptronTagger():
20+
START = ['-START-', '-START2-']
21+
END = ['-END-', '-END2-']
22+
AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), PICKLE)
23+
24+
def __init__(self, load=True):
25+
self.model = AveragePerceptron()
26+
self.tagdict = {}
27+
self.classes = set()
28+
if load:
29+
self.load(self.AP_MODEL_LOC)
30+
31+
def tag(self, corpus):
32+
s_split = lambda t: t.split('\n')
33+
w_split = lambda s: s.split()
34+
35+
def split_sents(corpus):
36+
for s in s_split(corpus):
37+
yield w_split(s)
38+
39+
prev, prev2 = self.START
40+
tokens = []
41+
for words in split_sents(corpus):
42+
context = self.START + [self._normalize(w) for w in words] + self.END
43+
for i, word in enumerate(words):
44+
tag = self.tagdict.get(word)
45+
if not tag:
46+
features = self._get_features(i, word, context, prev, prev2)
47+
tag = self.model.predict(features)
48+
tokens.append((word, tag))
49+
prev2 = prev
50+
prev = tag
51+
return tokens
52+
53+
def load(self, loc):
54+
try:
55+
w_td_c = pickle.load(open(loc, 'rb'))
56+
except IOError:
57+
raise IOError("Missing trontagger.pkg file.")
58+
self.model.weights, self.tagdict, self.classes = w_td_c
59+
self.model.classes = self.classes
60+
return None
61+
62+
def _normalize(self, word):
63+
if '-' in word and word[0] != '-':
64+
return '!HYPHEN'
65+
elif word.isdigit() and len(word) == 4:
66+
return '!YEAR'
67+
elif word[0].isdigit():
68+
return '!DIGITS'
69+
else:
70+
return word.lower()
71+
72+
def _get_features(self, i, word, context, prev, prev2):
73+
i += len(self.START)
74+
features = defaultdict(int)
75+
76+
def add(name, *args):
77+
features[' '.join((name,) + tuple(args))] += 1
78+
79+
# constant feature
80+
add('bias')
81+
add('i suffix', word[-3:])
82+
add('i pref1', word[0])
83+
add('i-1 tag', prev)
84+
add('i-2 tag', prev2)
85+
add('i tag+i-2 tag', prev, prev2)
86+
add('i word', context[i])
87+
add('i-1 tag+i word', prev, context[i])
88+
add('i-1 word', context[i - 1])
89+
add('i-1 suffix', context[i - 1][-3:])
90+
add('i-2 word', context[i - 2])
91+
add('i+1 word', context[i + 1])
92+
add('i+1 suffix', context[i + 1][-3:])
93+
add('i+2 word', context[i + 2])
94+
return features
95+
96+
def _make_tagdict(self, sentences):
97+
counts = defaultdict(lambda: defaultdict(int))
98+
for words, tags in sentences:
99+
for word, tag in zip(words, tags):
100+
counts[word][tag] += 1
101+
self.classes.add(tag)
102+
freq_thresh = 20
103+
ambiguity_thresh = 0.97
104+
for word, tag_freqs in counts.items():
105+
tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
106+
n = sum(tag_freqs.values())
107+
if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
108+
self.tagdict[word] = tag
109+
110+
def _pc(self, n, d):
111+
return (float(n) / d) * 100
112+
113+
def train(self, sentences, save_loc=None, nr_iter=5):
114+
self._make_tagdict(sentences)
115+
self.model.classes = self.classes
116+
for iter_ in range(nr_iter):
117+
c = 0
118+
n = 0
119+
for words, tags in sentences:
120+
prev, prev2 = self.START
121+
context = self.START + [self._normalize(w) for w in words] + self.END
122+
for i, word in enumerate(words):
123+
guess = self.tagdict.get(word)
124+
if not guess:
125+
feats = self._get_features(i, word, context, prev, prev2)
126+
guess = self.model.predict(feats)
127+
self.model.update(tags[i], guess, feats)
128+
prev2 = prev
129+
prev = guess
130+
c += guess == tags[i]
131+
n += 1
132+
random.shuffle(sentences)
133+
logging.info("Iter {0}: {1}/{2}={3}".format(iter_, c, n, self._pc(c, n)))
134+
self.model.average_weights()
135+
if save_loc is not None:
136+
pickle.dump((self.model.weights, self.tagdict, self.classes),
137+
open(save_loc, 'wb'), -1)
138+
return None
139+
140+
141+
if __name__ == "__main__":
142+
logging.basicConfig(level=logging.INFO)
143+
tagger = PerceptronTagger(False)
144+
try:
145+
tagger.load(PICKLE)
146+
print(tagger.tag("how are you ?"))
147+
logging.info("Start testing...")
148+
right = 0.0
149+
total = 0.0
150+
sentence = ([], [])
151+
for line in open(TEST_FILE_PATH):
152+
params = line.split()
153+
if len(params) != 2: continue
154+
sentence[0].append(params[0])
155+
sentence[1].append(params[1])
156+
if params[0] == ".":
157+
text = ""
158+
words = sentence[0]
159+
tags = sentence[1]
160+
for i, word in enumerate(words):
161+
text += word
162+
if i < len(words):
163+
text += " "
164+
outputs = tagger.tag(text)
165+
assert len(tags) == len(outputs)
166+
total += len(tags)
167+
for o, t in zip(outputs, tags):
168+
if o[1].strip() == t:
169+
right += 1
170+
sentence = ([], [])
171+
logging.info("Precision : %f", right / total)
172+
except IOError:
173+
logging.info("Reading corpus...")
174+
training_data = []
175+
sentence = ([], [])
176+
for line in open(TRAIN_FILE_PATH):
177+
params = line.split('\t')
178+
sentence[0].append(params[0])
179+
sentence[1].append(params[1])
180+
if params[0] == ".":
181+
training_data.append(sentence)
182+
sentence = ([], [])
183+
logging.info("training corpus size: %d", len(training_data))
184+
logging.info("Start training...")
185+
tagger.train(training_data, save_loc=PICKLE)
186+
logging.info("training end.")

14BP/__init__.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
# -*- coding: utf-8 -*-
2+
3+
"""
4+
@author: XuMing <[email protected]>
5+
@summary:
6+
"""
7+

0 commit comments

Comments
 (0)