-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathtagger.py
More file actions
127 lines (112 loc) · 4.43 KB
/
tagger.py
File metadata and controls
127 lines (112 loc) · 4.43 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""
Greedy Averaged Perceptron POS Tagger
"""
import os
import pickle
from collections import defaultdict
from perceptron import Perceptron
from utils import DefaultList
START = ['-START-', '-START2-']
END = ['-END-', '-END2-']
class PerceptronTagger(object):
'''Greedy Averaged Perceptron tagger'''
def __init__(self, classes=None, load=True):
self.model_loc = os.path.join('model', 'tagger.pickle')
self.tagdict = {}
if classes:
self.classes = classes
else:
self.classes = set()
self.model = Perceptron(self.classes)
if load:
self.load(self.model_loc)
def tag(self, words, tokenize=True):
prev, prev2 = START
tags = DefaultList('')
context = START + [self._normalize(w) for w in words] + END
for i, word in enumerate(words):
tag = self.tagdict.get(word)
if not tag:
features = self._get_features(i, word, context, prev, prev2)
tag = self.model.predict(features)
tags.append(tag)
prev2 = prev; prev = tag
return tags
def start_training(self, sentences):
self._make_tagdict(sentences)
self.model = Perceptron(self.classes)
def train(self, sentences, save_loc=None, nr_iter=5):
'''Train a model from sentences, and save it at save_loc. nr_iter
controls the number of Perceptron training iterations.'''
self.start_training(sentences)
for iter_ in range(nr_iter):
for words, tags in sentences:
self.train_one(words, tags)
random.shuffle(sentences)
self.end_training(save_loc)
def save(self):
# Pickle as a binary file
pickle.dump((self.model.weights, self.tagdict, self.classes), open(self.model_loc, 'wb'), -1)
def train_one(self, words, tags):
prev, prev2 = START
context = START + [self._normalize(w) for w in words] + END
for i, word in enumerate(words):
guess = self.tagdict.get(word)
if not guess:
feats = self._get_features(i, word, context, prev, prev2)
guess = self.model.predict(feats)
self.model.update(tags[i], guess, feats)
prev2 = prev; prev = guess
def load(self, path):
w_td_c = pickle.load(open(path, 'rb'))
self.model.weights, self.tagdict, self.classes = w_td_c
self.model.classes = self.classes
def _normalize(self, word):
if '-' in word and word[0] != '-':
return '!HYPHEN'
elif word.isdigit() and len(word) == 4:
return '!YEAR'
elif word[0].isdigit():
return '!DIGITS'
else:
return word.lower()
def _get_features(self, i, word, context, prev, prev2):
'''Map tokens into a feature representation, implemented as a
{hashable: float} dict. If the features change, a new model must be
trained.'''
def add(name, *args):
features[' '.join((name,) + tuple(args))] += 1
i += len(START)
features = defaultdict(int)
# It's useful to have a constant feature, which acts sort of like a prior
add('bias')
add('i suffix', word[-3:])
add('i pref1', word[0])
add('i-1 tag', prev)
add('i-2 tag', prev2)
add('i tag+i-2 tag', prev, prev2)
add('i word', context[i])
add('i-1 tag+i word', prev, context[i])
add('i-1 word', context[i-1])
add('i-1 suffix', context[i-1][-3:])
add('i-2 word', context[i-2])
add('i+1 word', context[i+1])
add('i+1 suffix', context[i+1][-3:])
add('i+2 word', context[i+2])
return features
def _make_tagdict(self, sentences):
'''Make a tag dictionary for single-tag words.'''
counts = defaultdict(lambda: defaultdict(int))
for sent in sentences:
for word, tag in zip(sent[0], sent[1]):
counts[word][tag] += 1
self.classes.add(tag)
freq_thresh = 20
ambiguity_thresh = 0.97
for word, tag_freqs in counts.items():
tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
n = sum(tag_freqs.values())
# Don't add rare words to the tag dictionary
# Only add quite unambiguous words
if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
self.tagdict[word] = tag