forked from shibing624/python-tutorial
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy path08.pos_classifier.py
More file actions
79 lines (67 loc) · 2.61 KB
/
08.pos_classifier.py
File metadata and controls
79 lines (67 loc) · 2.61 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# -*- coding: utf-8 -*-
"""
@description:
@author:XuMing
"""
from __future__ import print_function # 兼容python3的print写法
from __future__ import unicode_literals # 兼容python3的编码处理
import nltk
from nltk.corpus import brown
# pos features
def pos_features(sentence, i):
features = {'suffix(1)': sentence[i][-1:],
'suffix(2)': sentence[i][-2:],
'suffix(3)': sentence[i][-3:]}
if i == 0:
features['prev_word'] = '<START>'
else:
features['prev_word'] = sentence[i - 1]
return features
feature_data = pos_features(brown.sents()[0], 8)
print(feature_data)
tagged_sents = brown.tagged_sents(categories='news')
featuresets = []
for tagged_sent in tagged_sents:
untagged_sent = nltk.tag.untag(tagged_sent)
for i, (word, tag) in enumerate(tagged_sent):
featuresets.append((pos_features(untagged_sent, i), tag))
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
pos_classifier_nb_rate = nltk.classify.accuracy(classifier, test_set)
print('pos_classifier_nb_rate', pos_classifier_nb_rate) # 0.789
# optimize classifier
def pos_features(sentence, i, history):
features = {'suffix(1)': sentence[i][-1:],
'suffix(2)': sentence[i][-2:],
'suffix(3)': sentence[i][-3:]}
if i == 0:
features['prev-word'] = '<START>'
features['prev-tag'] = '<START>'
else:
features['prev-word'] = sentence[i - 1]
features['prev-tag'] = history[i - 1]
return features
class ConsecutivePosTagger(nltk.TaggerI):
def __init__(self, train_sents):
train_set = []
for tagged_sent in train_sents:
untagged_sent = nltk.tag.untag(tagged_sent)
history = []
for i, (word, tag) in enumerate(tagged_sent):
featuresets = pos_features(untagged_sent, i, history)
train_set.append((featuresets, tag))
history.append(tag)
self.classifier = nltk.NaiveBayesClassifier.train(train_set)
def tag(self, sentence):
history = []
for i, word in enumerate(sentence):
featuresets = pos_features(sentence, i, history)
tag = self.classifier.classify(featuresets)
history.append(tag)
return zip(sentence, history)
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_sents)
print('ConsecutivePosTagger', tagger.evaluate(test_sents))