add average preceptron.

xuming06 · xuming06 · commit 20555b9b9261 · 2017-08-11T11:46:51.000+08:00
diff --git a/13wordcloud/chinese-wordcloud.py b/13wordcloud/chinese-wordcloud.py
@@ -14,7 +14,7 @@
 with open('../data/stopword.txt', encoding='utf-8') as f:
     for line in f:
         STOPWORDS.add(line.strip())
-print("stopwrod size:" + len(STOPWORDS))
+print("stopwrod size:", len(STOPWORDS))
 backgroud_Image = plt.imread('../data/cloud/girl.jpg')
 wc = WordCloud(background_color='white',  # 设置背景颜色
                # mask=backgroud_Image,  # 设置背景图片
diff --git a/14BP/AveragePerceptron.py b/14BP/AveragePerceptron.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+# Author: XuMing <shibing624@126.com>
+# Data: 17/8/10
+# Brief: 平均感知机
+
+from collections import defaultdict
+import pickle
+import random
+
+
+class AveragePerceptron:
+    def __init__(self):
+        self.weights = {}
+        self.classes = set()
+        self._totals = defaultdict(int)
+        self._tstamps = defaultdict(int)
+        self.i = 0
+
+    def predict(self, features):
+        scores = defaultdict(float)
+        for feat, value in features.items():
+            if feat not in self.weights or value == 0:
+                continue
+            weights = self.weights[feat]
+            for label, weight in weights.items():
+                scores[label] += value * weight
+        return max(self.classes, key=lambda label: (scores[label], label))
+
+    def update(self, truth, guess, features):
+        """Update the feature weights"""
+
+        def update_feat(c, f, w, v):
+            param = (f, c)
+            self._totals[param] += (self.i - self._tstamps[param]) * w
+            self._tstamps[param] = self.i
+            self.weights[f][c] = w + v
+
+        self.i += 1
+        if truth == guess:
+            return None
+        for f in features:
+            weights = self.weights.setdefault(f, {})
+            update_feat(truth, f, weights.get(truth, 0.0), 1.0)
+            update_feat(guess, f, weights.get(guess, 0.0), -1.0)
+        return None
+
+    def average_weights(self):
+        """Average weights from all iterator"""
+        for feat, weights in self.weights.items():
+            new_feat_weights = {}
+            for clas, weight in weights.items():
+                new_feat_weights = {}
+                for clas, weight in weights.items():
+                    param = (feat, clas)
+                    total = self._totals[param]
+                    total += (self.i - self._tstamps[param]) * weight
+                    averaged = round(total / float(self.i), 3)
+                    if averaged:
+                        new_feat_weights[clas] = averaged
+                self.weights[feat] = new_feat_weights
+        return None
+
+    def save(self, path):
+        return pickle.dump(dict(self.weights), open(path, 'w'))
+
+    def load(self, path):
+        self.weights = pickle.load(open(path))
+        return None
+
+
+def train(nr_iter, examples):
+    model = AveragePerceptron()
+    for i in range(nr_iter):
+        random.shuffle(examples)
+        for features, clazz in examples:
+            scores = model.predict(features)
+            guess, score = max(scores.items(), key=lambda i: i[1])
+            if guess != clazz:
+                model.update(clazz, guess, features)
+    model.average_weights()
+    return model
diff --git a/14BP/PerceptronTagger.py b/14BP/PerceptronTagger.py
@@ -0,0 +1,186 @@
+# -*- coding: utf-8 -*-
+# Author: XuMing <shibing624@126.com>
+# Data: 17/8/10
+# Brief: 平均感知机：词性标注测试
+
+import os
+import random
+from collections import defaultdict
+import pickle
+import logging
+
+from AveragePerceptron import AveragePerceptron
+
+PICKLE = "../data/bp/trontagger-0.1.pkg"
+TRAIN_FILE_PATH = "../data/bp/train.txt"
+TEST_FILE_PATH = "../data/bp/test.txt"
+
+
+class PerceptronTagger():
+    START = ['-START-', '-START2-']
+    END = ['-END-', '-END2-']
+    AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), PICKLE)
+
+    def __init__(self, load=True):
+        self.model = AveragePerceptron()
+        self.tagdict = {}
+        self.classes = set()
+        if load:
+            self.load(self.AP_MODEL_LOC)
+
+    def tag(self, corpus):
+        s_split = lambda t: t.split('\n')
+        w_split = lambda s: s.split()
+
+        def split_sents(corpus):
+            for s in s_split(corpus):
+                yield w_split(s)
+
+        prev, prev2 = self.START
+        tokens = []
+        for words in split_sents(corpus):
+            context = self.START + [self._normalize(w) for w in words] + self.END
+            for i, word in enumerate(words):
+                tag = self.tagdict.get(word)
+                if not tag:
+                    features = self._get_features(i, word, context, prev, prev2)
+                    tag = self.model.predict(features)
+                tokens.append((word, tag))
+                prev2 = prev
+                prev = tag
+        return tokens
+
+    def load(self, loc):
+        try:
+            w_td_c = pickle.load(open(loc, 'rb'))
+        except IOError:
+            raise IOError("Missing trontagger.pkg file.")
+        self.model.weights, self.tagdict, self.classes = w_td_c
+        self.model.classes = self.classes
+        return None
+
+    def _normalize(self, word):
+        if '-' in word and word[0] != '-':
+            return '!HYPHEN'
+        elif word.isdigit() and len(word) == 4:
+            return '!YEAR'
+        elif word[0].isdigit():
+            return '!DIGITS'
+        else:
+            return word.lower()
+
+    def _get_features(self, i, word, context, prev, prev2):
+        i += len(self.START)
+        features = defaultdict(int)
+
+        def add(name, *args):
+            features[' '.join((name,) + tuple(args))] += 1
+
+        # constant feature
+        add('bias')
+        add('i suffix', word[-3:])
+        add('i pref1', word[0])
+        add('i-1 tag', prev)
+        add('i-2 tag', prev2)
+        add('i tag+i-2 tag', prev, prev2)
+        add('i word', context[i])
+        add('i-1 tag+i word', prev, context[i])
+        add('i-1 word', context[i - 1])
+        add('i-1 suffix', context[i - 1][-3:])
+        add('i-2 word', context[i - 2])
+        add('i+1 word', context[i + 1])
+        add('i+1 suffix', context[i + 1][-3:])
+        add('i+2 word', context[i + 2])
+        return features
+
+    def _make_tagdict(self, sentences):
+        counts = defaultdict(lambda: defaultdict(int))
+        for words, tags in sentences:
+            for word, tag in zip(words, tags):
+                counts[word][tag] += 1
+                self.classes.add(tag)
+        freq_thresh = 20
+        ambiguity_thresh = 0.97
+        for word, tag_freqs in counts.items():
+            tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
+            n = sum(tag_freqs.values())
+            if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
+                self.tagdict[word] = tag
+
+    def _pc(self, n, d):
+        return (float(n) / d) * 100
+
+    def train(self, sentences, save_loc=None, nr_iter=5):
+        self._make_tagdict(sentences)
+        self.model.classes = self.classes
+        for iter_ in range(nr_iter):
+            c = 0
+            n = 0
+            for words, tags in sentences:
+                prev, prev2 = self.START
+                context = self.START + [self._normalize(w) for w in words] + self.END
+                for i, word in enumerate(words):
+                    guess = self.tagdict.get(word)
+                    if not guess:
+                        feats = self._get_features(i, word, context, prev, prev2)
+                        guess = self.model.predict(feats)
+                        self.model.update(tags[i], guess, feats)
+                    prev2 = prev
+                    prev = guess
+                    c += guess == tags[i]
+                    n += 1
+            random.shuffle(sentences)
+            logging.info("Iter {0}: {1}/{2}={3}".format(iter_, c, n, self._pc(c, n)))
+        self.model.average_weights()
+        if save_loc is not None:
+            pickle.dump((self.model.weights, self.tagdict, self.classes),
+                        open(save_loc, 'wb'), -1)
+        return None
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO)
+    tagger = PerceptronTagger(False)
+    try:
+        tagger.load(PICKLE)
+        print(tagger.tag("how are you ?"))
+        logging.info("Start testing...")
+        right = 0.0
+        total = 0.0
+        sentence = ([], [])
+        for line in open(TEST_FILE_PATH):
+            params = line.split()
+            if len(params) != 2: continue
+            sentence[0].append(params[0])
+            sentence[1].append(params[1])
+            if params[0] == ".":
+                text = ""
+                words = sentence[0]
+                tags = sentence[1]
+                for i, word in enumerate(words):
+                    text += word
+                    if i < len(words):
+                        text += " "
+                outputs = tagger.tag(text)
+                assert len(tags) == len(outputs)
+                total += len(tags)
+                for o, t in zip(outputs, tags):
+                    if o[1].strip() == t:
+                        right += 1
+                sentence = ([], [])
+        logging.info("Precision : %f", right / total)
+    except IOError:
+        logging.info("Reading corpus...")
+        training_data = []
+        sentence = ([], [])
+        for line in open(TRAIN_FILE_PATH):
+            params = line.split('\t')
+            sentence[0].append(params[0])
+            sentence[1].append(params[1])
+            if params[0] == ".":
+                training_data.append(sentence)
+                sentence = ([], [])
+        logging.info("training corpus size: %d", len(training_data))
+        logging.info("Start training...")
+        tagger.train(training_data, save_loc=PICKLE)
+        logging.info("training end.")
diff --git a/14BP/__init__.py b/14BP/__init__.py
@@ -0,0 +1,7 @@
+# -*- coding: utf-8 -*-
+
+"""
+@author: XuMing <shibing624@126.com>
+@summary:
+"""
+
diff --git a/data/bp/test.txt b/data/bp/test.txt
diff --git a/data/bp/train.txt b/data/bp/train.txt

-Original file line number
+Diff line change
@@ @@ -0,0 +1,7 @@ @@
 +# -*- coding: utf-8 -*-
++
 +"""
 +@author: XuMing <[email protected]>
 +@summary:
 +"""
++