update cnn text with chinese words. xuming 20171020

xuming06 · xuming06 · commit e4be0c56cd12 · 2017-10-20T20:36:56.000+08:00
diff --git a/17tensorflow/4_cnn_text_classification/config.py b/17tensorflow/4_cnn_text_classification/config.py
@@ -6,8 +6,8 @@
 config = {
     # data
     "dev_sample_percentage": 0.1,  # percentage of the training data for validation
-    "positive_data_file": "./data/en_polarity/pos.txt",  # positive data
-    "negative_data_file": "./data/en_polarity/neg.txt",  # negative data
+    "positive_data_file": "./data/zh_polarity/pos.txt",  # positive data
+    "negative_data_file": "./data/zh_polarity/neg.txt",  # negative data
 
     # model
     "embedding_dim": 128,  # dimensionality of character embedding (default: 128)
diff --git a/17tensorflow/4_cnn_text_classification/data/zh_polarity/neg.txt b/17tensorflow/4_cnn_text_classification/data/zh_polarity/neg.txt
diff --git a/17tensorflow/4_cnn_text_classification/data/zh_polarity/neg_sample.txt b/17tensorflow/4_cnn_text_classification/data/zh_polarity/neg_sample.txt
@@ -0,0 +1,13 @@
+simplistic , silly and tedious .
+it's so laddish and juvenile , only teenage boys could possibly find it funny .
+镜面sample设计容易留指纹，降价速度太快，从订货到收货 100元没了，缺少原装包。
+cpu的使用较大，不知道寿命如何？显示屏中间的华硕标志非常热，不知道是有点还是缺点。主要是在贴这个标志的时候，贴的很糟糕。
+找了,好几个系统盘都找不到SATA的硬盘.最后用U盘做了一个启动盘，先启动了一个Win PE操作系统，然后在Win PE环境下使用Ghost安装一个ＸＰ　ｓｐ３的镜像，终于能够装上ＸＰ系统了。
+唯一感觉不足的是硬盘就分了两个驱，每个100多G，要是能分成3个或更多就好了，呵呵
+说明书简陋 随盘带的软件没有详细说明做什么用的 装的时候都不知道 不装的话 有些快捷键没法用 装的话不知道那个是那个 ，另外，键盘的手感不是很好。
+发热量也太大了吧，刚开机没多久，仅上网，机器就很热了，gpu就没有下过50度，cp一直44度以上，不知道是正常的还是我的这台有问题，希望有人指教一下~
+买前不知道这个酷睿双核原来是简版的。。。不过这个价钱也不冤了。
+也系统比较麻烦，还好有高人指点才顺利安装了XP。也不知驱动都下全了没，暂时先凑合用着了。
+下订单付款了 单没有现货 再次到货时已经降价100元 没有通知我 也没有退款 不知如何解决？
+价格不是最便宜的，招商还是浦发银行是238*12=2856.00人家还可以分期的。
+驱动还有系统要自装,还有显卡太鸡巴低了.还有装系统太麻烦了...
diff --git a/17tensorflow/4_cnn_text_classification/data/zh_polarity/pos.txt b/17tensorflow/4_cnn_text_classification/data/zh_polarity/pos.txt
diff --git a/17tensorflow/4_cnn_text_classification/data/zh_polarity/pos_sample.txt b/17tensorflow/4_cnn_text_classification/data/zh_polarity/pos_sample.txt
@@ -0,0 +1,13 @@
+the lively appeal of the last kiss lies in the ease with which it integrates thoughtfulness and pasta-fagioli comedy .
+without resorting to camp or parody , haynes ( like sirk , but differently ) has transformed the rhetoric of hollywood melodrama into something provocative , rich , and strange .
+the performances are an absolute joy .
+键盘keyword手感一般。
+这个价格,这个配置，应该来说还是很好的。HDMI可以用来电脑播放高清，然后接到电视上。
+拿到东西后发现比京东图片要漂亮多了,是深蓝的,很喜欢啊!!做工也不错.完美屏.电源也没见有多热啊!
+硬盘分区好了还带驱动真的太方便了，以前装驱动都要大半天下载。这次迅速完成
+华硕的 大品牌 散热很好 开了很长时间了也不热 4799买的 值这个价钱
+配置算不错，装XP有点麻烦，关键是找有ICH9驱动的盘，按照网上的安装方法，能完美装上XP SP3，解决了喇叭爆音问题，声音图标也能显示，没有别人说的那样发热量高，风扇声响大。
+SSD硬盘，金属外壳，齐全的功能（包括内置麦克风，这点介绍没有）。
+本来忍着DOS系统买的，想到4xxx能买到这个配置也超值了， 没想到一开机竟然自带vista，感觉京东老厚道了呵呵。先用正版vista吧呵呵
+E5400质量不错不会装系统的朋友,一定要注意,linux不好弄,尽量买一个带windows
+性价比比较高，配置也比较平衡，看电影玩普通的游戏都比较流畅。很实惠。在这样的价位上：双核处理器，3470独显，250G硬盘。已经非常可以了。自己加了两条金士顿2G-800，无兼容问题。
diff --git a/17tensorflow/4_cnn_text_classification/data_helpers.py b/17tensorflow/4_cnn_text_classification/data_helpers.py
@@ -4,13 +4,13 @@
 # Brief: 数据处理
 
 import re
-
+import jieba
 import numpy as np
 
 
 def clean_str(string):
     """
-    Tokenization cleaning for dataset
+    Tokenization cleaning for dataset, only save english chars
     Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
     :param string:
     :return:
@@ -31,9 +31,20 @@ def clean_str(string):
     return string.strip().lower()
 
 
+def contain_chinese(string):
+    """
+    Check string is contain chinese character
+    :param string:
+    :return:
+    """
+    zh_pattern = re.compile(r"[\u4E00-\u9FD5]+")
+    return zh_pattern.search(string)
+
+
 def load_data_labels(positive_data_file, negative_data_file):
     """
-    Loads polarity data from files, splits data to words and labels
+    Loads polarity data from files, splits data to words and labels,
+    Split chinese word with
     :param positive_data_file:
     :param negative_data_file:
     :return: split sentence and labels
@@ -44,11 +55,23 @@ def load_data_labels(positive_data_file, negative_data_file):
     negative_data = [s.strip() for s in negative_data]
     # split by words
     x_text = positive_data + negative_data
-    x_text = [clean_str(sent) for sent in x_text]
+    clean_text = []
+    for sent in x_text:
+        # 中文
+        if contain_chinese(sent):
+            # 用1元切分
+            # clean_text.append(" ".join(list(sent)))
+            # jieba切词
+            clean_text.append(" ".join(jieba.cut(sent)))
+        else:
+            # 英文用clean_str切分
+            clean_text.append(clean_str(sent))
+    # x_text = [clean_str(sent) for sent in x_text]
+    x_text = clean_text
     # generate labels
     positive_labels = [[0, 1] for i in positive_data]
     negative_labels = [[1, 0] for i in negative_data]
-    y = np.concatenate([positive_labels, negative_labels], 0)
+    y = np.concatenate([positive_labels, negative_labels], axis=0)
     return [x_text, y]
 
 
diff --git a/17tensorflow/4_cnn_text_classification/eval.py b/17tensorflow/4_cnn_text_classification/eval.py
@@ -23,8 +23,9 @@
                                                   config.config["negative_data_file"])
     y_test = np.argmax(y_test, axis=1)
 else:
-    x_raw = ["many insightful moments .", "everything is off.", "i hate you .", "it is a bad film."]
-    y_test = [1, 0, 0, 1]
+    x_raw = ["many insightful moments .", "everything is off.", "i hate you .", "it is a bad film.",
+             "good man and bad person."]
+    y_test = [1, 0, 0, 1, 1]
 
 # map data into vocabulary
 checkpoint_dir = config.evaluate["checkpoint_dir"]
diff --git a/17tensorflow/4_cnn_text_classification/train.py b/17tensorflow/4_cnn_text_classification/train.py
@@ -75,8 +75,10 @@
         grad_summaries_total = tf.summary.merge(grad_summaries)
 
         # output directory for models and summaries
+        today = str(datetime.datetime.today().strftime("%Y%m%d"))
         timestamp = str(int(time.time()))
-        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
+        folder = today + "-" + timestamp
+        out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", folder))
         print("writing to {}\n".format(out_dir))
 
         # summaries for loss and accuracy