44# Bi-gram : 0.9056 test accuracy after 5 epochs.
55import os
66
7+ import keras
78import numpy as np
89from keras .layers import Dense
910from keras .layers import Embedding
1011from keras .layers import GlobalAveragePooling1D
1112from keras .models import Sequential
1213from keras .preprocessing import sequence
13- from keras .preprocessing .sequence import pad_sequences
1414
1515
1616def get_corpus (data_dir ):
@@ -24,21 +24,22 @@ def get_corpus(data_dir):
2424 for file_name in os .listdir (data_dir ):
2525 with open (os .path .join (data_dir , file_name ), mode = 'r' , encoding = 'utf-8' ) as f :
2626 for line in f :
27- parts = line .strip ().split (',' )
27+ parts = line .rstrip ().split (',' )
2828 if parts and len (parts ) > 1 :
29- lbl = parts [0 ]
29+ # keras categorical label start with 0
30+ lbl = int (parts [0 ]) - 1
3031 sent = parts [1 ]
3132 sent_split = sent .split ()
3233 words .append (sent_split )
3334 labels .append (lbl )
3435 return words , labels
3536
3637
37- def vectorize_words (words , word_idx , maxlen ):
38+ def vectorize_words (words , word_idx ):
3839 inputs = []
3940 for word in words :
4041 inputs .append ([word_idx [w ] for w in word ])
41- return pad_sequences ( inputs , maxlen = maxlen )
42+ return inputs
4243
4344
4445def create_ngram_set (input_list , ngram_value = 2 ):
@@ -58,6 +59,11 @@ def add_ngram(sequences, token_indice, ngram_range=2):
5859 :param token_indice:
5960 :param ngram_range:
6061 :return:
62+ Example: adding bi-gram
63+ >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
64+ >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
65+ >>> add_ngram(sequences, token_indice, ngram_range=2)
66+ [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
6167 """
6268 new_seq = []
6369 for input in sequences :
@@ -72,11 +78,12 @@ def add_ngram(sequences, token_indice, ngram_range=2):
7278
7379
7480ngram_range = 2
81+ num_classes = 3
7582max_features = 20000
7683max_len = 400
7784batch_size = 32
78- embedding_dims = 50
79- epochs = 5
85+ embedding_dims = 200
86+ epochs = 10
8087SAVE_MODEL_PATH = 'fasttext_multi_classification_model.h5'
8188pwd_path = os .path .abspath (os .path .dirname (__file__ ))
8289print ('pwd_path:' , pwd_path )
@@ -87,11 +94,10 @@ def add_ngram(sequences, token_indice, ngram_range=2):
8794print ('loading data...' )
8895x_train , y_train = get_corpus (train_data_dir )
8996x_test , y_test = get_corpus (test_data_dir )
90-
91- # Reserve 0 for masking via pad_sequences
97+ y_train = keras . utils . to_categorical ( y_train )
98+ y_test = keras . utils . to_categorical ( y_test )
9299
93100sent_maxlen = max (map (len , (x for x in x_train + x_test )))
94-
95101print ('-' )
96102print ('Sentence max length:' , sent_maxlen , 'words' )
97103print ('Number of training data:' , len (x_train ))
@@ -102,11 +108,21 @@ def add_ngram(sequences, token_indice, ngram_range=2):
102108print ('-' )
103109print ('Vectorizing the word sequences...' )
104110
105- print (len (x_train ), 'train seq' )
106- print (len (x_test ), 'test seq' )
107111print ('Average train sequence length: {}' .format (np .mean (list (map (len , x_train )), dtype = int )))
108112print ('Average test sequence length: {}' .format (np .mean (list (map (len , x_test )), dtype = int )))
109113
114+ vocab = set ()
115+ for w in x_train + x_test :
116+ vocab |= set (w )
117+ vocab = sorted (vocab )
118+ vocab_size = len (vocab ) + 1
119+ print ('Vocab size:' , vocab_size , 'unique words' )
120+ word_idx = dict ((c , i + 1 ) for i , c in enumerate (vocab ))
121+ ids_2_word = dict ((value , key ) for key , value in word_idx .items ())
122+
123+ x_train = vectorize_words (x_train , word_idx )
124+ x_test = vectorize_words (x_test , word_idx )
125+
110126if ngram_range > 1 :
111127 print ('Adding {}-gram features' .format (ngram_range ))
112128 # n-gram set from train data
@@ -130,22 +146,9 @@ def add_ngram(sequences, token_indice, ngram_range=2):
130146 print ('Average train sequence length: {}' .format (train_mean_len ))
131147 print ('Average test sequence length: {}' .format (test_mean_len ))
132148
133- vocab = set ()
134- for w in x_train + x_test + y_test :
135- vocab |= set (w )
136- vocab = sorted (vocab )
137- vocab_size = len (vocab ) + 1
138- print ('Vocab size:' , vocab_size , 'unique words' )
139- word_idx = dict ((c , i + 1 ) for i , c in enumerate (vocab ))
140- ids_2_word = dict ((value , key ) for key , value in word_idx .items ())
141-
142149print ('pad sequences (samples x time)' )
143- # x_train = sequence.pad_sequences(x_train, maxlen=max_len)
144- # x_test = sequence.pad_sequences(x_test, maxlen=max_len)
145- x_train = vectorize_words (x_train , word_idx , max_len )
146- x_test = vectorize_words (x_test , word_idx , max_len )
147- print ('x_train shape:' , x_train .shape )
148- print ('x_test shape:' , x_test .shape )
150+ x_train = sequence .pad_sequences (x_train , maxlen = max_len )
151+ x_test = sequence .pad_sequences (x_test , maxlen = max_len )
149152
150153print ('build model...' )
151154model = Sequential ()
@@ -166,5 +169,5 @@ def add_ngram(sequences, token_indice, ngram_range=2):
166169print ('save model:' , SAVE_MODEL_PATH )
167170probs = model .predict (x_test , batch_size = batch_size )
168171assert len (probs ) == len (y_test )
169- for answer , prob in zip (y_test , probs ):
170- print ('answer_test_index :%s\t prob_index:%s\t prob:%s' % (answer , prob .argmax (), prob .max ()))
172+ for label , prob in zip (y_test , probs ):
173+ print ('label_test_index :%s\t prob_index:%s\t prob:%s' % (label . argmax () , prob .argmax (), prob .max ()))
0 commit comments