-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathnoising.py
More file actions
94 lines (74 loc) · 2.39 KB
/
noising.py
File metadata and controls
94 lines (74 loc) · 2.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import sys
import codecs
import argparse
import random
def build_vocab(corpus):
dict_vocab={}
with codecs.open(corpus, 'r', 'utf-8') as f:
for line in f:
word_arr = line.strip().lower().split()
for i in range(len(word_arr)):
if word_arr[i] in dict_vocab:
dict_vocab[word_arr[i]] = dict_vocab[word_arr[i]] + 1
else:
dict_vocab[word_arr[i]] = 1
f.close()
return dict_vocab
def read_normalized_dict(norm_dict):
dict_norm = {}
with codecs.open(norm_dict, 'r', 'utf-8') as f:
for line in f:
wd, word = line.strip().split('\t')
if word in dict_norm:
dict_norm[word].append(wd)
else:
dict_norm[word] = [wd]
f.close()
return dict_norm
def load_vocab(corpus_vocab):
dict_vocab = {}
with codecs.open(corpus_vocab, 'r', 'utf-8') as f:
for line in f:
word, count = line.strip().split('\t')
dict_vocab[word] = count
f.close()
return dict_vocab
def save_vocab(dict_vocab, corpus):
with codecs.open(corpus+".vocab", 'w', 'utf-8') as f:
for key, value in dict_vocab.items():
f.write(key + "\t" + str(value) + "\n")
f.close()
def norm_dict_to_corpus(corpus, dict_norm):
# save the line numbers that have the dictionary word
dict_norm_corpus = {}
line_num = 0
with codecs.open(corpus, 'r', 'utf-8') as f:
for line in f:
line_num += 1
word_arr = line.strip().lower().split()
for word in word_arr:
if word in dict_norm:
if word in dict_norm_corpus:
dict_norm_corpus[word].append(line_num)
else:
dict_norm_corpus[word] = [line_num] # this line contains the lexicon word
f.close()
return dict_norm_corpus
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Add spelling errors in the corpus')
parser.add_argument('--corpus', type=str, help='corpus file')
parser.add_argument('--corpus_vocab', type=str, help='corpus vocab file. optional parameter')
parser.add_argument('--norm_dictionary', type=str, help='normalized dictionary')
params = parser.parse_args()
corpus = params.corpus
norm_dictionary = params.norm_dictionary
dict_norm = read_normalized_dict(norm_dictionary)
dict_vocab = {}
if params.corpus_vocab == None:
dict_vocab = build_vocab(params.corpus)
save_vocab(dict_vocab, corpus)
print ("Save vocabulary file as " + corpus + ".vocab")
else:
dict_vocab = load_vocab(params.corpus_vocab)
dict_norm_corpus = norm_dict_to_corpus(params.corpus, dict_norm)
print (dict_norm_corpus)