-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathTransliterate.py
More file actions
162 lines (142 loc) · 5.04 KB
/
Transliterate.py
File metadata and controls
162 lines (142 loc) · 5.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# -*- coding: iso-8859-1 -*-
"""
Google Transliterate API
Public class: Transliteration, TransliterationError
Public functions: getTransliteration
"""
# Author: rajeshsr <[email protected]>
# http://rajeshsr.co.cc
# Licensed under MIT License
import urllib2
import urllib
import re
#import json
import simplejson
class TransliterationError(Exception):
pass
class Transliteration:
_code = {
'en':'ENGLISH',
'ar':'ARABIC',
'bn': 'BENGALI',
'gu':'GUJARATI',
'hi':'HINDI',
'kn':'KANNADA',
'ml':'MALAYALAM',
'mr':'MARATHI',
'ne':'NEPALI',
'fa':'PERSIAN',
'pa':'PUNJABI',
'ta':'TAMIL',
'te':'TELUGU',
'ur':'URDU'
}
_headerData = {'Host' : 'www.google.com',
'User-Agent' : 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.14) Gecko/2009091010 Iceweasel/3.0.6 (Debian-3.0.6-3)',
'Accept' : 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language' : 'en-us,en;q=0.5',
'Accept-Encoding' : 'gzip,deflate',
'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.7',
'Keep-Alive' : '300',
'Connection' : 'keep-alive',
'Content-Type' : 'application/x-www-form-urlencoded;charset=utf-8',
'Referer' : 'http://www.google.com/transliterate/',
'Content-Length' : '389',
'Pragma' : 'no-cache',
'Cache-Control' : ' no-cache'}
_postData = {
'langpair' : 'ar|en',
'num' : '5',
'text' : 'vallamai',
'tl_app' : '3',
'tlqt' : ' 1',
'version' : '2',
}
#pass destination language in constructor
def __init__(self,dest = 'ta'):
if dest not in self._code:
raise TransliterationError, "Destination language %s not supported"%dest
self._dest = dest
self._postData['langpair'] = 'en|%s'%dest
self._cache = {}
#google caches some commonly used words by the following URL and use it to speed up. You can use this cache for offline transliteration too
try:
url = 'http://www.google.com/transliterate/indic?tlqt=4&langpair=en|%s&tl_app=3&v=1'%dest
page = urllib.FancyURLopener({}).open(url).read()
page = page.replace('\'','\"')
self._cache = dict(zip(*simplejson.loads(page)))
except Exception:
#since this is just for optimization, don't bother if exception occurs
pass
def _getUnicode(self,s):
"""
return the unicode string corresponding to the encoding in s
"""
ans = u''
m = re.compile('\\u([0-9a-fA-F]+)')
return ans.join([unichr(int(x, 16)) for x in m.findall(s)])
def _getTrans(self,word):
if word == '':
return u''
if word in self._cache:
return self._cache[word]
dest = self._dest
#param text contains the word to transliterate
self._postData['text'] = word
URL = 'http://www.google.com/transliterate/%s'%self._code[dest]
req = urllib2.Request(URL,urllib.urlencode(self._postData))
req.add_headers = (self._headerData.items())
res = urllib2.urlopen(req)
val = res.read()
match = '"%s",\n\[\n"([^"]+)'%re.escape(self._postData['text'])
matcher = re.compile(match)
target = matcher.findall(val)
if(len(target) == 0):
raise TransliterationError, 'Unable to get transliteration of %s'%word
self._cache[word] = self._getUnicode(target[0])
return self._cache[word]
def getTransliteration(self,line):
"""
returns transliteration of line
By default it is tamil
"""
#Transliteration seems to return nothing with with non-(alphanumeric) characters, so sending only alphabet
#Transliteration seems to be done even to numeric values. I don't see its utility yet. So only alpha-characters are sent.
#Send word by word for transliteration as done by google transliterate itself.
stripped = ''
ans = u''
dest = self._dest
for c in line:
if not c.isalpha() and not c.isdigit():
ans += self._getTrans(stripped) + c
stripped = ''
else:
stripped += c
ans += self._getTrans(stripped)
return ans
#enums for passing destination language in constructor
ENGLISH = 'en'
ARABIC = 'ar'
BENGALI = 'bn'
GUJARATI = 'gu'
HINDI = 'hi'
KANNADA = 'kn'
MALAYALAM = 'ml'
MARATHI = 'mr'
NEPALI = 'ne'
PERSIAN = 'fa'
PUNJABI = 'pa'
TAMIL = 'ta'
TELUGU = 'te'
URDU = 'ur'
if __name__ == '__main__':
x = Transliteration(ARABIC)
print x.getTransliteration(inp).encode("UTF-8")
'''
while True:
try:
inp = raw_input()
print x.getTransliteration(inp).encode("UTF-8")
except EOFError:
break
'''