forked from shibing624/python-tutorial
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtemp_parser.py
More file actions
379 lines (311 loc) · 10.4 KB
/
temp_parser.py
File metadata and controls
379 lines (311 loc) · 10.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
# -*- coding: utf-8 -*-
"""
@author:XuMing([email protected])
@description:
"""
"""md2pdf
translates markdwon file into html or pdf, and support picture insertion.
Usage:
md2pdf <sourcefile> <outputfile> [options]
Options:
-h --help show help document.
-v --version show version information.
-o --output translate sourcefile into html file.
-p --print translate sourcefile into pdf file and html file respectively.
-P --Print translate sourcefile into pdf file only.
"""
import os
import re
import sys
from enum import Enum
from functools import reduce
from subprocess import call
__version__ = '1.0'
# 定义三个枚举类
# 定义表状态
class TABLE(Enum):
Init = 1
Format = 2
Table = 3
# 有序序列状态
class ORDERLIST(Enum):
Init = 1
List = 2
# 块状态
class BLOCK(Enum):
Init = 1
Block = 2
CodeBlock = 3
# 定义全局状态,并初始化状态
table_state = TABLE.Init
orderList_state = ORDERLIST.Init
block_state = BLOCK.Init
is_code = False
is_normal = True
temp_table_first_line = []
temp_table_first_line_str = ""
need_mathjax = False
def get_state(input):
global table_state, orderList_state, block_state, is_code, temp_table_first_line, temp_table_first_line_str
Code_List = ["python\n", "c++\n", "c\n"]
result = input
# 构建正则表达式规则
# 匹配块标识
pattern = re.compile(r'```(\s)*\n')
a = pattern.match(input)
# 普通块
if a and block_state == BLOCK.Init:
result = "<blockquote>"
block_state = BLOCK.Block
is_normal = False
# 特殊代码块
elif len(input) > 4 and input[0:3] == '```' and (
input[3:9] == "python" or input[3:6] == "c++" or input[3:4] == "c") and block_state == BLOCK.Init:
block_state = BLOCK.Block
result = "<code></br>"
is_code = True
is_normal = False
# 块结束
elif block_state == BLOCK.Block and input == '```\n':
if is_code:
result = "</code>"
else:
result = "</blockquote>"
block_state = BLOCK.Init
is_code = False
is_normal = False
elif block_state == BLOCK.Block:
pattern = re.compile(r'[\n\r\v\f\ ]')
result = pattern.sub(" ", result)
pattern = re.compile(r'\t')
result = pattern.sub(" " * 4, result)
result = "<span>" + result + "</span></br>"
is_normal = False
# 解析有序序列
if len(input) > 2 and input[0].isdigit() and input[1] == '.' and orderList_state == ORDERLIST.Init:
orderList_state = ORDERLIST.List
result = "<ol><li>" + input[2:] + "</li>"
is_normal = False
elif len(input) > 2 and input[0].isdigit() and input[1] == '.' and orderList_state == ORDERLIST.List:
result = "<li>" + input[2:] + "</li>"
is_normal = False
elif orderList_state == ORDERLIST.List and (len(input) <= 2 or input[0].isdigit() == False or input[1] != '.'):
result = "</ol>" + input
orderList_state = ORDERLIST.Init
# 解析表格
pattern = re.compile(r'^((.+)\|)+((.+))$')
match = pattern.match(input)
if match:
l = input.split('|')
l[-1] = l[-1][:-1]
# 将空字符弹出列表
if l[0] == '':
l.pop(0)
if l[-1] == '':
l.pop(-1)
if table_state == TABLE.Init:
table_state = TABLE.Format
temp_table_first_line = l
temp_table_first_line_str = input
result = ""
elif table_state == TABLE.Format:
# 如果是表头与表格主题的分割线
if reduce(lambda a, b: a and b, [all_same(i, '-') for i in l], True):
table_state = TABLE.Table
result = "<table><thread><tr>"
is_normal = False
# 添加表头
for i in temp_table_first_line:
result += "<th>" + i + "</th>"
result += "</tr>"
result += "</thread><tbody>"
is_normal = False
else:
result = temp_table_first_line_str + "</br>" + input
table_state = TABLE.Init
elif table_state == TABLE.Table:
result = "<tr>"
for i in l:
result += "<td>" + i + "</td>"
result += "</tr>"
elif table_state == TABLE.Table:
table_state = TABLE.Init
result = "</tbody></table>" + result
elif table_state == TABLE.Format:
pass
return result
# 判断 lst 是否全由字符 sym 构成
def all_same(lst, sym):
return not lst or sym * len(lst) == lst
# 处理标题
def handleTitle(s, n):
temp = "<h" + repr(n) + ">" + s[n:] + "</h" + repr(n) + ">"
return temp
# 处理无序列表
def handleUnorderd(s):
s = "<ul><li>" + s[1:]
s += "</li></ul>"
return s
def tokenTemplate(s, match):
pattern = ""
if match == '*':
pattern = "\*([^\*]*)\*"
if match == '~~':
pattern = "\~\~([^\~\~]*)\~\~"
if match == '**':
pattern = "\*\*([^\*\*]*)\*\*"
return pattern
# 处理特殊标识,比如 **, *, ~~
def tokenHandler(s):
l = ['b', 'i', 'S']
j = 0
for i in ['**', '*', '~~']:
pattern = re.compile(tokenTemplate(s, i))
match = pattern.finditer(s)
k = 0
for a in match:
if a:
content = a.group(1)
x, y = a.span()
c = 3
if i == '*':
c = 5
s = s[:x + c * k] + "<" + l[j] + ">" + content + "</" + l[j] + ">" + s[y + c * k:]
k += 1
pattern = re.compile(r'\$([^\$]*)\$')
a = pattern.search(s)
if a:
global need_mathjax
need_mathjax = True
j += 1
return s
# 处理链接
def link_image(s):
# 超链接
pattern = re.compile(r'\\\[(.*)\]\((.*)\)')
match = pattern.finditer(s)
for a in match:
if a:
text, url = a.group(1, 2)
x, y = a.span()
s = s[:x] + "<a href=" + url + " target=\"_blank\">" + text + "</a>" + s[y:]
# 图像链接
pattern = re.compile(r'!\[(.*)\]\((.*)\)')
match = pattern.finditer(s)
for a in match:
if a:
text, url = a.group(1, 2)
x, y = a.span()
s = s[:x] + "<img src=" + url + " target=\"_blank\">" + "</a>" + s[y:]
# 角标
pattern = re.compile(r'(.)\^\[([^\]]*)\]')
match = pattern.finditer(s)
k = 0
for a in match:
if a:
sym, index = a.group(1, 2)
x, y = a.span()
s = s[:x + 8 * k] + sym + "<sup>" + index + "</sup>" + s[y + 8 * k:]
k += 1
return s
def parse(input):
global block_state, is_normal
is_normal = True
result = input
# 检测当前 input 解析状态
result = get_state(input)
if block_state == BLOCK.Block:
return result
# 分析标题标记 #
title_rank = 0
for i in range(6, 0, -1):
if input[:i] == '#' * i:
title_rank = i
break
if title_rank != 0:
# 处理标题,转化为相应的 HTML 文本
result = handleTitle(input, title_rank)
return result
# 分析分割线标记 --
if len(input) > 2 and all_same(input[:-1], '-') and input[-1] == '\n':
result = "<hr>"
return result
# 解析无序列表
unorderd = ['+', '-']
if result != "" and result[0] in unorderd:
result = handleUnorderd(result)
is_normal = False
f = input[0]
count = 0
sys_q = False
while f == '>':
count += 1
f = input[count]
sys_q = True
if sys_q:
result = "<blockquote style=\"color:#8fbc8f\"> " * count + "<b>" + input[
count:] + "</b>" + "</blockquote>" * count
is_normal = False
# 处理特殊标记,比如 ***, ~~~
result = tokenHandler(result)
# 解析图像链接
result = link_image(result)
pa = re.compile(r'^(\s)*$')
a = pa.match(input)
if input[-1] == "\n" and is_normal == True and not a:
result += "</br>"
return result
def run(source_file, dest_file, dest_pdf_file, only_pdf):
# 获取文件名
file_name = source_file
# 转换后的 HTML 文件名
dest_name = dest_file
# 转换后的 PDF 文件名
dest_pdf_name = dest_pdf_file
# 获取文件后缀
_, suffix = os.path.splitext(file_name)
if suffix not in [".md", ".markdown", ".mdown", "mkd"]:
print('Error: the file should be in markdown format')
sys.exit(1)
if only_pdf:
dest_name = ".~temp~.html"
f = open(file_name, "r",encoding='utf-8')
f_r = open(dest_name, "w",encoding='utf-8')
# 往文件中填写 HTML 的一些属性
f_r.write("""<style type="text/css">div {display: block;font-family: "Times New Roman",Georgia,Serif}\
#wrapper { width: 100%;height:100%; margin: 0; padding: 0;}#left { float:left; \
width: 10%; height: 100%; }#second { float:left; width: 80%;height: 100%; \
}#right {float:left; width: 10%; height: 100%; \
}</style><div id="wrapper"> <div id="left"></div><div id="second">""")
f_r.write("""<meta charset="utf-8"/>""")
# 逐行解析 markdwon 文件
for eachline in f:
result = parse(eachline)
if result != "":
f_r.write(result)
print(result)
f_r.write("""</br></br></div><div id="right"></div></div>""")
# 公式支持
global need_mathjax
if need_mathjax:
f_r.write("""<script type="text/x-mathjax-config">\
MathJax.Hub.Config({tex2jax: {inlineMath: [['$','$'], ['\\(','\\)']]}});\
</script><script type="text/javascript" \
src="http://cdn.mathjax.org/mathjax/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>""")
# 文件操作完成之后记得关闭!!!
f_r.close()
f.close()
# 调用扩展 wkhtmltopdf 将 HTML 文件转换成 PDF
if dest_pdf_name != "" or only_pdf:
call(["wkhtmltopdf", dest_name, dest_pdf_name])
# 如果有必要,删除中间过程生成的 HTML 文件
if only_pdf:
call(["rm", dest_name])
# 主函数
def main():
dest_file = "translation_result.html"
dest_pdf_file = "translation_result.pdf"
only_pdf = False
run('temp.md', dest_file, '', only_pdf)
if __name__ == "__main__":
main()