-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathhtml_tokenizer.py
More file actions
85 lines (58 loc) · 2.11 KB
/
html_tokenizer.py
File metadata and controls
85 lines (58 loc) · 2.11 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import tokens
class Tokenizer:
def __init__(self, content: str):
self.content = content
self.position = -1
self.line = -1
self.column = -1
self.current_character = None
def advance(self):
self.position += 1
self.column += 1
if self.position >= len(self.content):
raise RuntimeError('error')
self.current_character = self.content[self.position]
def parse_string(self):
start_quote = self.current_character
self.advance()
string_end = self.content.find(start_quote, self.position)
if string_end == -1:
raise RuntimeError('error')
token_content = self.content[self.position : string_end]
self.column += string_end - self.position
self.position = string_end
return tokens.String(token_content)
def parse_identifier(self):
token_content = ""
while self.current_character.isalnum():
token_content += self.current_character
self.advance()
self.position -= 1
self.column -= 1
return tokens.Identifier(token_content)
def skip_whitespaces(self):
while self.current_character in ' \t\r\n':
if self.current_character == '\n':
self.line += 1
self.column = -1
self.advance()
def get_token(self):
if self.position + 1 >= len(self.content):
return tokens.EndOfFile()
self.advance()
self.skip_whitespaces()
if self.current_character == '<':
self.advance()
self.skip_whitespaces()
if self.current_character == '/':
return tokens.OpenTagSlash()
self.position -= 1
self.column -= 1
return tokens.OpenTag()
elif self.current_character == '>':
return tokens.CloseTag()
elif self.current_character == '"':
return self.parse_string()
elif self.current_character == '=':
return tokens.Equals()
return self.parse_identifier()