Skip to content

Commit 57f9639

Browse files
bbc2Saurabh Kumar
authored andcommitted
Refactor parser to fix inconsistencies (theskumar#180)
* Move parser to separate module * Add tests * Use unicode strings for unit tests in Python 2 Using `str` (e.g. `bytes`) is inconsistent with the types and the implementation. * Refactor parser This fixes inconsistencies reported after the release of version 0.10.0: * Valid escapes were interpreted as control characters even when in single-quoted strings. * `#` was interpreted as the start of a comment even if there was no whitespace preceding it. However, we are keeping the interpretation of escapes in double-quoted strings as they didn't make sense in versions before 0.10.0. The single large regular expression is replaced with a handwritten top-down parser using smaller regular expressions. The reason for this change is that it would have been very difficult or impossible to satisfy the parsing requirements with a single regex.
1 parent 73124de commit 57f9639

File tree

8 files changed

+265
-164
lines changed

8 files changed

+265
-164
lines changed

README.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -299,8 +299,9 @@ Changelog
299299
Unreleased
300300
-----
301301

302-
- ...
303-
302+
- Refactor parser to fix parsing inconsistencies ([@bbc2])([#170]).
303+
- Interpret escapes as control characters only in double-quoted strings.
304+
- Interpret `#` as start of comment only if preceded by whitespace.
304305

305306
0.10.2
306307
-----
@@ -428,6 +429,7 @@ Unreleased
428429
[#172]: https://github.com/theskumar/python-dotenv/issues/172
429430
[#121]: https://github.com/theskumar/python-dotenv/issues/121
430431
[#176]: https://github.com/theskumar/python-dotenv/issues/176
432+
[#170]: https://github.com/theskumar/python-dotenv/issues/170
431433

432434
[@asyncee]: https://github.com/asyncee
433435
[@greyli]: https://github.com/greyli

setup.cfg

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,9 @@ universal = 1
55
max-line-length = 120
66
exclude = .tox,.git,docs,venv,.venv
77

8+
[mypy]
9+
ignore_missing_imports = true
10+
811
[metadata]
912
description-file = README.rst
1013

src/dotenv/compat.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,23 @@
11
import sys
2+
from typing import Text
23

34
if sys.version_info >= (3, 0):
45
from io import StringIO # noqa
56
else:
67
from StringIO import StringIO # noqa
78

89
PY2 = sys.version_info[0] == 2 # type: bool
10+
11+
12+
def to_text(string):
13+
# type: (str) -> Text
14+
"""
15+
Make a string Unicode if it isn't already.
16+
17+
This is useful for defining raw unicode strings because `ur"foo"` isn't valid in
18+
Python 3.
19+
"""
20+
if PY2:
21+
return string.decode("utf-8")
22+
else:
23+
return string

src/dotenv/main.py

Lines changed: 3 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,21 @@
11
# -*- coding: utf-8 -*-
22
from __future__ import absolute_import, print_function, unicode_literals
33

4-
import codecs
54
import io
65
import os
76
import re
87
import shutil
98
import sys
109
from subprocess import Popen
1110
import tempfile
12-
from typing import (Any, Dict, Iterator, List, Match, NamedTuple, Optional, # noqa
13-
Pattern, Union, TYPE_CHECKING, Text, IO, Tuple) # noqa
11+
from typing import (Dict, Iterator, List, Match, Optional, # noqa
12+
Pattern, Union, TYPE_CHECKING, Text, IO, Tuple)
1413
import warnings
1514
from collections import OrderedDict
1615
from contextlib import contextmanager
1716

1817
from .compat import StringIO, PY2
18+
from .parser import parse_stream
1919

2020
if TYPE_CHECKING: # pragma: no cover
2121
if sys.version_info >= (3, 6):
@@ -30,84 +30,6 @@
3030

3131
__posix_variable = re.compile(r'\$\{[^\}]*\}') # type: Pattern[Text]
3232

33-
_binding = re.compile(
34-
r"""
35-
(
36-
\s* # leading whitespace
37-
(?:export{0}+)? # export
38-
39-
( '[^']+' # single-quoted key
40-
| [^=\#\s]+ # or unquoted key
41-
)?
42-
43-
(?:
44-
(?:{0}*={0}*) # equal sign
45-
46-
( '(?:\\'|[^'])*' # single-quoted value
47-
| "(?:\\"|[^"])*" # or double-quoted value
48-
| [^\#\r\n]* # or unquoted value
49-
)
50-
)?
51-
52-
\s* # trailing whitespace
53-
(?:\#[^\r\n]*)? # comment
54-
(?:\r|\n|\r\n)? # newline
55-
)
56-
""".format(r'[^\S\r\n]'),
57-
re.MULTILINE | re.VERBOSE,
58-
) # type: Pattern[Text]
59-
60-
_escape_sequence = re.compile(r"\\[\\'\"abfnrtv]") # type: Pattern[Text]
61-
62-
63-
Binding = NamedTuple("Binding", [("key", Optional[Text]),
64-
("value", Optional[Text]),
65-
("original", Text)])
66-
67-
68-
def decode_escapes(string):
69-
# type: (Text) -> Text
70-
def decode_match(match):
71-
# type: (Match[Text]) -> Text
72-
return codecs.decode(match.group(0), 'unicode-escape') # type: ignore
73-
74-
return _escape_sequence.sub(decode_match, string)
75-
76-
77-
def is_surrounded_by(string, char):
78-
# type: (Text, Text) -> bool
79-
return (
80-
len(string) > 1
81-
and string[0] == string[-1] == char
82-
)
83-
84-
85-
def parse_binding(string, position):
86-
# type: (Text, int) -> Tuple[Binding, int]
87-
match = _binding.match(string, position)
88-
assert match is not None
89-
(matched, key, value) = match.groups()
90-
if key is None or value is None:
91-
key = None
92-
value = None
93-
else:
94-
value_quoted = is_surrounded_by(value, "'") or is_surrounded_by(value, '"')
95-
if value_quoted:
96-
value = decode_escapes(value[1:-1])
97-
else:
98-
value = value.strip()
99-
return (Binding(key=key, value=value, original=matched), match.end())
100-
101-
102-
def parse_stream(stream):
103-
# type:(IO[Text]) -> Iterator[Binding]
104-
string = stream.read()
105-
position = 0
106-
length = len(string)
107-
while position < length:
108-
(binding, position) = parse_binding(string, position)
109-
yield binding
110-
11133

11234
def to_env(text):
11335
# type: (Text) -> str

src/dotenv/parser.py

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,147 @@
1+
import codecs
2+
import re
3+
from typing import (IO, Iterator, Match, NamedTuple, Optional, Pattern, # noqa
4+
Sequence, Text)
5+
6+
from .compat import to_text
7+
8+
9+
def make_regex(string, extra_flags=0):
10+
# type: (str, int) -> Pattern[Text]
11+
return re.compile(to_text(string), re.UNICODE | extra_flags)
12+
13+
14+
_whitespace = make_regex(r"\s*", extra_flags=re.MULTILINE)
15+
_export = make_regex(r"(?:export[^\S\r\n]+)?")
16+
_single_quoted_key = make_regex(r"'([^']+)'")
17+
_unquoted_key = make_regex(r"([^=\#\s]+)")
18+
_equal_sign = make_regex(r"[^\S\r\n]*=[^\S\r\n]*")
19+
_single_quoted_value = make_regex(r"'((?:\\'|[^'])*)'")
20+
_double_quoted_value = make_regex(r'"((?:\\"|[^"])*)"')
21+
_unquoted_value_part = make_regex(r"([^ \r\n]*)")
22+
_comment = make_regex(r"(?:\s*#[^\r\n]*)?")
23+
_end_of_line = make_regex(r"[^\S\r\n]*(?:\r\n|\n|\r)?")
24+
_rest_of_line = make_regex(r"[^\r\n]*(?:\r|\n|\r\n)?")
25+
_double_quote_escapes = make_regex(r"\\[\\'\"abfnrtv]")
26+
_single_quote_escapes = make_regex(r"\\[\\']")
27+
28+
Binding = NamedTuple("Binding", [("key", Optional[Text]),
29+
("value", Optional[Text]),
30+
("original", Text)])
31+
32+
33+
class Error(Exception):
34+
pass
35+
36+
37+
class Reader:
38+
def __init__(self, stream):
39+
# type: (IO[Text]) -> None
40+
self.string = stream.read()
41+
self.position = 0
42+
self.mark = 0
43+
44+
def has_next(self):
45+
# type: () -> bool
46+
return self.position < len(self.string)
47+
48+
def set_mark(self):
49+
# type: () -> None
50+
self.mark = self.position
51+
52+
def get_marked(self):
53+
# type: () -> Text
54+
return self.string[self.mark:self.position]
55+
56+
def peek(self, count):
57+
# type: (int) -> Text
58+
return self.string[self.position:self.position + count]
59+
60+
def read(self, count):
61+
# type: (int) -> Text
62+
result = self.string[self.position:self.position + count]
63+
if len(result) < count:
64+
raise Error("read: End of string")
65+
self.position += count
66+
return result
67+
68+
def read_regex(self, regex):
69+
# type: (Pattern[Text]) -> Sequence[Text]
70+
match = regex.match(self.string, self.position)
71+
if match is None:
72+
raise Error("read_regex: Pattern not found")
73+
self.position = match.end()
74+
return match.groups()
75+
76+
77+
def decode_escapes(regex, string):
78+
# type: (Pattern[Text], Text) -> Text
79+
def decode_match(match):
80+
# type: (Match[Text]) -> Text
81+
return codecs.decode(match.group(0), 'unicode-escape') # type: ignore
82+
83+
return regex.sub(decode_match, string)
84+
85+
86+
def parse_key(reader):
87+
# type: (Reader) -> Text
88+
char = reader.peek(1)
89+
if char == "'":
90+
(key,) = reader.read_regex(_single_quoted_key)
91+
else:
92+
(key,) = reader.read_regex(_unquoted_key)
93+
return key
94+
95+
96+
def parse_unquoted_value(reader):
97+
# type: (Reader) -> Text
98+
value = u""
99+
while True:
100+
(part,) = reader.read_regex(_unquoted_value_part)
101+
value += part
102+
after = reader.peek(2)
103+
if len(after) < 2 or after[0] in u"\r\n" or after[1] in u" #\r\n":
104+
return value
105+
value += reader.read(2)
106+
107+
108+
def parse_value(reader):
109+
# type: (Reader) -> Text
110+
char = reader.peek(1)
111+
if char == u"'":
112+
(value,) = reader.read_regex(_single_quoted_value)
113+
return decode_escapes(_single_quote_escapes, value)
114+
elif char == u'"':
115+
(value,) = reader.read_regex(_double_quoted_value)
116+
return decode_escapes(_double_quote_escapes, value)
117+
elif char in (u"", u"\n", u"\r"):
118+
return u""
119+
else:
120+
return parse_unquoted_value(reader)
121+
122+
123+
def parse_binding(reader):
124+
# type: (Reader) -> Binding
125+
reader.set_mark()
126+
try:
127+
reader.read_regex(_whitespace)
128+
reader.read_regex(_export)
129+
key = parse_key(reader)
130+
reader.read_regex(_equal_sign)
131+
value = parse_value(reader)
132+
reader.read_regex(_comment)
133+
reader.read_regex(_end_of_line)
134+
return Binding(key=key, value=value, original=reader.get_marked())
135+
except Error:
136+
reader.read_regex(_rest_of_line)
137+
return Binding(key=None, value=None, original=reader.get_marked())
138+
139+
140+
def parse_stream(stream):
141+
# type:(IO[Text]) -> Iterator[Binding]
142+
reader = Reader(stream)
143+
while reader.has_next():
144+
try:
145+
yield parse_binding(reader)
146+
except Error:
147+
return

0 commit comments

Comments
 (0)