-
Notifications
You must be signed in to change notification settings - Fork 37
Expand file tree
/
Copy pathNameDifferentiatorPython.cpp
More file actions
190 lines (164 loc) · 6.4 KB
/
NameDifferentiatorPython.cpp
File metadata and controls
190 lines (164 loc) · 6.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
// SPDX-License-Identifier: GPL-3.0-only
/**
* @file DocstringPython.hpp
*
* @copyright Copyright (C) 2024 srcML, LLC. (www.srcML.org)
*
* This file is part of the srcML Toolkit.
*
* Changes certain statement keywords to names in Python.
* Impacts `case`, `exec`, `match`, `print`, and `type`.
*/
#include <NameDifferentiatorPython.hpp>
// Converts certain statement keyword tokens to names
antlr::RefToken NameDifferentiatorPython::nextToken() {
// place all input tokens in a buffer to help check for keywords that should be names
if (buffer.empty()) {
auto token = input.nextToken();
switch (token->getType()) {
// Change `exec`, `print`, or `type` to names (if applicable)
case srcMLParser::PY_2_EXEC:
case srcMLParser::PY_2_PRINT:
case srcMLParser::PY_TYPE:
lookAheadTwoDifferentiator(token);
break;
// Change `case` or `match` to names (if applicable)
case srcMLParser::PY_CASE:
case srcMLParser::PY_MATCH:
variableLookAheadDifferentiator(token);
break;
default:
checkBracketToken(token); // Detect if currently in/out of `()`, `{}`, or `[]`
break;
}
// insert read token
buffer.emplace_front(token);
}
// next token
auto token = buffer.front();
buffer.pop_front();
return token;
}
/**
* Uses the next two tokens after `token` to check if a keyword should be a name.
*
* Supports Python soft keywords `exec`, `print`, and `type`.
*/
void NameDifferentiatorPython::lookAheadTwoDifferentiator(antlr::RefToken token) {
bool isTopLevel = (numBrackets == 0); // statements cannot exist inside brackets
auto nextToken = input.nextToken();
checkBracketToken(nextToken); // Detect if currently in/out of `()`, `{}`, or `[]`
buffer.emplace_back(nextToken);
auto extraToken = input.nextToken();
checkBracketToken(extraToken); // Detect if currently in/out of `()`, `{}`, or `[]`
buffer.emplace_back(extraToken);
switch (token->getType()) {
// Check if `exec` or `print` should be names or keywords
case srcMLParser::PY_2_EXEC:
case srcMLParser::PY_2_PRINT: {
// `print` and `exec` are statements if followed by WS + non-LPAREN token at the top level
if (
isTopLevel
&& (nextToken->getType() != srcMLParser::WS || extraToken->getType() == srcMLParser::LPAREN)
)
token->setType(srcMLParser::NAME);
break;
}
// Check if `type` should be a name or keyword
case srcMLParser::PY_TYPE: {
// `type` is a statement if followed by WS (or WS + NAME) at the top level
if (
isTopLevel
&& (
nextToken->getType() != srcMLParser::WS
|| (
extraToken->getType() != srcMLParser::NAME
&& !srcMLParser::identifier_list_tokens_set.member(extraToken->getType())
)
)
)
token->setType(srcMLParser::NAME);
break;
}
default:
break;
}
}
/**
* Uses (at least) the next token after `token` to check if a keyword should be a name.
*
* A nested soft keyword should always be marked as a `NAME`;
* That logic goes in `identifier_list[]` in `srcMLParser.g` and
* its corresponding bitset token set, not here.
*
* Supports Python soft keywords `case` and `match`.
*/
void NameDifferentiatorPython::variableLookAheadDifferentiator(antlr::RefToken token) {
bool skipProcessing = false;
numCurrentBrackets = numBrackets; // ensure block start token is not buried in brackets
isName = true; // assume `token` is a name until shown otherwise
auto nextToken = input.nextToken();
checkBracketToken(nextToken); // Detect if currently in/out of `()`, `{}`, or `[]`
buffer.emplace_back(nextToken);
// `token` + `:` should automatically set `token` to a NAME (for type annotations)
// if `try` ever becomes a soft keyword, this entire method will break
if (nextToken->getType() == blockStartToken)
skipProcessing = true;
if (!skipProcessing) {
// look for a block start token or a newline (if numCurrentBrackets == numBrackets)
while (true) {
nextToken = input.nextToken();
checkBracketToken(nextToken); // Detect if currently in/out of `()`, `{}`, or `[]`
buffer.emplace_back(nextToken);
// [NOT A NAME] failsafe to break out of the loop
if (nextToken->getType() == srcMLParser::EOF_) {
isName = false;
break;
}
// [IS A NAME] found a newline not buried inside additional brackets
if (
(nextToken->getType() == srcMLParser::EOL || nextToken->getType() == srcMLParser::WS_EOL)
&& numBrackets == numCurrentBrackets
)
break;
// [NOT A NAME] found a block start token
if (nextToken->getType() == blockStartToken && numBrackets == numCurrentBrackets) {
isName = false;
break;
}
}
}
if (isName)
token->setType(srcMLParser::NAME);
}
/**
* Detects opening and closing brackets (e.g., `()`, `{}`, and `[]`).
*
* Ensures certain colon (`:`) tokens do not start blocks in Python.
* Examples including array slicing, dictionairies, and type annotations.
*
* Operates under the assumption the code contains balanced brackets.
*/
void NameDifferentiatorPython::checkBracketToken(antlr::RefToken token) {
if (srcMLParser::left_bracket_py_token_set.member(token->getType()))
++numBrackets;
else if (numBrackets > 0 && srcMLParser::right_bracket_py_token_set.member(token->getType()))
--numBrackets;
}
/**
* Assigns the value of `token` to `blockStartToken`.
*
* Existing logic was built around the colon (`:`) in Python.
* Other values may not work as expected, especially in niche situations.
*
* Refer to `srcMLParserTokenTypes.hpp` for all supported values.
*/
void NameDifferentiatorPython::setBlockStartToken(int token) {
blockStartToken = token;
}
/**
* Returns `blockStartToken`.
*/
int NameDifferentiatorPython::getBlockStartToken() const {
return blockStartToken;
}