-
Notifications
You must be signed in to change notification settings - Fork 37
Expand file tree
/
Copy pathDocstringPython.cpp
More file actions
113 lines (96 loc) · 3.8 KB
/
DocstringPython.cpp
File metadata and controls
113 lines (96 loc) · 3.8 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
// SPDX-License-Identifier: GPL-3.0-only
/**
* @file DocstringPython.hpp
*
* @copyright Copyright (C) 2024 srcML, LLC. (www.srcML.org)
*
* This file is part of the srcML Toolkit.
*
* Alters string literals that should be docstrings in the token stream.
* As a consequence, it will fix any incorrect string literal line numbers.
*/
#include <DocstringPython.hpp>
// Converts certain STRING_START/CHAR_START tokens to Python docstrings
antlr::RefToken DocstringPython::nextToken() {
// place all input tokens in the buffer so we can check for docstrings
if (buffer.empty()) {
auto token = input.nextToken();
// check if at the start or end of a bracket (encompasses (), {}, and [])
if (srcMLParser::left_bracket_py_token_set.member(token->getType()))
++numBrackets;
else if (numBrackets > 0 && srcMLParser::right_bracket_py_token_set.member(token->getType()))
--numBrackets;
// check if at the start of a function or class
if (token->getType() == srcMLParser::PY_FUNCTION || token->getType() == srcMLParser::CLASS)
isFunctionOrClass = true;
// check if at the start of a block
if (token->getType() == blockStartToken && isFunctionOrClass && numBrackets == 0)
isBlockStart = true;
// determine if the first non-WS/blockStartToken token in a function or class
// is a string (and thus a docstring); otherwise, stop looking for a docstring
if (
isFunctionOrClass
&& isBlockStart
&& token->getType() != blockStartToken
&& token->getType() != srcMLParser::EOL
&& !srcMLParser::whitespace_token_set.member(token->getType())
) {
if (token->getType() == srcMLParser::STRING_START)
token->setType(srcMLParser::DQUOTE_DOCSTRING_START);
if (token->getType() == srcMLParser::CHAR_START)
token->setType(srcMLParser::SQUOTE_DOCSTRING_START);
isFunctionOrClass = false;
isBlockStart = false;
}
// increment the line number in comments or multi-line string literals
if (
srcMLParser::comment_py_token_set.member(token->getType())
|| srcMLParser::multiline_literals_py_token_set.member(token->getType())
)
countWSNewlineTokens(token);
// increment the line number at the end of a line
if (
token->getType() == srcMLParser::EOL
|| token->getType() == srcMLParser::WS_EOL
|| token->getType() == srcMLParser::EOL_BACKSLASH
)
++lineNumber;
// insert read token
buffer.emplace_back(token);
}
// next token
auto token = buffer.front();
buffer.pop_front();
return token;
}
/**
* Record end-of-line tokens in a whitespace `token`'s text.
*
* @param token the whitespace token (e.g., comment, docstring, etc.) to analyze.
*/
void DocstringPython::countWSNewlineTokens(antlr::RefToken token) {
std::string text = token->getText();
auto newlines = std::count(text.begin(), text.end(), '\n');
for (auto i = 0; i < newlines; ++i)
++lineNumber;
// ensure any multi-line string token has accurate line numbers
if (srcMLParser::multiline_literals_py_token_set.member(token->getType()))
token->setLine(lineNumber);
}
/**
* Assigns the value of `token` to `blockStartToken`.
*
* Existing logic was built around the colon (`:`) in Python.
* Other values may not work as expected, especially in niche situations.
*
* Refer to `srcMLParserTokenTypes.hpp` for all supported values.
*/
void DocstringPython::setBlockStartToken(int token) {
blockStartToken = token;
}
/**
* Returns `blockStartToken`.
*/
int DocstringPython::getBlockStartToken() const {
return blockStartToken;
}