Skip to content

Commit 75f4eb4

Browse files
SONARPY-341 CPD based on PyCharm frontend (SonarSource#249)
1 parent 8b372ca commit 75f4eb4

5 files changed

Lines changed: 79 additions & 59 deletions

File tree

python-frontend/src/main/java/org/sonar/python/frontend/PythonParser.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ private static PyFile parseAs(String content, LanguageLevel languageLevel) {
9999
}
100100

101101
@NotNull
102-
private static String normalizeEol(String content) {
102+
public static String normalizeEol(String content) {
103103
return content.replaceAll("\\r\\n?", "\n");
104104
}
105105

python-frontend/src/main/java/org/sonar/python/frontend/PythonTokenLocation.java

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,13 @@ public class PythonTokenLocation {
3030
private final int endLineOffset;
3131

3232
public PythonTokenLocation(@NotNull PsiElement element) {
33-
Document psiDocument = element.getContainingFile().getViewProvider().getDocument();
34-
int startOffset = element.getTextRange().getStartOffset();
33+
this(element.getTextRange().getStartOffset(), element.getTextRange().getEndOffset(), element.getContainingFile().getViewProvider().getDocument());
34+
}
35+
36+
public PythonTokenLocation(int startOffset, int endOffset, Document psiDocument) {
3537
startLine = psiDocument.getLineNumber(startOffset);
3638
int startLineNumberOffset = psiDocument.getLineStartOffset(startLine);
3739
startLineOffset = startOffset - startLineNumberOffset;
38-
int endOffset = element.getTextRange().getEndOffset();
3940
endLine = psiDocument.getLineNumber(endOffset);
4041
int endLineNumberOffset = psiDocument.getLineStartOffset(endLine);
4142
endLineOffset = endOffset - endLineNumberOffset;

sonar-python-plugin/src/main/java/org/sonar/plugins/python/PythonScanner.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ private void scanFile(InputFile inputFile) {
9595
try {
9696
visitorContext = new PythonVisitorContext(parser.parse(fileContent), pythonFile);
9797
pyFile = new org.sonar.python.frontend.PythonParser().parse(fileContent);
98-
saveMeasures(inputFile, visitorContext, pyFile);
98+
saveMeasures(inputFile, visitorContext, pyFile, fileContent);
9999
} catch (RecognitionException e) {
100100
visitorContext = new PythonVisitorContext(pythonFile, e);
101101
LOG.error("Unable to parse file: " + inputFile.toString());
@@ -162,12 +162,12 @@ private static NewIssueLocation newLocation(InputFile inputFile, NewIssue issue,
162162
return newLocation;
163163
}
164164

165-
private void saveMeasures(InputFile inputFile, PythonVisitorContext visitorContext, PyFile pyFile) {
165+
private void saveMeasures(InputFile inputFile, PythonVisitorContext visitorContext, PyFile pyFile, String fileContent) {
166166
boolean ignoreHeaderComments = new PythonConfiguration(context.fileSystem().encoding()).getIgnoreHeaderComments();
167167
FileMetrics fileMetrics = new FileMetrics(visitorContext, ignoreHeaderComments, pyFile);
168168
MetricsVisitor metricsVisitor = fileMetrics.metricsVisitor();
169169

170-
cpdAnalyzer.pushCpdTokens(inputFile, visitorContext);
170+
cpdAnalyzer.pushCpdTokens(inputFile, pyFile, fileContent);
171171
noSonarFilter.noSonarInFile(inputFile, metricsVisitor.getLinesWithNoSonar());
172172

173173
Set<Integer> linesOfCode = metricsVisitor.getLinesOfCode();

sonar-python-plugin/src/main/java/org/sonar/plugins/python/cpd/PythonCpdAnalyzer.java

Lines changed: 54 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -19,56 +19,79 @@
1919
*/
2020
package org.sonar.plugins.python.cpd;
2121

22-
import com.sonar.sslr.api.AstNode;
23-
import com.sonar.sslr.api.GenericTokenType;
24-
import com.sonar.sslr.api.Token;
25-
import com.sonar.sslr.api.TokenType;
26-
import java.util.List;
22+
import com.intellij.openapi.editor.Document;
23+
import com.intellij.psi.PsiElement;
24+
import com.intellij.psi.tree.IElementType;
25+
import com.jetbrains.python.PyTokenTypes;
26+
import com.jetbrains.python.lexer.PythonIndentingLexer;
27+
import com.jetbrains.python.psi.PyElementType;
28+
import com.jetbrains.python.psi.PyFile;
29+
import java.util.Arrays;
30+
import java.util.HashSet;
31+
import java.util.Set;
32+
import javax.annotation.CheckForNull;
2733
import org.sonar.api.batch.fs.InputFile;
2834
import org.sonar.api.batch.sensor.SensorContext;
2935
import org.sonar.api.batch.sensor.cpd.NewCpdTokens;
30-
import org.sonar.python.PythonVisitorContext;
31-
import org.sonar.python.TokenLocation;
32-
import org.sonar.python.api.PythonTokenType;
36+
import org.sonar.api.utils.log.Logger;
37+
import org.sonar.api.utils.log.Loggers;
38+
import org.sonar.python.frontend.PythonParser;
39+
import org.sonar.python.frontend.PythonTokenLocation;
3340

3441
public class PythonCpdAnalyzer {
3542

3643
private final SensorContext context;
44+
private static final Set<PyElementType> IGNORED_TOKEN_TYPES = new HashSet<>(Arrays.asList(
45+
PyTokenTypes.LINE_BREAK, PyTokenTypes.DEDENT, PyTokenTypes.INDENT, PyTokenTypes.END_OF_LINE_COMMENT, PyTokenTypes.SPACE, PyTokenTypes.STATEMENT_BREAK));
46+
private static final Logger LOG = Loggers.get(PythonCpdAnalyzer.class);
3747

3848
public PythonCpdAnalyzer(SensorContext context) {
3949
this.context = context;
4050
}
4151

42-
public void pushCpdTokens(InputFile inputFile, PythonVisitorContext visitorContext) {
43-
AstNode root = visitorContext.rootTree();
44-
if (root != null) {
45-
NewCpdTokens cpdTokens = context.newCpdTokens().onFile(inputFile);
46-
List<Token> tokens = root.getTokens();
47-
for (int i = 0; i < tokens.size(); i++) {
48-
Token token = tokens.get(i);
49-
TokenType currentTokenType = token.getType();
50-
TokenType nextTokenType = i + 1 < tokens.size() ? tokens.get(i + 1).getType() : GenericTokenType.EOF;
51-
// INDENT/DEDENT could not be completely ignored during CPD see https://docs.python.org/3/reference/lexical_analysis.html#indentation
52-
// Just taking into account DEDENT is enough, but because the DEDENT token has an empty value, it's the
53-
// preceding new line which is added in its place to create a difference
54-
if (isNewLineWithIndentationChange(currentTokenType, nextTokenType) || !isIgnoredType(currentTokenType)) {
55-
TokenLocation location = new TokenLocation(token);
56-
cpdTokens.addToken(location.startLine(), location.startLineOffset(), location.endLine(), location.endLineOffset(), token.getValue());
52+
public void pushCpdTokens(InputFile inputFile, PyFile pyFile, String fileContent) {
53+
Document document = getDocument(pyFile);
54+
if (document == null) {
55+
LOG.debug("Cannot complete CPD analysis: PSIDocument is null.");
56+
return;
57+
}
58+
PythonIndentingLexer lexer = new PythonIndentingLexer();
59+
lexer.start(PythonParser.normalizeEol(fileContent));
60+
NewCpdTokens cpdTokens = context.newCpdTokens().onFile(inputFile);
61+
IElementType prevTokenType = null;
62+
while (lexer.getTokenType() != null) {
63+
IElementType currentTokenType = lexer.getTokenType();
64+
// INDENT/DEDENT could not be completely ignored during CPD see https://docs.python.org/3/reference/lexical_analysis.html#indentation
65+
// Just taking into account DEDENT is enough, but because the DEDENT token has an empty value, it's the
66+
// following new line which is added in its place to create a difference
67+
if (isNewLineWithIndentationChange(prevTokenType, currentTokenType) || !IGNORED_TOKEN_TYPES.contains(currentTokenType)) {
68+
int tokenEnd = lexer.getTokenEnd();
69+
String tokenText = lexer.getTokenText();
70+
if (currentTokenType == PyTokenTypes.LINE_BREAK) {
71+
tokenText = "\n";
72+
tokenEnd = lexer.getTokenStart() + 1;
5773
}
74+
PythonTokenLocation location = new PythonTokenLocation(lexer.getTokenStart(), tokenEnd, document);
75+
cpdTokens.addToken(location.startLine(), location.startLineOffset(), location.endLine(), location.endLineOffset(), tokenText);
5876
}
59-
cpdTokens.save();
77+
prevTokenType = currentTokenType;
78+
lexer.advance();
6079
}
80+
81+
cpdTokens.save();
6182
}
6283

63-
private static boolean isNewLineWithIndentationChange(TokenType currentTokenType, TokenType nextTokenType) {
64-
return currentTokenType.equals(PythonTokenType.NEWLINE) && nextTokenType.equals(PythonTokenType.DEDENT);
84+
private static boolean isNewLineWithIndentationChange(@CheckForNull IElementType prevTokenType, IElementType currentTokenType) {
85+
return prevTokenType != null && prevTokenType == PyTokenTypes.DEDENT && currentTokenType == PyTokenTypes.LINE_BREAK;
6586
}
6687

67-
private static boolean isIgnoredType(TokenType type) {
68-
return type.equals(PythonTokenType.NEWLINE) ||
69-
type.equals(PythonTokenType.DEDENT) ||
70-
type.equals(PythonTokenType.INDENT) ||
71-
type.equals(GenericTokenType.EOF);
88+
@CheckForNull
89+
private static Document getDocument(PyFile pyFile) {
90+
PsiElement root = pyFile.getFirstChild();
91+
if (root == null) {
92+
return null;
93+
}
94+
return root.getContainingFile().getViewProvider().getDocument();
7295
}
7396

7497
}

sonar-python-plugin/src/test/java/org/sonar/plugins/python/cpd/PythonCpdAnalyzerTest.java

Lines changed: 17 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,8 @@
1919
*/
2020
package org.sonar.plugins.python.cpd;
2121

22+
import com.jetbrains.python.psi.PyFile;
2223
import java.io.File;
23-
import java.nio.charset.StandardCharsets;
2424
import java.nio.file.Paths;
2525
import java.util.List;
2626
import java.util.stream.Collectors;
@@ -32,8 +32,7 @@
3232
import org.sonar.api.batch.sensor.internal.SensorContextTester;
3333
import org.sonar.plugins.python.Python;
3434
import org.sonar.plugins.python.TestUtils;
35-
import org.sonar.python.PythonVisitorContext;
36-
import org.sonar.python.TestPythonVisitorRunner;
35+
import org.sonar.python.frontend.PythonParser;
3736

3837
import static java.nio.charset.StandardCharsets.UTF_8;
3938
import static org.assertj.core.api.Assertions.assertThat;
@@ -46,9 +45,21 @@ public class PythonCpdAnalyzerTest {
4645

4746
@Test
4847
public void code_chunks_2() {
49-
DefaultInputFile inputFile = inputFile("code_chunks_2.py");
50-
PythonVisitorContext visitorContext = TestPythonVisitorRunner.createContext(inputFile.path().toFile());
51-
cpdAnalyzer.pushCpdTokens(inputFile, visitorContext);
48+
File file = new File(BASE_DIR, "code_chunks_2.py");
49+
50+
String content = TestUtils.fileContent(file, UTF_8);
51+
DefaultInputFile inputFile = TestInputFileBuilder.create("moduleKey", file.getName())
52+
.setModuleBaseDir(Paths.get(BASE_DIR))
53+
.setCharset(UTF_8)
54+
.setType(InputFile.Type.MAIN)
55+
.setLanguage(Python.KEY)
56+
.initMetadata(content)
57+
.build();
58+
59+
context.fileSystem().add(inputFile);
60+
61+
PyFile pyFile = new PythonParser().parse(content);
62+
cpdAnalyzer.pushCpdTokens(inputFile, pyFile, content);
5263

5364
List<TokensLine> lines = context.cpdTokens("moduleKey:code_chunks_2.py");
5465
assertThat(lines).isNotNull().hasSize(25);
@@ -89,19 +100,4 @@ public void code_chunks_2() {
89100
"[itemforiteminitems]");
90101
}
91102

92-
private DefaultInputFile inputFile(String fileName) {
93-
File file = new File(BASE_DIR, fileName);
94-
95-
DefaultInputFile inputFile = TestInputFileBuilder.create("moduleKey", file.getName())
96-
.setModuleBaseDir(Paths.get(BASE_DIR))
97-
.setCharset(UTF_8)
98-
.setType(InputFile.Type.MAIN)
99-
.setLanguage(Python.KEY)
100-
.initMetadata(TestUtils.fileContent(file, StandardCharsets.UTF_8))
101-
.build();
102-
103-
context.fileSystem().add(inputFile);
104-
105-
return inputFile;
106-
}
107103
}

0 commit comments

Comments
 (0)