SONARPY-341 CPD based on PyCharm frontend (SonarSource#249)

andrea-guarino-sonarsource · mpaladin · commit 75f4eb42f151 · 2019-07-19T16:55:56.000+02:00
diff --git a/python-frontend/src/main/java/org/sonar/python/frontend/PythonParser.java b/python-frontend/src/main/java/org/sonar/python/frontend/PythonParser.java
@@ -99,7 +99,7 @@ private static PyFile parseAs(String content, LanguageLevel languageLevel) {
   }
 
   @NotNull
-  private static String normalizeEol(String content) {
+  public static String normalizeEol(String content) {
     return content.replaceAll("\\r\\n?", "\n");
   }
 
diff --git a/python-frontend/src/main/java/org/sonar/python/frontend/PythonTokenLocation.java b/python-frontend/src/main/java/org/sonar/python/frontend/PythonTokenLocation.java
@@ -30,12 +30,13 @@ public class PythonTokenLocation {
   private final int endLineOffset;
 
   public PythonTokenLocation(@NotNull PsiElement element) {
-    Document psiDocument = element.getContainingFile().getViewProvider().getDocument();
-    int startOffset = element.getTextRange().getStartOffset();
+    this(element.getTextRange().getStartOffset(), element.getTextRange().getEndOffset(), element.getContainingFile().getViewProvider().getDocument());
+  }
+
+  public PythonTokenLocation(int startOffset, int endOffset, Document psiDocument) {
     startLine = psiDocument.getLineNumber(startOffset);
     int startLineNumberOffset = psiDocument.getLineStartOffset(startLine);
     startLineOffset = startOffset - startLineNumberOffset;
-    int endOffset = element.getTextRange().getEndOffset();
     endLine = psiDocument.getLineNumber(endOffset);
     int endLineNumberOffset = psiDocument.getLineStartOffset(endLine);
     endLineOffset = endOffset - endLineNumberOffset;
diff --git a/sonar-python-plugin/src/main/java/org/sonar/plugins/python/PythonScanner.java b/sonar-python-plugin/src/main/java/org/sonar/plugins/python/PythonScanner.java
@@ -95,7 +95,7 @@ private void scanFile(InputFile inputFile) {
     try {
       visitorContext = new PythonVisitorContext(parser.parse(fileContent), pythonFile);
       pyFile = new org.sonar.python.frontend.PythonParser().parse(fileContent);
-      saveMeasures(inputFile, visitorContext, pyFile);
+      saveMeasures(inputFile, visitorContext, pyFile, fileContent);
     } catch (RecognitionException e) {
       visitorContext = new PythonVisitorContext(pythonFile, e);
       LOG.error("Unable to parse file: " + inputFile.toString());
@@ -162,12 +162,12 @@ private static NewIssueLocation newLocation(InputFile inputFile, NewIssue issue,
     return newLocation;
   }
 
-  private void saveMeasures(InputFile inputFile, PythonVisitorContext visitorContext, PyFile pyFile) {
+  private void saveMeasures(InputFile inputFile, PythonVisitorContext visitorContext, PyFile pyFile, String fileContent) {
     boolean ignoreHeaderComments = new PythonConfiguration(context.fileSystem().encoding()).getIgnoreHeaderComments();
     FileMetrics fileMetrics = new FileMetrics(visitorContext, ignoreHeaderComments, pyFile);
     MetricsVisitor metricsVisitor = fileMetrics.metricsVisitor();
 
-    cpdAnalyzer.pushCpdTokens(inputFile, visitorContext);
+    cpdAnalyzer.pushCpdTokens(inputFile, pyFile, fileContent);
     noSonarFilter.noSonarInFile(inputFile, metricsVisitor.getLinesWithNoSonar());
 
     Set<Integer> linesOfCode = metricsVisitor.getLinesOfCode();
diff --git a/sonar-python-plugin/src/main/java/org/sonar/plugins/python/cpd/PythonCpdAnalyzer.java b/sonar-python-plugin/src/main/java/org/sonar/plugins/python/cpd/PythonCpdAnalyzer.java
@@ -19,56 +19,79 @@
  */
 package org.sonar.plugins.python.cpd;
 
-import com.sonar.sslr.api.AstNode;
-import com.sonar.sslr.api.GenericTokenType;
-import com.sonar.sslr.api.Token;
-import com.sonar.sslr.api.TokenType;
-import java.util.List;
+import com.intellij.openapi.editor.Document;
+import com.intellij.psi.PsiElement;
+import com.intellij.psi.tree.IElementType;
+import com.jetbrains.python.PyTokenTypes;
+import com.jetbrains.python.lexer.PythonIndentingLexer;
+import com.jetbrains.python.psi.PyElementType;
+import com.jetbrains.python.psi.PyFile;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+import javax.annotation.CheckForNull;
 import org.sonar.api.batch.fs.InputFile;
 import org.sonar.api.batch.sensor.SensorContext;
 import org.sonar.api.batch.sensor.cpd.NewCpdTokens;
-import org.sonar.python.PythonVisitorContext;
-import org.sonar.python.TokenLocation;
-import org.sonar.python.api.PythonTokenType;
+import org.sonar.api.utils.log.Logger;
+import org.sonar.api.utils.log.Loggers;
+import org.sonar.python.frontend.PythonParser;
+import org.sonar.python.frontend.PythonTokenLocation;
 
 public class PythonCpdAnalyzer {
 
   private final SensorContext context;
+  private static final Set<PyElementType> IGNORED_TOKEN_TYPES = new HashSet<>(Arrays.asList(
+    PyTokenTypes.LINE_BREAK, PyTokenTypes.DEDENT, PyTokenTypes.INDENT, PyTokenTypes.END_OF_LINE_COMMENT, PyTokenTypes.SPACE, PyTokenTypes.STATEMENT_BREAK));
+  private static final Logger LOG = Loggers.get(PythonCpdAnalyzer.class);
 
   public PythonCpdAnalyzer(SensorContext context) {
     this.context = context;
   }
 
-  public void pushCpdTokens(InputFile inputFile, PythonVisitorContext visitorContext) {
-    AstNode root = visitorContext.rootTree();
-    if (root != null) {
-      NewCpdTokens cpdTokens = context.newCpdTokens().onFile(inputFile);
-      List<Token> tokens = root.getTokens();
-      for (int i = 0; i < tokens.size(); i++) {
-        Token token = tokens.get(i);
-        TokenType currentTokenType = token.getType();
-        TokenType nextTokenType = i + 1 < tokens.size() ? tokens.get(i + 1).getType() : GenericTokenType.EOF;
-        // INDENT/DEDENT could not be completely ignored during CPD see https://docs.python.org/3/reference/lexical_analysis.html#indentation
-        // Just taking into account DEDENT is enough, but because the DEDENT token has an empty value, it's the
-        // preceding new line which is added in its place to create a difference
-        if (isNewLineWithIndentationChange(currentTokenType, nextTokenType) || !isIgnoredType(currentTokenType)) {
-          TokenLocation location = new TokenLocation(token);
-          cpdTokens.addToken(location.startLine(), location.startLineOffset(), location.endLine(), location.endLineOffset(), token.getValue());
+  public void pushCpdTokens(InputFile inputFile, PyFile pyFile, String fileContent) {
+    Document document = getDocument(pyFile);
+    if (document == null) {
+      LOG.debug("Cannot complete CPD analysis: PSIDocument is null.");
+      return;
+    }
+    PythonIndentingLexer lexer = new PythonIndentingLexer();
+    lexer.start(PythonParser.normalizeEol(fileContent));
+    NewCpdTokens cpdTokens = context.newCpdTokens().onFile(inputFile);
+    IElementType prevTokenType = null;
+    while (lexer.getTokenType() != null) {
+      IElementType currentTokenType = lexer.getTokenType();
+      // INDENT/DEDENT could not be completely ignored during CPD see https://docs.python.org/3/reference/lexical_analysis.html#indentation
+      // Just taking into account DEDENT is enough, but because the DEDENT token has an empty value, it's the
+      // following new line which is added in its place to create a difference
+      if (isNewLineWithIndentationChange(prevTokenType, currentTokenType) || !IGNORED_TOKEN_TYPES.contains(currentTokenType)) {
+        int tokenEnd = lexer.getTokenEnd();
+        String tokenText = lexer.getTokenText();
+        if (currentTokenType == PyTokenTypes.LINE_BREAK) {
+          tokenText = "\n";
+          tokenEnd = lexer.getTokenStart() + 1;
         }
+        PythonTokenLocation location = new PythonTokenLocation(lexer.getTokenStart(), tokenEnd, document);
+        cpdTokens.addToken(location.startLine(), location.startLineOffset(), location.endLine(), location.endLineOffset(), tokenText);
       }
-      cpdTokens.save();
+      prevTokenType = currentTokenType;
+      lexer.advance();
     }
+
+    cpdTokens.save();
   }
 
-  private static boolean isNewLineWithIndentationChange(TokenType currentTokenType, TokenType nextTokenType) {
-    return currentTokenType.equals(PythonTokenType.NEWLINE) && nextTokenType.equals(PythonTokenType.DEDENT);
+  private static boolean isNewLineWithIndentationChange(@CheckForNull IElementType prevTokenType, IElementType currentTokenType) {
+    return prevTokenType != null && prevTokenType == PyTokenTypes.DEDENT && currentTokenType == PyTokenTypes.LINE_BREAK;
   }
 
-  private static boolean isIgnoredType(TokenType type) {
-    return type.equals(PythonTokenType.NEWLINE) ||
-      type.equals(PythonTokenType.DEDENT) ||
-      type.equals(PythonTokenType.INDENT) ||
-      type.equals(GenericTokenType.EOF);
+  @CheckForNull
+  private static Document getDocument(PyFile pyFile) {
+    PsiElement root = pyFile.getFirstChild();
+    if (root == null) {
+      return null;
+    }
+    return root.getContainingFile().getViewProvider().getDocument();
   }
 
 }
diff --git a/sonar-python-plugin/src/test/java/org/sonar/plugins/python/cpd/PythonCpdAnalyzerTest.java b/sonar-python-plugin/src/test/java/org/sonar/plugins/python/cpd/PythonCpdAnalyzerTest.java
@@ -19,8 +19,8 @@
  */
 package org.sonar.plugins.python.cpd;
 
+import com.jetbrains.python.psi.PyFile;
 import java.io.File;
-import java.nio.charset.StandardCharsets;
 import java.nio.file.Paths;
 import java.util.List;
 import java.util.stream.Collectors;
@@ -32,8 +32,7 @@
 import org.sonar.api.batch.sensor.internal.SensorContextTester;
 import org.sonar.plugins.python.Python;
 import org.sonar.plugins.python.TestUtils;
-import org.sonar.python.PythonVisitorContext;
-import org.sonar.python.TestPythonVisitorRunner;
+import org.sonar.python.frontend.PythonParser;
 
 import static java.nio.charset.StandardCharsets.UTF_8;
 import static org.assertj.core.api.Assertions.assertThat;
@@ -46,9 +45,21 @@ public class PythonCpdAnalyzerTest {
 
   @Test
   public void code_chunks_2() {
-    DefaultInputFile inputFile = inputFile("code_chunks_2.py");
-    PythonVisitorContext visitorContext = TestPythonVisitorRunner.createContext(inputFile.path().toFile());
-    cpdAnalyzer.pushCpdTokens(inputFile, visitorContext);
+    File file = new File(BASE_DIR, "code_chunks_2.py");
+
+    String content = TestUtils.fileContent(file, UTF_8);
+    DefaultInputFile inputFile = TestInputFileBuilder.create("moduleKey", file.getName())
+      .setModuleBaseDir(Paths.get(BASE_DIR))
+      .setCharset(UTF_8)
+      .setType(InputFile.Type.MAIN)
+      .setLanguage(Python.KEY)
+      .initMetadata(content)
+      .build();
+
+    context.fileSystem().add(inputFile);
+
+    PyFile pyFile = new PythonParser().parse(content);
+    cpdAnalyzer.pushCpdTokens(inputFile, pyFile, content);
 
     List<TokensLine> lines = context.cpdTokens("moduleKey:code_chunks_2.py");
     assertThat(lines).isNotNull().hasSize(25);
@@ -89,19 +100,4 @@ public void code_chunks_2() {
       "[itemforiteminitems]");
   }
 
-  private DefaultInputFile inputFile(String fileName) {
-    File file = new File(BASE_DIR, fileName);
-
-    DefaultInputFile inputFile = TestInputFileBuilder.create("moduleKey", file.getName())
-      .setModuleBaseDir(Paths.get(BASE_DIR))
-      .setCharset(UTF_8)
-      .setType(InputFile.Type.MAIN)
-      .setLanguage(Python.KEY)
-      .initMetadata(TestUtils.fileContent(file, StandardCharsets.UTF_8))
-      .build();
-
-    context.fileSystem().add(inputFile);
-
-    return inputFile;
-  }
 }

Original file line number	Diff line number	Diff line change
`@@ -99,7 +99,7 @@ private static PyFile parseAs(String content, LanguageLevel languageLevel) {`
`99`	`99`	`}`
`100`	`100`
`101`	`101`	`@NotNull`
`102`		`- private static String normalizeEol(String content) {`
	`102`	`+ public static String normalizeEol(String content) {`
`103`	`103`	`return content.replaceAll("\\r\\n?", "\n");`
`104`	`104`	`}`
`105`	`105`