SONARPY-1493: Support lexing of PEP701 f-strings (#1606)

SonarSource · Oct 19, 2023 · 8f382be · 8f382be
1 parent 4666eba
commit 8f382be
Show file tree

Hide file tree

Showing 7 changed files with 691 additions and 56 deletions.
diff --git a/python-frontend/src/main/java/org/sonar/python/api/PythonTokenType.java b/python-frontend/src/main/java/org/sonar/python/api/PythonTokenType.java
@@ -25,6 +25,10 @@
 public enum PythonTokenType implements TokenType {
   NUMBER,
   STRING,
+
+  FSTRING_START,
+  FSTRING_MIDDLE,
+  FSTRING_END,
 
   INDENT,
   DEDENT,

diff --git a/python-frontend/src/main/java/org/sonar/python/lexer/FStringChannel312.java b/python-frontend/src/main/java/org/sonar/python/lexer/FStringChannel312.java
@@ -0,0 +1,204 @@
+/*
+ * SonarQube Python Plugin
+ * Copyright (C) 2011-2023 SonarSource SA
+ * mailto:info AT sonarsource DOT com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+package org.sonar.python.lexer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.IntStream;
+
+import org.sonar.python.api.PythonPunctuator;
+import org.sonar.python.api.PythonTokenType;
+import org.sonar.python.lexer.FStringState.Mode;
+import org.sonar.sslr.channel.Channel;
+import org.sonar.sslr.channel.CodeReader;
+
+import com.sonar.sslr.api.Token;
+import com.sonar.sslr.api.TokenType;
+import com.sonar.sslr.impl.Lexer;
+
+/**
+ * A channel to handle f-strings.
+ * See https://docs.python.org/3.12/reference/lexical_analysis.html#formatted-string-literals
+ */
+public class FStringChannel312 extends Channel<Lexer> {
+
+  private static final char EOF = (char) -1;
+
+  private final LexerState lexerState;
+  private final StringBuilder sb = new StringBuilder();
+
+  private static final List<Character> QUOTES = List.of('\"', '\'');
+  private static final List<Character> PREFIXES = List.of('F', 'R');
+
+  public FStringChannel312(LexerState lexerState) {
+    this.lexerState = lexerState;
+  }
+
+  @Override
+  public boolean consume(CodeReader code, Lexer output) {
+    char c = code.charAt(0);
+    int line = code.getLinePosition();
+    int column = code.getColumnPosition();
+
+    FStringState currentState = lexerState.fStringStateStack.peek();
+
+    if (canConsumeFStringPrefix(sb, code)) {
+      char quote = code.charAt(0);
+      StringBuilder quotes = consumeFStringQuotes(code, quote);
+      FStringState newState = new FStringState(Mode.FSTRING_MODE, lexerState.brackets);
+      newState.setQuote(quote);
+      newState.setNumberOfQuotes(quotes.length());
+      lexerState.fStringStateStack.push(newState);
+      Token fStringStartToken = buildToken(PythonTokenType.FSTRING_START, sb.append(quotes).toString(), output, line, column);
+      sb.setLength(0);
+      List<Token> tokens = new ArrayList<>();
+      tokens.add(fStringStartToken);
+      return consumeFStringMiddle(tokens, sb, newState, code, output);
+    }
+
+    FStringState.Mode currentMode = currentState.getTokenizerMode();
+
+    if (currentMode == Mode.REGULAR_MODE && lexerState.fStringStateStack.size() > 1) {
+      if (c == '}') {
+        Token rCurlyBraceToken = buildToken(PythonPunctuator.RCURLYBRACE, "}", output, line, column);
+        code.pop();
+        List<Token> tokens = new ArrayList<>();
+        tokens.add(rCurlyBraceToken);
+        lexerState.fStringStateStack.pop();
+        FStringState previousState = lexerState.fStringStateStack.peek();
+        return consumeFStringMiddle(tokens, sb, previousState, code, output);
+        // do not lexer colon if the nesting level is different from the open curly brace
+      } else if (c == ':' && lexerState.brackets == currentState.getBrackets()) {
+        Token formatSpecifier = buildToken(PythonPunctuator.COLON, ":", output, line, column);
+        code.pop();
+        List<Token> tokens = new ArrayList<>();
+        tokens.add(formatSpecifier);
+        FStringState newState = new FStringState(Mode.FORMAT_SPECIFIER_MODE, lexerState.brackets);
+        lexerState.fStringStateStack.push(newState);
+        return consumeFStringMiddle(tokens, sb, newState, code, output);
+      }
+    }
+    return false;
+  }
+
+  private boolean consumeFStringMiddle(List<Token> tokens, StringBuilder sb, FStringState state, CodeReader code, Lexer output) {
+    int line = code.getLinePosition();
+    int column = code.getColumnPosition();
+    FStringState.Mode currentMode = state.getTokenizerMode();
+    while (code.charAt(0) != EOF) {
+      if (currentMode == Mode.FSTRING_MODE && isEscapedCurlyBrace(code)) {
+        sb.append((char) code.pop());
+        sb.append((char) code.pop());
+      } else if (code.charAt(0) == '{') {
+        addFStringMiddleToTokens(tokens, sb, output, line, column);
+        addLCurlBraceAndSwitchToRegularMode(tokens, code, output);
+        addTokens(tokens, output);
+        return true;
+      } else if (currentMode == Mode.FORMAT_SPECIFIER_MODE && code.charAt(0) == '}') {
+        addFStringMiddleToTokens(tokens, sb, output, line, column);
+        lexerState.fStringStateStack.pop();
+        addTokens(tokens, output);
+        return true;
+      } else if (currentMode == Mode.FSTRING_MODE && areClosingQuotes(code, state)) {
+        addFStringMiddleToTokens(tokens, sb, output, line, column);
+        addFStringEndToTokens(code, state.getQuote(), tokens, output);
+        addTokens(tokens, output);
+        return true;
+      } else {
+        sb.append((char) code.pop());
+      }
+    }
+    return false;
+  }
+
+  private static boolean canConsumeFStringPrefix(StringBuilder sb, CodeReader code) {
+    Character firstChar = Character.toUpperCase(code.charAt(0));
+    Character secondChar = Character.toUpperCase(code.charAt(1));
+    if (firstChar == 'F' && QUOTES.contains(code.charAt(1))) {
+      sb.append((char) code.pop());
+      return true;
+    } else if (PREFIXES.contains(firstChar) && PREFIXES.contains(secondChar) &&
+      !firstChar.equals(secondChar) && QUOTES.contains(code.charAt(2))) {
+      sb.append((char) code.pop());
+      sb.append((char) code.pop());
+      return true;
+    }
+    return false;
+  }
+
+  private static boolean isEscapedCurlyBrace(CodeReader code) {
+    return Arrays.equals(code.peek(2), "{{".toCharArray()) || Arrays.equals(code.peek(2), "}}".toCharArray());
+  }
+
+  private static boolean areClosingQuotes(CodeReader code, FStringState state) {
+    char[] quotes = code.peek(state.getNumberOfQuotes());
+    return IntStream.range(0, quotes.length).mapToObj(i -> quotes[i]).allMatch(state.getQuote()::equals);
+  }
+
+  private static void addFStringMiddleToTokens(List<Token> tokens, StringBuilder sb, Lexer output, int line, int column) {
+    if (sb.length() != 0) {
+      Token fStringMiddleToken = buildToken(PythonTokenType.FSTRING_MIDDLE, sb.toString(), output, line, column);
+      sb.setLength(0);
+      tokens.add(fStringMiddleToken);
+    }
+  }
+
+  private void addFStringEndToTokens(CodeReader code, char quote, List<Token> tokens, Lexer output) {
+    StringBuilder endQuotes = consumeFStringQuotes(code, quote);
+    lexerState.fStringStateStack.pop();
+    Token fStringEndToken = buildToken(PythonTokenType.FSTRING_END, endQuotes.toString(), output, code.getLinePosition(), code.getColumnPosition());
+    tokens.add(fStringEndToken);
+  }
+
+  private void addLCurlBraceAndSwitchToRegularMode(List<Token> tokens, CodeReader code, Lexer output) {
+    Token curlyBraceToken = buildToken(PythonPunctuator.LCURLYBRACE, "{", output, code.getLinePosition(), code.getColumnPosition());
+    code.pop();
+    FStringState updatedState = new FStringState(FStringState.Mode.REGULAR_MODE, lexerState.brackets);
+    lexerState.fStringStateStack.push(updatedState);
+    tokens.add(curlyBraceToken);
+  }
+
+  private static StringBuilder consumeFStringQuotes(CodeReader code, char quote) {
+    StringBuilder quotes = new StringBuilder();
+    if (code.charAt(1) == quote && code.charAt(2) == quote) {
+      quotes.append((char) code.pop());
+      quotes.append((char) code.pop());
+      quotes.append((char) code.pop());
+    } else {
+      quotes.append((char) code.pop());
+    }
+    return quotes;
+  }
+
+  private static void addTokens(List<Token> tokens, Lexer output) {
+    output.addToken(tokens.toArray(Token[]::new));
+  }
+
+  private static Token buildToken(TokenType tokenType, String value, Lexer output, int line, int column) {
+    return Token.builder()
+      .setType(tokenType)
+      .setValueAndOriginalValue(value)
+      .setURI(output.getURI())
+      .setLine(line)
+      .setColumn(column)
+      .build();
+  }
+}
diff --git a/python-frontend/src/main/java/org/sonar/python/lexer/FStringState.java b/python-frontend/src/main/java/org/sonar/python/lexer/FStringState.java
@@ -0,0 +1,65 @@
+/*
+ * SonarQube Python Plugin
+ * Copyright (C) 2011-2023 SonarSource SA
+ * mailto:info AT sonarsource DOT com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 3 of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this program; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+package org.sonar.python.lexer;
+
+public class FStringState {
+
+  Character quote;
+  int numberOfQuotes;
+  int brackets;
+
+
+  public enum Mode {
+    REGULAR_MODE,
+    FSTRING_MODE,
+    FORMAT_SPECIFIER_MODE
+  }
+
+  private Mode tokenizerMode;
+
+  public FStringState(Mode mode, int brackets) {
+    this.tokenizerMode = mode;
+    this.brackets = brackets;
+  }
+
+  public Character getQuote() {
+    return quote;
+  }
+
+  public void setQuote(Character quote) {
+    this.quote = quote;
+  }
+
+  public Mode getTokenizerMode() {
+    return tokenizerMode;
+  }
+
+  public int getNumberOfQuotes() {
+    return numberOfQuotes;
+  }
+
+  public void setNumberOfQuotes(int numberOfQuotes) {
+    this.numberOfQuotes = numberOfQuotes;
+  }
+
+  public int getBrackets() {
+    return brackets;
+  }
+}
diff --git a/python-frontend/src/main/java/org/sonar/python/lexer/LexerState.java b/python-frontend/src/main/java/org/sonar/python/lexer/LexerState.java
@@ -22,10 +22,14 @@
 import java.util.ArrayDeque;
 import java.util.Deque;
 
+import org.sonar.python.lexer.FStringState.Mode;
+
 public class LexerState {
 
   public final Deque<Integer> indentationStack = new ArrayDeque<>();
 
+  public final Deque<FStringState> fStringStateStack = new ArrayDeque<>();
+
   int brackets;
   boolean joined;
   int initialLine = 1;
@@ -37,6 +41,8 @@ public void reset() {
 
     brackets = 0;
     joined = false;
+    fStringStateStack.clear();
+    fStringStateStack.push(new FStringState(Mode.REGULAR_MODE, brackets));
   }
 
   public void reset(int initialLine, int initialColumn) {