-
Notifications
You must be signed in to change notification settings - Fork 93
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
SONARPY-1493: Support lexing of PEP701 f-strings (#1606)
- Loading branch information
Showing
7 changed files
with
691 additions
and
56 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
204 changes: 204 additions & 0 deletions
204
python-frontend/src/main/java/org/sonar/python/lexer/FStringChannel312.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
/* | ||
* SonarQube Python Plugin | ||
* Copyright (C) 2011-2023 SonarSource SA | ||
* mailto:info AT sonarsource DOT com | ||
* | ||
* This program is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU Lesser General Public | ||
* License as published by the Free Software Foundation; either | ||
* version 3 of the License, or (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* Lesser General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Lesser General Public License | ||
* along with this program; if not, write to the Free Software Foundation, | ||
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
*/ | ||
package org.sonar.python.lexer; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
import java.util.stream.IntStream; | ||
|
||
import org.sonar.python.api.PythonPunctuator; | ||
import org.sonar.python.api.PythonTokenType; | ||
import org.sonar.python.lexer.FStringState.Mode; | ||
import org.sonar.sslr.channel.Channel; | ||
import org.sonar.sslr.channel.CodeReader; | ||
|
||
import com.sonar.sslr.api.Token; | ||
import com.sonar.sslr.api.TokenType; | ||
import com.sonar.sslr.impl.Lexer; | ||
|
||
/** | ||
* A channel to handle f-strings. | ||
* See https://docs.python.org/3.12/reference/lexical_analysis.html#formatted-string-literals | ||
*/ | ||
public class FStringChannel312 extends Channel<Lexer> { | ||
|
||
private static final char EOF = (char) -1; | ||
|
||
private final LexerState lexerState; | ||
private final StringBuilder sb = new StringBuilder(); | ||
|
||
private static final List<Character> QUOTES = List.of('\"', '\''); | ||
private static final List<Character> PREFIXES = List.of('F', 'R'); | ||
|
||
public FStringChannel312(LexerState lexerState) { | ||
this.lexerState = lexerState; | ||
} | ||
|
||
@Override | ||
public boolean consume(CodeReader code, Lexer output) { | ||
char c = code.charAt(0); | ||
int line = code.getLinePosition(); | ||
int column = code.getColumnPosition(); | ||
|
||
FStringState currentState = lexerState.fStringStateStack.peek(); | ||
|
||
if (canConsumeFStringPrefix(sb, code)) { | ||
char quote = code.charAt(0); | ||
StringBuilder quotes = consumeFStringQuotes(code, quote); | ||
FStringState newState = new FStringState(Mode.FSTRING_MODE, lexerState.brackets); | ||
newState.setQuote(quote); | ||
newState.setNumberOfQuotes(quotes.length()); | ||
lexerState.fStringStateStack.push(newState); | ||
Token fStringStartToken = buildToken(PythonTokenType.FSTRING_START, sb.append(quotes).toString(), output, line, column); | ||
sb.setLength(0); | ||
List<Token> tokens = new ArrayList<>(); | ||
tokens.add(fStringStartToken); | ||
return consumeFStringMiddle(tokens, sb, newState, code, output); | ||
} | ||
|
||
FStringState.Mode currentMode = currentState.getTokenizerMode(); | ||
|
||
if (currentMode == Mode.REGULAR_MODE && lexerState.fStringStateStack.size() > 1) { | ||
if (c == '}') { | ||
Token rCurlyBraceToken = buildToken(PythonPunctuator.RCURLYBRACE, "}", output, line, column); | ||
code.pop(); | ||
List<Token> tokens = new ArrayList<>(); | ||
tokens.add(rCurlyBraceToken); | ||
lexerState.fStringStateStack.pop(); | ||
FStringState previousState = lexerState.fStringStateStack.peek(); | ||
return consumeFStringMiddle(tokens, sb, previousState, code, output); | ||
// do not lexer colon if the nesting level is different from the open curly brace | ||
} else if (c == ':' && lexerState.brackets == currentState.getBrackets()) { | ||
Token formatSpecifier = buildToken(PythonPunctuator.COLON, ":", output, line, column); | ||
code.pop(); | ||
List<Token> tokens = new ArrayList<>(); | ||
tokens.add(formatSpecifier); | ||
FStringState newState = new FStringState(Mode.FORMAT_SPECIFIER_MODE, lexerState.brackets); | ||
lexerState.fStringStateStack.push(newState); | ||
return consumeFStringMiddle(tokens, sb, newState, code, output); | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
private boolean consumeFStringMiddle(List<Token> tokens, StringBuilder sb, FStringState state, CodeReader code, Lexer output) { | ||
int line = code.getLinePosition(); | ||
int column = code.getColumnPosition(); | ||
FStringState.Mode currentMode = state.getTokenizerMode(); | ||
while (code.charAt(0) != EOF) { | ||
if (currentMode == Mode.FSTRING_MODE && isEscapedCurlyBrace(code)) { | ||
sb.append((char) code.pop()); | ||
sb.append((char) code.pop()); | ||
} else if (code.charAt(0) == '{') { | ||
addFStringMiddleToTokens(tokens, sb, output, line, column); | ||
addLCurlBraceAndSwitchToRegularMode(tokens, code, output); | ||
addTokens(tokens, output); | ||
return true; | ||
} else if (currentMode == Mode.FORMAT_SPECIFIER_MODE && code.charAt(0) == '}') { | ||
addFStringMiddleToTokens(tokens, sb, output, line, column); | ||
lexerState.fStringStateStack.pop(); | ||
addTokens(tokens, output); | ||
return true; | ||
} else if (currentMode == Mode.FSTRING_MODE && areClosingQuotes(code, state)) { | ||
addFStringMiddleToTokens(tokens, sb, output, line, column); | ||
addFStringEndToTokens(code, state.getQuote(), tokens, output); | ||
addTokens(tokens, output); | ||
return true; | ||
} else { | ||
sb.append((char) code.pop()); | ||
} | ||
} | ||
return false; | ||
} | ||
|
||
private static boolean canConsumeFStringPrefix(StringBuilder sb, CodeReader code) { | ||
Character firstChar = Character.toUpperCase(code.charAt(0)); | ||
Character secondChar = Character.toUpperCase(code.charAt(1)); | ||
if (firstChar == 'F' && QUOTES.contains(code.charAt(1))) { | ||
sb.append((char) code.pop()); | ||
return true; | ||
} else if (PREFIXES.contains(firstChar) && PREFIXES.contains(secondChar) && | ||
!firstChar.equals(secondChar) && QUOTES.contains(code.charAt(2))) { | ||
sb.append((char) code.pop()); | ||
sb.append((char) code.pop()); | ||
return true; | ||
} | ||
return false; | ||
} | ||
|
||
private static boolean isEscapedCurlyBrace(CodeReader code) { | ||
return Arrays.equals(code.peek(2), "{{".toCharArray()) || Arrays.equals(code.peek(2), "}}".toCharArray()); | ||
} | ||
|
||
private static boolean areClosingQuotes(CodeReader code, FStringState state) { | ||
char[] quotes = code.peek(state.getNumberOfQuotes()); | ||
return IntStream.range(0, quotes.length).mapToObj(i -> quotes[i]).allMatch(state.getQuote()::equals); | ||
} | ||
|
||
private static void addFStringMiddleToTokens(List<Token> tokens, StringBuilder sb, Lexer output, int line, int column) { | ||
if (sb.length() != 0) { | ||
Token fStringMiddleToken = buildToken(PythonTokenType.FSTRING_MIDDLE, sb.toString(), output, line, column); | ||
sb.setLength(0); | ||
tokens.add(fStringMiddleToken); | ||
} | ||
} | ||
|
||
private void addFStringEndToTokens(CodeReader code, char quote, List<Token> tokens, Lexer output) { | ||
StringBuilder endQuotes = consumeFStringQuotes(code, quote); | ||
lexerState.fStringStateStack.pop(); | ||
Token fStringEndToken = buildToken(PythonTokenType.FSTRING_END, endQuotes.toString(), output, code.getLinePosition(), code.getColumnPosition()); | ||
tokens.add(fStringEndToken); | ||
} | ||
|
||
private void addLCurlBraceAndSwitchToRegularMode(List<Token> tokens, CodeReader code, Lexer output) { | ||
Token curlyBraceToken = buildToken(PythonPunctuator.LCURLYBRACE, "{", output, code.getLinePosition(), code.getColumnPosition()); | ||
code.pop(); | ||
FStringState updatedState = new FStringState(FStringState.Mode.REGULAR_MODE, lexerState.brackets); | ||
lexerState.fStringStateStack.push(updatedState); | ||
tokens.add(curlyBraceToken); | ||
} | ||
|
||
private static StringBuilder consumeFStringQuotes(CodeReader code, char quote) { | ||
StringBuilder quotes = new StringBuilder(); | ||
if (code.charAt(1) == quote && code.charAt(2) == quote) { | ||
quotes.append((char) code.pop()); | ||
quotes.append((char) code.pop()); | ||
quotes.append((char) code.pop()); | ||
} else { | ||
quotes.append((char) code.pop()); | ||
} | ||
return quotes; | ||
} | ||
|
||
private static void addTokens(List<Token> tokens, Lexer output) { | ||
output.addToken(tokens.toArray(Token[]::new)); | ||
} | ||
|
||
private static Token buildToken(TokenType tokenType, String value, Lexer output, int line, int column) { | ||
return Token.builder() | ||
.setType(tokenType) | ||
.setValueAndOriginalValue(value) | ||
.setURI(output.getURI()) | ||
.setLine(line) | ||
.setColumn(column) | ||
.build(); | ||
} | ||
} |
65 changes: 65 additions & 0 deletions
65
python-frontend/src/main/java/org/sonar/python/lexer/FStringState.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
/* | ||
* SonarQube Python Plugin | ||
* Copyright (C) 2011-2023 SonarSource SA | ||
* mailto:info AT sonarsource DOT com | ||
* | ||
* This program is free software; you can redistribute it and/or | ||
* modify it under the terms of the GNU Lesser General Public | ||
* License as published by the Free Software Foundation; either | ||
* version 3 of the License, or (at your option) any later version. | ||
* | ||
* This program is distributed in the hope that it will be useful, | ||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | ||
* Lesser General Public License for more details. | ||
* | ||
* You should have received a copy of the GNU Lesser General Public License | ||
* along with this program; if not, write to the Free Software Foundation, | ||
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. | ||
*/ | ||
package org.sonar.python.lexer; | ||
|
||
public class FStringState { | ||
|
||
Character quote; | ||
int numberOfQuotes; | ||
int brackets; | ||
|
||
|
||
public enum Mode { | ||
REGULAR_MODE, | ||
FSTRING_MODE, | ||
FORMAT_SPECIFIER_MODE | ||
} | ||
|
||
private Mode tokenizerMode; | ||
|
||
public FStringState(Mode mode, int brackets) { | ||
this.tokenizerMode = mode; | ||
this.brackets = brackets; | ||
} | ||
|
||
public Character getQuote() { | ||
return quote; | ||
} | ||
|
||
public void setQuote(Character quote) { | ||
this.quote = quote; | ||
} | ||
|
||
public Mode getTokenizerMode() { | ||
return tokenizerMode; | ||
} | ||
|
||
public int getNumberOfQuotes() { | ||
return numberOfQuotes; | ||
} | ||
|
||
public void setNumberOfQuotes(int numberOfQuotes) { | ||
this.numberOfQuotes = numberOfQuotes; | ||
} | ||
|
||
public int getBrackets() { | ||
return brackets; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.