Skip to content

Commit

Permalink
SONARPY-1494: Support grammar of PEP701 f-strings (#1620)
Browse files Browse the repository at this point in the history
  • Loading branch information
joke1196 authored Oct 26, 2023
1 parent 4554af9 commit bbb023e
Show file tree
Hide file tree
Showing 20 changed files with 543 additions and 638 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@
import org.sonar.plugins.python.api.tree.ParenthesizedExpression;
import org.sonar.plugins.python.api.tree.Pattern;
import org.sonar.plugins.python.api.tree.ReturnStatement;
import org.sonar.plugins.python.api.tree.StringElement;
import org.sonar.plugins.python.api.tree.Tree;
import org.sonar.plugins.python.api.tree.Tree.Kind;
import org.sonar.plugins.python.api.tree.TryStatement;
Expand Down Expand Up @@ -156,7 +157,9 @@ private static boolean haveTheSameValue(LatestExecutedBlock leftBlock, Tree left
return binaryExpressionsHaveTheSameValue(leftBlock, (BinaryExpression) left, rightBlock, (BinaryExpression) right);
} else if (left.is(Kind.STRING_LITERAL)) {
return haveTheSameValue(leftBlock, left.children(), rightBlock, right.children());
} else if (left.is(Kind.NUMERIC_LITERAL, Kind.STRING_ELEMENT)) {
} else if (left.is(Kind.STRING_ELEMENT)) {
return ((StringElement) left).value().equals(((StringElement) right).value());
} else if (left.is(Kind.NUMERIC_LITERAL)) {
return left.firstToken().value().equals(right.firstToken().value());
} else if (left.is(Kind.NAME)) {
return identifierHaveTheSameValue(leftBlock, (Name) left, rightBlock, (Name) right);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
*/
package org.sonar.python.checks;

import java.text.ParseException;
import java.util.ArrayList;
import java.util.List;
import org.junit.jupiter.api.Test;
Expand All @@ -34,7 +35,10 @@
import org.sonar.python.semantic.SymbolTableBuilder;
import org.sonar.python.tree.PythonTreeMaker;

import com.sonar.sslr.api.RecognitionException;

import static org.assertj.core.api.Assertions.assertThat;
import static org.assertj.core.api.Assertions.assertThatThrownBy;
import static org.sonar.python.checks.Expressions.isFalsy;
import static org.sonar.python.checks.Expressions.isTruthy;
import static org.sonar.python.checks.Expressions.removeParentheses;
Expand Down Expand Up @@ -208,8 +212,6 @@ void unescape_string_element_invalid_escape_sequences() {
assertThat(unescape(stringElement("'\\u000'"))).isEqualTo("\\u000");
assertThat(unescape(stringElement("'\\U0000000'"))).isEqualTo("\\U0000000");

// Python error: f-string expression part cannot include a backslash
assertThat(unescape(stringElement("f'name:\\n{na\\\nme}'"))).isEqualTo("name:\n{name}");
}

private StringElement stringElement(String source) {
Expand Down
15 changes: 15 additions & 0 deletions python-checks/src/test/resources/checks/invariantReturn.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,21 @@ def f_same_string(x): # Noncompliant {{Refactor this method to not always return
return "ab"
# ^^^^^^^^^^^<

def f_same_fstring(x): # Noncompliant {{Refactor this method to not always return the same value.}}
# ^^^^^^^^^^^^^^
if x:
return f"{x.foo()}('{str(x.bar)}')"
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^<
else:
return f"{x.foo()}('{str(x.bar)}')"
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^<

def f_different_fstring(x): # Compliant
if x:
return f"{x.foo()}('{str(x.bar)}')"
else:
return f"{x.foo()}('{str(x.bar)}...{str(x.foobar)}')"

def f_same_number(x): # Noncompliant
if x:
return 42
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,9 +76,10 @@ public enum PythonGrammar implements GrammarRuleKey {
NAMED_EXPR_TEST,
STAR_NAMED_EXPRESSIONS,
STAR_NAMED_EXPRESSION,
FORMATTED_EXPR,
F_STRING_CONTENT,
FSTRING_REPLACEMENT_FIELD,
FSTRING,
FORMAT_SPECIFIER,
STRINGS,

COMPARISON,
COMP_OPERATOR,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@
*/
package org.sonar.python.api;

import com.sonar.sslr.api.GenericTokenType;
import com.sonar.sslr.api.Grammar;
import org.sonar.sslr.grammar.LexerfulGrammarBuilder;

Expand Down Expand Up @@ -91,20 +90,26 @@ protected void grammar(LexerfulGrammarBuilder b) {
b.rule(STAR_EXPR).is("*", EXPR);
b.rule(EXPR).is(XOR_EXPR, b.zeroOrMore("|", XOR_EXPR));

// https://docs.python.org/3/reference/lexical_analysis.html#formatted-string-literals
b.rule(F_STRING_CONTENT).is(b.zeroOrMore(b.firstOf(GenericTokenType.UNKNOWN_CHAR, FORMATTED_EXPR)));
b.rule(FORMATTED_EXPR).is(
// https://docs.python.org/3.12/reference/lexical_analysis.html#formatted-string-literals
b.rule(FSTRING).is(
PythonTokenType.FSTRING_START,
b.zeroOrMore(b.firstOf(FSTRING_REPLACEMENT_FIELD, PythonTokenType.FSTRING_MIDDLE)),
PythonTokenType.FSTRING_END
);
b.rule(FSTRING_REPLACEMENT_FIELD).is(
PythonPunctuator.LCURLYBRACE,
TESTLIST,
b.firstOf(YIELD_EXPR, TESTLIST_STAR_EXPR),
b.optional(PythonPunctuator.ASSIGN),
b.optional("!", b.firstOf("s", "r", "a")),
b.optional(FORMAT_SPECIFIER),
PythonPunctuator.RCURLYBRACE);
b.rule(FORMAT_SPECIFIER).is(
":",
b.oneOrMore(b.firstOf(FORMATTED_EXPR, b.anyTokenButNot(PythonPunctuator.RCURLYBRACE)))
b.zeroOrMore(b.firstOf(PythonTokenType.FSTRING_MIDDLE, FSTRING_REPLACEMENT_FIELD))
);

b.rule(STRINGS).is(b.oneOrMore(b.firstOf(FSTRING, PythonTokenType.STRING)));

b.rule(FACTOR).is(b.firstOf(
b.sequence(b.firstOf("+", "-", "~"), FACTOR),
POWER)).skipIfOneChild();
Expand All @@ -120,7 +125,7 @@ protected void grammar(LexerfulGrammarBuilder b) {
b.sequence("`", TEST, b.zeroOrMore(",", TEST), "`"),
NAME,
PythonTokenType.NUMBER,
b.oneOrMore(PythonTokenType.STRING),
STRINGS,
ELLIPSIS,
PythonKeyword.NONE));
b.rule(ELLIPSIS).is(b.sequence(".", ".", "."));
Expand Down Expand Up @@ -369,7 +374,7 @@ protected void compoundStatements(LexerfulGrammarBuilder b) {
b.rule(LITERAL_PATTERN).is(b.firstOf(
COMPLEX_NUMBER,
SIGNED_NUMBER,
b.oneOrMore(PythonTokenType.STRING),
STRINGS,
PythonKeyword.NONE,
"True",
"False"
Expand Down
200 changes: 162 additions & 38 deletions python-frontend/src/main/java/org/sonar/python/lexer/FStringChannel.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,24 @@
*/
package org.sonar.python.lexer;

import com.sonar.sslr.api.GenericTokenType;
import com.sonar.sslr.api.Token;
import com.sonar.sslr.impl.Lexer;
import java.util.ArrayList;
import java.util.List;
import java.util.Set;
import java.util.stream.IntStream;

import org.sonar.python.api.PythonPunctuator;
import org.sonar.python.api.PythonTokenType;
import org.sonar.python.lexer.FStringState.Mode;
import org.sonar.sslr.channel.Channel;
import org.sonar.sslr.channel.CodeReader;

import com.sonar.sslr.api.Token;
import com.sonar.sslr.api.TokenType;
import com.sonar.sslr.impl.Lexer;

/**
* A channel to handle the literal_char parts inside f-strings.
* See https://docs.python.org/3/reference/lexical_analysis.html#f-strings
* A channel to handle f-strings.
* See https://docs.python.org/3.12/reference/lexical_analysis.html#formatted-string-literals
*/
public class FStringChannel extends Channel<Lexer> {

Expand All @@ -36,56 +45,171 @@ public class FStringChannel extends Channel<Lexer> {
private final LexerState lexerState;
private final StringBuilder sb = new StringBuilder();

private static final Set<Character> QUOTES = Set.of('\"', '\'');
private static final Set<Character> PREFIXES = Set.of('F', 'R');
private static final Set<String> ESCAPED_CHARS = Set.of("{{", "}}","\\\"","\\\'");

public FStringChannel(LexerState lexerState) {
this.lexerState = lexerState;
}

@Override
public boolean consume(CodeReader code, Lexer output) {
setInitialLineAndColumn(code);
if (code.charAt(0) == '#') {
// disable comments
addUnknownCharToken("#", output, code.getLinePosition(), code.getColumnPosition());
code.pop();
return true;
char c = code.charAt(0);
int line = code.getLinePosition();
int column = code.getColumnPosition();

FStringState currentState = lexerState.fStringStateStack.peek();

if (canConsumeFStringPrefix(sb, code)) {
char quote = code.charAt(0);
StringBuilder quotes = consumeFStringQuotes(code, quote);
FStringState newState = new FStringState(Mode.FSTRING_MODE, lexerState.brackets);
newState.setQuote(quote);
newState.setNumberOfQuotes(quotes.length());
lexerState.fStringStateStack.push(newState);
Token fStringStartToken = buildToken(PythonTokenType.FSTRING_START, sb.append(quotes).toString(), output, line, column);
sb.setLength(0);
List<Token> tokens = new ArrayList<>();
tokens.add(fStringStartToken);
return consumeFStringMiddle(tokens, sb, newState, code, output);
}
if (lexerState.brackets == 0) {
int line = code.getLinePosition();
int column = code.getColumnPosition();
while (code.charAt(0) != EOF) {
char c = code.charAt(0);
if (c != '{') {
sb.append((char) code.pop());
} else if (code.charAt(1) == '{') {
sb.append((char) code.pop());
sb.append((char) code.pop());
} else {
break;
}

FStringState.Mode currentMode = currentState.getTokenizerMode();

if (currentMode == Mode.REGULAR_MODE && lexerState.fStringStateStack.size() > 1) {
// because the lexerState removes one to the count of brackets before entering this channel
// we need to adjust the comparison
if (c == '}' && currentState.getBrackets() -1 == lexerState.brackets) {
Token rCurlyBraceToken = buildToken(PythonPunctuator.RCURLYBRACE, "}", output, line, column);
code.pop();
List<Token> tokens = new ArrayList<>();
tokens.add(rCurlyBraceToken);
lexerState.fStringStateStack.pop();
FStringState previousState = lexerState.fStringStateStack.peek();
return consumeFStringMiddle(tokens, sb, previousState, code, output);
// do not lex colon if the nesting level is different from the open curly brace
} else if (c == ':' && lexerState.brackets == currentState.getBrackets()) {
Token formatSpecifier = buildToken(PythonPunctuator.COLON, ":", output, line, column);
code.pop();
List<Token> tokens = new ArrayList<>();
tokens.add(formatSpecifier);
FStringState newState = new FStringState(Mode.FORMAT_SPECIFIER_MODE, lexerState.brackets);
lexerState.fStringStateStack.push(newState);
return consumeFStringMiddle(tokens, sb, newState, code, output);
}
if (sb.length() != 0) {
addUnknownCharToken(sb.toString(), output, line, column);
sb.setLength(0);
}
return false;
}

private boolean consumeFStringMiddle(List<Token> tokens, StringBuilder sb, FStringState state, CodeReader code, Lexer output) {
int line = code.getLinePosition();
int column = code.getColumnPosition();
FStringState.Mode currentMode = state.getTokenizerMode();
while (code.charAt(0) != EOF) {
if (currentMode == Mode.FSTRING_MODE && isEscapedChar(code) ) {
sb.append((char) code.pop());
sb.append((char) code.pop());
} else if (code.charAt(0) == '{' && !isUnicodeChar(sb)) {
addFStringMiddleToTokens(tokens, sb, output, line, column);
addLCurlBraceAndSwitchToRegularMode(tokens, code, output);
addTokens(tokens, output);
return true;
} else if (currentMode == Mode.FORMAT_SPECIFIER_MODE && code.charAt(0) == '}') {
addFStringMiddleToTokens(tokens, sb, output, line, column);
lexerState.fStringStateStack.pop();
addTokens(tokens, output);
return true;
} else if (currentMode == Mode.FSTRING_MODE && areClosingQuotes(code, state)) {
addFStringMiddleToTokens(tokens, sb, output, line, column);
addFStringEndToTokens(code, state.getQuote(), tokens, output);
addTokens(tokens, output);
return true;
} else {
sb.append((char) code.pop());
}
}
return false;
}

private static void addUnknownCharToken(String value, Lexer output, int line, int column) {
output.addToken(Token.builder()
.setType(GenericTokenType.UNKNOWN_CHAR)
private static boolean canConsumeFStringPrefix(StringBuilder sb, CodeReader code) {
Character firstChar = Character.toUpperCase(code.charAt(0));
Character secondChar = Character.toUpperCase(code.charAt(1));
if (firstChar == 'F' && QUOTES.contains(code.charAt(1))) {
sb.append((char) code.pop());
return true;
} else if (PREFIXES.contains(firstChar) && PREFIXES.contains(secondChar) &&
!firstChar.equals(secondChar) && QUOTES.contains(code.charAt(2))) {
sb.append((char) code.pop());
sb.append((char) code.pop());
return true;
}
return false;
}

private static boolean isUnicodeChar(StringBuilder sb ){
int lastIndexOfUnicodeChar = sb.lastIndexOf("\\N");
return lastIndexOfUnicodeChar >= 0 && lastIndexOfUnicodeChar == sb.length() - 2;
}

private static boolean isEscapedChar(CodeReader code) {
return ESCAPED_CHARS.contains(String.valueOf(code.peek(2)));
}

private static boolean areClosingQuotes(CodeReader code, FStringState state) {
char[] quotes = code.peek(state.getNumberOfQuotes());
return IntStream.range(0, quotes.length).mapToObj(i -> quotes[i]).allMatch(state.getQuote()::equals);
}

private static void addFStringMiddleToTokens(List<Token> tokens, StringBuilder sb, Lexer output, int line, int column) {
if (sb.length() != 0) {
Token fStringMiddleToken = buildToken(PythonTokenType.FSTRING_MIDDLE, sb.toString(), output, line, column);
sb.setLength(0);
tokens.add(fStringMiddleToken);
}
}

private void addFStringEndToTokens(CodeReader code, char quote, List<Token> tokens, Lexer output) {
int line = code.getLinePosition();
int column = code.getColumnPosition();
StringBuilder endQuotes = consumeFStringQuotes(code, quote);
lexerState.fStringStateStack.pop();
Token fStringEndToken = buildToken(PythonTokenType.FSTRING_END, endQuotes.toString(), output, line, column);
tokens.add(fStringEndToken);
}

private void addLCurlBraceAndSwitchToRegularMode(List<Token> tokens, CodeReader code, Lexer output) {
Token curlyBraceToken = buildToken(PythonPunctuator.LCURLYBRACE, "{", output, code.getLinePosition(), code.getColumnPosition());
code.pop();
lexerState.brackets++;
FStringState updatedState = new FStringState(FStringState.Mode.REGULAR_MODE, lexerState.brackets);
lexerState.fStringStateStack.push(updatedState);
tokens.add(curlyBraceToken);
}

private static StringBuilder consumeFStringQuotes(CodeReader code, char quote) {
StringBuilder quotes = new StringBuilder();
if (code.charAt(1) == quote && code.charAt(2) == quote) {
quotes.append((char) code.pop());
quotes.append((char) code.pop());
quotes.append((char) code.pop());
} else {
quotes.append((char) code.pop());
}
return quotes;
}

private static void addTokens(List<Token> tokens, Lexer output) {
output.addToken(tokens.toArray(Token[]::new));
}

private static Token buildToken(TokenType tokenType, String value, Lexer output, int line, int column) {
return Token.builder()
.setType(tokenType)
.setValueAndOriginalValue(value)
.setURI(output.getURI())
.setLine(line)
.setColumn(column)
.build());
}

private void setInitialLineAndColumn(CodeReader code) {
if (code.getLinePosition() == 1 && code.getColumnPosition() == 0) {
code.setLinePosition(lexerState.initialLine);
code.setColumnPosition(lexerState.initialColumn);
}
.build();
}
}
Loading

0 comments on commit bbb023e

Please sign in to comment.