From a39fd54df70a117ee9c8eac11d55b4d40d5d38d0 Mon Sep 17 00:00:00 2001 From: Glyphack Date: Thu, 29 Aug 2024 23:19:09 +0200 Subject: [PATCH 1/7] Separate compatibility tests Signed-off-by: Shaygan --- Cargo.toml | 4 +- compat/Cargo.toml | 23 + compat/src/lexer_compat.rs | 893 +++++++++++++++++ compat/src/main.rs | 74 ++ compat/src/parser_compat.rs | 469 +++++++++ {parser => compat}/src/runpython.rs | 0 parser/Cargo.toml | 5 - parser/src/lexer/compat.rs | 903 ------------------ parser/src/lexer/mod.rs | 24 +- parser/src/lib.rs | 3 +- parser/src/parser/compat.rs | 474 +-------- ...checker__tests__annotations_coroutine.snap | 16 +- ...ecker__checker__tests__basic_generics.snap | 152 ++- 13 files changed, 1560 insertions(+), 1480 deletions(-) create mode 100644 compat/Cargo.toml create mode 100644 compat/src/lexer_compat.rs create mode 100644 compat/src/main.rs create mode 100644 compat/src/parser_compat.rs rename {parser => compat}/src/runpython.rs (100%) delete mode 100644 parser/src/lexer/compat.rs diff --git a/Cargo.toml b/Cargo.toml index 1ae112be..3dcd54c8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,5 +1,5 @@ [workspace] -members = ["parser", "enderpy", "typechecker", "lsp", "benchmark"] +members = ["parser", "enderpy", "typechecker", "lsp", "benchmark", "compat"] resolver = "2" [workspace.package] @@ -31,5 +31,3 @@ opt-level = 3 lto = "fat" opt-level = 3 # panic = "abort" - - diff --git a/compat/Cargo.toml b/compat/Cargo.toml new file mode 100644 index 00000000..11b624a7 --- /dev/null +++ b/compat/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "enderpy-compat" +version = "0.0.0" +authors = { workspace = true } +edition = { workspace = true } +rust-version = { workspace = true } +homepage = { workspace = true } +documentation = { workspace = true } +repository = { workspace = true } +license = { workspace = true } + +[dependencies] +serde_json = "1.0" +serde = "1.0" +miette = "5.5" +enderpy_python_parser = { path = "../parser"} +tabled = "0.15" +terminal_size = "0.3" +assert-json-diff = "2.0" +pretty_assertions = "1.4" +which = "6.0.1" +reqwest = { version = "0.11", features = ["blocking"] } +zip = "0.6" diff --git a/compat/src/lexer_compat.rs b/compat/src/lexer_compat.rs new file mode 100644 index 00000000..664c3f13 --- /dev/null +++ b/compat/src/lexer_compat.rs @@ -0,0 +1,893 @@ +use enderpy_python_parser::get_row_col_position; +use enderpy_python_parser::token::Kind; +use enderpy_python_parser::{token::Token, Lexer}; +use miette::{bail, IntoDiagnostic, Result}; +use serde::{Deserialize, Serialize}; +use std::io::Write; +use tabled::{ + builder::Builder, + settings::peaker::PriorityMax, + settings::{Style, Width}, +}; + +use crate::runpython::{default_python_path, spawn_python_script_command}; +use terminal_size::{terminal_size, Width as TerminalWidth}; + +// Derived from: +// https://github.com/python/cpython/blob/main/Lib/token.py +#[allow(dead_code)] +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] +#[serde(rename_all = "UPPERCASE")] +pub enum PythonKind { + EndMarker, + Name, + Number, + String, + NewLine, + Indent, + Dedent, + LPar, + RPar, + LSqb, + RSqb, + Colon, + Comma, + Semi, + Plus, + Minus, + Star, + Slash, + VBar, + Amper, + Less, + Greater, + Equal, + Dot, + Percent, + LBrace, + RBrace, + EqEqual, + NotEqual, + LessEqual, + GreaterEqual, + Tilde, + Circumflex, + LeftShift, + RightShift, + DoubleStar, + PlusEqual, + MinEqual, + StarEqual, + SlashEqual, + PercentEqual, + AmperEqual, + VBarEqual, + CircumflexEqual, + LeftShiftEqual, + RightShiftEqual, + DoubleStarEqual, + DoubleSlash, + DoubleSlashEqual, + At, + AtEqual, + RArrow, + Ellipsis, + ColonEqual, + Exclamation, + Op, + Await, + Async, + #[serde(rename = "TYPE_IGNORE")] + TypeIgnore, + #[serde(rename = "TYPE_COMMENT")] + TypeComment, + #[serde(rename = "SOFT_KEYWORD")] + SoftKeyword, + #[serde(rename = "FSTRING_START")] + FstringStart, + #[serde(rename = "FSTRING_MIDDLE")] + FstringMiddle, + #[serde(rename = "FSTRING_END")] + FstringEnd, + Comment, + NL, + ErrorToken, + Encoding, + #[serde(rename = "N_TOKENS")] + NTokens, + #[serde(rename = "NT_OFFSET")] + NTOffset, +} +enum TokenMismatch { + MissingToken(Option, Option), + WrongKind(PythonToken, Token), + WrongValue(PythonToken, Token, String, String), + WrongStartEnd( + PythonToken, + Token, + (u32, u32), + (u32, u32), + (u32, u32), + (u32, u32), + ), +} + +#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] +pub struct PythonToken { + kind: PythonKind, + value: String, + start: (u32, u32), + end: (u32, u32), +} + +pub fn lex_python_source(source: &str) -> Result> { + let mut process = spawn_python_script_command( + "parser/lex_python.py", + vec!["--stdin", "--output-format", "json"], + default_python_path()?, + )?; + + // Get process stdin and write the input string. + if let Some(mut stdin) = process.stdin.take() { + stdin.write_all(source.as_bytes()).into_diagnostic()?; + } else { + bail!("Failed to open stdin when running `parser/lex_python.py`"); + } + // Get process stdout and parse result. + let output = process.wait_with_output().into_diagnostic()?; + let python_tokens: Vec = + serde_json::from_str(String::from_utf8_lossy(&output.stdout).as_ref()).into_diagnostic()?; + Ok(python_tokens) +} + +pub fn assert_tokens_eq( + python_tokens: Vec, + enderpy_tokens: Vec, + lexer: &Lexer, +) { + let num_python_tokens = python_tokens.len(); + let num_enderpy_tokens = enderpy_tokens.len(); + let mut mismatches: Vec = vec![]; + let mut python_index = 0; + let mut enderpy_index = 0; + while python_index < num_python_tokens || enderpy_index < num_enderpy_tokens { + let python_token = if python_index > num_python_tokens - 1 { + None + } else { + Some(python_tokens[python_index].clone()) + }; + let enderpy_token = if enderpy_index > num_enderpy_tokens - 1 { + None + } else { + Some(enderpy_tokens[enderpy_index].clone()) + }; + if python_token.is_none() || enderpy_token.is_none() { + mismatches.push(TokenMismatch::MissingToken(python_token, enderpy_token)); + } else { + let python_token = python_token.unwrap(); + let enderpy_token = enderpy_token.unwrap(); + if let Some(mismatch) = check_tokens_match(python_token, enderpy_token, lexer) { + if is_python_trailing_newline_mismatch( + &mismatch, + &python_tokens[python_index + 1..], + ) { + // If we found Python's trailing newline, we've read the end of file. + break; + } else if is_python_fstring_mismatch( + &mismatch, + &enderpy_tokens[enderpy_index + 1..], + &mut enderpy_index, // <-- `enderpy_index` may be updated + ) { + // Nothing, but don't add the mismatch. + } else { + mismatches.push(mismatch); + } + } + } + python_index += 1; + enderpy_index += 1; + } + if mismatches.is_empty() { + return; + } + + let include_source = std::env::var("INCLUDE_SOURCE").is_ok(); + let include_all_tokens = std::env::var("INCLUDE_ALL_TOKENS").is_ok(); + + let mut table_builder = Builder::default(); + // Add the table header. + let header = vec!["Python token", "Enderpy token", "Failure"]; + table_builder.push_record(header); + // Add the table rows. Each row represents a lexer token mismatch. + let num_mismatches = mismatches.len(); + for mismatch in mismatches { + let mut row: Vec = vec![]; + let (python_token, enderpy_token, message) = match mismatch { + TokenMismatch::MissingToken(python_token, enderpy_token) => { + (python_token, enderpy_token, "Missing token".to_string()) + } + TokenMismatch::WrongKind(python_token, enderpy_token) => { + let message = format!( + "Wrong token kind.\nPython: {:?}\nEnderpy: {:?}", + python_token.kind, enderpy_token.kind + ); + (Some(python_token), Some(enderpy_token), message) + } + TokenMismatch::WrongValue( + python_token, + enderpy_token, + expected_value, + actual_value, + ) => ( + Some(python_token), + Some(enderpy_token), + format!( + "Wrong token value.\nPython: {:?}\nEnderpy: {:?}", + expected_value, actual_value + ), + ), + TokenMismatch::WrongStartEnd( + python_token, + enderpy_token, + expected_start, + expected_end, + actual_start, + actual_end, + ) => ( + Some(python_token), + Some(enderpy_token), + format!( + "Wrong token start/end offset.\nPython: {:?} - {:?}\nEnderpy: {:?} - {:?}", + expected_start, expected_end, actual_start, actual_end, + ), + ), + }; + if include_all_tokens { + row.extend_from_slice(&[ + python_tokens + .iter() + .map(|token| { + let is_this_token = python_token.as_ref().is_some_and(|tok| tok == token); + format!("{}{:?}", if is_this_token { "→ " } else { "" }, token) + }) + .collect::>() + .join("\n"), + enderpy_tokens + .iter() + .map(|token| { + let is_this_token = enderpy_token.as_ref().is_some_and(|tok| tok == token); + format!("{}{:?}", if is_this_token { "→ " } else { "" }, token) + }) + .collect::>() + .join("\n"), + message, + ]); + } else { + row.extend_from_slice(&[ + python_token.map_or("None".to_string(), |t| format!("{:?}", t)), + enderpy_token.map_or("None".to_string(), |t| format!("{:?}", t)), + message, + ]); + } + table_builder.push_record(row); + } + let mut table = table_builder.build(); + table.with(Style::modern()); + // If run in a terminal, don't expand table beyond terminal width. + if let Some((TerminalWidth(width), _)) = terminal_size() { + table + .with( + Width::wrap(width as usize) + .keep_words() + .priority::(), + ) + .with(Width::increase(width as usize)); + } + let formatted_source = if include_source { + format!("\nSource:\n{}", lexer.source) + } else { + "".to_string() + }; + panic!( + "Enderpy tokens do not match Python tokens.{}\n{}\n{} token mismatches found", + formatted_source, table, num_mismatches + ); +} + +fn check_tokens_match( + python_token: PythonToken, + enderpy_token: Token, + lexer: &Lexer, +) -> Option { + let kind_matches = match python_token.kind { + PythonKind::EndMarker => enderpy_token.kind == Kind::Eof, + // For some reason, Python maintains a kind for these tokens but doesn't use them + // during tokenization. + // Instead, it slams keywords together into a generic Name kind. + PythonKind::Name => { + matches_python_name_token(python_token.value.as_str(), &enderpy_token.kind) + } + PythonKind::Number => matches!( + enderpy_token.kind, + Kind::Integer + | Kind::PointFloat + | Kind::ExponentFloat + | Kind::Binary + | Kind::Octal + | Kind::Hexadecimal + | Kind::ImaginaryPointFloat + | Kind::ImaginaryExponentFloat + | Kind::ImaginaryInteger + ), + // NOTE: The Python tokenizer doesn't appear to track differences in string modifiers. + // For example, "hello"/u"hello"/r"hello" are all just String. + PythonKind::String => matches!( + enderpy_token.kind, + Kind::StringLiteral | Kind::Bytes | Kind::RawBytes | Kind::Unicode + ), + PythonKind::NewLine => enderpy_token.kind == Kind::NewLine, + PythonKind::Indent => enderpy_token.kind == Kind::Indent, + PythonKind::Dedent => enderpy_token.kind == Kind::Dedent, + PythonKind::LPar => enderpy_token.kind == Kind::LeftParen, + PythonKind::RPar => enderpy_token.kind == Kind::RightParen, + PythonKind::LSqb => enderpy_token.kind == Kind::LeftBrace, + PythonKind::RSqb => enderpy_token.kind == Kind::RightBrace, + PythonKind::Colon => enderpy_token.kind == Kind::Colon, + PythonKind::Comma => enderpy_token.kind == Kind::Comma, + PythonKind::Semi => enderpy_token.kind == Kind::SemiColon, + PythonKind::Plus => enderpy_token.kind == Kind::Plus, + PythonKind::Minus => enderpy_token.kind == Kind::Minus, + PythonKind::Star => enderpy_token.kind == Kind::Mul, + PythonKind::Slash => enderpy_token.kind == Kind::Div, + PythonKind::VBar => enderpy_token.kind == Kind::BitOr, + PythonKind::Amper => enderpy_token.kind == Kind::BitAnd, + PythonKind::Less => enderpy_token.kind == Kind::Less, + PythonKind::Greater => enderpy_token.kind == Kind::Greater, + PythonKind::Equal => enderpy_token.kind == Kind::Assign, + PythonKind::Dot => enderpy_token.kind == Kind::Dot, + PythonKind::Percent => enderpy_token.kind == Kind::Mod, + PythonKind::LBrace => enderpy_token.kind == Kind::LeftBracket, + PythonKind::RBrace => enderpy_token.kind == Kind::RightBracket, + PythonKind::EqEqual => enderpy_token.kind == Kind::Eq, + PythonKind::NotEqual => enderpy_token.kind == Kind::NotEq, + PythonKind::LessEqual => enderpy_token.kind == Kind::LessEq, + PythonKind::GreaterEqual => enderpy_token.kind == Kind::GreaterEq, + PythonKind::Tilde => enderpy_token.kind == Kind::BitNot, + PythonKind::Circumflex => enderpy_token.kind == Kind::BitXor, + PythonKind::LeftShift => enderpy_token.kind == Kind::LeftShift, + PythonKind::RightShift => enderpy_token.kind == Kind::RightShift, + PythonKind::DoubleStar => enderpy_token.kind == Kind::Pow, + PythonKind::PlusEqual => enderpy_token.kind == Kind::AddAssign, + PythonKind::MinEqual => enderpy_token.kind == Kind::SubAssign, + PythonKind::StarEqual => enderpy_token.kind == Kind::MulAssign, + PythonKind::SlashEqual => enderpy_token.kind == Kind::DivAssign, + PythonKind::PercentEqual => enderpy_token.kind == Kind::ModAssign, + PythonKind::AmperEqual => enderpy_token.kind == Kind::BitAndAssign, + PythonKind::VBarEqual => enderpy_token.kind == Kind::BitOrAssign, + PythonKind::CircumflexEqual => enderpy_token.kind == Kind::BitXorAssign, + PythonKind::LeftShiftEqual => enderpy_token.kind == Kind::ShiftLeftAssign, + PythonKind::RightShiftEqual => enderpy_token.kind == Kind::ShiftRightAssign, + PythonKind::DoubleStarEqual => enderpy_token.kind == Kind::PowAssign, + PythonKind::DoubleSlash => enderpy_token.kind == Kind::IntDiv, + PythonKind::DoubleSlashEqual => enderpy_token.kind == Kind::IntDivAssign, + PythonKind::At => enderpy_token.kind == Kind::MatrixMul, + PythonKind::AtEqual => enderpy_token.kind == Kind::MatrixMulAssign, + PythonKind::RArrow => enderpy_token.kind == Kind::Arrow, + PythonKind::Ellipsis => enderpy_token.kind == Kind::Ellipsis, + PythonKind::ColonEqual => enderpy_token.kind == Kind::Walrus, + PythonKind::Exclamation => false, // doesn't exist + // For some reason, Python maintains a kind for these tokens but doesn't use them + // during tokenization. + // Instead, it slams all operators together into a generic Op kind. + PythonKind::Op => matches_python_op_token(python_token.value.as_str(), &enderpy_token.kind), + PythonKind::Await => enderpy_token.kind == Kind::Await, + PythonKind::Async => enderpy_token.kind == Kind::Async, + PythonKind::TypeIgnore => false, // doesn't exist + PythonKind::TypeComment => false, // doesn't exist + PythonKind::SoftKeyword => false, // doesn't exist + PythonKind::FstringStart => matches!( + enderpy_token.kind, + Kind::FStringStart | Kind::RawFStringStart + ), + PythonKind::FstringMiddle => enderpy_token.kind == Kind::FStringMiddle, + PythonKind::FstringEnd => enderpy_token.kind == Kind::FStringEnd, + PythonKind::Comment => enderpy_token.kind == Kind::Comment, + PythonKind::NL => enderpy_token.kind == Kind::NL, + PythonKind::ErrorToken => { + match python_token.value.as_str() { + // Python 3.11 chokes on these tokens. + "$" => enderpy_token.kind == Kind::Dollar, + "?" => enderpy_token.kind == Kind::QuestionMark, + "`" => enderpy_token.kind == Kind::BackTick, + _ => enderpy_token.kind == Kind::Error, + } + } + PythonKind::Encoding => false, // doesn't exist + PythonKind::NTokens => false, // doesn't exist, + PythonKind::NTOffset => false, // doesn't exist + }; + if !kind_matches { + return Some(TokenMismatch::WrongKind(python_token, enderpy_token)); + } + + let python_token_value = python_token.value.clone(); + let enderpy_token_value = enderpy_token.value.to_string(); + // The Python tokenizer sets values in a number of places where Enderpy simply relies + // on kind to assume value. Handle those cases here. + let value_matches = matches_python_name_token(python_token.value.as_str(), &enderpy_token.kind) + || matches_python_op_token(python_token.value.as_str(), &enderpy_token.kind) + || matches_python_indent_dedent_token(&python_token.kind, &enderpy_token.kind) + || (python_token.kind == PythonKind::EndMarker && enderpy_token.kind == Kind::Eof) + || (python_token.value.as_str() == "\n" + && (matches!(enderpy_token.kind, Kind::NewLine | Kind::NL))) + || python_token_value == enderpy_token_value; + if !value_matches { + return Some(TokenMismatch::WrongValue( + python_token, + enderpy_token, + python_token_value, + enderpy_token_value, + )); + } + + let (enderpy_start_row, enderpy_start_col, enderpy_end_row, enderpy_end_col) = + get_row_col_position(enderpy_token.start, enderpy_token.end, &lexer.line_starts); + let python_token_start = python_token.start; + let python_token_end = python_token.end; + if enderpy_start_row != python_token_start.0 + || enderpy_start_col != python_token_start.1 + || enderpy_end_row != python_token_end.0 + || enderpy_end_col != python_token_end.1 + { + return Some(TokenMismatch::WrongStartEnd( + python_token, + enderpy_token, + python_token_start, + python_token_end, + (enderpy_start_row, enderpy_start_col), + (enderpy_end_row, enderpy_end_col), + )); + } + None +} + +fn matches_python_name_token(python_token_value: &str, token_kind: &Kind) -> bool { + match python_token_value { + "if" => token_kind == &Kind::If, + "elif" => token_kind == &Kind::Elif, + "else" => token_kind == &Kind::Else, + "False" => token_kind == &Kind::False, + "None" => token_kind == &Kind::None, + "True" => token_kind == &Kind::True, + "and" => token_kind == &Kind::And, + "as" => token_kind == &Kind::As, + "assert" => token_kind == &Kind::Assert, + "async" => token_kind == &Kind::Async, + "await" => token_kind == &Kind::Await, + "break" => token_kind == &Kind::Break, + "class" => token_kind == &Kind::Class, + "continue" => token_kind == &Kind::Continue, + "def" => token_kind == &Kind::Def, + "del" => token_kind == &Kind::Del, + "except" => token_kind == &Kind::Except, + "finally" => token_kind == &Kind::Finally, + "for" => token_kind == &Kind::For, + "from" => token_kind == &Kind::From, + "global" => token_kind == &Kind::Global, + "import" => token_kind == &Kind::Import, + "in" => token_kind == &Kind::In, + "is" => token_kind == &Kind::Is, + "lambda" => token_kind == &Kind::Lambda, + "nonlocal" => token_kind == &Kind::Nonlocal, + "not" => token_kind == &Kind::Not, + "or" => token_kind == &Kind::Or, + "pass" => token_kind == &Kind::Pass, + "raise" => token_kind == &Kind::Raise, + "return" => token_kind == &Kind::Return, + "try" => token_kind == &Kind::Try, + "while" => token_kind == &Kind::While, + "with" => token_kind == &Kind::With, + "yield" => token_kind == &Kind::Yield, + _ => token_kind == &Kind::Identifier, + } +} + +fn matches_python_op_token(python_token_value: &str, token_kind: &Kind) -> bool { + match python_token_value { + "!=" => token_kind == &Kind::NotEq, + "$" => token_kind == &Kind::Dollar, + "%" => token_kind == &Kind::Mod, + "%=" => token_kind == &Kind::ModAssign, + "&" => token_kind == &Kind::BitAnd, + "&=" => token_kind == &Kind::BitAndAssign, + "(" => token_kind == &Kind::LeftParen, + ")" => token_kind == &Kind::RightParen, + "*" => token_kind == &Kind::Mul, + "**" => token_kind == &Kind::Pow, + "**=" => token_kind == &Kind::PowAssign, + "*=" => token_kind == &Kind::MulAssign, + "+" => token_kind == &Kind::Plus, + "+=" => token_kind == &Kind::AddAssign, + "," => token_kind == &Kind::Comma, + "-" => token_kind == &Kind::Minus, + "-=" => token_kind == &Kind::SubAssign, + "->" => token_kind == &Kind::Arrow, + "." => token_kind == &Kind::Dot, + "/" => token_kind == &Kind::Div, + "//" => token_kind == &Kind::IntDiv, + "//=" => token_kind == &Kind::IntDivAssign, + "/=" => token_kind == &Kind::DivAssign, + ":" => token_kind == &Kind::Colon, + ":=" => token_kind == &Kind::Walrus, + ";" => token_kind == &Kind::SemiColon, + "<" => token_kind == &Kind::Less, + "<<" => token_kind == &Kind::LeftShift, + "<<=" => token_kind == &Kind::ShiftLeftAssign, + "<=" => token_kind == &Kind::LessEq, + "=" => token_kind == &Kind::Assign, + "==" => token_kind == &Kind::Eq, + ">" => token_kind == &Kind::Greater, + ">=" => token_kind == &Kind::GreaterEq, + ">>" => token_kind == &Kind::RightShift, + ">>=" => token_kind == &Kind::ShiftRightAssign, + "?" => token_kind == &Kind::QuestionMark, + "@" => token_kind == &Kind::MatrixMul, + "@=" => token_kind == &Kind::MatrixMulAssign, + "[" => token_kind == &Kind::LeftBrace, + "]" => token_kind == &Kind::RightBrace, + "^" => token_kind == &Kind::BitXor, + "^=" => token_kind == &Kind::BitXorAssign, + "`" => token_kind == &Kind::BackTick, + "{" => token_kind == &Kind::LeftBracket, + "|" => token_kind == &Kind::BitOr, + "|=" => token_kind == &Kind::BitOrAssign, + "}" => token_kind == &Kind::RightBracket, + "~" => token_kind == &Kind::BitNot, + "..." => token_kind == &Kind::Ellipsis, + _ => false, + } +} + +fn matches_python_indent_dedent_token(python_kind: &PythonKind, enderpy_kind: &Kind) -> bool { + // TODO lex_python: There's no obvious way with the Python lexer to determine what is + // considered one indent level. Instead, it simply stores the literal whitespace. This + // makes it really difficult to determine whether indentation levels actually match + // (without looking around at the larger context), so for now we'll just make sure the + // Kind lines up. + (python_kind == &PythonKind::Indent && enderpy_kind == &Kind::Indent) + || (python_kind == &PythonKind::Dedent && enderpy_kind == &Kind::Dedent) +} + +/// The Python tokenizer adds a cheeky newline to the end of the source, causing mismatches. We +/// handle this by ignoring mismatches that meet all of the following criteria. +/// - The mismatch type is `WrongKind`. +/// - The Python kind is a known whitespace value. +/// - The Enderpy kind is a EOF. +/// - The only remaining Python tokens before EOF are known whitespace values. +fn is_python_trailing_newline_mismatch( + mismatch: &TokenMismatch, + remaining_tokens: &[PythonToken], +) -> bool { + if let TokenMismatch::WrongKind(python_token, enderpy_token) = mismatch { + if !matches!(python_token.kind, PythonKind::NewLine | PythonKind::NL) + || enderpy_token.kind != Kind::Eof + { + return false; + } + return remaining_tokens.iter().all(|t| { + matches!( + t.kind, + PythonKind::NewLine | PythonKind::NL | PythonKind::Dedent | PythonKind::EndMarker + ) + }); + } + false +} + +/// Python 3.11 and earlier tokenizes fstrings as just.. strings. +fn is_python_fstring_mismatch( + mismatch: &TokenMismatch, + remaining_tokens: &[Token], + enderpy_index: &mut usize, +) -> bool { + if let TokenMismatch::WrongKind(python_token, enderpy_token) = mismatch { + if !matches!( + enderpy_token.kind, + Kind::FStringStart | Kind::RawFStringStart + ) || python_token.kind != PythonKind::String + { + return false; + } + let mut num_skipped = 0; + for token in remaining_tokens { + num_skipped += 1; + if matches!(token.kind, Kind::FStringEnd | Kind::Eof) { + break; + } + } + *enderpy_index += num_skipped; + return true; + } + false +} + +#[cfg(test)] +mod tests { + + use super::*; + + #[test] + fn test_simple_compat() { + let source = r#" +a: int = 1 +print(a) +"#; + let mut lexer = Lexer::new(source); + let enderpy_tokens = lexer.lex(); + let python_tokens = lex_python_source(source).unwrap(); + assert_tokens_eq(python_tokens, enderpy_tokens, &lexer); + } + + fn python_tokenize_test_lexer(inputs: &[&str]) { + for test_input in inputs.iter() { + let mut lexer = Lexer::new(test_input); + let tokens = lexer.lex(); + let python_tokens = lex_python_source(test_input).unwrap(); + assert_tokens_eq(python_tokens, tokens, &lexer); + } + } + + #[test] + fn test_lex_operators() { + python_tokenize_test_lexer(&[ + "1+2", + "a+b", + "a + b", + "+=2", + "xX = 2", + "if else elif", + "()", + "[]", + "{}:", + ".", + ",", + ";", + "@", + "=", + "#", + "$", + "?", + "`", + "->", + "+=", + "-=", + "*=", + "/=", + "%=", + "@=", + "&=", + "|=", + "^=", + "//=", + "<<=", + ">>=", + "**=", + "**", + "//", + "<<", + ">>", + "+", + "-", + "*", + "**", + "/", + "//", + "%", + "@", + "<<", + ">>", + "&", + "|", + "^", + "~", + ":=", + "<", + ">", + "<=", + ">=", + "==", + "!=", + ]); + } + + #[test] + fn test_lex_keywords() { + python_tokenize_test_lexer(&[ + "False None True and as assert async await", + "break class continue def del elif else except", + "finally for from global if import in is lambda", + "nonlocal not or pass raise return try while with yield", + ]); + } + + #[test] + fn test_lex_identifiers() { + python_tokenize_test_lexer(&["a", "a_a", "_a", "a_", "a_a_a", "a_a_"]); + } + + #[test] + fn test_lex_literals() { + // Binary + python_tokenize_test_lexer(&[ + "0b0", "0b1", "0b10", "0b11", "0b100", "0b101", "0b110", "0b111", + ]); + + // Octal + python_tokenize_test_lexer(&["0o0", "0o1", "0o2", "0o3", "0o4", "0o5", "0o6", "0o7"]); + + // Hexadecimal + python_tokenize_test_lexer(&[ + "0x0", "0x1", "0x2", "0x3", "0x4", "0x5", "0x6", "0x7", "0x8", "0x9", "0xa", "0xb", + "0xc", "0xd", "0xe", "0xf", "0xA", "0xB", "0xC", "0xD", "0xE", "0xF", + ]); + + // Point float + python_tokenize_test_lexer(&["0.0 0.1 00.0 00.1 0.1j 0.01J"]); + + // Exponent float + python_tokenize_test_lexer(&["0e0 0e-1 0e+2 0e+3j 0e+3J"]); + + // Integer + python_tokenize_test_lexer(&["11 33 1j 1_000_000j"]); + + // Strings + python_tokenize_test_lexer(&[ + "\"hello\" ", + "\"world\"", + "\"\"", + "a = \"hello\"", + "'hello'", + "\"\"\"hello\"\"\"", + "'''hello'''", + ]); + + // Bytes + python_tokenize_test_lexer(&[ + "b\"hello\"", + "b\"world\"", + "b\"\"", + "a = b\"hello\"", + "b'hello'", + "b\"\"\"hello\"\"\"", + "b'''hello'''", + ]); + + // Raw strings + python_tokenize_test_lexer(&[ + "r\"hello\"", + "r\"world\"", + "r\"\"", + "a = r\"hello\"", + "r'hello'", + "r\"\"\"hello\"\"\"", + "r'''hello'''", + ]); + + // Raw bytes + python_tokenize_test_lexer(&[ + "rb\"hello\"", + "rb\"world\"", + "rb\"\"", + "a = rb\"hello\"", + "rb'hello'", + "rb\"\"\"hello\"\"\"", + "rb'''hello'''", + ]); + + // Unicode strings + python_tokenize_test_lexer(&[ + "u\"hello\"", + "u\"world\"", + "u\"\"", + "a = u\"hello\"", + "u'hello'", + "u\"\"\"hello\"\"\"", + "u'''hello'''", + ]); + } + + #[test] + fn test_lex_imports() { + python_tokenize_test_lexer(&["import a", "import a.b", "import a.b.c", "import a from b"]); + } + + #[test] + fn test_lex_other() { + python_tokenize_test_lexer(&["(a, + + )"]); + } + + #[test] + fn test_lex_indentation() { + python_tokenize_test_lexer(&[ + "if True: + pass\n", + "if True: + pass +else: + pass", + "if True: + if True: + pass +def", + "def f(x): + y = z + + print(y) +", + "if a: + + f = c + + # Path: test_local.py +", + ]); + } + + #[test] + fn test_lex_fstring() { + python_tokenize_test_lexer(&[ + "f\"hello\"", + "f'hello_{var}'", + "f\"world\"", + "f\"\"", + "a = f\"hello\"", + "f\"\"\"hello\"\"\"", + "f'''hello'''", + // TODO lex_python: Python lexes these poorly. + // "f\"{{hey}}\"", + // "f\"oh_{{hey}}\"", + "f'a' 'c'", + // TODO lex_python: Python 3.11 chokes on this input. + // "f'hello_{f'''{a}'''}'", + ]); + + // Raw F-strings + python_tokenize_test_lexer(&[ + "rf\"hello\"", + "rf\"world\"", + "rf\"\"", + "a = rf\"hello\"", + "rf'hello_{var}'", + "rf\"\"\"hello\"\"\"", + "rf'''hello'''", + ]); + } + + #[test] + fn test_lex_ellipsis() { + python_tokenize_test_lexer(&[ + "...", + "def a(): + ...", + ]); + } + + #[test] + fn test_logical_and_physical_lines() { + python_tokenize_test_lexer(&[ + // This case the first line should have physical line + " +a: int = 1 +print(a) +", + ]); + } + + #[test] + #[should_panic] + fn test_lex_unterminated_string_double_quotes() { + python_tokenize_test_lexer(&["\"hello", "'hello", "'''hello''", "'''hello'"]); + } +} diff --git a/compat/src/main.rs b/compat/src/main.rs new file mode 100644 index 00000000..72a6d528 --- /dev/null +++ b/compat/src/main.rs @@ -0,0 +1,74 @@ +use enderpy_python_parser::Lexer; +use miette::{IntoDiagnostic, Result}; +use reqwest::blocking::get; +use std::fs; +use std::io::Cursor; +use std::path::Path; +use zip::ZipArchive; + +use self::lexer_compat::{assert_tokens_eq, lex_python_source}; +use self::parser_compat::python_parser_test_ast; + +pub mod lexer_compat; +pub mod parser_compat; +pub mod runpython; + +fn main() -> Result<()> { + let url = "https://github.com/python/mypy/archive/refs/heads/master.zip"; + let repo_path = "mypy-master"; + + // Download the ZIP file + if !Path::new(repo_path).exists() { + println!("Downloading repository..."); + let response = get(url).into_diagnostic()?; + let mut zip = + ZipArchive::new(Cursor::new(response.bytes().into_diagnostic()?)).into_diagnostic()?; + + // Extract the ZIP file + println!("Extracting repository..."); + zip.extract(".").into_diagnostic()?; + } else { + println!("Repository already downloaded and extracted."); + } + + // Get all Python files in the extracted repo + let python_files = get_python_files(repo_path)?; + + // Run the compatibility tests on each Python file + for file in python_files { + run_compatibility_test(&file)?; + } + + Ok(()) +} + +/// Recursively finds all Python files in the specified directory. +fn get_python_files(dir: &str) -> Result> { + let mut python_files = Vec::new(); + for entry in fs::read_dir(dir).into_diagnostic()? { + let entry = entry.into_diagnostic()?; + let path = entry.path(); + + if path.is_dir() { + python_files.extend(get_python_files(path.to_str().unwrap())?); + } else if path.extension().and_then(|ext| ext.to_str()) == Some("py") { + python_files.push(path.to_str().unwrap().to_string()); + } + } + Ok(python_files) +} + +/// Runs the compatibility test on a single Python file. +fn run_compatibility_test(file: &str) -> Result<()> { + println!("Running compatibility test on {}", file); + + let source = fs::read_to_string(file).into_diagnostic()?; + let mut lexer = Lexer::new(&source); + let enderpy_tokens = lexer.lex(); + let python_tokens = lex_python_source(&source)?; + + assert_tokens_eq(python_tokens, enderpy_tokens, &lexer); + python_parser_test_ast(&vec![source.as_str()]); + + Ok(()) +} diff --git a/compat/src/parser_compat.rs b/compat/src/parser_compat.rs new file mode 100644 index 00000000..037f994f --- /dev/null +++ b/compat/src/parser_compat.rs @@ -0,0 +1,469 @@ +use assert_json_diff::assert_json_matches_no_panic; +use miette::{bail, IntoDiagnostic, Result}; +use serde_json::Value; +use std::convert::From; +use std::io::Write; + +use crate::runpython::{default_python_path, spawn_python_script_command}; +use enderpy_python_parser::parser::compat::AsPythonCompat; +use enderpy_python_parser::Parser; +use tabled::{ + builder::Builder, + settings::peaker::PriorityMax, + settings::{Style, Width}, +}; + +use terminal_size::{terminal_size, Width as TerminalWidth}; + +fn parse_python_source(source: &str) -> Result { + let mut process = spawn_python_script_command( + "parser/ast_python.py", + vec!["--stdin"], + default_python_path()?, + )?; + + // Get process stdin and write the input string. + if let Some(mut stdin) = process.stdin.take() { + stdin.write_all(source.as_bytes()).into_diagnostic()?; + } else { + bail!("Failed to open stdin when running `parser/ast_python.py`"); + } + // Get process stdout and parse result. + let output = process.wait_with_output().into_diagnostic()?; + let mut ast = + serde_json::from_str(String::from_utf8_lossy(&output.stdout).as_ref()).into_diagnostic()?; + remove_unimplemented_attributes(&mut ast); + Ok(ast) +} +pub fn python_parser_test_ast(inputs: &[&str]) { + for test_input in inputs.iter() { + let enderpy_ast = parse_enderpy_source(test_input).unwrap(); + let python_ast = parse_python_source(test_input).unwrap(); + assert_ast_eq(&python_ast, &enderpy_ast, test_input); + } +} + +fn assert_ast_eq(python_ast: &Value, enderpy_ast: &Value, source: &str) { + let include_source = std::env::var("INCLUDE_SOURCE").is_ok(); + let side_by_side = std::env::var("SIDE_BY_SIDE").is_ok(); + + let formatted_source = if include_source { + format!("\nSource:\n{}\n", source) + } else { + "".to_string() + }; + if !side_by_side { + pretty_assertions::assert_eq!( + &python_ast, + &enderpy_ast, + "Enderpy AST does not match Python AST.\n{}\x1b[31mPython AST\x1b[0m / \x1b[32mEnderpy AST\x1b[0m", + formatted_source, + ); + } else if let Err(message) = assert_json_matches_no_panic( + &python_ast, + &enderpy_ast, + assert_json_diff::Config::new(assert_json_diff::CompareMode::Strict), + ) { + let mut table_builder = Builder::default(); + table_builder.push_record(["Python AST", "Enderpy AST"]); + table_builder.push_record([ + serde_json::to_string_pretty(&python_ast).unwrap(), + serde_json::to_string_pretty(&enderpy_ast).unwrap(), + ]); + let mut table = table_builder.build(); + table.with(Style::modern()); + // If run in a terminal, don't expand table beyond terminal width. + if let Some((TerminalWidth(width), _)) = terminal_size() { + table + .with( + Width::wrap(width as usize) + .keep_words() + .priority::(), + ) + .with(Width::increase(width as usize)); + } + panic!( + "Enderpy AST does not match Python AST.\n{}{}\n{}", + formatted_source, table, message + ); + } +} +fn remove_unimplemented_attributes(value: &mut Value) { + match value { + Value::Object(map) => { + // TODO ast_python: Adjust these ignored values as Enderpy adds support. + map.retain(|key, _| !matches!(key.as_str(), "ctx" | "type_ignores" | "kind")); + for (_, v) in map.iter_mut() { + remove_unimplemented_attributes(v); + } + } + Value::Array(vec) => { + for v in vec.iter_mut() { + remove_unimplemented_attributes(v); + } + } + _ => { + // Nothing to do for other value types. + } + }; +} + +fn parse_enderpy_source(source: &str) -> Result { + let mut parser = Parser::new(source, "string"); + let typed_ast = parser.parse().into_diagnostic()?; + let ast = typed_ast.as_python_compat(&parser); + Ok(ast) +} + +macro_rules! parser_test { + ($test_name:ident, $test_file:expr) => { + #[test] + fn $test_name() { + let test_case = std::fs::read_to_string($test_file).unwrap(); + python_parser_test_ast(&[test_case.as_str()]); + } + }; +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_simple_compat() { + // let source = r#" + // def x(a: int) -> int: + // return 1 + 1 + // b = x(1) + // print(b) + // "#; + + let source = r#"(a +, b, c) +"#; + + let enderpy_ast = parse_enderpy_source(source).unwrap(); + let python_ast = parse_python_source(source).unwrap(); + assert_ast_eq(&python_ast, &enderpy_ast, source); + } + + #[test] + fn test_parse_assignment() { + python_parser_test_ast(&[ + "a = 1", + "a = None", + "a = True", + "a = False", + "a = 1j", + // TODO ast_python: Python does not evaluate bytes. + // "a = b'1'", + // "a = rb'1'", + // "a = br'1'", + "a = \"a\"", + "a = '''a'''", + "a = \"\"\"a\"\"\"", + "a = 'a'", + "a = 1, 2", + "a = 1, 2, ", + "a = b = 1", + "a,b = c,d = 1,2", + // augmented assignment + "a += 1", + "a -= 1", + "a *= 1", + "a /= 1", + "a //= 1", + "a %= 1", + "a **= 1", + "a <<= 1", + "a >>= 1", + "a &= 1", + "a ^= 1", + "a |= 1", + // annotated assignment + ]); + } + + #[test] + fn test_parse_assert_stmt() { + python_parser_test_ast(&["assert a", "assert a, b", "assert True, 'fancy message'"]); + } + + #[test] + fn test_pass_stmt() { + python_parser_test_ast(&["pass", "pass ", "pass\n"]); + } + + #[test] + fn test_parse_del_stmt() { + python_parser_test_ast(&["del a", "del a, b", "del a, b, "]); + } + + #[test] + fn parse_yield_statement() { + python_parser_test_ast(&["yield", "yield a", "yield a, b", "yield a, b, "]); + } + + #[test] + fn test_raise_statement() { + python_parser_test_ast(&["raise", "raise a", "raise a from c"]); + } + + #[test] + fn test_parse_break_continue() { + python_parser_test_ast(&["break", "continue"]); + } + + #[test] + fn test_parse_bool_op() { + python_parser_test_ast(&[ + "a or b", + "a and b", + // TODO ast_python: Python parses this as a BoolOp with 3 values. + // i.e. {"op": "or", "values": ["a", "b", "c"]} + // Enderpy parses this as a nested set of BoolOps. + // i.e. {"op": "or", "values": ["a", {"op": "or", "values": ["b", "c"]}]} + // I'm not sure which is correct. + // "a or b or c", + "a and b or c", + ]); + } + + #[test] + fn test_parse_unary_op() { + python_parser_test_ast(&["not a", "+ a", "~ a", "-a"]); + } + + #[test] + fn test_named_expression() { + // TODO ast_python: Enderpy chokes on this. + // python_parser_test_ast(&["(a := b)"]); + } + + #[test] + fn test_tuple() { + python_parser_test_ast(&[ + "(a, b, c)", + // TODO ast_python: Enderpy doesn't handle newlines within a nested context. + "(a, + b, c)", + "(a + , b, c)", + // "(a, + // b, + // c)", + // "(a, + // )", + "(a, b, c,)", + ]); + } + + #[test] + fn test_yield_expression() { + python_parser_test_ast(&["yield", "yield a", "yield from a"]); + } + + #[test] + fn test_starred() { + // TODO ast_python: Enderpy chokes on this. + // python_parser_test_ast(&["(*a)"]); + } + + #[test] + fn test_await_expression() { + python_parser_test_ast(&["await a"]); + } + + #[test] + fn test_attribute_ref() { + python_parser_test_ast(&["a.b", "a.b.c", "a.b_c", "a.b.c.d"]); + } + #[test] + fn test_subscript() { + python_parser_test_ast(&["a[1]", "a.b[1]"]); + } + + #[test] + fn parse_call() { + python_parser_test_ast(&[ + "a()", + "a(b)", + "a(b, c)", + "func(b=c)", + "func(a, b=c, d=e)", + "func(a, b=c, d=e, *f)", + "func(a, b=c, d=e, *f, **g)", + "func(a,)", + ]); + } + + #[test] + fn test_lambda() { + python_parser_test_ast(&[ + "lambda: a", + "lambda a: a", + "lambda a, b: a", + "lambda a, b, c: a", + "lambda a, *b: a", + "lambda a, *b, c: a", + "lambda a, *b, c, **d: a", + "lambda a=1 : a", + "lambda a=1 : a,", + ]); + } + + #[test] + fn test_conditional_expression() { + python_parser_test_ast(&["a if b else c if d else e"]); + } + + #[test] + fn test_string_literal_concatenation() { + python_parser_test_ast(&[ + "'a' 'b'", + // TODO ast_python: Python evaluates this as "ab". + // "b'a' b'b'", + "'a' 'b'", + // TODO ast_python: Enderpy evaluates this as 'r"a"b'. This seems wrong. + // "r'a' 'b'", + // TODO ast_python: Enderpy doesn't handle newlines within a nested context. + // "('a' + // 'b')", + // "('a' + // 'b', 'c')", + // "('a' + // 'b' + // 'c')", + // TODO ast_python: Python evaluates this as "ac". Enderpy creates 2 constants. + // "f'a' 'c'", + // TODO ast_python: Python evaluates this as "abc". Enderpy creates 3 constants. + // "f'a' 'b' 'c'", + // TODO ast_python: Python evaluates this as "dab". Enderpy creates 3 constants. + // "'d' f'a' 'b'", + "f'a_{1}' 'b' ", + ]); + } + + #[test] + fn test_fstring() { + python_parser_test_ast(&[ + "f'a'", + "f'hello_{a}'", + "f'hello_{a} {b}'", + "f'hello_{a} {b} {c}'", + // unsupported + // "f'hello_{f'''{a}'''}'", + ]); + } + + #[test] + fn test_comparison() { + python_parser_test_ast(&[ + "a == b", + "a != b", + "a > b", + "a < b", + "a >= b", + "a <= b", + "a is b", + "a is not b", + "a in b", + "a not in b", + "a < b < c", + ]); + } + + #[test] + fn test_while_statement() { + python_parser_test_ast(&[ + "while a: pass", + "while a: + pass", + "while a: + a = 1 +else: + b = 1 +", + ]); + } + + #[test] + fn test_try_statement() { + python_parser_test_ast(&[ + "try: + pass +except: + pass", + "try: + pass +except Exception: + pass", + "try: + pass +except Exception as e: + pass", + "try: + pass +except Exception as e: + pass +else: + pass", + "try: + pass +except Exception as e: + pass +else: + pass +finally: + pass", + "try: + pass +except *Exception as e: + pass +", + ]); + } + + #[test] + fn test_ellipsis_statement() { + python_parser_test_ast(&[ + "def a(): ...", + "def a(): + ...", + "a = ...", + "... + 1", + ]); + } + + // parser_test!(test_functions, "test_data/inputs/functions.py"); + // parser_test!(test_if, "test_data/inputs/if.py"); + // parser_test!(test_indentation, "test_data/inputs/indentation.py"); + // parser_test!( + // test_separate_statements, + // "test_data/inputs/separate_statements.py" + // ); + // parser_test!(test_try, "test_data/inputs/try.py"); + // parser_test!( + // annotated_assignment, + // "test_data/inputs/annotated_assignment.py" + // ); + // parser_test!(binary_op, "test_data/inputs/binary_op.py"); + // parser_test!(class, "test_data/inputs/class.py"); + // parser_test!(dict, "test_data/inputs/dict.py"); + // parser_test!(test_for, "test_data/inputs/for.py"); + // parser_test!(from_import, "test_data/inputs/from_import.py"); + // parser_test!(function_def, "test_data/inputs/function_def.py"); + // parser_test!( + // generator_expressions, + // "test_data/inputs/generator_expressions.py" + // ); + // parser_test!(lists, "test_data/inputs/lists.py"); + // parser_test!(test_match, "test_data/inputs/match.py"); + // parser_test!(sets, "test_data/inputs/sets.py"); + // parser_test!(string, "test_data/inputs/string.py"); + // parser_test!(subscript, "test_data/inputs/subscript.py"); + // parser_test!(with, "test_data/inputs/with.py"); + // parser_test!(newlines, "test_data/inputs/newlines.py"); + // parser_test!(comments, "test_data/inputs/comments.py"); + // parser_test!(types_alias, "test_data/inputs/type_alias.py"); +} diff --git a/parser/src/runpython.rs b/compat/src/runpython.rs similarity index 100% rename from parser/src/runpython.rs rename to compat/src/runpython.rs diff --git a/parser/Cargo.toml b/parser/Cargo.toml index 5284493b..5ba28567 100644 --- a/parser/Cargo.toml +++ b/parser/Cargo.toml @@ -20,7 +20,6 @@ unicode-id-start = "1.0.3" miette = { version = "5.6.0", features = ["fancy"] } thiserror = "1.0.40" is-macro = "0.3.5" -which = "6.0.1" [dev-dependencies] codspeed-criterion-compat.workspace = true @@ -28,10 +27,6 @@ criterion.workspace = true insta.workspace = true reqwest = { version= "0.12.4", features = ["blocking"] } tokio.workspace = true -tabled = "0.15" -terminal_size = "0.3" -assert-json-diff = "2.0" -pretty_assertions = "1.4" [lib] bench = false diff --git a/parser/src/lexer/compat.rs b/parser/src/lexer/compat.rs deleted file mode 100644 index d89d962c..00000000 --- a/parser/src/lexer/compat.rs +++ /dev/null @@ -1,903 +0,0 @@ -use serde::{Deserialize, Serialize}; - -// Derived from: -// https://github.com/python/cpython/blob/main/Lib/token.py -#[allow(dead_code)] -#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] -#[serde(rename_all = "UPPERCASE")] -pub enum PythonKind { - EndMarker, - Name, - Number, - String, - NewLine, - Indent, - Dedent, - LPar, - RPar, - LSqb, - RSqb, - Colon, - Comma, - Semi, - Plus, - Minus, - Star, - Slash, - VBar, - Amper, - Less, - Greater, - Equal, - Dot, - Percent, - LBrace, - RBrace, - EqEqual, - NotEqual, - LessEqual, - GreaterEqual, - Tilde, - Circumflex, - LeftShift, - RightShift, - DoubleStar, - PlusEqual, - MinEqual, - StarEqual, - SlashEqual, - PercentEqual, - AmperEqual, - VBarEqual, - CircumflexEqual, - LeftShiftEqual, - RightShiftEqual, - DoubleStarEqual, - DoubleSlash, - DoubleSlashEqual, - At, - AtEqual, - RArrow, - Ellipsis, - ColonEqual, - Exclamation, - Op, - Await, - Async, - #[serde(rename = "TYPE_IGNORE")] - TypeIgnore, - #[serde(rename = "TYPE_COMMENT")] - TypeComment, - #[serde(rename = "SOFT_KEYWORD")] - SoftKeyword, - #[serde(rename = "FSTRING_START")] - FstringStart, - #[serde(rename = "FSTRING_MIDDLE")] - FstringMiddle, - #[serde(rename = "FSTRING_END")] - FstringEnd, - Comment, - NL, - ErrorToken, - Encoding, - #[serde(rename = "N_TOKENS")] - NTokens, - #[serde(rename = "NT_OFFSET")] - NTOffset, -} - -#[derive(Debug, Serialize, Deserialize, Clone, PartialEq)] -pub struct PythonToken { - kind: PythonKind, - value: String, - start: (u32, u32), - end: (u32, u32), -} - -#[cfg(test)] -mod tests { - - use super::*; - use crate::get_row_col_position; - use crate::token::Kind; - use crate::{lexer::Lexer, token::Token}; - use miette::{bail, IntoDiagnostic, Result}; - use std::io::Write; - use tabled::{ - builder::Builder, - settings::peaker::PriorityMax, - settings::{Style, Width}, - }; - - use crate::runpython::{default_python_path, spawn_python_script_command}; - use terminal_size::{terminal_size, Width as TerminalWidth}; - - fn lex_python_source(source: &str) -> Result> { - let mut process = spawn_python_script_command( - "parser/lex_python.py", - vec!["--stdin", "--output-format", "json"], - default_python_path()?, - )?; - - // Get process stdin and write the input string. - if let Some(mut stdin) = process.stdin.take() { - stdin.write_all(source.as_bytes()).into_diagnostic()?; - } else { - bail!("Failed to open stdin when running `parser/lex_python.py`"); - } - // Get process stdout and parse result. - let output = process.wait_with_output().into_diagnostic()?; - let python_tokens: Vec = - serde_json::from_str(String::from_utf8_lossy(&output.stdout).as_ref()) - .into_diagnostic()?; - Ok(python_tokens) - } - - #[test] - fn test_simple_compat() { - let source = r#" -a: int = 1 -print(a) -"#; - let mut lexer = Lexer::new(source); - let enderpy_tokens = lexer.lex(); - let python_tokens = lex_python_source(source).unwrap(); - assert_tokens_eq(python_tokens, enderpy_tokens, &lexer); - } - - fn python_tokenize_test_lexer(inputs: &[&str]) { - for test_input in inputs.iter() { - let mut lexer = Lexer::new(test_input); - let tokens = lexer.lex(); - let python_tokens = lex_python_source(test_input).unwrap(); - assert_tokens_eq(python_tokens, tokens, &lexer); - } - } - - #[test] - fn test_lex_operators() { - python_tokenize_test_lexer(&[ - "1+2", - "a+b", - "a + b", - "+=2", - "xX = 2", - "if else elif", - "()", - "[]", - "{}:", - ".", - ",", - ";", - "@", - "=", - "#", - "$", - "?", - "`", - "->", - "+=", - "-=", - "*=", - "/=", - "%=", - "@=", - "&=", - "|=", - "^=", - "//=", - "<<=", - ">>=", - "**=", - "**", - "//", - "<<", - ">>", - "+", - "-", - "*", - "**", - "/", - "//", - "%", - "@", - "<<", - ">>", - "&", - "|", - "^", - "~", - ":=", - "<", - ">", - "<=", - ">=", - "==", - "!=", - ]); - } - - #[test] - fn test_lex_keywords() { - python_tokenize_test_lexer(&[ - "False None True and as assert async await", - "break class continue def del elif else except", - "finally for from global if import in is lambda", - "nonlocal not or pass raise return try while with yield", - ]); - } - - #[test] - fn test_lex_identifiers() { - python_tokenize_test_lexer(&["a", "a_a", "_a", "a_", "a_a_a", "a_a_"]); - } - - #[test] - fn test_lex_literals() { - // Binary - python_tokenize_test_lexer(&[ - "0b0", "0b1", "0b10", "0b11", "0b100", "0b101", "0b110", "0b111", - ]); - - // Octal - python_tokenize_test_lexer(&["0o0", "0o1", "0o2", "0o3", "0o4", "0o5", "0o6", "0o7"]); - - // Hexadecimal - python_tokenize_test_lexer(&[ - "0x0", "0x1", "0x2", "0x3", "0x4", "0x5", "0x6", "0x7", "0x8", "0x9", "0xa", "0xb", - "0xc", "0xd", "0xe", "0xf", "0xA", "0xB", "0xC", "0xD", "0xE", "0xF", - ]); - - // Point float - python_tokenize_test_lexer(&["0.0 0.1 00.0 00.1 0.1j 0.01J"]); - - // Exponent float - python_tokenize_test_lexer(&["0e0 0e-1 0e+2 0e+3j 0e+3J"]); - - // Integer - python_tokenize_test_lexer(&["11 33 1j 1_000_000j"]); - - // Strings - python_tokenize_test_lexer(&[ - "\"hello\" ", - "\"world\"", - "\"\"", - "a = \"hello\"", - "'hello'", - "\"\"\"hello\"\"\"", - "'''hello'''", - ]); - - // Bytes - python_tokenize_test_lexer(&[ - "b\"hello\"", - "b\"world\"", - "b\"\"", - "a = b\"hello\"", - "b'hello'", - "b\"\"\"hello\"\"\"", - "b'''hello'''", - ]); - - // Raw strings - python_tokenize_test_lexer(&[ - "r\"hello\"", - "r\"world\"", - "r\"\"", - "a = r\"hello\"", - "r'hello'", - "r\"\"\"hello\"\"\"", - "r'''hello'''", - ]); - - // Raw bytes - python_tokenize_test_lexer(&[ - "rb\"hello\"", - "rb\"world\"", - "rb\"\"", - "a = rb\"hello\"", - "rb'hello'", - "rb\"\"\"hello\"\"\"", - "rb'''hello'''", - ]); - - // Unicode strings - python_tokenize_test_lexer(&[ - "u\"hello\"", - "u\"world\"", - "u\"\"", - "a = u\"hello\"", - "u'hello'", - "u\"\"\"hello\"\"\"", - "u'''hello'''", - ]); - } - - #[test] - fn test_lex_imports() { - python_tokenize_test_lexer(&["import a", "import a.b", "import a.b.c", "import a from b"]); - } - - #[test] - fn test_lex_other() { - python_tokenize_test_lexer(&["(a, - - )"]); - } - - #[test] - fn test_lex_indentation() { - python_tokenize_test_lexer(&[ - "if True: - pass\n", - "if True: - pass -else: - pass", - "if True: - if True: - pass -def", - "def f(x): - y = z - - print(y) -", - "if a: - - f = c - - # Path: test_local.py -", - ]); - } - - #[test] - fn test_lex_fstring() { - python_tokenize_test_lexer(&[ - "f\"hello\"", - "f'hello_{var}'", - "f\"world\"", - "f\"\"", - "a = f\"hello\"", - "f\"\"\"hello\"\"\"", - "f'''hello'''", - // TODO lex_python: Python lexes these poorly. - // "f\"{{hey}}\"", - // "f\"oh_{{hey}}\"", - "f'a' 'c'", - // TODO lex_python: Python 3.11 chokes on this input. - // "f'hello_{f'''{a}'''}'", - ]); - - // Raw F-strings - python_tokenize_test_lexer(&[ - "rf\"hello\"", - "rf\"world\"", - "rf\"\"", - "a = rf\"hello\"", - "rf'hello_{var}'", - "rf\"\"\"hello\"\"\"", - "rf'''hello'''", - ]); - } - - #[test] - fn test_lex_ellipsis() { - python_tokenize_test_lexer(&[ - "...", - "def a(): - ...", - ]); - } - - #[test] - fn test_logical_and_physical_lines() { - python_tokenize_test_lexer(&[ - // This case the first line should have physical line - " -a: int = 1 -print(a) -", - ]); - } - - #[test] - #[should_panic] - fn test_lex_unterminated_string_double_quotes() { - python_tokenize_test_lexer(&["\"hello", "'hello", "'''hello''", "'''hello'"]); - } - - enum TokenMismatch { - MissingToken(Option, Option), - WrongKind(PythonToken, Token), - WrongValue(PythonToken, Token, String, String), - WrongStartEnd( - PythonToken, - Token, - (u32, u32), - (u32, u32), - (u32, u32), - (u32, u32), - ), - } - - fn assert_tokens_eq( - python_tokens: Vec, - enderpy_tokens: Vec, - lexer: &Lexer, - ) { - let num_python_tokens = python_tokens.len(); - let num_enderpy_tokens = enderpy_tokens.len(); - let mut mismatches: Vec = vec![]; - let mut python_index = 0; - let mut enderpy_index = 0; - while python_index < num_python_tokens || enderpy_index < num_enderpy_tokens { - let python_token = if python_index > num_python_tokens - 1 { - None - } else { - Some(python_tokens[python_index].clone()) - }; - let enderpy_token = if enderpy_index > num_enderpy_tokens - 1 { - None - } else { - Some(enderpy_tokens[enderpy_index].clone()) - }; - if python_token.is_none() || enderpy_token.is_none() { - mismatches.push(TokenMismatch::MissingToken(python_token, enderpy_token)); - } else { - let python_token = python_token.unwrap(); - let enderpy_token = enderpy_token.unwrap(); - if let Some(mismatch) = check_tokens_match(python_token, enderpy_token, lexer) { - if is_python_trailing_newline_mismatch( - &mismatch, - &python_tokens[python_index + 1..], - ) { - // If we found Python's trailing newline, we've read the end of file. - break; - } else if is_python_fstring_mismatch( - &mismatch, - &enderpy_tokens[enderpy_index + 1..], - &mut enderpy_index, // <-- `enderpy_index` may be updated - ) { - // Nothing, but don't add the mismatch. - } else { - mismatches.push(mismatch); - } - } - } - python_index += 1; - enderpy_index += 1; - } - if mismatches.is_empty() { - return; - } - - let include_source = std::env::var("INCLUDE_SOURCE").is_ok(); - let include_all_tokens = std::env::var("INCLUDE_ALL_TOKENS").is_ok(); - - let mut table_builder = Builder::default(); - // Add the table header. - let header = vec!["Python token", "Enderpy token", "Failure"]; - table_builder.push_record(header); - // Add the table rows. Each row represents a lexer token mismatch. - let num_mismatches = mismatches.len(); - for mismatch in mismatches { - let mut row: Vec = vec![]; - let (python_token, enderpy_token, message) = match mismatch { - TokenMismatch::MissingToken(python_token, enderpy_token) => { - (python_token, enderpy_token, "Missing token".to_string()) - } - TokenMismatch::WrongKind(python_token, enderpy_token) => { - let message = format!( - "Wrong token kind.\nPython: {:?}\nEnderpy: {:?}", - python_token.kind, enderpy_token.kind - ); - (Some(python_token), Some(enderpy_token), message) - } - TokenMismatch::WrongValue( - python_token, - enderpy_token, - expected_value, - actual_value, - ) => ( - Some(python_token), - Some(enderpy_token), - format!( - "Wrong token value.\nPython: {:?}\nEnderpy: {:?}", - expected_value, actual_value - ), - ), - TokenMismatch::WrongStartEnd( - python_token, - enderpy_token, - expected_start, - expected_end, - actual_start, - actual_end, - ) => ( - Some(python_token), - Some(enderpy_token), - format!( - "Wrong token start/end offset.\nPython: {:?} - {:?}\nEnderpy: {:?} - {:?}", - expected_start, expected_end, actual_start, actual_end, - ), - ), - }; - if include_all_tokens { - row.extend_from_slice(&[ - python_tokens - .iter() - .map(|token| { - let is_this_token = - python_token.as_ref().is_some_and(|tok| tok == token); - format!("{}{:?}", if is_this_token { "→ " } else { "" }, token) - }) - .collect::>() - .join("\n"), - enderpy_tokens - .iter() - .map(|token| { - let is_this_token = - enderpy_token.as_ref().is_some_and(|tok| tok == token); - format!("{}{:?}", if is_this_token { "→ " } else { "" }, token) - }) - .collect::>() - .join("\n"), - message, - ]); - } else { - row.extend_from_slice(&[ - python_token.map_or("None".to_string(), |t| format!("{:?}", t)), - enderpy_token.map_or("None".to_string(), |t| format!("{:?}", t)), - message, - ]); - } - table_builder.push_record(row); - } - let mut table = table_builder.build(); - table.with(Style::modern()); - // If run in a terminal, don't expand table beyond terminal width. - if let Some((TerminalWidth(width), _)) = terminal_size() { - table - .with( - Width::wrap(width as usize) - .keep_words() - .priority::(), - ) - .with(Width::increase(width as usize)); - } - let formatted_source = if include_source { - format!("\nSource:\n{}", lexer.source) - } else { - "".to_string() - }; - panic!( - "Enderpy tokens do not match Python tokens.{}\n{}\n{} token mismatches found", - formatted_source, table, num_mismatches - ); - } - - fn check_tokens_match( - python_token: PythonToken, - enderpy_token: Token, - lexer: &Lexer, - ) -> Option { - let kind_matches = match python_token.kind { - PythonKind::EndMarker => enderpy_token.kind == Kind::Eof, - // For some reason, Python maintains a kind for these tokens but doesn't use them - // during tokenization. - // Instead, it slams keywords together into a generic Name kind. - PythonKind::Name => { - matches_python_name_token(python_token.value.as_str(), &enderpy_token.kind) - } - PythonKind::Number => matches!( - enderpy_token.kind, - Kind::Integer - | Kind::PointFloat - | Kind::ExponentFloat - | Kind::Binary - | Kind::Octal - | Kind::Hexadecimal - | Kind::ImaginaryPointFloat - | Kind::ImaginaryExponentFloat - | Kind::ImaginaryInteger - ), - // NOTE: The Python tokenizer doesn't appear to track differences in string modifiers. - // For example, "hello"/u"hello"/r"hello" are all just String. - PythonKind::String => matches!( - enderpy_token.kind, - Kind::StringLiteral | Kind::Bytes | Kind::RawBytes | Kind::Unicode - ), - PythonKind::NewLine => enderpy_token.kind == Kind::NewLine, - PythonKind::Indent => enderpy_token.kind == Kind::Indent, - PythonKind::Dedent => enderpy_token.kind == Kind::Dedent, - PythonKind::LPar => enderpy_token.kind == Kind::LeftParen, - PythonKind::RPar => enderpy_token.kind == Kind::RightParen, - PythonKind::LSqb => enderpy_token.kind == Kind::LeftBrace, - PythonKind::RSqb => enderpy_token.kind == Kind::RightBrace, - PythonKind::Colon => enderpy_token.kind == Kind::Colon, - PythonKind::Comma => enderpy_token.kind == Kind::Comma, - PythonKind::Semi => enderpy_token.kind == Kind::SemiColon, - PythonKind::Plus => enderpy_token.kind == Kind::Plus, - PythonKind::Minus => enderpy_token.kind == Kind::Minus, - PythonKind::Star => enderpy_token.kind == Kind::Mul, - PythonKind::Slash => enderpy_token.kind == Kind::Div, - PythonKind::VBar => enderpy_token.kind == Kind::BitOr, - PythonKind::Amper => enderpy_token.kind == Kind::BitAnd, - PythonKind::Less => enderpy_token.kind == Kind::Less, - PythonKind::Greater => enderpy_token.kind == Kind::Greater, - PythonKind::Equal => enderpy_token.kind == Kind::Assign, - PythonKind::Dot => enderpy_token.kind == Kind::Dot, - PythonKind::Percent => enderpy_token.kind == Kind::Mod, - PythonKind::LBrace => enderpy_token.kind == Kind::LeftBracket, - PythonKind::RBrace => enderpy_token.kind == Kind::RightBracket, - PythonKind::EqEqual => enderpy_token.kind == Kind::Eq, - PythonKind::NotEqual => enderpy_token.kind == Kind::NotEq, - PythonKind::LessEqual => enderpy_token.kind == Kind::LessEq, - PythonKind::GreaterEqual => enderpy_token.kind == Kind::GreaterEq, - PythonKind::Tilde => enderpy_token.kind == Kind::BitNot, - PythonKind::Circumflex => enderpy_token.kind == Kind::BitXor, - PythonKind::LeftShift => enderpy_token.kind == Kind::LeftShift, - PythonKind::RightShift => enderpy_token.kind == Kind::RightShift, - PythonKind::DoubleStar => enderpy_token.kind == Kind::Pow, - PythonKind::PlusEqual => enderpy_token.kind == Kind::AddAssign, - PythonKind::MinEqual => enderpy_token.kind == Kind::SubAssign, - PythonKind::StarEqual => enderpy_token.kind == Kind::MulAssign, - PythonKind::SlashEqual => enderpy_token.kind == Kind::DivAssign, - PythonKind::PercentEqual => enderpy_token.kind == Kind::ModAssign, - PythonKind::AmperEqual => enderpy_token.kind == Kind::BitAndAssign, - PythonKind::VBarEqual => enderpy_token.kind == Kind::BitOrAssign, - PythonKind::CircumflexEqual => enderpy_token.kind == Kind::BitXorAssign, - PythonKind::LeftShiftEqual => enderpy_token.kind == Kind::ShiftLeftAssign, - PythonKind::RightShiftEqual => enderpy_token.kind == Kind::ShiftRightAssign, - PythonKind::DoubleStarEqual => enderpy_token.kind == Kind::PowAssign, - PythonKind::DoubleSlash => enderpy_token.kind == Kind::IntDiv, - PythonKind::DoubleSlashEqual => enderpy_token.kind == Kind::IntDivAssign, - PythonKind::At => enderpy_token.kind == Kind::MatrixMul, - PythonKind::AtEqual => enderpy_token.kind == Kind::MatrixMulAssign, - PythonKind::RArrow => enderpy_token.kind == Kind::Arrow, - PythonKind::Ellipsis => enderpy_token.kind == Kind::Ellipsis, - PythonKind::ColonEqual => enderpy_token.kind == Kind::Walrus, - PythonKind::Exclamation => false, // doesn't exist - // For some reason, Python maintains a kind for these tokens but doesn't use them - // during tokenization. - // Instead, it slams all operators together into a generic Op kind. - PythonKind::Op => { - matches_python_op_token(python_token.value.as_str(), &enderpy_token.kind) - } - PythonKind::Await => enderpy_token.kind == Kind::Await, - PythonKind::Async => enderpy_token.kind == Kind::Async, - PythonKind::TypeIgnore => false, // doesn't exist - PythonKind::TypeComment => false, // doesn't exist - PythonKind::SoftKeyword => false, // doesn't exist - PythonKind::FstringStart => matches!( - enderpy_token.kind, - Kind::FStringStart | Kind::RawFStringStart - ), - PythonKind::FstringMiddle => enderpy_token.kind == Kind::FStringMiddle, - PythonKind::FstringEnd => enderpy_token.kind == Kind::FStringEnd, - PythonKind::Comment => enderpy_token.kind == Kind::Comment, - PythonKind::NL => enderpy_token.kind == Kind::NL, - PythonKind::ErrorToken => { - match python_token.value.as_str() { - // Python 3.11 chokes on these tokens. - "$" => enderpy_token.kind == Kind::Dollar, - "?" => enderpy_token.kind == Kind::QuestionMark, - "`" => enderpy_token.kind == Kind::BackTick, - _ => enderpy_token.kind == Kind::Error, - } - } - PythonKind::Encoding => false, // doesn't exist - PythonKind::NTokens => false, // doesn't exist, - PythonKind::NTOffset => false, // doesn't exist - }; - if !kind_matches { - return Some(TokenMismatch::WrongKind(python_token, enderpy_token)); - } - - let python_token_value = python_token.value.clone(); - let enderpy_token_value = enderpy_token.value.to_string(); - // The Python tokenizer sets values in a number of places where Enderpy simply relies - // on kind to assume value. Handle those cases here. - let value_matches = - matches_python_name_token(python_token.value.as_str(), &enderpy_token.kind) - || matches_python_op_token(python_token.value.as_str(), &enderpy_token.kind) - || matches_python_indent_dedent_token(&python_token.kind, &enderpy_token.kind) - || (python_token.kind == PythonKind::EndMarker && enderpy_token.kind == Kind::Eof) - || (python_token.value.as_str() == "\n" - && (matches!(enderpy_token.kind, Kind::NewLine | Kind::NL))) - || python_token_value == enderpy_token_value; - if !value_matches { - return Some(TokenMismatch::WrongValue( - python_token, - enderpy_token, - python_token_value, - enderpy_token_value, - )); - } - - let (enderpy_start_row, enderpy_start_col, enderpy_end_row, enderpy_end_col) = - get_row_col_position(enderpy_token.start, enderpy_token.end, &lexer.line_starts); - let python_token_start = python_token.start; - let python_token_end = python_token.end; - if enderpy_start_row != python_token_start.0 - || enderpy_start_col != python_token_start.1 - || enderpy_end_row != python_token_end.0 - || enderpy_end_col != python_token_end.1 - { - return Some(TokenMismatch::WrongStartEnd( - python_token, - enderpy_token, - python_token_start, - python_token_end, - (enderpy_start_row, enderpy_start_col), - (enderpy_end_row, enderpy_end_col), - )); - } - None - } - - fn matches_python_name_token(python_token_value: &str, token_kind: &Kind) -> bool { - match python_token_value { - "if" => token_kind == &Kind::If, - "elif" => token_kind == &Kind::Elif, - "else" => token_kind == &Kind::Else, - "False" => token_kind == &Kind::False, - "None" => token_kind == &Kind::None, - "True" => token_kind == &Kind::True, - "and" => token_kind == &Kind::And, - "as" => token_kind == &Kind::As, - "assert" => token_kind == &Kind::Assert, - "async" => token_kind == &Kind::Async, - "await" => token_kind == &Kind::Await, - "break" => token_kind == &Kind::Break, - "class" => token_kind == &Kind::Class, - "continue" => token_kind == &Kind::Continue, - "def" => token_kind == &Kind::Def, - "del" => token_kind == &Kind::Del, - "except" => token_kind == &Kind::Except, - "finally" => token_kind == &Kind::Finally, - "for" => token_kind == &Kind::For, - "from" => token_kind == &Kind::From, - "global" => token_kind == &Kind::Global, - "import" => token_kind == &Kind::Import, - "in" => token_kind == &Kind::In, - "is" => token_kind == &Kind::Is, - "lambda" => token_kind == &Kind::Lambda, - "nonlocal" => token_kind == &Kind::Nonlocal, - "not" => token_kind == &Kind::Not, - "or" => token_kind == &Kind::Or, - "pass" => token_kind == &Kind::Pass, - "raise" => token_kind == &Kind::Raise, - "return" => token_kind == &Kind::Return, - "try" => token_kind == &Kind::Try, - "while" => token_kind == &Kind::While, - "with" => token_kind == &Kind::With, - "yield" => token_kind == &Kind::Yield, - _ => token_kind == &Kind::Identifier, - } - } - - fn matches_python_op_token(python_token_value: &str, token_kind: &Kind) -> bool { - match python_token_value { - "!=" => token_kind == &Kind::NotEq, - "$" => token_kind == &Kind::Dollar, - "%" => token_kind == &Kind::Mod, - "%=" => token_kind == &Kind::ModAssign, - "&" => token_kind == &Kind::BitAnd, - "&=" => token_kind == &Kind::BitAndAssign, - "(" => token_kind == &Kind::LeftParen, - ")" => token_kind == &Kind::RightParen, - "*" => token_kind == &Kind::Mul, - "**" => token_kind == &Kind::Pow, - "**=" => token_kind == &Kind::PowAssign, - "*=" => token_kind == &Kind::MulAssign, - "+" => token_kind == &Kind::Plus, - "+=" => token_kind == &Kind::AddAssign, - "," => token_kind == &Kind::Comma, - "-" => token_kind == &Kind::Minus, - "-=" => token_kind == &Kind::SubAssign, - "->" => token_kind == &Kind::Arrow, - "." => token_kind == &Kind::Dot, - "/" => token_kind == &Kind::Div, - "//" => token_kind == &Kind::IntDiv, - "//=" => token_kind == &Kind::IntDivAssign, - "/=" => token_kind == &Kind::DivAssign, - ":" => token_kind == &Kind::Colon, - ":=" => token_kind == &Kind::Walrus, - ";" => token_kind == &Kind::SemiColon, - "<" => token_kind == &Kind::Less, - "<<" => token_kind == &Kind::LeftShift, - "<<=" => token_kind == &Kind::ShiftLeftAssign, - "<=" => token_kind == &Kind::LessEq, - "=" => token_kind == &Kind::Assign, - "==" => token_kind == &Kind::Eq, - ">" => token_kind == &Kind::Greater, - ">=" => token_kind == &Kind::GreaterEq, - ">>" => token_kind == &Kind::RightShift, - ">>=" => token_kind == &Kind::ShiftRightAssign, - "?" => token_kind == &Kind::QuestionMark, - "@" => token_kind == &Kind::MatrixMul, - "@=" => token_kind == &Kind::MatrixMulAssign, - "[" => token_kind == &Kind::LeftBrace, - "]" => token_kind == &Kind::RightBrace, - "^" => token_kind == &Kind::BitXor, - "^=" => token_kind == &Kind::BitXorAssign, - "`" => token_kind == &Kind::BackTick, - "{" => token_kind == &Kind::LeftBracket, - "|" => token_kind == &Kind::BitOr, - "|=" => token_kind == &Kind::BitOrAssign, - "}" => token_kind == &Kind::RightBracket, - "~" => token_kind == &Kind::BitNot, - "..." => token_kind == &Kind::Ellipsis, - _ => false, - } - } - - fn matches_python_indent_dedent_token(python_kind: &PythonKind, enderpy_kind: &Kind) -> bool { - // TODO lex_python: There's no obvious way with the Python lexer to determine what is - // considered one indent level. Instead, it simply stores the literal whitespace. This - // makes it really difficult to determine whether indentation levels actually match - // (without looking around at the larger context), so for now we'll just make sure the - // Kind lines up. - (python_kind == &PythonKind::Indent && enderpy_kind == &Kind::Indent) - || (python_kind == &PythonKind::Dedent && enderpy_kind == &Kind::Dedent) - } - - /// The Python tokenizer adds a cheeky newline to the end of the source, causing mismatches. We - /// handle this by ignoring mismatches that meet all of the following criteria. - /// - The mismatch type is `WrongKind`. - /// - The Python kind is a known whitespace value. - /// - The Enderpy kind is a EOF. - /// - The only remaining Python tokens before EOF are known whitespace values. - fn is_python_trailing_newline_mismatch( - mismatch: &TokenMismatch, - remaining_tokens: &[PythonToken], - ) -> bool { - if let TokenMismatch::WrongKind(python_token, enderpy_token) = mismatch { - if !matches!(python_token.kind, PythonKind::NewLine | PythonKind::NL) - || enderpy_token.kind != Kind::Eof - { - return false; - } - return remaining_tokens.iter().all(|t| { - matches!( - t.kind, - PythonKind::NewLine - | PythonKind::NL - | PythonKind::Dedent - | PythonKind::EndMarker - ) - }); - } - false - } - - /// Python 3.11 and earlier tokenizes fstrings as just.. strings. - fn is_python_fstring_mismatch( - mismatch: &TokenMismatch, - remaining_tokens: &[Token], - enderpy_index: &mut usize, - ) -> bool { - if let TokenMismatch::WrongKind(python_token, enderpy_token) = mismatch { - if !matches!( - enderpy_token.kind, - Kind::FStringStart | Kind::RawFStringStart - ) || python_token.kind != PythonKind::String - { - return false; - } - let mut num_skipped = 0; - for token in remaining_tokens { - num_skipped += 1; - if matches!(token.kind, Kind::FStringEnd | Kind::Eof) { - break; - } - } - *enderpy_index += num_skipped; - return true; - } - false - } -} diff --git a/parser/src/lexer/mod.rs b/parser/src/lexer/mod.rs index 750372d2..cf782418 100644 --- a/parser/src/lexer/mod.rs +++ b/parser/src/lexer/mod.rs @@ -1,5 +1,3 @@ -pub mod compat; - use unicode_id_start::{is_id_continue, is_id_start}; use crate::{ @@ -44,7 +42,7 @@ enum TokenizationMode { #[derive(Debug, Clone)] pub struct Lexer<'a> { /// The source code - source: &'a str, + pub source: &'a str, /// The current position in the source code current: u32, current_line: u16, @@ -316,7 +314,7 @@ impl<'a> Lexer<'a> { } // String Literals str_start @ '"' | str_start @ '\'' => { - self.skip_to_str_end(str_start)?; + self.consume_str(str_start)?; return Ok(Kind::StringLiteral); } // Operators @@ -580,7 +578,7 @@ impl<'a> Lexer<'a> { Some('b') | Some('B') => match self.double_peek() { Some(str_start @ '"') | Some(str_start @ '\'') => { self.double_next(); - self.skip_to_str_end(str_start)?; + self.consume_str(str_start)?; return Ok(Some(Kind::RawBytes)); } _ => {} @@ -602,7 +600,7 @@ impl<'a> Lexer<'a> { }, Some(str_start @ '"') | Some(str_start @ '\'') => { self.next(); - self.skip_to_str_end(str_start)?; + self.consume_str(str_start)?; return Ok(Some(Kind::StringLiteral)); } _ => {} @@ -611,14 +609,14 @@ impl<'a> Lexer<'a> { Some('r') | Some('R') => match self.double_peek() { Some(str_start @ '"') | Some(str_start @ '\'') => { self.double_next(); - self.skip_to_str_end(str_start)?; + self.consume_str(str_start)?; return Ok(Some(Kind::RawBytes)); } _ => {} }, Some(str_start @ '"') | Some(str_start @ '\'') => { self.next(); - self.skip_to_str_end(str_start)?; + self.consume_str(str_start)?; return Ok(Some(Kind::Bytes)); } _ => {} @@ -656,7 +654,7 @@ impl<'a> Lexer<'a> { 'u' | 'U' => match self.peek() { Some(str_start @ '"') | Some(str_start @ '\'') => { self.next(); - self.skip_to_str_end(str_start)?; + self.consume_str(str_start)?; return Ok(Some(Kind::Unicode)); } _ => {} @@ -732,7 +730,7 @@ impl<'a> Lexer<'a> { } } - fn skip_to_str_end(&mut self, str_start: char) -> Result<(), LexError> { + fn consume_str(&mut self, str_start: char) -> Result<(), LexError> { // string start position is current position - 1 because we already consumed the // quote let _str_start_pos = self.current - 1; @@ -743,6 +741,9 @@ impl<'a> Lexer<'a> { if self.peek() == Some(str_start) && self.double_peek() == Some(str_start) { self.next(); while let Some(c) = self.next() { + if c == '\n' { + self.line_starts.push(self.current); + } if c == str_start && self.peek() == Some(str_start) && self.double_peek() == Some(str_start) @@ -756,6 +757,9 @@ impl<'a> Lexer<'a> { } } else { while let Some(c) = self.next() { + if c == '\n' { + self.line_starts.push(self.current); + } if c == str_start && last_read_char != '\\' { string_terminated = true; break; diff --git a/parser/src/lib.rs b/parser/src/lib.rs index 78c08f63..e9e5638a 100644 --- a/parser/src/lib.rs +++ b/parser/src/lib.rs @@ -1,12 +1,11 @@ mod lexer; -mod parser; +pub mod parser; pub use crate::{ lexer::Lexer, parser::{ast, parser::Parser}, }; pub mod error; -pub mod runpython; pub mod token; pub fn get_row_col_position(start: u32, end: u32, line_starts: &[u32]) -> (u32, u32, u32, u32) { diff --git a/parser/src/parser/compat.rs b/parser/src/parser/compat.rs index e70de26e..144db22a 100644 --- a/parser/src/parser/compat.rs +++ b/parser/src/parser/compat.rs @@ -1,63 +1,10 @@ -use miette::{bail, IntoDiagnostic, Result}; +use crate::ast::*; +use crate::Parser; use serde_json::Number; use serde_json::{json, Value}; -use std::convert::From; -use std::io::Write; use std::str::FromStr; -use crate::ast::*; -use crate::runpython::{default_python_path, spawn_python_script_command}; -use crate::Parser; - -fn parse_python_source(source: &str) -> Result { - let mut process = spawn_python_script_command( - "parser/ast_python.py", - vec!["--stdin"], - default_python_path()?, - )?; - - // Get process stdin and write the input string. - if let Some(mut stdin) = process.stdin.take() { - stdin.write_all(source.as_bytes()).into_diagnostic()?; - } else { - bail!("Failed to open stdin when running `parser/ast_python.py`"); - } - // Get process stdout and parse result. - let output = process.wait_with_output().into_diagnostic()?; - let mut ast = - serde_json::from_str(String::from_utf8_lossy(&output.stdout).as_ref()).into_diagnostic()?; - remove_unimplemented_attributes(&mut ast); - Ok(ast) -} - -fn remove_unimplemented_attributes(value: &mut Value) { - match value { - Value::Object(map) => { - // TODO ast_python: Adjust these ignored values as Enderpy adds support. - map.retain(|key, _| !matches!(key.as_str(), "ctx" | "type_ignores" | "kind")); - for (_, v) in map.iter_mut() { - remove_unimplemented_attributes(v); - } - } - Value::Array(vec) => { - for v in vec.iter_mut() { - remove_unimplemented_attributes(v); - } - } - _ => { - // Nothing to do for other value types. - } - }; -} - -fn parse_enderpy_source(source: &str) -> Result { - let mut parser = Parser::new(source, "string"); - let typed_ast = parser.parse().into_diagnostic()?; - let ast = typed_ast.as_python_compat(&parser); - Ok(ast) -} - -trait AsPythonCompat { +pub trait AsPythonCompat { fn as_python_compat(&self, parser: &Parser) -> Value; } @@ -965,418 +912,3 @@ impl AsPythonCompat for TypeAlias { }) } } - -#[cfg(test)] -mod tests { - use super::{parse_enderpy_source, parse_python_source}; - use assert_json_diff::assert_json_matches_no_panic; - use serde_json::Value; - use tabled::{ - builder::Builder, - settings::peaker::PriorityMax, - settings::{Style, Width}, - }; - use terminal_size::{terminal_size, Width as TerminalWidth}; - - #[test] - fn test_simple_compat() { - // let source = r#" - // def x(a: int) -> int: - // return 1 + 1 - // b = x(1) - // print(b) - // "#; - - let source = r#"(a -, b, c) -"#; - - let enderpy_ast = parse_enderpy_source(source).unwrap(); - let python_ast = parse_python_source(source).unwrap(); - assert_ast_eq(&python_ast, &enderpy_ast, source); - } - - fn python_parser_test_ast(inputs: &[&str]) { - for test_input in inputs.iter() { - let enderpy_ast = parse_enderpy_source(test_input).unwrap(); - let python_ast = parse_python_source(test_input).unwrap(); - assert_ast_eq(&python_ast, &enderpy_ast, test_input); - } - } - - #[test] - fn test_parse_assignment() { - python_parser_test_ast(&[ - "a = 1", - "a = None", - "a = True", - "a = False", - "a = 1j", - // TODO ast_python: Python does not evaluate bytes. - // "a = b'1'", - // "a = rb'1'", - // "a = br'1'", - "a = \"a\"", - "a = '''a'''", - "a = \"\"\"a\"\"\"", - "a = 'a'", - "a = 1, 2", - "a = 1, 2, ", - "a = b = 1", - "a,b = c,d = 1,2", - // augmented assignment - "a += 1", - "a -= 1", - "a *= 1", - "a /= 1", - "a //= 1", - "a %= 1", - "a **= 1", - "a <<= 1", - "a >>= 1", - "a &= 1", - "a ^= 1", - "a |= 1", - // annotated assignment - ]); - } - - #[test] - fn test_parse_assert_stmt() { - python_parser_test_ast(&["assert a", "assert a, b", "assert True, 'fancy message'"]); - } - - #[test] - fn test_pass_stmt() { - python_parser_test_ast(&["pass", "pass ", "pass\n"]); - } - - #[test] - fn test_parse_del_stmt() { - python_parser_test_ast(&["del a", "del a, b", "del a, b, "]); - } - - #[test] - fn parse_yield_statement() { - python_parser_test_ast(&["yield", "yield a", "yield a, b", "yield a, b, "]); - } - - #[test] - fn test_raise_statement() { - python_parser_test_ast(&["raise", "raise a", "raise a from c"]); - } - - #[test] - fn test_parse_break_continue() { - python_parser_test_ast(&["break", "continue"]); - } - - #[test] - fn test_parse_bool_op() { - python_parser_test_ast(&[ - "a or b", - "a and b", - // TODO ast_python: Python parses this as a BoolOp with 3 values. - // i.e. {"op": "or", "values": ["a", "b", "c"]} - // Enderpy parses this as a nested set of BoolOps. - // i.e. {"op": "or", "values": ["a", {"op": "or", "values": ["b", "c"]}]} - // I'm not sure which is correct. - // "a or b or c", - "a and b or c", - ]); - } - - #[test] - fn test_parse_unary_op() { - python_parser_test_ast(&["not a", "+ a", "~ a", "-a"]); - } - - #[test] - fn test_named_expression() { - // TODO ast_python: Enderpy chokes on this. - // python_parser_test_ast(&["(a := b)"]); - } - - #[test] - fn test_tuple() { - python_parser_test_ast(&[ - "(a, b, c)", - // TODO ast_python: Enderpy doesn't handle newlines within a nested context. - "(a, - b, c)", - "(a - , b, c)", - // "(a, - // b, - // c)", - // "(a, - // )", - "(a, b, c,)", - ]); - } - - #[test] - fn test_yield_expression() { - python_parser_test_ast(&["yield", "yield a", "yield from a"]); - } - - #[test] - fn test_starred() { - // TODO ast_python: Enderpy chokes on this. - // python_parser_test_ast(&["(*a)"]); - } - - #[test] - fn test_await_expression() { - python_parser_test_ast(&["await a"]); - } - - #[test] - fn test_attribute_ref() { - python_parser_test_ast(&["a.b", "a.b.c", "a.b_c", "a.b.c.d"]); - } - #[test] - fn test_subscript() { - python_parser_test_ast(&["a[1]", "a.b[1]"]); - } - - #[test] - fn parse_call() { - python_parser_test_ast(&[ - "a()", - "a(b)", - "a(b, c)", - "func(b=c)", - "func(a, b=c, d=e)", - "func(a, b=c, d=e, *f)", - "func(a, b=c, d=e, *f, **g)", - "func(a,)", - ]); - } - - #[test] - fn test_lambda() { - python_parser_test_ast(&[ - "lambda: a", - "lambda a: a", - "lambda a, b: a", - "lambda a, b, c: a", - "lambda a, *b: a", - "lambda a, *b, c: a", - "lambda a, *b, c, **d: a", - "lambda a=1 : a", - "lambda a=1 : a,", - ]); - } - - #[test] - fn test_conditional_expression() { - python_parser_test_ast(&["a if b else c if d else e"]); - } - - #[test] - fn test_string_literal_concatenation() { - python_parser_test_ast(&[ - "'a' 'b'", - // TODO ast_python: Python evaluates this as "ab". - // "b'a' b'b'", - "'a' 'b'", - // TODO ast_python: Enderpy evaluates this as 'r"a"b'. This seems wrong. - // "r'a' 'b'", - // TODO ast_python: Enderpy doesn't handle newlines within a nested context. - // "('a' - // 'b')", - // "('a' - // 'b', 'c')", - // "('a' - // 'b' - // 'c')", - // TODO ast_python: Python evaluates this as "ac". Enderpy creates 2 constants. - // "f'a' 'c'", - // TODO ast_python: Python evaluates this as "abc". Enderpy creates 3 constants. - // "f'a' 'b' 'c'", - // TODO ast_python: Python evaluates this as "dab". Enderpy creates 3 constants. - // "'d' f'a' 'b'", - "f'a_{1}' 'b' ", - ]); - } - - #[test] - fn test_fstring() { - python_parser_test_ast(&[ - "f'a'", - "f'hello_{a}'", - "f'hello_{a} {b}'", - "f'hello_{a} {b} {c}'", - // unsupported - // "f'hello_{f'''{a}'''}'", - ]); - } - - #[test] - fn test_comparison() { - python_parser_test_ast(&[ - "a == b", - "a != b", - "a > b", - "a < b", - "a >= b", - "a <= b", - "a is b", - "a is not b", - "a in b", - "a not in b", - "a < b < c", - ]); - } - - #[test] - fn test_while_statement() { - python_parser_test_ast(&[ - "while a: pass", - "while a: - pass", - "while a: - a = 1 -else: - b = 1 -", - ]); - } - - #[test] - fn test_try_statement() { - python_parser_test_ast(&[ - "try: - pass -except: - pass", - "try: - pass -except Exception: - pass", - "try: - pass -except Exception as e: - pass", - "try: - pass -except Exception as e: - pass -else: - pass", - "try: - pass -except Exception as e: - pass -else: - pass -finally: - pass", - "try: - pass -except *Exception as e: - pass -", - ]); - } - - #[test] - fn test_ellipsis_statement() { - python_parser_test_ast(&[ - "def a(): ...", - "def a(): - ...", - "a = ...", - "... + 1", - ]); - } - - macro_rules! parser_test { - ($test_name:ident, $test_file:expr) => { - #[test] - fn $test_name() { - let test_case = std::fs::read_to_string($test_file).unwrap(); - python_parser_test_ast(&[test_case.as_str()]); - } - }; - } - - // parser_test!(test_functions, "test_data/inputs/functions.py"); - // parser_test!(test_if, "test_data/inputs/if.py"); - // parser_test!(test_indentation, "test_data/inputs/indentation.py"); - // parser_test!( - // test_separate_statements, - // "test_data/inputs/separate_statements.py" - // ); - // parser_test!(test_try, "test_data/inputs/try.py"); - // parser_test!( - // annotated_assignment, - // "test_data/inputs/annotated_assignment.py" - // ); - // parser_test!(binary_op, "test_data/inputs/binary_op.py"); - // parser_test!(class, "test_data/inputs/class.py"); - // parser_test!(dict, "test_data/inputs/dict.py"); - // parser_test!(test_for, "test_data/inputs/for.py"); - // parser_test!(from_import, "test_data/inputs/from_import.py"); - // parser_test!(function_def, "test_data/inputs/function_def.py"); - // parser_test!( - // generator_expressions, - // "test_data/inputs/generator_expressions.py" - // ); - // parser_test!(lists, "test_data/inputs/lists.py"); - // parser_test!(test_match, "test_data/inputs/match.py"); - // parser_test!(sets, "test_data/inputs/sets.py"); - // parser_test!(string, "test_data/inputs/string.py"); - // parser_test!(subscript, "test_data/inputs/subscript.py"); - // parser_test!(with, "test_data/inputs/with.py"); - // parser_test!(newlines, "test_data/inputs/newlines.py"); - // parser_test!(comments, "test_data/inputs/comments.py"); - // parser_test!(types_alias, "test_data/inputs/type_alias.py"); - - fn assert_ast_eq(python_ast: &Value, enderpy_ast: &Value, source: &str) { - let include_source = std::env::var("INCLUDE_SOURCE").is_ok(); - let side_by_side = std::env::var("SIDE_BY_SIDE").is_ok(); - - let formatted_source = if include_source { - format!("\nSource:\n{}\n", source) - } else { - "".to_string() - }; - if !side_by_side { - pretty_assertions::assert_eq!( - &python_ast, - &enderpy_ast, - "Enderpy AST does not match Python AST.\n{}\x1b[31mPython AST\x1b[0m / \x1b[32mEnderpy AST\x1b[0m", - formatted_source, - ); - } else if let Err(message) = assert_json_matches_no_panic( - &python_ast, - &enderpy_ast, - assert_json_diff::Config::new(assert_json_diff::CompareMode::Strict), - ) { - let mut table_builder = Builder::default(); - table_builder.push_record(["Python AST", "Enderpy AST"]); - table_builder.push_record([ - serde_json::to_string_pretty(&python_ast).unwrap(), - serde_json::to_string_pretty(&enderpy_ast).unwrap(), - ]); - let mut table = table_builder.build(); - table.with(Style::modern()); - // If run in a terminal, don't expand table beyond terminal width. - if let Some((TerminalWidth(width), _)) = terminal_size() { - table - .with( - Width::wrap(width as usize) - .keep_words() - .priority::(), - ) - .with(Width::increase(width as usize)); - } - panic!( - "Enderpy AST does not match Python AST.\n{}{}\n{}", - formatted_source, table, message - ); - } - } -} diff --git a/typechecker/test_data/output/enderpy_python_type_checker__checker__tests__annotations_coroutine.snap b/typechecker/test_data/output/enderpy_python_type_checker__checker__tests__annotations_coroutine.snap index cc5d75e0..bc2fdc45 100644 --- a/typechecker/test_data/output/enderpy_python_type_checker__checker__tests__annotations_coroutine.snap +++ b/typechecker/test_data/output/enderpy_python_type_checker__checker__tests__annotations_coroutine.snap @@ -4,8 +4,6 @@ description: "1: \"\"\"\n2: Tests for annotating coroutines.\n3: \"\"\"\n4: \n5: expression: result --- Line 1: """ -Tests for annotating coroutines. -""" Expr types in the line --->: """ @@ -13,7 +11,7 @@ Tests for annotating coroutines. """ => (class) str --- -Line 10: from typing import Any, Callable, Coroutine, assert_type +Line 12: from typing import Any, Callable, Coroutine, assert_type Expr types in the line --->: typing => Module @@ -23,19 +21,19 @@ Expr types in the line --->: assert_type => (function) Callable[[TypeVar[_T, ], (class) object], TypeVar[_T, ]] --- -Line 13: async def func1(ignored: int, /) -> str: +Line 15: async def func1(ignored: int, /) -> str: Expr types in the line --->: func => (function) Callable[[(class) int], Coroutine[Any, Any, (class) str]] --- -Line 14: return "spam" +Line 16: return "spam" Expr types in the line --->: "spam" => (class) str --- -Line 17: assert_type(func1, Callable[[int], Coroutine[Any, Any, str]]) +Line 19: assert_type(func1, Callable[[int], Coroutine[Any, Any, str]]) Expr types in the line --->: assert_type => (function) Callable[[TypeVar[_T, ], (class) object], TypeVar[_T, ]] @@ -54,13 +52,13 @@ Expr types in the line --->: str => (class) str --- -Line 19: async def func2() -> None: +Line 21: async def func2() -> None: Expr types in the line --->: func => (function) Callable[[], Coroutine[Any, Any, None]] --- -Line 20: x = await func1(42) +Line 22: x = await func1(42) Expr types in the line --->: x => (class) str @@ -70,7 +68,7 @@ Expr types in the line --->: 42 => (class) int --- -Line 21: assert_type(x, str) +Line 23: assert_type(x, str) Expr types in the line --->: assert_type => (function) Callable[[TypeVar[_T, ], (class) object], TypeVar[_T, ]] diff --git a/typechecker/test_data/output/enderpy_python_type_checker__checker__tests__basic_generics.snap b/typechecker/test_data/output/enderpy_python_type_checker__checker__tests__basic_generics.snap index 35a298dd..16fe79dd 100644 --- a/typechecker/test_data/output/enderpy_python_type_checker__checker__tests__basic_generics.snap +++ b/typechecker/test_data/output/enderpy_python_type_checker__checker__tests__basic_generics.snap @@ -4,8 +4,6 @@ description: "1: \"\"\"\n2: Tests for basic usage of generics.\n3: \"\"\"\n4: \n expression: result --- Line 1: """ -Tests for basic usage of generics. -""" Expr types in the line --->: """ @@ -13,21 +11,21 @@ Tests for basic usage of generics. """ => (class) str --- -Line 5: from __future__ import annotations +Line 7: from __future__ import annotations Expr types in the line --->: __future__ => Module annotations => (class) _Feature --- -Line 7: from collections.abc import Sequence +Line 9: from collections.abc import Sequence Expr types in the line --->: collections.abc => Module Sequence => (class) typing.Sequence[TypeVar[_T_co, ]] --- -Line 8: from typing import Any, Generic, TypeVar, assert_type +Line 10: from typing import Any, Generic, TypeVar, assert_type Expr types in the line --->: typing => Module @@ -37,7 +35,7 @@ Expr types in the line --->: assert_type => (function) Callable[[TypeVar[_T, ], (class) object], TypeVar[_T, ]] --- -Line 10: T = TypeVar("T") +Line 12: T = TypeVar("T") Expr types in the line --->: T => TypeVar[T, ] @@ -46,7 +44,7 @@ Expr types in the line --->: "T" => (class) str --- -Line 16: def first(l: Sequence[T]) -> T: +Line 18: def first(l: Sequence[T]) -> T: Expr types in the line --->: first => (function) Callable[[(class) typing.Sequence[TypeVar[T, ]]], TypeVar[T, ]] @@ -55,7 +53,7 @@ Expr types in the line --->: T => TypeVar[T, ] --- -Line 17: return l[0] +Line 19: return l[0] Expr types in the line --->: l => (class) typing.Sequence[TypeVar[T, ]] @@ -63,7 +61,7 @@ Expr types in the line --->: 0 => (class) int --- -Line 20: def test_first(seq_int: Sequence[int], seq_str: Sequence[str]) -> None: +Line 22: def test_first(seq_int: Sequence[int], seq_str: Sequence[str]) -> None: Expr types in the line --->: test_first => (function) Callable[[(class) typing.Sequence[(class) int], (class) typing.Sequence[(class) str]], None] @@ -74,7 +72,7 @@ Expr types in the line --->: None => None --- -Line 21: assert_type(first(seq_int), int) +Line 23: assert_type(first(seq_int), int) Expr types in the line --->: assert_type => (function) Callable[[TypeVar[_T, ], (class) object], TypeVar[_T, ]] @@ -85,7 +83,7 @@ Expr types in the line --->: int => (class) int --- -Line 22: assert_type(first(seq_str), str) +Line 24: assert_type(first(seq_str), str) Expr types in the line --->: assert_type => (function) Callable[[TypeVar[_T, ], (class) object], TypeVar[_T, ]] @@ -96,7 +94,7 @@ Expr types in the line --->: str => (class) str --- -Line 28: AnyStr = TypeVar("AnyStr", str, bytes) +Line 30: AnyStr = TypeVar("AnyStr", str, bytes) Expr types in the line --->: AnyStr => TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]] @@ -107,7 +105,7 @@ Expr types in the line --->: bytes => (class) builtins.bytes[(class) int] --- -Line 31: def concat(x: AnyStr, y: AnyStr) -> AnyStr: +Line 33: def concat(x: AnyStr, y: AnyStr) -> AnyStr: Expr types in the line --->: concat => (function) Callable[[TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]], TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]]], TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]]] @@ -118,7 +116,7 @@ Expr types in the line --->: AnyStr => TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]] --- -Line 32: return x + y +Line 34: return x + y Expr types in the line --->: x => TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]] @@ -126,7 +124,7 @@ Expr types in the line --->: y => TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]] --- -Line 35: def test_concat(s: str, b: bytes, a: Any) -> None: +Line 37: def test_concat(s: str, b: bytes, a: Any) -> None: Expr types in the line --->: test_concat => (function) Callable[[(class) str, Unknown, (class) object], None] @@ -139,7 +137,7 @@ Expr types in the line --->: None => None --- -Line 36: concat(s, s) # OK +Line 38: concat(s, s) # OK Expr types in the line --->: concat => (function) Callable[[TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]], TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]]], TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]]] @@ -148,7 +146,7 @@ Expr types in the line --->: s => (class) str --- -Line 37: concat(b, b) # OK +Line 39: concat(b, b) # OK Expr types in the line --->: concat => (function) Callable[[TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]], TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]]], TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]]] @@ -157,7 +155,7 @@ Expr types in the line --->: b => Unknown --- -Line 38: concat(s, b) # E +Line 40: concat(s, b) # E Expr types in the line --->: concat => (function) Callable[[TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]], TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]]], TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]]] @@ -166,7 +164,7 @@ Expr types in the line --->: b => Unknown --- -Line 39: concat(b, s) # E +Line 41: concat(b, s) # E Expr types in the line --->: concat => (function) Callable[[TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]], TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]]], TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]]] @@ -175,7 +173,7 @@ Expr types in the line --->: s => (class) str --- -Line 41: concat(s, a) # OK +Line 43: concat(s, a) # OK Expr types in the line --->: concat => (function) Callable[[TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]], TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]]], TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]]] @@ -184,7 +182,7 @@ Expr types in the line --->: a => (class) object --- -Line 42: concat(a, b) # OK +Line 44: concat(a, b) # OK Expr types in the line --->: concat => (function) Callable[[TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]], TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]]], TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]]] @@ -193,7 +191,7 @@ Expr types in the line --->: b => Unknown --- -Line 47: BadConstraint1 = TypeVar("BadConstraint1", str) # E +Line 49: BadConstraint1 = TypeVar("BadConstraint1", str) # E Expr types in the line --->: BadConstraint1 => Unknown @@ -203,7 +201,7 @@ Expr types in the line --->: str => (class) str --- -Line 52: class Test(Generic[T]): +Line 54: class Test(Generic[T]): Expr types in the line --->: Test => (class) basic_generics.Test[TypeVar[T, ]] @@ -212,7 +210,7 @@ Expr types in the line --->: T => TypeVar[T, ] --- -Line 53: BadConstraint2 = TypeVar("BadConstraint2", str, list[T]) # E +Line 55: BadConstraint2 = TypeVar("BadConstraint2", str, list[T]) # E Expr types in the line --->: BadConstraint2 => TypeVar[BadConstraint2, (class) str, (class) builtins.list[TypeVar[_T, ]]] @@ -225,7 +223,7 @@ Expr types in the line --->: T => TypeVar[T, ] --- -Line 61: class MyStr(str): ... +Line 63: class MyStr(str): ... Expr types in the line --->: MyStr => (class) MyStr @@ -233,7 +231,7 @@ Expr types in the line --->: ... => Any --- -Line 64: def test_concat_subtype(s: str, b: bytes, a: Any, m: MyStr) -> None: +Line 66: def test_concat_subtype(s: str, b: bytes, a: Any, m: MyStr) -> None: Expr types in the line --->: test_concat_subtype => (function) Callable[[(class) str, Unknown, (class) object, (class) MyStr], None] @@ -248,7 +246,7 @@ Expr types in the line --->: None => None --- -Line 65: assert_type(concat(m, m), str) +Line 67: assert_type(concat(m, m), str) Expr types in the line --->: assert_type => (function) Callable[[TypeVar[_T, ], (class) object], TypeVar[_T, ]] @@ -260,7 +258,7 @@ Expr types in the line --->: str => (class) str --- -Line 66: assert_type(concat(m, s), str) +Line 68: assert_type(concat(m, s), str) Expr types in the line --->: assert_type => (function) Callable[[TypeVar[_T, ], (class) object], TypeVar[_T, ]] @@ -272,7 +270,7 @@ Expr types in the line --->: str => (class) str --- -Line 67: concat(m, b) # E +Line 69: concat(m, b) # E Expr types in the line --->: concat => (function) Callable[[TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]], TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]]], TypeVar[AnyStr, (class) str, (class) builtins.bytes[(class) int]]] @@ -281,21 +279,21 @@ Expr types in the line --->: b => Unknown --- -Line 79: from logging import Logger +Line 81: from logging import Logger Expr types in the line --->: logging => Module Logger => (class) Logger --- -Line 80: from collections.abc import Iterable +Line 82: from collections.abc import Iterable Expr types in the line --->: collections.abc => Module Iterable => (class) typing.Iterable[TypeVar[_T_co, ]] --- -Line 83: class LoggedVar(Generic[T]): +Line 85: class LoggedVar(Generic[T]): Expr types in the line --->: LoggedVar => (class) basic_generics.LoggedVar[TypeVar[T, ]] @@ -304,7 +302,7 @@ Expr types in the line --->: T => TypeVar[T, ] --- -Line 84: def __init__(self, value: T, name: str, logger: Logger) -> None: +Line 86: def __init__(self, value: T, name: str, logger: Logger) -> None: Expr types in the line --->: __init__ => (function) Callable[[Unknown, TypeVar[T, ], (class) str, (class) Logger], None] @@ -318,7 +316,7 @@ Expr types in the line --->: None => None --- -Line 85: self.name = name +Line 87: self.name = name Expr types in the line --->: self => (class) basic_generics.LoggedVar[TypeVar[T, ]] @@ -326,7 +324,7 @@ Expr types in the line --->: name => (class) str --- -Line 86: self.logger = logger +Line 88: self.logger = logger Expr types in the line --->: self => (class) basic_generics.LoggedVar[TypeVar[T, ]] @@ -334,7 +332,7 @@ Expr types in the line --->: logger => (class) Logger --- -Line 87: self.value = value +Line 89: self.value = value Expr types in the line --->: self => (class) basic_generics.LoggedVar[TypeVar[T, ]] @@ -342,7 +340,7 @@ Expr types in the line --->: value => TypeVar[T, ] --- -Line 89: def set(self, new: T) -> None: +Line 91: def set(self, new: T) -> None: Expr types in the line --->: set => (function) Callable[[Unknown, TypeVar[T, ]], None] @@ -352,7 +350,7 @@ Expr types in the line --->: None => None --- -Line 90: self.log("Set " + repr(self.value)) +Line 92: self.log("Set " + repr(self.value)) Expr types in the line --->: self.log => (function) Callable[[Unknown, (class) str], None] @@ -362,7 +360,7 @@ Expr types in the line --->: repr(self.value) => (class) str --- -Line 91: self.value = new +Line 93: self.value = new Expr types in the line --->: self => (class) basic_generics.LoggedVar[TypeVar[T, ]] @@ -370,7 +368,7 @@ Expr types in the line --->: new => TypeVar[T, ] --- -Line 93: def get(self) -> T: +Line 95: def get(self) -> T: Expr types in the line --->: get => (function) Callable[[Unknown], TypeVar[T, ]] @@ -378,7 +376,7 @@ Expr types in the line --->: T => TypeVar[T, ] --- -Line 94: self.log("Get " + repr(self.value)) +Line 96: self.log("Get " + repr(self.value)) Expr types in the line --->: self.log => (function) Callable[[Unknown, (class) str], None] @@ -388,14 +386,14 @@ Expr types in the line --->: repr(self.value) => (class) str --- -Line 95: return self.value +Line 97: return self.value Expr types in the line --->: self => (class) basic_generics.LoggedVar[TypeVar[T, ]] self.value => TypeVar[T, ] --- -Line 97: def log(self, message: str) -> None: +Line 99: def log(self, message: str) -> None: Expr types in the line --->: log => (function) Callable[[Unknown, (class) str], None] @@ -405,7 +403,7 @@ Expr types in the line --->: None => None --- -Line 98: self.logger.info("{}: {}".format(self.name, message)) +Line 100: self.logger.info("{}: {}".format(self.name, message)) Expr types in the line --->: self.logger.info => (function) Callable[[Unknown, Unknown, Unknown, (class) TypeAlias, (class) bool, (class) int, Union[(class) typing.Mapping[Unknown], None]], None] @@ -417,7 +415,7 @@ Expr types in the line --->: message => (class) str --- -Line 101: def zero_all_vars(vars: Iterable[LoggedVar[int]]) -> None: +Line 103: def zero_all_vars(vars: Iterable[LoggedVar[int]]) -> None: Expr types in the line --->: zero_all_vars => (function) Callable[[(class) typing.Iterable[(class) basic_generics.LoggedVar[(class) int]]], None] @@ -426,13 +424,13 @@ Expr types in the line --->: None => None --- -Line 102: for var in vars: +Line 104: for var in vars: Expr types in the line --->: vars => (function) Callable[[(class) type], Unknown] --- -Line 103: var.set(0) +Line 105: var.set(0) Expr types in the line --->: var.set => Unknown @@ -440,7 +438,7 @@ Expr types in the line --->: 0 => (class) int --- -Line 104: assert_type(var.get(), int) +Line 106: assert_type(var.get(), int) Expr types in the line --->: assert_type => (function) Callable[[TypeVar[_T, ], (class) object], TypeVar[_T, ]] @@ -450,7 +448,7 @@ Expr types in the line --->: int => (class) int --- -Line 110: S = TypeVar("S") +Line 112: S = TypeVar("S") Expr types in the line --->: S => TypeVar[S, ] @@ -459,7 +457,7 @@ Expr types in the line --->: "S" => (class) str --- -Line 113: class Pair1(Generic[T, S]): ... +Line 115: class Pair1(Generic[T, S]): ... Expr types in the line --->: Pair1 => (class) basic_generics.Pair1[TypeVar[T, ], TypeVar[S, ]] @@ -471,7 +469,7 @@ Expr types in the line --->: ... => Any --- -Line 119: class Pair2(Generic[T, T]): # E +Line 121: class Pair2(Generic[T, T]): # E Expr types in the line --->: Pair2 => (class) basic_generics.Pair2[Unknown] @@ -482,13 +480,13 @@ Expr types in the line --->: T => TypeVar[T, ] --- -Line 120: ... +Line 122: ... Expr types in the line --->: ... => Any --- -Line 127: from collections.abc import Iterator, Mapping +Line 129: from collections.abc import Iterator, Mapping Expr types in the line --->: collections.abc => Module @@ -496,7 +494,7 @@ Expr types in the line --->: Mapping => (class) typing.Mapping[TypeVar[_KT, ], TypeVar[_KT, ], TypeVar[_VT_co, ]] --- -Line 130: class MyIter1(Iterator[T]): ... +Line 132: class MyIter1(Iterator[T]): ... Expr types in the line --->: MyIter1 => (class) basic_generics.MyIter1[TypeVar[T, ]] @@ -506,7 +504,7 @@ Expr types in the line --->: ... => Any --- -Line 133: class MyIter2(Iterator[T], Generic[T]): ... +Line 135: class MyIter2(Iterator[T], Generic[T]): ... Expr types in the line --->: MyIter2 => (class) basic_generics.MyIter2[TypeVar[T, ]] @@ -519,7 +517,7 @@ Expr types in the line --->: ... => Any --- -Line 136: def test_my_iter(m1: MyIter1[int], m2: MyIter2[int]): +Line 138: def test_my_iter(m1: MyIter1[int], m2: MyIter2[int]): Expr types in the line --->: test_my_iter => (function) Callable[[(class) basic_generics.MyIter1[(class) int], (class) basic_generics.MyIter2[(class) int]], Unknown] @@ -529,7 +527,7 @@ Expr types in the line --->: MyIter2[int] => (class) basic_generics.MyIter2[TypeVar[T, ]] --- -Line 137: assert_type(next(m1), int) +Line 139: assert_type(next(m1), int) Expr types in the line --->: assert_type => (function) Callable[[TypeVar[_T, ], (class) object], TypeVar[_T, ]] @@ -540,7 +538,7 @@ Expr types in the line --->: int => (class) int --- -Line 138: assert_type(next(m2), int) +Line 140: assert_type(next(m2), int) Expr types in the line --->: assert_type => (function) Callable[[TypeVar[_T, ], (class) object], TypeVar[_T, ]] @@ -551,7 +549,7 @@ Expr types in the line --->: int => (class) int --- -Line 141: K = TypeVar("K") +Line 143: K = TypeVar("K") Expr types in the line --->: K => TypeVar[K, ] @@ -560,7 +558,7 @@ Expr types in the line --->: "K" => (class) str --- -Line 142: V = TypeVar("V") +Line 144: V = TypeVar("V") Expr types in the line --->: V => TypeVar[V, ] @@ -569,7 +567,7 @@ Expr types in the line --->: "V" => (class) str --- -Line 145: class MyMap1(Mapping[K, V], Generic[K, V]): ... +Line 147: class MyMap1(Mapping[K, V], Generic[K, V]): ... Expr types in the line --->: MyMap1 => (class) basic_generics.MyMap1[TypeVar[K, ], TypeVar[V, ], TypeVar[K, ], TypeVar[V, ]] @@ -586,7 +584,7 @@ Expr types in the line --->: ... => Any --- -Line 148: class MyMap2(Mapping[K, V], Generic[V, K]): ... +Line 150: class MyMap2(Mapping[K, V], Generic[V, K]): ... Expr types in the line --->: MyMap2 => (class) basic_generics.MyMap2[TypeVar[K, ], TypeVar[V, ], TypeVar[V, ], TypeVar[K, ]] @@ -603,7 +601,7 @@ Expr types in the line --->: ... => Any --- -Line 151: def test_my_map(m1: MyMap1[str, int], m2: MyMap2[int, str]): +Line 153: def test_my_map(m1: MyMap1[str, int], m2: MyMap2[int, str]): Expr types in the line --->: test_my_map => (function) Callable[[(class) basic_generics.MyMap1[Unknown], (class) basic_generics.MyMap2[Unknown]], Unknown] @@ -613,7 +611,7 @@ Expr types in the line --->: MyMap2[int, str] => (class) basic_generics.MyMap2[TypeVar[K, ], TypeVar[V, ], TypeVar[V, ], TypeVar[K, ]] --- -Line 152: assert_type(m1["key"], int) +Line 154: assert_type(m1["key"], int) Expr types in the line --->: assert_type => (function) Callable[[TypeVar[_T, ], (class) object], TypeVar[_T, ]] @@ -624,7 +622,7 @@ Expr types in the line --->: int => (class) int --- -Line 153: assert_type(m2["key"], int) +Line 155: assert_type(m2["key"], int) Expr types in the line --->: assert_type => (function) Callable[[TypeVar[_T, ], (class) object], TypeVar[_T, ]] @@ -635,7 +633,7 @@ Expr types in the line --->: int => (class) int --- -Line 155: m1[0] # E +Line 157: m1[0] # E Expr types in the line --->: m1 => (class) basic_generics.MyMap1[Unknown] @@ -643,7 +641,7 @@ Expr types in the line --->: 0 => (class) int --- -Line 156: m2[0] # E +Line 158: m2[0] # E Expr types in the line --->: m2 => (class) basic_generics.MyMap2[Unknown] @@ -651,7 +649,7 @@ Expr types in the line --->: 0 => (class) int --- -Line 161: from collections.abc import Sized, Container +Line 163: from collections.abc import Sized, Container Expr types in the line --->: collections.abc => Module @@ -659,7 +657,7 @@ Expr types in the line --->: Container => (class) typing.Container[TypeVar[_T_co, ]] --- -Line 164: class LinkedList(Sized, Generic[T]): ... +Line 166: class LinkedList(Sized, Generic[T]): ... Expr types in the line --->: LinkedList => (class) basic_generics.LinkedList[TypeVar[T, ]] @@ -670,7 +668,7 @@ Expr types in the line --->: ... => Any --- -Line 167: class MyMapping(Iterable[tuple[K, V]], Container[tuple[K, V]], Generic[K, V]): ... +Line 169: class MyMapping(Iterable[tuple[K, V]], Container[tuple[K, V]], Generic[K, V]): ... Expr types in the line --->: MyMapping => (class) basic_generics.MyMapping[TypeVar[K, ], TypeVar[V, ]] @@ -696,20 +694,20 @@ Expr types in the line --->: ... => Any --- -Line 175: class MyIterableAny(Iterable): # Same as Iterable[Any] +Line 177: class MyIterableAny(Iterable): # Same as Iterable[Any] Expr types in the line --->: MyIterableAny => (class) basic_generics.MyIterableAny[TypeVar[_T_co, ]] Iterable => (class) typing.Iterable[TypeVar[_T_co, ]] --- -Line 176: ... +Line 178: ... Expr types in the line --->: ... => Any --- -Line 179: def test_my_iterable_any(m: MyIterableAny): +Line 181: def test_my_iterable_any(m: MyIterableAny): Expr types in the line --->: test_my_iterable_any => (function) Callable[[(class) basic_generics.MyIterableAny[TypeVar[_T_co, ]]], Unknown] @@ -717,7 +715,7 @@ Expr types in the line --->: MyIterableAny => (class) basic_generics.MyIterableAny[TypeVar[_T_co, ]] --- -Line 180: assert_type(iter(m), Iterator[Any]) +Line 182: assert_type(iter(m), Iterator[Any]) Expr types in the line --->: assert_type => (function) Callable[[TypeVar[_T, ], (class) object], TypeVar[_T, ]] @@ -730,7 +728,7 @@ Expr types in the line --->: Any => (class) object --- -Line 186: class GenericMeta(type, Generic[T]): ... +Line 188: class GenericMeta(type, Generic[T]): ... Expr types in the line --->: GenericMeta => (class) basic_generics.GenericMeta[TypeVar[T, ]] @@ -741,7 +739,7 @@ Expr types in the line --->: ... => Any --- -Line 189: class GenericMetaInstance(metaclass=GenericMeta[T]): # E +Line 191: class GenericMetaInstance(metaclass=GenericMeta[T]): # E Expr types in the line --->: GenericMetaInstance => (class) basic_generics.GenericMetaInstance[TypeVar[T, ]] @@ -750,7 +748,7 @@ Expr types in the line --->: T => TypeVar[T, ] --- -Line 190: ... +Line 192: ... Expr types in the line --->: ... => Any From a5412426434284cb03415eae5393d9d5836d1257 Mon Sep 17 00:00:00 2001 From: Glyphack Date: Thu, 29 Aug 2024 23:19:38 +0200 Subject: [PATCH 2/7] Run compatibility tests on CI Signed-off-by: Shaygan --- .github/workflows/test.yml | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 4fdd078d..b26dbc07 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -23,13 +23,12 @@ jobs: with: rust-version: stable components: rustfmt, clippy - # Required for compatibility tests + # Required for compatibility unit tests - uses: actions/setup-python@v5 with: python-version: '3.12' - name: Build run: cargo build - - name: Run tests env: RUST_BACKTRACE: 1 @@ -38,6 +37,24 @@ jobs: run: make format-check - name: clippy run: make lint + compatibility-test-complete: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + submodules: true + - uses: Swatinem/rust-cache@v2 + - name: setup toolchain + uses: hecrj/setup-rust-action@v2 + with: + rust-version: stable + - uses: actions/setup-python@v5 + with: + python-version: '3.12' + - name: Run Compatibility Tests + run: | + cargo run -p enderpy-compat --bin enderpy-compat + # coverage: # name: coverage # runs-on: ubuntu-latest From 08c305c495ecc811572bf33e5c6a42a5bbfb9edd Mon Sep 17 00:00:00 2001 From: Glyphack Date: Thu, 29 Aug 2024 23:25:58 +0200 Subject: [PATCH 3/7] Update snapshots --- ...er__checker__tests__specialtypes_none.snap | 38 +++++++++---------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/typechecker/test_data/output/enderpy_python_type_checker__checker__tests__specialtypes_none.snap b/typechecker/test_data/output/enderpy_python_type_checker__checker__tests__specialtypes_none.snap index b012f1c9..f9ea9259 100644 --- a/typechecker/test_data/output/enderpy_python_type_checker__checker__tests__specialtypes_none.snap +++ b/typechecker/test_data/output/enderpy_python_type_checker__checker__tests__specialtypes_none.snap @@ -4,8 +4,6 @@ description: "1: \"\"\"\n2: Tests the handling of builtins.None in a type annota expression: result --- Line 1: """ -Tests the handling of builtins.None in a type annotation. -""" Expr types in the line --->: """ @@ -13,14 +11,14 @@ Tests the handling of builtins.None in a type annotation. """ => (class) str --- -Line 5: from types import NoneType +Line 7: from types import NoneType Expr types in the line --->: types => Module NoneType => (class) NoneType --- -Line 6: from typing import Hashable, Iterable, assert_type +Line 8: from typing import Hashable, Iterable, assert_type Expr types in the line --->: typing => Module @@ -29,7 +27,7 @@ Expr types in the line --->: assert_type => (function) Callable[[TypeVar[_T, ], (class) object], TypeVar[_T, ]] --- -Line 12: def func1(val1: None) -> None: +Line 14: def func1(val1: None) -> None: Expr types in the line --->: func1 => (function) Callable[[None], None] @@ -38,7 +36,7 @@ Expr types in the line --->: None => None --- -Line 13: assert_type(val1, None) +Line 15: assert_type(val1, None) Expr types in the line --->: assert_type => (function) Callable[[TypeVar[_T, ], (class) object], TypeVar[_T, ]] @@ -47,20 +45,20 @@ Expr types in the line --->: None => None --- -Line 14: t1: None = None +Line 16: t1: None = None Expr types in the line --->: t1 => None None => None --- -Line 15: return None # OK +Line 17: return None # OK Expr types in the line --->: None => None --- -Line 18: func1(None) # OK +Line 20: func1(None) # OK Expr types in the line --->: func1 => (function) Callable[[None], None] @@ -68,7 +66,7 @@ Expr types in the line --->: None => None --- -Line 19: func1(type(None)) # E +Line 21: func1(type(None)) # E Expr types in the line --->: func1 => (function) Callable[[None], None] @@ -78,35 +76,35 @@ Expr types in the line --->: None => None --- -Line 22: none1: Hashable = None # OK +Line 24: none1: Hashable = None # OK Expr types in the line --->: none1 => (class) Hashable None => None --- -Line 25: none2: Iterable = None # E: not iterable +Line 27: none2: Iterable = None # E: not iterable Expr types in the line --->: none2 => (class) typing.Iterable[TypeVar[_T_co, ]] None => None --- -Line 28: None.__class__ # OK +Line 30: None.__class__ # OK Expr types in the line --->: None => None None.__class__ => (function) Callable[[Unknown], (class) builtins.type[(class) Self]] --- -Line 29: None.__doc__ # OK +Line 31: None.__doc__ # OK Expr types in the line --->: None => None None.__doc__ => Union[(class) str, None] --- -Line 30: None.__eq__(0) # OK +Line 32: None.__eq__(0) # OK Expr types in the line --->: None.__eq__ => (function) Callable[[Unknown, (class) object], (class) bool] @@ -114,7 +112,7 @@ Expr types in the line --->: 0 => (class) int --- -Line 33: def func2(val1: type[None]): +Line 35: def func2(val1: type[None]): Expr types in the line --->: func2 => (function) Callable[[(class) builtins.type[None]], Unknown] @@ -122,7 +120,7 @@ Expr types in the line --->: type[None] => (class) type --- -Line 34: assert_type(val1, type[None]) +Line 36: assert_type(val1, type[None]) Expr types in the line --->: assert_type => (function) Callable[[TypeVar[_T, ], (class) object], TypeVar[_T, ]] @@ -133,7 +131,7 @@ Expr types in the line --->: None => None --- -Line 37: func2(None.__class__) # OK +Line 39: func2(None.__class__) # OK Expr types in the line --->: func2 => (function) Callable[[(class) builtins.type[None]], Unknown] @@ -142,7 +140,7 @@ Expr types in the line --->: None.__class__ => (function) Callable[[Unknown], (class) builtins.type[(class) Self]] --- -Line 38: func2(type(None)) # OK +Line 40: func2(type(None)) # OK Expr types in the line --->: func2 => (function) Callable[[(class) builtins.type[None]], Unknown] @@ -152,7 +150,7 @@ Expr types in the line --->: None => None --- -Line 39: func2(None) # E: not compatible +Line 41: func2(None) # E: not compatible Expr types in the line --->: func2 => (function) Callable[[(class) builtins.type[None]], Unknown] From 03991b4b86edb6b7935571f814835fa0b6323284 Mon Sep 17 00:00:00 2001 From: Glyphack Date: Sat, 31 Aug 2024 18:47:29 +0200 Subject: [PATCH 4/7] Add exclamation token --- compat/src/lexer_compat.rs | 1 + compat/src/main.rs | 2 +- parser/src/lexer/mod.rs | 317 +++++++++--------- parser/src/parser/parser.rs | 8 +- parser/src/token.rs | 3 + ...ts__snapshot_test_lexer@indentation-2.snap | 2 +- ...ot_test_lexer_and_errors@functions.py.snap | 6 +- ..._snapshot_test_lexer_and_errors@if.py.snap | 4 +- ...apshot_test_lexer_and_errors@match.py.snap | 34 +- ...pshot_test_lexer_and_errors@string.py.snap | 3 +- ...snapshot_test_lexer_and_errors@try.py.snap | 4 +- 11 files changed, 190 insertions(+), 194 deletions(-) diff --git a/compat/src/lexer_compat.rs b/compat/src/lexer_compat.rs index 664c3f13..a2514e80 100644 --- a/compat/src/lexer_compat.rs +++ b/compat/src/lexer_compat.rs @@ -544,6 +544,7 @@ fn matches_python_op_token(python_token_value: &str, token_kind: &Kind) -> bool "}" => token_kind == &Kind::RightBracket, "~" => token_kind == &Kind::BitNot, "..." => token_kind == &Kind::Ellipsis, + "!" => token_kind == &Kind::Exclamation, _ => false, } } diff --git a/compat/src/main.rs b/compat/src/main.rs index 72a6d528..bd735ec8 100644 --- a/compat/src/main.rs +++ b/compat/src/main.rs @@ -68,7 +68,7 @@ fn run_compatibility_test(file: &str) -> Result<()> { let python_tokens = lex_python_source(&source)?; assert_tokens_eq(python_tokens, enderpy_tokens, &lexer); - python_parser_test_ast(&vec![source.as_str()]); + // python_parser_test_ast(&vec![source.as_str()]); Ok(()) } diff --git a/parser/src/lexer/mod.rs b/parser/src/lexer/mod.rs index cf782418..00379c02 100644 --- a/parser/src/lexer/mod.rs +++ b/parser/src/lexer/mod.rs @@ -58,12 +58,13 @@ pub struct Lexer<'a> { next_token_is_dedent: u8, /// Array of all line starts offsets. Starts from line 0 pub line_starts: Vec, - peak_mode: bool, + peek_mode: bool, /// Previous token was a Newline token non_logical_line_state: bool, /// Cursor at position after the indentation in line indented: bool, + // tokens: Vec, } impl<'a> Lexer<'a> { @@ -78,7 +79,7 @@ impl<'a> Lexer<'a> { tokenization_mode_stack: vec![], next_token_is_dedent: 0, line_starts: vec![0], - peak_mode: false, + peek_mode: false, non_logical_line_state: true, indented: false, } @@ -108,7 +109,8 @@ impl<'a> Lexer<'a> { }; } - let start = self.current; + // NOTE: for dedent we want to set the token start to the token end + let mut start = self.current; let kind = match self.next_kind() { Ok(kind) => kind, Err(e) => { @@ -136,16 +138,22 @@ impl<'a> Lexer<'a> { let value = self.parse_token_value(kind, start); let end = self.current; - if (kind == Kind::NewLine || kind == Kind::NL) && !self.peak_mode { + if (kind == Kind::NewLine || kind == Kind::NL) && !self.peek_mode { self.line_starts.push(self.current); } - Token { + if kind == Kind::Dedent { + start = end + } + + let token = Token { kind, value, start, end, - } + }; + + token } // peek_token is a side-effect free version of next_token @@ -157,11 +165,11 @@ impl<'a> Lexer<'a> { let next_token_is_dedent = self.next_token_is_dedent; let prev_token_newline = self.non_logical_line_state; let indented = self.indented; - self.peak_mode = true; + self.peek_mode = true; let token = self.next_token(); self.indented = indented; self.non_logical_line_state = prev_token_newline; - self.peak_mode = false; + self.peek_mode = false; self.current = current; self.current_line = current_line; self.nesting = nesting; @@ -170,106 +178,6 @@ impl<'a> Lexer<'a> { token } - // https://peps.python.org/pep-0701/#how-to-produce-these-new-tokens - fn next_fstring_token(&mut self, str_finisher: StringQuotation, _fstring_nesting: u8) -> Kind { - let mut read_chars = false; - loop { - let peeked_char = self.peek(); - let double_peek = self.double_peek(); - if peeked_char == Some('{') && peeked_char == double_peek { - self.next(); - self.next(); - read_chars = true; - continue; - } - if peeked_char == Some('{') && peeked_char != double_peek { - if read_chars { - if !self.peak_mode { - self.tokenization_mode_stack - .push(TokenizationMode::PythonWithinFstring(self.nesting + 1)); - } - return Kind::FStringMiddle; - } else { - if !self.peak_mode { - self.tokenization_mode_stack - .push(TokenizationMode::PythonWithinFstring(self.nesting + 1)); - } - self.next(); - return Kind::LeftBracket; - } - } - - let Some(curr) = self.next() else { - panic!("eof while parsing fstring") - }; - read_chars = true; - - match str_finisher { - StringQuotation::Single => { - if self.peek() == Some('\'') { - return Kind::FStringMiddle; - } - if curr == '\'' { - if !self.peak_mode { - let last = self.tokenization_mode_stack.pop(); - assert!(matches!(last, Some(TokenizationMode::Fstring(_)))) - } - return Kind::FStringEnd; - } - } - StringQuotation::Double => { - if self.peek() == Some('"') { - return Kind::FStringMiddle; - } - if curr == '"' { - if !self.peak_mode { - let last = self.tokenization_mode_stack.pop(); - assert!(matches!(last, Some(TokenizationMode::Fstring(_)))) - } - return Kind::FStringEnd; - } - } - StringQuotation::TripleSingle => { - if self.peek() == Some('\'') - && self.peek() == self.double_peek() - && self.peek() == self.triple_peek() - { - return Kind::FStringMiddle; - } - - if curr == '\'' - && self.peek() == Some(curr) - && self.peek() == self.double_peek() - { - if !self.peak_mode { - let last = self.tokenization_mode_stack.pop(); - assert!(matches!(last, Some(TokenizationMode::Fstring(_)))) - } - self.double_next(); - return Kind::FStringEnd; - } - } - StringQuotation::TripleDouble => { - if self.peek() == Some('\"') - && self.peek() == self.double_peek() - && self.peek() == self.triple_peek() - { - return Kind::FStringMiddle; - } - if curr == '"' && self.peek() == Some(curr) && self.peek() == self.double_peek() - { - if !self.peak_mode { - let last = self.tokenization_mode_stack.pop(); - assert!(matches!(last, Some(TokenizationMode::Fstring(_)))) - } - self.double_next(); - return Kind::FStringEnd; - } - } - } - } - } - fn next_kind(&mut self) -> Result { if self.start_of_line && self.nesting == 0 { if let Some(indent_kind) = self.match_indentation()? { @@ -294,7 +202,7 @@ impl<'a> Lexer<'a> { } if read_chars > 0 { return Ok(Kind::FStringMiddle); - } else if !self.peak_mode { + } else if !self.peek_mode { self.tokenization_mode_stack.pop(); } } @@ -413,7 +321,7 @@ impl<'a> Lexer<'a> { if let Some(TokenizationMode::PythonWithinFstring(i)) = self.tokenization_mode_stack.last() { - if self.nesting == *i && !self.peak_mode { + if self.nesting == *i && !self.peek_mode { self.tokenization_mode_stack .push(TokenizationMode::FstringFormatSpecifier); } @@ -426,6 +334,7 @@ impl<'a> Lexer<'a> { self.next(); return Ok(Kind::NotEq); } + return Ok(Kind::Exclamation); } // Delimiters '(' => { @@ -447,7 +356,7 @@ impl<'a> Lexer<'a> { if self.peek() != Some('}') { if let Some(mode) = self.tokenization_mode_stack.last() { if matches!(mode, TokenizationMode::PythonWithinFstring(_)) { - if !self.peak_mode { + if !self.peek_mode { self.tokenization_mode_stack.pop(); } return Ok(Kind::RightBracket); @@ -539,6 +448,106 @@ impl<'a> Lexer<'a> { Ok(Kind::Eof) } + // https://peps.python.org/pep-0701/#how-to-produce-these-new-tokens + fn next_fstring_token(&mut self, str_finisher: StringQuotation, _fstring_nesting: u8) -> Kind { + let mut read_chars = false; + loop { + let peeked_char = self.peek(); + let double_peek = self.double_peek(); + if peeked_char == Some('{') && peeked_char == double_peek { + self.next(); + self.next(); + read_chars = true; + continue; + } + if peeked_char == Some('{') && peeked_char != double_peek { + if read_chars { + if !self.peek_mode { + self.tokenization_mode_stack + .push(TokenizationMode::PythonWithinFstring(self.nesting + 1)); + } + return Kind::FStringMiddle; + } else { + if !self.peek_mode { + self.tokenization_mode_stack + .push(TokenizationMode::PythonWithinFstring(self.nesting + 1)); + } + self.next(); + return Kind::LeftBracket; + } + } + + let Some(curr) = self.next() else { + panic!("eof while parsing fstring") + }; + read_chars = true; + + match str_finisher { + StringQuotation::Single => { + if self.peek() == Some('\'') { + return Kind::FStringMiddle; + } + if curr == '\'' { + if !self.peek_mode { + let last = self.tokenization_mode_stack.pop(); + assert!(matches!(last, Some(TokenizationMode::Fstring(_)))) + } + return Kind::FStringEnd; + } + } + StringQuotation::Double => { + if self.peek() == Some('"') { + return Kind::FStringMiddle; + } + if curr == '"' { + if !self.peek_mode { + let last = self.tokenization_mode_stack.pop(); + assert!(matches!(last, Some(TokenizationMode::Fstring(_)))) + } + return Kind::FStringEnd; + } + } + StringQuotation::TripleSingle => { + if self.peek() == Some('\'') + && self.peek() == self.double_peek() + && self.peek() == self.triple_peek() + { + return Kind::FStringMiddle; + } + + if curr == '\'' + && self.peek() == Some(curr) + && self.peek() == self.double_peek() + { + if !self.peek_mode { + let last = self.tokenization_mode_stack.pop(); + assert!(matches!(last, Some(TokenizationMode::Fstring(_)))) + } + self.double_next(); + return Kind::FStringEnd; + } + } + StringQuotation::TripleDouble => { + if self.peek() == Some('\"') + && self.peek() == self.double_peek() + && self.peek() == self.triple_peek() + { + return Kind::FStringMiddle; + } + if curr == '"' && self.peek() == Some(curr) && self.peek() == self.double_peek() + { + if !self.peek_mode { + let last = self.tokenization_mode_stack.pop(); + assert!(matches!(last, Some(TokenizationMode::Fstring(_)))) + } + self.double_next(); + return Kind::FStringEnd; + } + } + } + } + } + fn match_id_keyword(&mut self, id_start: char) -> Result { if let Some(str_kind) = self.match_str(id_start)? { return Ok(str_kind); @@ -587,7 +596,7 @@ impl<'a> Lexer<'a> { Some(str_start @ '"') | Some(str_start @ '\'') => { self.double_next(); let fstring_start = self.f_string_quote_count(str_start); - if !self.peak_mode { + if !self.peek_mode { self.tokenization_mode_stack .push(TokenizationMode::Fstring(( self.nesting, @@ -626,7 +635,7 @@ impl<'a> Lexer<'a> { Some(str_start @ '"') | Some(str_start @ '\'') => { self.double_next(); let fstring_start = self.f_string_quote_count(str_start); - if !self.peak_mode { + if !self.peek_mode { self.tokenization_mode_stack .push(TokenizationMode::Fstring(( self.nesting, @@ -640,7 +649,7 @@ impl<'a> Lexer<'a> { Some(str_start @ '"') | Some(str_start @ '\'') => { self.next(); let fstring_start = self.f_string_quote_count(str_start); - if !self.peak_mode { + if !self.peek_mode { self.tokenization_mode_stack .push(TokenizationMode::Fstring(( self.nesting, @@ -742,7 +751,9 @@ impl<'a> Lexer<'a> { self.next(); while let Some(c) = self.next() { if c == '\n' { - self.line_starts.push(self.current); + if !self.peek_mode { + self.line_starts.push(self.current); + } } if c == str_start && self.peek() == Some(str_start) @@ -758,7 +769,9 @@ impl<'a> Lexer<'a> { } else { while let Some(c) = self.next() { if c == '\n' { - self.line_starts.push(self.current); + if !self.peek_mode { + self.line_starts.push(self.current); + } } if c == str_start && last_read_char != '\\' { string_terminated = true; @@ -987,12 +1000,35 @@ impl<'a> Lexer<'a> { if !indentation_matches_outer_level { return Err(LexError::UnindentDoesNotMatchAnyOuterIndentationLevel); } + if !self.peek_mode { + let mut de_indents = 0; + while let Some(top) = self.indent_stack.last() { + match top.cmp(&spaces_count) { + Ordering::Greater => { + self.indent_stack.pop(); + de_indents += 1; + } + Ordering::Equal => break, + // We only see a Kind::Dedent when the indentation level is less than the + // top of the stack. So this should never happen and if it happens it's a + // bug in code not an error for the user + Ordering::Less => { + unreachable!() + } + } + } + if de_indents > 1 { + // minus 1 because the dedent with actual Indent value is already added + // This is super hacky and I don't like it + self.next_token_is_dedent += de_indents - 1; + } + } Ok(Some(Kind::Dedent)) } // Returning whitespace to ignore these spaces Ordering::Equal => Ok(Some(Kind::WhiteSpace)), Ordering::Greater => { - if !self.peak_mode { + if !self.peek_mode { self.indent_stack.push(spaces_count); } Ok(Some(Kind::Indent)) @@ -1005,7 +1041,6 @@ impl<'a> Lexer<'a> { fn parse_token_value(&mut self, kind: Kind, start: u32) -> TokenValue { let kind_value = &self.source[start as usize..self.current as usize]; - use std::cmp::Ordering; match kind { Kind::Integer | Kind::Hexadecimal @@ -1030,51 +1065,7 @@ impl<'a> Lexer<'a> { | Kind::Bytes | Kind::Unicode | Kind::Comment => TokenValue::Str(kind_value.to_string()), - Kind::Dedent => { - let mut spaces_count = 0; - for c in kind_value.chars() { - match c { - '\t' => { - spaces_count += 4; - } - ' ' => { - spaces_count += 1; - } - _ => { - break; - } - } - } - let mut de_indents = 0; - // TODO: This is not correct. But since we don't use the value inside parser it's - // it's okay to do. - // The reason for doing this is that we don't want to modify the indent_stack in - // the peak mode which alters lexer state. - if self.peak_mode { - return TokenValue::Indent(0); - } - while let Some(top) = self.indent_stack.last() { - match top.cmp(&spaces_count) { - Ordering::Greater => { - self.indent_stack.pop(); - de_indents += 1; - } - Ordering::Equal => break, - // We only see a Kind::Dedent when the indentation level is less than the - // top of the stack. So this should never happen and if it happens it's a - // bug in code not an error for the user - Ordering::Less => { - unreachable!() - } - } - } - if de_indents != 1 { - // minus 1 because the dedent with actual Indent value is already added - // This is super hacky and I don't like it - self.next_token_is_dedent += de_indents - 1; - } - TokenValue::Indent(de_indents.into()) - } + Kind::Dedent => TokenValue::Indent(1), Kind::Indent => TokenValue::Indent(1), Kind::Error => TokenValue::Str(kind_value.to_string()), _ => TokenValue::None, diff --git a/parser/src/parser/parser.rs b/parser/src/parser/parser.rs index 5bde119a..ca53d61c 100644 --- a/parser/src/parser/parser.rs +++ b/parser/src/parser/parser.rs @@ -3446,11 +3446,11 @@ impl<'a> Parser<'a> { if self.eat(Kind::Assign) { conversion = 114; } - if self.at(Kind::Identifier) { + if self.eat(Kind::Exclamation) { conversion = match self.cur_token.value.take_string().as_str() { - "!s" => 115, - "!r" => 114, - "!a" => 97, + "s" => 115, + "r" => 114, + "a" => 97, _ => panic!("should not happen"), }; self.bump_any(); diff --git a/parser/src/token.rs b/parser/src/token.rs index 264f1f3e..3ad6d550 100644 --- a/parser/src/token.rs +++ b/parser/src/token.rs @@ -113,6 +113,8 @@ pub enum Kind { BitAnd, // & BitOr, // | BitXor, // ^ + /// This operator is not listed in https://docs.python.org/3/reference/lexical_analysis.html#operators + Exclamation, // ! /// ~ BitNot, Walrus, // := @@ -364,6 +366,7 @@ impl From for &str { Kind::Indent => "Indent", Kind::Dedent => "Dedent", Kind::Ellipsis => "Ellipsis", + Kind::Exclamation => "!", } } } diff --git a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@indentation-2.snap b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@indentation-2.snap index 719f7698..a2431cd1 100644 --- a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@indentation-2.snap +++ b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@indentation-2.snap @@ -14,6 +14,6 @@ description: "if True:\n if True:\n pass\ndef" 22,30: Indent (Indent(1)) 30,34: Pass (None) 34,35: NewLine (None) -35,35: Dedent (Indent(2)) +35,35: Dedent (Indent(1)) 35,35: Dedent (None) 35,38: Def (None) diff --git a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@functions.py.snap b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@functions.py.snap index 1d4aaf11..a4d47628 100644 --- a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@functions.py.snap +++ b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@functions.py.snap @@ -56,7 +56,7 @@ input_file: parser/test_data/inputs/functions.py 309,310: . (None) 310,315: Identifier (Str("index")) 315,316: NewLine (None) -316,320: Dedent (Indent(2)) +320,320: Dedent (Indent(1)) 320,320: Dedent (None) 320,326: Return (None) 327,339: Identifier (Str("ticker_index")) @@ -107,7 +107,7 @@ input_file: parser/test_data/inputs/functions.py 554,555: Integer (Number("5")) 555,556: ) (None) 556,557: NewLine (None) -557,561: Dedent (Indent(1)) +561,561: Dedent (Indent(1)) 561,565: Identifier (Str("data")) 566,567: = (None) 568,576: Identifier (Str("response")) @@ -167,7 +167,7 @@ input_file: parser/test_data/inputs/functions.py 752,753: ) (None) 753,754: ) (None) 754,755: NewLine (None) -755,759: Dedent (Indent(1)) +759,759: Dedent (Indent(1)) 759,761: Identifier (Str("df")) 761,762: . (None) 762,768: Identifier (Str("rename")) diff --git a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@if.py.snap b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@if.py.snap index 05b13bcc..e1cfdbcd 100644 --- a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@if.py.snap +++ b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@if.py.snap @@ -138,7 +138,7 @@ input_file: parser/test_data/inputs/if.py 333,392: StringLiteral (Str("\"adjust is True and adjusted_daily_records_csv_path exists\"")) 392,393: ) (None) 393,394: NewLine (None) -394,398: Dedent (Indent(1)) +398,398: Dedent (Indent(1)) 398,402: Else (None) 402,403: : (None) 403,404: NewLine (None) @@ -148,7 +148,7 @@ input_file: parser/test_data/inputs/if.py 418,434: StringLiteral (Str("\"adjust is True\"")) 434,435: ) (None) 435,436: NewLine (None) -436,436: Dedent (Indent(2)) +436,436: Dedent (Indent(1)) 436,436: Dedent (None) 436,440: Else (None) 440,441: : (None) diff --git a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@match.py.snap b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@match.py.snap index a3f68804..f09c966b 100644 --- a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@match.py.snap +++ b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@match.py.snap @@ -16,7 +16,7 @@ input_file: parser/test_data/inputs/match.py 29,33: Pass (None) 33,34: NewLine (None) 34,35: NL (None) -35,35: Dedent (Indent(2)) +35,35: Dedent (Indent(1)) 35,35: Dedent (None) 35,40: Identifier (Match) 41,42: Identifier (Str("a")) @@ -33,7 +33,7 @@ input_file: parser/test_data/inputs/match.py 68,72: Pass (None) 72,73: NewLine (None) 73,74: NL (None) -74,74: Dedent (Indent(2)) +74,74: Dedent (Indent(1)) 74,74: Dedent (None) 74,79: Identifier (Match) 80,81: Identifier (Str("a")) @@ -50,7 +50,7 @@ input_file: parser/test_data/inputs/match.py 105,109: Pass (None) 109,110: NewLine (None) 110,111: NL (None) -111,111: Dedent (Indent(2)) +111,111: Dedent (Indent(1)) 111,111: Dedent (None) 111,116: Identifier (Match) 117,118: Identifier (Str("a")) @@ -64,7 +64,7 @@ input_file: parser/test_data/inputs/match.py 135,143: Indent (Indent(1)) 143,147: Pass (None) 147,148: NewLine (None) -148,152: Dedent (Indent(1)) +152,152: Dedent (Indent(1)) 152,156: Identifier (Str("case")) 157,161: True (None) 161,162: : (None) @@ -72,7 +72,7 @@ input_file: parser/test_data/inputs/match.py 163,171: Indent (Indent(1)) 171,175: Pass (None) 175,176: NewLine (None) -176,180: Dedent (Indent(1)) +180,180: Dedent (Indent(1)) 180,184: Identifier (Str("case")) 185,190: False (None) 190,191: : (None) @@ -80,7 +80,7 @@ input_file: parser/test_data/inputs/match.py 192,200: Indent (Indent(1)) 200,204: Pass (None) 204,205: NewLine (None) -205,209: Dedent (Indent(1)) +209,209: Dedent (Indent(1)) 209,213: Identifier (Str("case")) 214,215: - (None) 215,216: Integer (Number("1")) @@ -89,7 +89,7 @@ input_file: parser/test_data/inputs/match.py 218,226: Indent (Indent(1)) 226,230: Pass (None) 230,231: NewLine (None) -231,235: Dedent (Indent(1)) +235,235: Dedent (Indent(1)) 235,239: Identifier (Str("case")) 240,243: PointFloat (Number("1.0")) 243,244: : (None) @@ -97,7 +97,7 @@ input_file: parser/test_data/inputs/match.py 245,253: Indent (Indent(1)) 253,257: Pass (None) 257,258: NewLine (None) -258,262: Dedent (Indent(1)) +262,262: Dedent (Indent(1)) 262,266: Identifier (Str("case")) 267,268: Identifier (Str("_")) 268,269: : (None) @@ -106,7 +106,7 @@ input_file: parser/test_data/inputs/match.py 278,282: Pass (None) 282,283: NewLine (None) 283,284: NL (None) -284,284: Dedent (Indent(2)) +284,284: Dedent (Indent(1)) 284,284: Dedent (None) 284,289: Identifier (Match) 290,291: Identifier (Str("a")) @@ -122,7 +122,7 @@ input_file: parser/test_data/inputs/match.py 307,315: Indent (Indent(1)) 315,319: Pass (None) 319,320: NewLine (None) -320,324: Dedent (Indent(1)) +324,324: Dedent (Indent(1)) 324,328: Identifier (Str("case")) 329,330: Identifier (Str("a")) 330,331: : (None) @@ -131,7 +131,7 @@ input_file: parser/test_data/inputs/match.py 340,344: Pass (None) 344,345: NewLine (None) 345,346: NL (None) -346,346: Dedent (Indent(2)) +346,346: Dedent (Indent(1)) 346,346: Dedent (None) 346,351: Identifier (Match) 352,353: Identifier (Str("a")) @@ -149,7 +149,7 @@ input_file: parser/test_data/inputs/match.py 372,380: Indent (Indent(1)) 380,384: Pass (None) 384,385: NewLine (None) -385,389: Dedent (Indent(1)) +389,389: Dedent (Indent(1)) 389,393: Identifier (Str("case")) 394,395: { (None) 395,396: Integer (Number("1")) @@ -165,7 +165,7 @@ input_file: parser/test_data/inputs/match.py 408,416: Indent (Indent(1)) 416,420: Pass (None) 420,421: NewLine (None) -421,425: Dedent (Indent(1)) +425,425: Dedent (Indent(1)) 425,429: Identifier (Str("case")) 430,431: { (None) 431,433: ** (None) @@ -177,7 +177,7 @@ input_file: parser/test_data/inputs/match.py 448,452: Pass (None) 452,453: NewLine (None) 453,454: NL (None) -454,454: Dedent (Indent(2)) +454,454: Dedent (Indent(1)) 454,454: Dedent (None) 454,459: Identifier (Match) 460,461: Identifier (Str("x")) @@ -196,7 +196,7 @@ input_file: parser/test_data/inputs/match.py 487,495: Indent (Indent(1)) 495,499: Pass (None) 499,500: NewLine (None) -500,504: Dedent (Indent(1)) +504,504: Dedent (Indent(1)) 504,508: Identifier (Str("case")) 509,516: Identifier (Str("Point3D")) 516,517: ( (None) @@ -218,7 +218,7 @@ input_file: parser/test_data/inputs/match.py 541,545: Pass (None) 545,546: NewLine (None) 546,547: NL (None) -547,547: Dedent (Indent(2)) +547,547: Dedent (Indent(1)) 547,547: Dedent (None) 547,552: Identifier (Match) 553,554: Identifier (Str("x")) @@ -238,5 +238,5 @@ input_file: parser/test_data/inputs/match.py 576,584: Indent (Indent(1)) 584,588: Pass (None) 588,589: NewLine (None) -589,589: Dedent (Indent(2)) +589,589: Dedent (Indent(1)) 589,589: Dedent (None) diff --git a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@string.py.snap b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@string.py.snap index 7c232b9f..9aeb2be6 100644 --- a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@string.py.snap +++ b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@string.py.snap @@ -33,7 +33,8 @@ input_file: parser/test_data/inputs/string.py 93,101: Identifier (Str("_display")) 101,102: ( (None) 102,103: ) (None) -103,105: Identifier (Str("!r")) +103,104: ! (None) +104,105: Identifier (Str("r")) 105,106: } (None) 106,107: FstringMiddle (Str(")")) 107,108: FStringEnd (Str("\"")) diff --git a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@try.py.snap b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@try.py.snap index 7d1e97e7..b4df2763 100644 --- a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@try.py.snap +++ b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@try.py.snap @@ -149,7 +149,7 @@ input_file: parser/test_data/inputs/try.py 659,660: Integer (Number("1")) 660,661: ) (None) 661,662: NewLine (None) -662,670: Dedent (Indent(1)) +670,670: Dedent (Indent(1)) 670,674: Else (None) 674,675: : (None) 675,676: NewLine (None) @@ -197,7 +197,7 @@ input_file: parser/test_data/inputs/try.py 911,912: ) (None) 912,913: ] (None) 913,914: NewLine (None) -914,914: Dedent (Indent(3)) +914,914: Dedent (Indent(1)) 914,914: Dedent (None) 914,914: Dedent (None) 914,920: Except (None) From 91820786474b58c11b3f92d3cdc411d45d03b676 Mon Sep 17 00:00:00 2001 From: Glyphack Date: Sun, 1 Sep 2024 13:48:00 +0200 Subject: [PATCH 5/7] Fix lexer incompatibility with python on mypy source --- compat/src/lexer_compat.rs | 63 +++++++--- compat/src/main.rs | 2 +- parser/src/lexer/mod.rs | 110 +++++++++++------- parser/src/lib.rs | 2 +- ...apshot_test_lexer@f-string-literals-7.snap | 2 +- ...apshot_test_lexer@f-string-literals-8.snap | 2 +- ..._snapshot_test_lexer@fstring_escape-0.snap | 7 ++ ..._snapshot_test_lexer@fstring_escape-1.snap | 16 +++ ...hot_test_lexer_and_errors@comments.py.snap | 6 +- ...xer_and_errors@separate_statements.py.snap | 6 +- ...snapshot_test_lexer_and_errors@try.py.snap | 2 +- ...rser__parser__parser__tests__comments.snap | 4 +- ...r__parser__tests__separate_statements.snap | 4 +- 13 files changed, 154 insertions(+), 72 deletions(-) create mode 100644 parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@fstring_escape-0.snap create mode 100644 parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@fstring_escape-1.snap diff --git a/compat/src/lexer_compat.rs b/compat/src/lexer_compat.rs index a2514e80..2b3c516d 100644 --- a/compat/src/lexer_compat.rs +++ b/compat/src/lexer_compat.rs @@ -164,8 +164,23 @@ pub fn assert_tokens_eq( if python_token.is_none() || enderpy_token.is_none() { mismatches.push(TokenMismatch::MissingToken(python_token, enderpy_token)); } else { - let python_token = python_token.unwrap(); - let enderpy_token = enderpy_token.unwrap(); + let mut python_token = python_token.unwrap(); + let mut enderpy_token = enderpy_token.unwrap(); + if python_token.kind == PythonKind::FstringStart { + if enderpy_token.kind == Kind::FStringStart { + // Python tokenizes fstring with more tokens than needed. + // So let's just skip the whole fstring part. For now. + // Skip until the end of the fstring. + while python_token.kind != PythonKind::FstringEnd { + python_index += 1; + python_token = python_tokens[python_index].clone(); + } + while enderpy_token.kind != Kind::FStringEnd { + enderpy_index += 1; + enderpy_token = enderpy_tokens[enderpy_index].clone() + } + } + } if let Some(mismatch) = check_tokens_match(python_token, enderpy_token, lexer) { if is_python_trailing_newline_mismatch( &mismatch, @@ -176,11 +191,14 @@ pub fn assert_tokens_eq( } else if is_python_fstring_mismatch( &mismatch, &enderpy_tokens[enderpy_index + 1..], + &python_tokens[python_index + 1..], + &mut python_index, &mut enderpy_index, // <-- `enderpy_index` may be updated ) { // Nothing, but don't add the mismatch. } else { mismatches.push(mismatch); + break; } } } @@ -589,25 +607,36 @@ fn is_python_trailing_newline_mismatch( fn is_python_fstring_mismatch( mismatch: &TokenMismatch, remaining_tokens: &[Token], + remaining_python_tokens: &[PythonToken], + python_index: &mut usize, enderpy_index: &mut usize, ) -> bool { - if let TokenMismatch::WrongKind(python_token, enderpy_token) = mismatch { - if !matches!( - enderpy_token.kind, - Kind::FStringStart | Kind::RawFStringStart - ) || python_token.kind != PythonKind::String - { - return false; - } - let mut num_skipped = 0; - for token in remaining_tokens { - num_skipped += 1; - if matches!(token.kind, Kind::FStringEnd | Kind::Eof) { - break; + match mismatch { + TokenMismatch::WrongKind(python_token, enderpy_token) => { + if !matches!( + enderpy_token.kind, + Kind::FStringStart | Kind::RawFStringStart + ) || python_token.kind != PythonKind::String + { + return false; + } + let mut num_skipped = 0; + for token in remaining_tokens { + num_skipped += 1; + if matches!(token.kind, Kind::FStringEnd | Kind::Eof) { + break; + } } + *enderpy_index += num_skipped; + return true; } - *enderpy_index += num_skipped; - return true; + // TokenMismatch::WrongValue(python_token, token, python_value, enderpy_value) => { + // if python_value == "{" { + // *python_index += 1; + // return true; + // } + // } + _ => (), } false } diff --git a/compat/src/main.rs b/compat/src/main.rs index bd735ec8..72a6d528 100644 --- a/compat/src/main.rs +++ b/compat/src/main.rs @@ -68,7 +68,7 @@ fn run_compatibility_test(file: &str) -> Result<()> { let python_tokens = lex_python_source(&source)?; assert_tokens_eq(python_tokens, enderpy_tokens, &lexer); - // python_parser_test_ast(&vec![source.as_str()]); + python_parser_test_ast(&vec![source.as_str()]); Ok(()) } diff --git a/parser/src/lexer/mod.rs b/parser/src/lexer/mod.rs index 00379c02..1b88415b 100644 --- a/parser/src/lexer/mod.rs +++ b/parser/src/lexer/mod.rs @@ -2,10 +2,11 @@ use unicode_id_start::{is_id_continue, is_id_start}; use crate::{ error::LexError, + get_row_col_position, token::{Kind, Token, TokenValue}, }; -#[derive(Debug, Clone, Copy)] +#[derive(Debug, Clone, Copy, PartialEq)] enum StringQuotation { Single, Double, @@ -46,6 +47,9 @@ pub struct Lexer<'a> { /// The current position in the source code current: u32, current_line: u16, + /// Array of all line starts offsets. Starts from line 0 + pub line_starts: Vec, + /// Keeps track of whether the lexer is at the start of a line start_of_line: bool, /// keeps track of the indentation level @@ -56,15 +60,10 @@ pub struct Lexer<'a> { tokenization_mode_stack: Vec, // When not zero lexer is in de indent mode next_token_is_dedent: u8, - /// Array of all line starts offsets. Starts from line 0 - pub line_starts: Vec, peek_mode: bool, /// Previous token was a Newline token non_logical_line_state: bool, - /// Cursor at position after the indentation in line - indented: bool, - // tokens: Vec, } impl<'a> Lexer<'a> { @@ -81,7 +80,6 @@ impl<'a> Lexer<'a> { line_starts: vec![0], peek_mode: false, non_logical_line_state: true, - indented: false, } } @@ -111,9 +109,14 @@ impl<'a> Lexer<'a> { // NOTE: for dedent we want to set the token start to the token end let mut start = self.current; + let kind = match self.next_kind() { Ok(kind) => kind, Err(e) => { + println!( + "position {:?}", + get_row_col_position(start, self.current, &self.line_starts) + ); panic!("Invalid token {e}"); } }; @@ -136,11 +139,10 @@ impl<'a> Lexer<'a> { self.non_logical_line_state = kind == Kind::NewLine; } let value = self.parse_token_value(kind, start); - let end = self.current; - - if (kind == Kind::NewLine || kind == Kind::NL) && !self.peek_mode { - self.line_starts.push(self.current); - } + let end = match kind { + Kind::FStringMiddle => start + value.as_str().expect("").len() as u32, + _ => self.current, + }; if kind == Kind::Dedent { start = end @@ -164,10 +166,8 @@ impl<'a> Lexer<'a> { let start_of_line = self.start_of_line; let next_token_is_dedent = self.next_token_is_dedent; let prev_token_newline = self.non_logical_line_state; - let indented = self.indented; self.peek_mode = true; let token = self.next_token(); - self.indented = indented; self.non_logical_line_state = prev_token_newline; self.peek_mode = false; self.current = current; @@ -353,14 +353,12 @@ impl<'a> Lexer<'a> { return Ok(Kind::LeftBracket); } '}' => { - if self.peek() != Some('}') { - if let Some(mode) = self.tokenization_mode_stack.last() { - if matches!(mode, TokenizationMode::PythonWithinFstring(_)) { - if !self.peek_mode { - self.tokenization_mode_stack.pop(); - } - return Ok(Kind::RightBracket); + if let Some(mode) = self.tokenization_mode_stack.last() { + if matches!(mode, TokenizationMode::PythonWithinFstring(_)) { + if !self.peek_mode { + self.tokenization_mode_stack.pop(); } + return Ok(Kind::RightBracket); } } return Ok(Kind::RightBracket); @@ -451,6 +449,7 @@ impl<'a> Lexer<'a> { // https://peps.python.org/pep-0701/#how-to-produce-these-new-tokens fn next_fstring_token(&mut self, str_finisher: StringQuotation, _fstring_nesting: u8) -> Kind { let mut read_chars = false; + let mut last_read_char: Option = None; loop { let peeked_char = self.peek(); let double_peek = self.double_peek(); @@ -476,6 +475,20 @@ impl<'a> Lexer<'a> { return Kind::LeftBracket; } } + if last_read_char != Some('\\') + && self.peek() == Some('\'') + && read_chars + && str_finisher == StringQuotation::Single + { + return Kind::FStringMiddle; + } + if last_read_char != Some('\\') + && self.peek() == Some('"') + && read_chars + && str_finisher == StringQuotation::Double + { + return Kind::FStringMiddle; + } let Some(curr) = self.next() else { panic!("eof while parsing fstring") @@ -484,10 +497,10 @@ impl<'a> Lexer<'a> { match str_finisher { StringQuotation::Single => { - if self.peek() == Some('\'') { + if curr != '\\' && self.peek() == Some('\'') { return Kind::FStringMiddle; } - if curr == '\'' { + if last_read_char != Some('\\') && curr == '\'' { if !self.peek_mode { let last = self.tokenization_mode_stack.pop(); assert!(matches!(last, Some(TokenizationMode::Fstring(_)))) @@ -545,6 +558,7 @@ impl<'a> Lexer<'a> { } } } + last_read_char = Some(curr); } } @@ -677,6 +691,9 @@ impl<'a> Lexer<'a> { let c = self.peek(); if let Some(c) = c { self.current += c.len_utf8() as u32; + if c == '\n' && !self.peek_mode { + self.line_starts.push(self.current); + } } c } @@ -740,21 +757,13 @@ impl<'a> Lexer<'a> { } fn consume_str(&mut self, str_start: char) -> Result<(), LexError> { - // string start position is current position - 1 because we already consumed the - // quote - let _str_start_pos = self.current - 1; let mut string_terminated = false; let mut last_read_char = str_start; // Check if string starts with triple quotes // if string started with triple quotes, we need to read 3 characters at a time if self.peek() == Some(str_start) && self.double_peek() == Some(str_start) { - self.next(); + self.double_next(); while let Some(c) = self.next() { - if c == '\n' { - if !self.peek_mode { - self.line_starts.push(self.current); - } - } if c == str_start && self.peek() == Some(str_start) && self.double_peek() == Some(str_start) @@ -764,20 +773,24 @@ impl<'a> Lexer<'a> { self.double_next(); break; } - last_read_char = c; + if last_read_char == '\\' && c == '\\' { + last_read_char = ' '; + } else { + last_read_char = c; + } } } else { while let Some(c) = self.next() { - if c == '\n' { - if !self.peek_mode { - self.line_starts.push(self.current); - } - } + // Two consecutive backslashes cancel out if c == str_start && last_read_char != '\\' { string_terminated = true; break; } - last_read_char = c; + if last_read_char == '\\' && c == '\\' { + last_read_char = ' '; + } else { + last_read_char = c; + } } } @@ -984,6 +997,9 @@ impl<'a> Lexer<'a> { return Ok(None); } } + if self.peek() == Some('#') { + return Ok(Some(Kind::WhiteSpace)); + } if let Some(top) = self.indent_stack.last() { match spaces_count.cmp(top) { Ordering::Less => { @@ -1058,13 +1074,17 @@ impl<'a> Lexer<'a> { }, Kind::StringLiteral | Kind::FStringStart - | Kind::FStringMiddle | Kind::FStringEnd | Kind::RawBytes | Kind::RawFStringStart | Kind::Bytes | Kind::Unicode | Kind::Comment => TokenValue::Str(kind_value.to_string()), + Kind::FStringMiddle => { + let value = kind_value.replace("{{", "{"); + let value = value.replace("}}", "}"); + TokenValue::Str(value) + } Kind::Dedent => TokenValue::Indent(1), Kind::Indent => TokenValue::Indent(1), Kind::Error => TokenValue::Str(kind_value.to_string()), @@ -1122,6 +1142,7 @@ mod tests { let mut snapshot = String::from(""); loop { let token = lexer.next_token(); + println!("{:?}", token); if token.kind == Kind::Eof { break; } @@ -1435,6 +1456,15 @@ def", .unwrap(); } + #[test] + fn test_fstring_escape() { + snapshot_test_lexer( + "fstring_escape", + &["f'Can\\'t find plugin'", "f\"{{{', '.join(dict_items)}}}\""], + ) + .unwrap(); + } + #[test] #[should_panic] fn test_unterminated_string_double_quotes() { diff --git a/parser/src/lib.rs b/parser/src/lib.rs index e9e5638a..1dfe4b29 100644 --- a/parser/src/lib.rs +++ b/parser/src/lib.rs @@ -28,7 +28,7 @@ pub fn get_row_col_position(start: u32, end: u32, line_starts: &[u32]) -> (u32, // When end line offset is exactly on line start it means that this is the new line // token end offset. We want to set the new line token line number same for start and // end. - Ok(idx) => (idx - 1, line_starts[idx - 1]), + Ok(idx) => (start_line_num, line_starts[idx - 1]), Err(idx) => (idx - 1, line_starts[idx - 1]), }; let end_line_column = end.saturating_sub(end_line_offset); diff --git a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@f-string-literals-7.snap b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@f-string-literals-7.snap index 951b3ef9..ccd1da85 100644 --- a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@f-string-literals-7.snap +++ b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@f-string-literals-7.snap @@ -3,5 +3,5 @@ source: parser/src/lexer/mod.rs description: "f\"{{hey}}\"" --- 0,2: FStringStart (Str("f\"")) -2,9: FstringMiddle (Str("{{hey}}")) +2,7: FstringMiddle (Str("{hey}")) 9,10: FStringEnd (Str("\"")) diff --git a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@f-string-literals-8.snap b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@f-string-literals-8.snap index 7fb669f6..247f0b2b 100644 --- a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@f-string-literals-8.snap +++ b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@f-string-literals-8.snap @@ -3,5 +3,5 @@ source: parser/src/lexer/mod.rs description: "f\"oh_{{hey}}\"" --- 0,2: FStringStart (Str("f\"")) -2,12: FstringMiddle (Str("oh_{{hey}}")) +2,10: FstringMiddle (Str("oh_{hey}")) 12,13: FStringEnd (Str("\"")) diff --git a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@fstring_escape-0.snap b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@fstring_escape-0.snap new file mode 100644 index 00000000..2ba64438 --- /dev/null +++ b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@fstring_escape-0.snap @@ -0,0 +1,7 @@ +--- +source: parser/src/lexer/mod.rs +description: "f'Can\\'t find plugin'" +--- +0,2: FStringStart (Str("f'")) +2,20: FstringMiddle (Str("Can\\'t find plugin")) +20,21: FStringEnd (Str("'")) diff --git a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@fstring_escape-1.snap b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@fstring_escape-1.snap new file mode 100644 index 00000000..1c18bd4e --- /dev/null +++ b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@fstring_escape-1.snap @@ -0,0 +1,16 @@ +--- +source: parser/src/lexer/mod.rs +description: "f\"{{{', '.join(dict_items)}}}\"" +--- +0,2: FStringStart (Str("f\"")) +2,3: FstringMiddle (Str("{")) +4,5: { (None) +5,9: StringLiteral (Str("', '")) +9,10: . (None) +10,14: Identifier (Str("join")) +14,15: ( (None) +15,25: Identifier (Str("dict_items")) +25,26: ) (None) +26,27: } (None) +27,28: FstringMiddle (Str("}")) +29,30: FStringEnd (Str("\"")) diff --git a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@comments.py.snap b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@comments.py.snap index a839acbd..27517a5b 100644 --- a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@comments.py.snap +++ b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@comments.py.snap @@ -27,13 +27,13 @@ input_file: parser/test_data/inputs/comments.py 41,42: ) (None) 42,43: : (None) 43,44: NewLine (None) -44,48: Indent (Indent(1)) 48,51: Comment (Str("# a")) -51,52: NewLine (None) +51,52: NL (None) +52,56: Indent (Indent(1)) 56,59: Ellipsis (None) 60,63: Comment (Str("# a")) 63,64: NewLine (None) 64,65: NL (None) -65,65: Dedent (Indent(1)) 65,94: Comment (Str("# this is a comment only line")) 94,95: NL (None) +95,95: Dedent (Indent(1)) diff --git a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@separate_statements.py.snap b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@separate_statements.py.snap index b7ba3d79..abf939ab 100644 --- a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@separate_statements.py.snap +++ b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@separate_statements.py.snap @@ -34,19 +34,19 @@ input_file: parser/test_data/inputs/separate_statements.py 153,154: ) (None) 154,155: : (None) 155,156: NewLine (None) -156,160: Indent (Indent(1)) 160,203: Comment (Str("# TODO(parser): enable after error handling")) -203,204: NewLine (None) +203,204: NL (None) 208,221: Comment (Str("# x = 1 y = 2")) 221,222: NL (None) +222,226: Indent (Indent(1)) 226,232: Return (None) 233,234: Identifier (Str("x")) 235,236: + (None) 237,238: Identifier (Str("y")) 238,239: NewLine (None) 239,240: NL (None) -240,240: Dedent (Indent(1)) 240,283: Comment (Str("# TODO(parser): enable after error handling")) 283,284: NL (None) 284,298: Comment (Str("# a = 1 b = 2")) 298,299: NL (None) +299,299: Dedent (Indent(1)) diff --git a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@try.py.snap b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@try.py.snap index b4df2763..457ab2f2 100644 --- a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@try.py.snap +++ b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer_and_errors@try.py.snap @@ -19,9 +19,9 @@ input_file: parser/test_data/inputs/try.py 94,95: ] (None) 95,96: ) (None) 96,97: NewLine (None) -97,97: Dedent (Indent(1)) 97,134: Comment (Str("# when instead of number value is `F`")) 134,135: NL (None) +135,135: Dedent (Indent(1)) 135,141: Except (None) 142,143: ( (None) 143,153: Identifier (Str("ValueError")) diff --git a/parser/test_data/output/enderpy_python_parser__parser__parser__tests__comments.snap b/parser/test_data/output/enderpy_python_parser__parser__parser__tests__comments.snap index 26531029..66bca8e2 100644 --- a/parser/test_data/output/enderpy_python_parser__parser__parser__tests__comments.snap +++ b/parser/test_data/output/enderpy_python_parser__parser__parser__tests__comments.snap @@ -5,7 +5,7 @@ description: "test file: test_data/inputs/comments.py\n# a\n# a\n\n# a\n# b\ndef Module { node: Node { start: 0, - end: 65, + end: 95, }, body: [ FunctionDef( @@ -51,7 +51,7 @@ Module { FunctionDef { node: Node { start: 35, - end: 65, + end: 64, }, name: "b", args: Arguments { diff --git a/parser/test_data/output/enderpy_python_parser__parser__parser__tests__separate_statements.snap b/parser/test_data/output/enderpy_python_parser__parser__parser__tests__separate_statements.snap index 9335a230..ed980d63 100644 --- a/parser/test_data/output/enderpy_python_parser__parser__parser__tests__separate_statements.snap +++ b/parser/test_data/output/enderpy_python_parser__parser__parser__tests__separate_statements.snap @@ -5,7 +5,7 @@ description: "test file: test_data/inputs/separate_statements.py\n# Test case to Module { node: Node { start: 0, - end: 240, + end: 299, }, body: [ FunctionDef( @@ -133,7 +133,7 @@ Module { FunctionDef { node: Node { start: 145, - end: 240, + end: 239, }, name: "bar", args: Arguments { From 02571505987187d6ebe3a1581d4789c3a83f746a Mon Sep 17 00:00:00 2001 From: Glyphack Date: Sun, 1 Sep 2024 15:41:17 +0200 Subject: [PATCH 6/7] Fix clippy warnings --- {parser => compat}/ast_python.py | 45 +++++--- {parser => compat}/lex_python.py | 0 compat/src/lexer_compat.rs | 25 ++--- compat/src/main.rs | 2 - compat/src/parser_compat.rs | 100 ++++++++---------- lsp/src/main.rs | 8 +- parser/src/lexer/mod.rs | 2 +- parser/src/parser/compat.rs | 25 +++-- ...snapshot_test_lexer@string-literals-7.snap | 5 + typechecker/src/ast_visitor.rs | 1 + typechecker/src/build.rs | 19 ++-- typechecker/src/file.rs | 12 +-- typechecker/src/semantic_analyzer.rs | 2 +- typechecker/src/types.rs | 3 - 14 files changed, 129 insertions(+), 120 deletions(-) rename {parser => compat}/ast_python.py (76%) rename {parser => compat}/lex_python.py (100%) create mode 100644 parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@string-literals-7.snap diff --git a/parser/ast_python.py b/compat/ast_python.py similarity index 76% rename from parser/ast_python.py rename to compat/ast_python.py index e9720532..b87a3992 100644 --- a/parser/ast_python.py +++ b/compat/ast_python.py @@ -1,17 +1,19 @@ import sys import ast -from _ast import AST # Python internals I guess? +from _ast import AST # Python internals I guess? import argparse import pathlib import codecs import json -arg_parser = argparse.ArgumentParser( - description="Parse a Python program to AST." -) +arg_parser = argparse.ArgumentParser(description="Parse a Python program to AST.") arg_parser.add_argument("--input-file", help="Read and parse input file.") -arg_parser.add_argument("--stdin", action="store_true", help="Read and parse input from stdin.") -arg_parser.add_argument("--type-comments", action="store_true", help="Produce an AST with type comments.") +arg_parser.add_argument( + "--stdin", action="store_true", help="Read and parse input from stdin." +) +arg_parser.add_argument( + "--type-comments", action="store_true", help="Produce an AST with type comments." +) args = arg_parser.parse_args() if args.input_file is not None: @@ -19,7 +21,10 @@ elif args.stdin: source = sys.stdin.read() else: - print("Missing input parameter. Please specify one of --input-file or --stdin.", file=sys.stderr) + print( + "Missing input parameter. Please specify one of --input-file or --stdin.", + file=sys.stderr, + ) sys.exit(1) # ----- Begin inline dependency ------------------------------------------------------------------- @@ -53,16 +58,19 @@ BUILTIN_PURE = (int, float, bool) BUILTIN_BYTES = (bytearray, bytes) -BUILTIN_STR = (str) +BUILTIN_STR = str + def decode_str(value): return value + def decode_bytes(value): try: - return value.decode('utf-8') + return value.decode("utf-8") except: - return codecs.getencoder('hex_codec')(value)[0].decode('utf-8') + return codecs.getencoder("hex_codec")(value)[0].decode("utf-8") + def ast2json(node): assert isinstance(node, AST) @@ -72,8 +80,13 @@ def ast2json(node): if attr.startswith("_") or attr == "n" or attr == "s": continue to_return[attr] = get_value(getattr(node, attr)) + to_return.pop("lineno", None) + to_return.pop("end_lineno", None) + to_return.pop("col_offset", None) + to_return.pop("end_col_offset", None) return to_return + def get_value(attr_value): if attr_value is None: return attr_value @@ -92,11 +105,19 @@ def get_value(attr_value): if isinstance(attr_value, type(Ellipsis)): return "..." else: - raise Exception("Unknown case for '%s' of type '%s'" % (attr_value, type(attr_value))) + raise Exception( + "Unknown case for '%s' of type '%s'" % (attr_value, type(attr_value)) + ) + # -------------------------------------------------------------------- End inline dependency ------ -tree = ast.parse(source, filename=args.input_file or "stdin", mode="exec", type_comments=args.type_comments) +tree = ast.parse( + source, + filename=args.input_file or "stdin", + mode="exec", + type_comments=args.type_comments, +) tree_json = ast2json(tree) print(json.dumps(tree_json, indent=4)) diff --git a/parser/lex_python.py b/compat/lex_python.py similarity index 100% rename from parser/lex_python.py rename to compat/lex_python.py diff --git a/compat/src/lexer_compat.rs b/compat/src/lexer_compat.rs index 2b3c516d..b7b26927 100644 --- a/compat/src/lexer_compat.rs +++ b/compat/src/lexer_compat.rs @@ -122,7 +122,7 @@ pub struct PythonToken { pub fn lex_python_source(source: &str) -> Result> { let mut process = spawn_python_script_command( - "parser/lex_python.py", + "compat/lex_python.py", vec!["--stdin", "--output-format", "json"], default_python_path()?, )?; @@ -131,7 +131,7 @@ pub fn lex_python_source(source: &str) -> Result> { if let Some(mut stdin) = process.stdin.take() { stdin.write_all(source.as_bytes()).into_diagnostic()?; } else { - bail!("Failed to open stdin when running `parser/lex_python.py`"); + bail!("Failed to open stdin when running `compat/lex_python.py`"); } // Get process stdout and parse result. let output = process.wait_with_output().into_diagnostic()?; @@ -166,6 +166,9 @@ pub fn assert_tokens_eq( } else { let mut python_token = python_token.unwrap(); let mut enderpy_token = enderpy_token.unwrap(); + // (compat_fstrings) TODO: python fstring is a bit different than enderpy. + // We merge multiple fstring middle tokens together and emit one token but python emits + // multiple fstring middle tokens. Here we skip to the end and do not check fstrings. if python_token.kind == PythonKind::FstringStart { if enderpy_token.kind == Kind::FStringStart { // Python tokenizes fstring with more tokens than needed. @@ -191,8 +194,6 @@ pub fn assert_tokens_eq( } else if is_python_fstring_mismatch( &mismatch, &enderpy_tokens[enderpy_index + 1..], - &python_tokens[python_index + 1..], - &mut python_index, &mut enderpy_index, // <-- `enderpy_index` may be updated ) { // Nothing, but don't add the mismatch. @@ -607,8 +608,6 @@ fn is_python_trailing_newline_mismatch( fn is_python_fstring_mismatch( mismatch: &TokenMismatch, remaining_tokens: &[Token], - remaining_python_tokens: &[PythonToken], - python_index: &mut usize, enderpy_index: &mut usize, ) -> bool { match mismatch { @@ -630,12 +629,6 @@ fn is_python_fstring_mismatch( *enderpy_index += num_skipped; return true; } - // TokenMismatch::WrongValue(python_token, token, python_value, enderpy_value) => { - // if python_value == "{" { - // *python_index += 1; - // return true; - // } - // } _ => (), } false @@ -875,12 +868,10 @@ def", "a = f\"hello\"", "f\"\"\"hello\"\"\"", "f'''hello'''", - // TODO lex_python: Python lexes these poorly. - // "f\"{{hey}}\"", - // "f\"oh_{{hey}}\"", + "f\"{{hey}}\"", + "f\"oh_{{hey}}\"", "f'a' 'c'", - // TODO lex_python: Python 3.11 chokes on this input. - // "f'hello_{f'''{a}'''}'", + "f'hello_{f'''{a}'''}'", ]); // Raw F-strings diff --git a/compat/src/main.rs b/compat/src/main.rs index 72a6d528..690e62d1 100644 --- a/compat/src/main.rs +++ b/compat/src/main.rs @@ -7,7 +7,6 @@ use std::path::Path; use zip::ZipArchive; use self::lexer_compat::{assert_tokens_eq, lex_python_source}; -use self::parser_compat::python_parser_test_ast; pub mod lexer_compat; pub mod parser_compat; @@ -68,7 +67,6 @@ fn run_compatibility_test(file: &str) -> Result<()> { let python_tokens = lex_python_source(&source)?; assert_tokens_eq(python_tokens, enderpy_tokens, &lexer); - python_parser_test_ast(&vec![source.as_str()]); Ok(()) } diff --git a/compat/src/parser_compat.rs b/compat/src/parser_compat.rs index 037f994f..34944332 100644 --- a/compat/src/parser_compat.rs +++ b/compat/src/parser_compat.rs @@ -1,3 +1,4 @@ +#![allow(clippy::all)] use assert_json_diff::assert_json_matches_no_panic; use miette::{bail, IntoDiagnostic, Result}; use serde_json::Value; @@ -17,7 +18,7 @@ use terminal_size::{terminal_size, Width as TerminalWidth}; fn parse_python_source(source: &str) -> Result { let mut process = spawn_python_script_command( - "parser/ast_python.py", + "compat/ast_python.py", vec!["--stdin"], default_python_path()?, )?; @@ -26,7 +27,7 @@ fn parse_python_source(source: &str) -> Result { if let Some(mut stdin) = process.stdin.take() { stdin.write_all(source.as_bytes()).into_diagnostic()?; } else { - bail!("Failed to open stdin when running `parser/ast_python.py`"); + bail!("Failed to open stdin when running `compat/ast_python.py`"); } // Get process stdout and parse result. let output = process.wait_with_output().into_diagnostic()?; @@ -115,6 +116,7 @@ fn parse_enderpy_source(source: &str) -> Result { Ok(ast) } +#[allow(unused_macros)] macro_rules! parser_test { ($test_name:ident, $test_file:expr) => { #[test] @@ -219,11 +221,10 @@ mod tests { python_parser_test_ast(&[ "a or b", "a and b", - // TODO ast_python: Python parses this as a BoolOp with 3 values. + // TODO: Python parses this as a BoolOp with 3 values. // i.e. {"op": "or", "values": ["a", "b", "c"]} // Enderpy parses this as a nested set of BoolOps. // i.e. {"op": "or", "values": ["a", {"op": "or", "values": ["b", "c"]}]} - // I'm not sure which is correct. // "a or b or c", "a and b or c", ]); @@ -236,24 +237,22 @@ mod tests { #[test] fn test_named_expression() { - // TODO ast_python: Enderpy chokes on this. - // python_parser_test_ast(&["(a := b)"]); + python_parser_test_ast(&["(a := b)"]); } #[test] fn test_tuple() { python_parser_test_ast(&[ "(a, b, c)", - // TODO ast_python: Enderpy doesn't handle newlines within a nested context. "(a, b, c)", "(a , b, c)", - // "(a, - // b, - // c)", - // "(a, - // )", + "(a, + b, + c)", + "(a, + )", "(a, b, c,)", ]); } @@ -263,12 +262,6 @@ mod tests { python_parser_test_ast(&["yield", "yield a", "yield from a"]); } - #[test] - fn test_starred() { - // TODO ast_python: Enderpy chokes on this. - // python_parser_test_ast(&["(*a)"]); - } - #[test] fn test_await_expression() { python_parser_test_ast(&["await a"]); @@ -326,14 +319,13 @@ mod tests { "'a' 'b'", // TODO ast_python: Enderpy evaluates this as 'r"a"b'. This seems wrong. // "r'a' 'b'", - // TODO ast_python: Enderpy doesn't handle newlines within a nested context. - // "('a' - // 'b')", - // "('a' - // 'b', 'c')", - // "('a' - // 'b' - // 'c')", + "('a' + 'b')", + "('a' + 'b', 'c')", + "('a' + 'b' + 'c')", // TODO ast_python: Python evaluates this as "ac". Enderpy creates 2 constants. // "f'a' 'c'", // TODO ast_python: Python evaluates this as "abc". Enderpy creates 3 constants. @@ -351,8 +343,7 @@ mod tests { "f'hello_{a}'", "f'hello_{a} {b}'", "f'hello_{a} {b} {c}'", - // unsupported - // "f'hello_{f'''{a}'''}'", + "f'hello_{f'''{a}'''}'", ]); } @@ -435,35 +426,38 @@ except *Exception as e: ]); } - // parser_test!(test_functions, "test_data/inputs/functions.py"); - // parser_test!(test_if, "test_data/inputs/if.py"); - // parser_test!(test_indentation, "test_data/inputs/indentation.py"); - // parser_test!( - // test_separate_statements, - // "test_data/inputs/separate_statements.py" - // ); - // parser_test!(test_try, "test_data/inputs/try.py"); + parser_test!(test_functions, "../parser/test_data/inputs/functions.py"); + parser_test!(test_if, "../parser/test_data/inputs/if.py"); + parser_test!( + test_indentation, + "../parser/test_data/inputs/indentation.py" + ); + parser_test!( + test_separate_statements, + "../parser/test_data/inputs/separate_statements.py" + ); + // parser_test!(test_try, "../parser/test_data/inputs/try.py"); // parser_test!( // annotated_assignment, - // "test_data/inputs/annotated_assignment.py" + // "../parser/test_data/inputs/annotated_assignment.py" // ); - // parser_test!(binary_op, "test_data/inputs/binary_op.py"); - // parser_test!(class, "test_data/inputs/class.py"); - // parser_test!(dict, "test_data/inputs/dict.py"); - // parser_test!(test_for, "test_data/inputs/for.py"); - // parser_test!(from_import, "test_data/inputs/from_import.py"); - // parser_test!(function_def, "test_data/inputs/function_def.py"); + parser_test!(binary_op, "../parser/test_data/inputs/binary_op.py"); + parser_test!(class, "../parser/test_data/inputs/class.py"); + // parser_test!(dict, "../parser/test_data/inputs/dict.py"); + // parser_test!(test_for, "../parser/test_data/inputs/for.py"); + parser_test!(from_import, "../parser/test_data/inputs/from_import.py"); + parser_test!(function_def, "../parser/test_data/inputs/function_def.py"); // parser_test!( // generator_expressions, - // "test_data/inputs/generator_expressions.py" + // "../parser/test_data/inputs/generator_expressions.py" // ); - // parser_test!(lists, "test_data/inputs/lists.py"); - // parser_test!(test_match, "test_data/inputs/match.py"); - // parser_test!(sets, "test_data/inputs/sets.py"); - // parser_test!(string, "test_data/inputs/string.py"); - // parser_test!(subscript, "test_data/inputs/subscript.py"); - // parser_test!(with, "test_data/inputs/with.py"); - // parser_test!(newlines, "test_data/inputs/newlines.py"); - // parser_test!(comments, "test_data/inputs/comments.py"); - // parser_test!(types_alias, "test_data/inputs/type_alias.py"); + // parser_test!(lists, "../parser/test_data/inputs/lists.py"); + // parser_test!(test_match, "../parser/test_data/inputs/match.py"); + // parser_test!(sets, "../parser/test_data/inputs/sets.py"); + // parser_test!(string, "../parser/test_data/inputs/string.py"); + // parser_test!(subscript, "../parser/test_data/inputs/subscript.py"); + // parser_test!(with, "../parser/test_data/inputs/with.py"); + // parser_test!(newlines, "../parser/test_data/inputs/newlines.py"); + parser_test!(comments, "../parser/test_data/inputs/comments.py"); + // parser_test!(types_alias, "../parser/test_data/inputs/type_alias.py"); } diff --git a/lsp/src/main.rs b/lsp/src/main.rs index 239dc773..8b6368b4 100644 --- a/lsp/src/main.rs +++ b/lsp/src/main.rs @@ -6,12 +6,12 @@ use log::LevelFilter; use tower_lsp::{jsonrpc::Result, lsp_types::*, Client, LanguageServer, LspService, Server}; #[derive(Debug)] -struct Backend<'a> { +struct Backend { client: Client, - manager: BuildManager<'a>, + manager: BuildManager, } -impl<'a> Backend<'a> { +impl<'a> Backend { fn build(&self, path: PathBuf) { let root = find_project_root(&path); self.manager.build_one(root, &path); @@ -20,7 +20,7 @@ impl<'a> Backend<'a> { } #[tower_lsp::async_trait] -impl LanguageServer for Backend<'static> { +impl LanguageServer for Backend { async fn initialize(&self, i: InitializeParams) -> Result { let root = match i.root_uri { Some(v) => v.to_file_path().unwrap_or(PathBuf::from("")), diff --git a/parser/src/lexer/mod.rs b/parser/src/lexer/mod.rs index 1b88415b..d8fbfa93 100644 --- a/parser/src/lexer/mod.rs +++ b/parser/src/lexer/mod.rs @@ -1142,7 +1142,6 @@ mod tests { let mut snapshot = String::from(""); loop { let token = lexer.next_token(); - println!("{:?}", token); if token.kind == Kind::Eof { break; } @@ -1295,6 +1294,7 @@ mod tests { "'hello'", "\"\"\"hello\"\"\"", "'''hello'''", + "'\\n ([^ ].*)'", ], ) .unwrap(); diff --git a/parser/src/parser/compat.rs b/parser/src/parser/compat.rs index 144db22a..a18e012c 100644 --- a/parser/src/parser/compat.rs +++ b/parser/src/parser/compat.rs @@ -1,3 +1,5 @@ +#![allow(clippy::all, unused_variables)] + use crate::ast::*; use crate::Parser; use serde_json::Number; @@ -28,10 +30,11 @@ macro_rules! json_python_compat_node { let (start_row, start_col, end_row, end_col) = $parser.to_row_col($instance.node.start, $instance.node.end); node["_type"] = json!($name); - node["lineno"] = json!(start_row); - node["col_offset"] = json!(start_col); - node["end_lineno"] = json!(end_row); - node["end_col_offset"] = json!(end_col); + // TODO: (offset_compat) + // node["lineno"] = json!(start_row); + // node["col_offset"] = json!(start_col); + // node["end_lineno"] = json!(end_row); + // node["end_col_offset"] = json!(end_col); node }}; } @@ -52,10 +55,12 @@ impl AsPythonCompat for Statement { let expr = e.as_python_compat(parser); json!({ "_type": "Expr", - "lineno": expr["lineno"], - "col_offset": expr["col_offset"], - "end_lineno": expr["end_lineno"], - "end_col_offset": expr["end_col_offset"], + // TODO: (offset_compat) python parser and enderpy start/end offsets are different + // Fix it when have some time + // "lineno": expr["lineno"], + // "col_offset": expr["col_offset"], + // "end_lineno": expr["end_lineno"], + // "end_col_offset": expr["end_col_offset"], "value": expr, }) } @@ -204,7 +209,7 @@ impl AsPythonCompat for Import { impl AsPythonCompat for Alias { fn as_python_compat(&self, parser: &Parser) -> Value { - json_python_compat_node!("Alias", self, parser, { + json_python_compat_node!("alias", self, parser, { "name": self.name, "asname": self.asname, }) @@ -414,7 +419,7 @@ impl AsPythonCompat for BinaryOperator { impl AsPythonCompat for NamedExpression { fn as_python_compat(&self, parser: &Parser) -> Value { - json_python_compat_node!("NamedExpression", self, parser, { + json_python_compat_node!("NamedExpr", self, parser, { "target": self.target.as_python_compat(parser), "value": self.value.as_python_compat(parser), }) diff --git a/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@string-literals-7.snap b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@string-literals-7.snap new file mode 100644 index 00000000..5c0dc3d1 --- /dev/null +++ b/parser/test_data/output/enderpy_python_parser__lexer__tests__snapshot_test_lexer@string-literals-7.snap @@ -0,0 +1,5 @@ +--- +source: parser/src/lexer/mod.rs +description: "'\\n ([^ ].*)'" +--- +0,16: StringLiteral (Str("'\\n ([^ ].*)'")) diff --git a/typechecker/src/ast_visitor.rs b/typechecker/src/ast_visitor.rs index e3591491..979574f6 100644 --- a/typechecker/src/ast_visitor.rs +++ b/typechecker/src/ast_visitor.rs @@ -7,6 +7,7 @@ use enderpy_python_parser::ast::*; /// This is useful for visitors that only need to visit a few nodes /// and don't want to implement all the methods. /// The overridden methods must make sure to continue the traversal. +#[allow(dead_code)] pub trait TraversalVisitor { fn visit_stmt(&mut self, s: &Statement) { // map all statements and call visit diff --git a/typechecker/src/build.rs b/typechecker/src/build.rs index 28b89480..83d9a4b2 100755 --- a/typechecker/src/build.rs +++ b/typechecker/src/build.rs @@ -21,8 +21,8 @@ use crate::{ }; #[derive(Debug)] -pub struct BuildManager<'a> { - pub modules: DashMap>, +pub struct BuildManager { + pub modules: DashMap, pub diagnostics: DashMap>, pub symbol_tables: DashMap, pub module_ids: DashMap, @@ -31,7 +31,7 @@ pub struct BuildManager<'a> { host: ruff_python_resolver::host::StaticHost, } #[allow(unused)] -impl<'a> BuildManager<'a> { +impl<'a> BuildManager { pub fn new(settings: Settings) -> Self { let mut builder = Builder::new(); @@ -65,7 +65,7 @@ impl<'a> BuildManager<'a> { } } - pub fn get_state(&self, path: &Path) -> EnderpyFile<'a> { + pub fn get_state(&self, path: &Path) -> EnderpyFile { let id = self.module_ids.get(path).expect("path not found"); let result = self.modules.get(&id).unwrap(); return result.value().clone(); @@ -154,24 +154,23 @@ impl<'a> BuildManager<'a> { #[derive(Debug, Clone)] pub struct ResolvedImport { pub resolved_ids: Vec, - result: ImportResult, + _result: ImportResult, } pub type ResolvedImports = HashMap>; fn gather_files<'a>( - mut initial_files: Vec>, + mut initial_files: Vec, root: &Path, import_config: &ruff_python_resolver::config::Config, host: &ruff_python_resolver::host::StaticHost, -) -> (ResolvedImports, HashSet>) { +) -> (ResolvedImports, HashSet) { let execution_environment = &execution_environment::ExecutionEnvironment { root: root.to_path_buf(), python_version: ruff_python_resolver::python_version::PythonVersion::Py312, python_platform: ruff_python_resolver::python_platform::PythonPlatform::Darwin, extra_paths: vec![], }; - let mut path_to_id: HashMap<&Path, Id> = HashMap::with_capacity(initial_files.len() * 5); let mut new_modules = HashSet::with_capacity(initial_files.len() * 5); let mut import_results = HashMap::new(); let mut seen = HashSet::new(); @@ -227,7 +226,7 @@ fn gather_files<'a>( import_desc, Arc::new(ResolvedImport { resolved_ids, - result: resolved, + _result: resolved, }), ); } @@ -249,7 +248,7 @@ fn gather_files<'a>( } fn resolve_file_imports( - file: &EnderpyFile<'_>, + file: &EnderpyFile, execution_environment: &ruff_python_resolver::execution_environment::ExecutionEnvironment, import_config: &ruff_python_resolver::config::Config, host: &ruff_python_resolver::host::StaticHost, diff --git a/typechecker/src/file.rs b/typechecker/src/file.rs index b1aa2a9b..c8c77c61 100755 --- a/typechecker/src/file.rs +++ b/typechecker/src/file.rs @@ -22,7 +22,7 @@ pub enum ImportKinds<'a> { /// EnderpyFile<'a>holds information about the files in the analyze /// and methods to perform semantic analysis and type check on them #[derive(Clone, Debug)] -pub struct EnderpyFile<'a> { +pub struct EnderpyFile { pub id: symbol_table::Id, pub module: String, // if this source is found by following an import @@ -31,18 +31,17 @@ pub struct EnderpyFile<'a> { pub source: String, pub line_starts: Vec, pub tree: ast::Module, - dummy: &'a str, } -impl<'a> Eq for EnderpyFile<'a> {} +impl<'a> Eq for EnderpyFile {} -impl<'a> PartialEq for EnderpyFile<'a> { +impl<'a> PartialEq for EnderpyFile { fn eq(&self, other: &Self) -> bool { self.id == other.id && self.path == other.path } } -impl<'a> std::hash::Hash for EnderpyFile<'a> { +impl<'a> std::hash::Hash for EnderpyFile { fn hash(&self, state: &mut H) { self.id.hash(state); self.path.hash(state); @@ -54,7 +53,7 @@ fn get_id() -> u32 { COUNTER.fetch_add(1, Ordering::SeqCst) as u32 } -impl<'a> EnderpyFile<'a> { +impl<'a> EnderpyFile { pub fn new(path: PathBuf, followed: bool) -> Self { let source = std::fs::read_to_string(&path).unwrap_or_else(|_| panic!("cannot read file {path:?}")); @@ -77,7 +76,6 @@ impl<'a> EnderpyFile<'a> { module, tree, path: Arc::new(path), - dummy: "sdfsd", } } pub fn module_name(&self) -> String { diff --git a/typechecker/src/semantic_analyzer.rs b/typechecker/src/semantic_analyzer.rs index 813e474e..542cf833 100644 --- a/typechecker/src/semantic_analyzer.rs +++ b/typechecker/src/semantic_analyzer.rs @@ -39,7 +39,7 @@ pub struct FunctionInformation { #[allow(unused)] impl<'a> SemanticAnalyzer<'a> { - pub fn new(file: &'a EnderpyFile<'a>, imports: &'a ResolvedImports) -> Self { + pub fn new(file: &'a EnderpyFile, imports: &'a ResolvedImports) -> Self { let symbols = SymbolTable::new(&file.path, file.id); SemanticAnalyzer { symbol_table: symbols, diff --git a/typechecker/src/types.rs b/typechecker/src/types.rs index 29b1c138..d6298fd1 100644 --- a/typechecker/src/types.rs +++ b/typechecker/src/types.rs @@ -1,8 +1,5 @@ use is_macro::Is; use std::fmt::Display; -use std::path::PathBuf; - -use enderpy_python_parser::ast; use crate::symbol_table::{self, Id}; From c61a913008649968cd85869e9048b18a830eb573 Mon Sep 17 00:00:00 2001 From: Glyphack Date: Sun, 1 Sep 2024 15:41:54 +0200 Subject: [PATCH 7/7] Ignore mypy master --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 5d3cdc45..dab45ae7 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,5 @@ out/ **/dist/ # vscode extension *.vsix + +mypy-master/