From 9820c046707be80103f21379da1434ad9bec4a06 Mon Sep 17 00:00:00 2001 From: Dhruv Manilawala Date: Thu, 14 Sep 2023 07:16:49 +0530 Subject: [PATCH] Add support for the new f-string tokens per PEP 701 (#6659) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary This PR adds support in the lexer for the newly added f-string tokens as per PEP 701. The following new tokens are added: * `FStringStart`: Token value for the start of an f-string. This includes the `f`/`F`/`fr` prefix and the opening quote(s). * `FStringMiddle`: Token value that includes the portion of text inside the f-string that's not part of the expression part and isn't an opening or closing brace. * `FStringEnd`: Token value for the end of an f-string. This includes the closing quote. Additionally, a new `Exclamation` token is added for conversion (`f"{foo!s}"`) as that's part of an expression. ## Test Plan New test cases are added to for various possibilities using snapshot testing. The output has been verified using python/cpython@f2cc00527e. ## Benchmarks _I've put the number of f-strings for each of the following files after the file name_ ``` lexer/large/dataset.py (1) 1.05 612.6±91.60µs 66.4 MB/sec 1.00 584.7±33.72µs 69.6 MB/sec lexer/numpy/ctypeslib.py (0) 1.01 131.8±3.31µs 126.3 MB/sec 1.00 130.9±5.37µs 127.2 MB/sec lexer/numpy/globals.py (1) 1.02 13.2±0.43µs 222.7 MB/sec 1.00 13.0±0.41µs 226.8 MB/sec lexer/pydantic/types.py (8) 1.13 285.0±11.72µs 89.5 MB/sec 1.00 252.9±10.13µs 100.8 MB/sec lexer/unicode/pypinyin.py (0) 1.03 32.9±1.92µs 127.5 MB/sec 1.00 31.8±1.25µs 132.0 MB/sec ``` It seems that overall the lexer has regressed. I profiled every file mentioned above and I saw one improvement which is done in (098ee5d493ca83238754a8cb4629fa1b91144b84). But otherwise I don't see anything else. A few notes by isolating the f-string part in the profile: * As we're adding new tokens and functionality to emit them, I expect the lexer to take more time because of more code. * The `lex_fstring_middle_or_end` takes the most amount of time followed by the `current_mut` line when lexing the `:` token. The latter is to check if we're at the start of a format spec or not. * In a f-string heavy file such as https://github.com/python/cpython/blob/main/Lib/test/test_fstring.py [^1] (293), most of the time in `lex_fstring_middle_or_end` is accounted by string allocation for the string literal part of `FStringMiddle` token (https://share.firefox.dev/3ErEa1W) I don't see anything out of ordinary for `pydantic/types` profile (https://share.firefox.dev/45XcLRq) fixes: #7042 [^1]: We could add this in lexer and parser benchmark --- Cargo.lock | 1 + crates/ruff_python_parser/Cargo.toml | 1 + crates/ruff_python_parser/src/lexer.rs | 422 +++++++++++++++++- crates/ruff_python_parser/src/lexer/cursor.rs | 12 + .../ruff_python_parser/src/lexer/fstring.rs | 158 +++++++ ..._parser__lexer__tests__empty_fstrings.snap | 66 +++ ..._python_parser__lexer__tests__fstring.snap | 88 ++++ ...arser__lexer__tests__fstring_comments.snap | 60 +++ ...ser__lexer__tests__fstring_conversion.snap | 116 +++++ ..._parser__lexer__tests__fstring_escape.snap | 78 ++++ ...ser__lexer__tests__fstring_escape_raw.snap | 78 ++++ ...__tests__fstring_expression_multiline.snap | 72 +++ ...rser__lexer__tests__fstring_multiline.snap | 99 ++++ ...__lexer__tests__fstring_named_unicode.snap | 25 ++ ...xer__tests__fstring_named_unicode_raw.snap | 46 ++ ..._parser__lexer__tests__fstring_nested.snap | 163 +++++++ ...er__lexer__tests__fstring_parentheses.snap | 154 +++++++ ..._parser__lexer__tests__fstring_prefix.snap | 90 ++++ ...exer__tests__fstring_with_format_spec.snap | 222 +++++++++ ...ests__fstring_with_ipy_escape_command.snap | 50 +++ ...tests__fstring_with_lambda_expression.snap | 117 +++++ ..._tests__fstring_with_named_expression.snap | 170 +++++++ crates/ruff_python_parser/src/string.rs | 7 +- crates/ruff_python_parser/src/token.rs | 33 ++ 24 files changed, 2317 insertions(+), 11 deletions(-) create mode 100644 crates/ruff_python_parser/src/lexer/fstring.rs create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__empty_fstrings.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_comments.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_conversion.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_escape.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_escape_raw.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_expression_multiline.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_multiline.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_named_unicode.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_named_unicode_raw.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_nested.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_parentheses.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_prefix.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_format_spec.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_ipy_escape_command.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_lambda_expression.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_named_expression.snap diff --git a/Cargo.lock b/Cargo.lock index a8b182b0fd86e..9bf9b25850c8b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2376,6 +2376,7 @@ name = "ruff_python_parser" version = "0.0.0" dependencies = [ "anyhow", + "bitflags 2.4.0", "insta", "is-macro", "itertools", diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml index a4bb8f6c500a8..9bf93dce626a3 100644 --- a/crates/ruff_python_parser/Cargo.toml +++ b/crates/ruff_python_parser/Cargo.toml @@ -18,6 +18,7 @@ ruff_python_ast = { path = "../ruff_python_ast" } ruff_text_size = { path = "../ruff_text_size" } anyhow = { workspace = true } +bitflags = { workspace = true } is-macro = { workspace = true } itertools = { workspace = true } lalrpop-util = { version = "0.20.0", default-features = false } diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index fca0ba5b90fbe..68e61697da573 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -38,6 +38,7 @@ use ruff_text_size::{TextLen, TextRange, TextSize}; use unicode_ident::{is_xid_continue, is_xid_start}; use crate::lexer::cursor::{Cursor, EOF_CHAR}; +use crate::lexer::fstring::{FStringContext, FStringContextFlags, FStrings}; use crate::lexer::indentation::{Indentation, Indentations}; use crate::{ soft_keywords::SoftKeywordTransformer, @@ -47,6 +48,7 @@ use crate::{ }; mod cursor; +mod fstring; mod indentation; /// A lexer for Python source code. @@ -63,6 +65,8 @@ pub struct Lexer<'source> { pending_indentation: Option, // Lexer mode. mode: Mode, + // F-string contexts. + fstrings: FStrings, } /// Contains a Token along with its `range`. @@ -155,6 +159,7 @@ impl<'source> Lexer<'source> { source: input, cursor: Cursor::new(input), mode, + fstrings: FStrings::default(), }; // TODO: Handle possible mismatch between BOM and explicit encoding declaration. // spell-checker:ignore feff @@ -166,16 +171,24 @@ impl<'source> Lexer<'source> { /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix. fn lex_identifier(&mut self, first: char) -> Result { // Detect potential string like rb'' b'' f'' u'' r'' - match self.cursor.first() { - quote @ ('\'' | '"') => { + match (first, self.cursor.first()) { + ('f' | 'F', quote @ ('\'' | '"')) => { + self.cursor.bump(); + return Ok(self.lex_fstring_start(quote, false)); + } + ('r' | 'R', 'f' | 'F') | ('f' | 'F', 'r' | 'R') if is_quote(self.cursor.second()) => { + self.cursor.bump(); + let quote = self.cursor.bump().unwrap(); + return Ok(self.lex_fstring_start(quote, true)); + } + (_, quote @ ('\'' | '"')) => { if let Ok(string_kind) = StringKind::try_from(first) { self.cursor.bump(); return self.lex_string(string_kind, quote); } } - second @ ('f' | 'F' | 'r' | 'R' | 'b' | 'B') if is_quote(self.cursor.second()) => { + (_, second @ ('r' | 'R' | 'b' | 'B')) if is_quote(self.cursor.second()) => { self.cursor.bump(); - if let Ok(string_kind) = StringKind::try_from([first, second]) { let quote = self.cursor.bump().unwrap(); return self.lex_string(string_kind, quote); @@ -491,6 +504,162 @@ impl<'source> Lexer<'source> { } } + /// Lex a f-string start token. + fn lex_fstring_start(&mut self, quote: char, is_raw_string: bool) -> Tok { + #[cfg(debug_assertions)] + debug_assert_eq!(self.cursor.previous(), quote); + + let mut flags = FStringContextFlags::empty(); + if quote == '"' { + flags |= FStringContextFlags::DOUBLE; + } + if is_raw_string { + flags |= FStringContextFlags::RAW; + } + if self.cursor.eat_char2(quote, quote) { + flags |= FStringContextFlags::TRIPLE; + } + + self.fstrings.push(FStringContext::new(flags, self.nesting)); + Tok::FStringStart + } + + /// Lex a f-string middle or end token. + fn lex_fstring_middle_or_end(&mut self) -> Result, LexicalError> { + // SAFETY: Safe because the function is only called when `self.fstrings` is not empty. + let fstring = self.fstrings.current().unwrap(); + self.cursor.start_token(); + + // Check if we're at the end of the f-string. + if fstring.is_triple_quoted() { + let quote_char = fstring.quote_char(); + if self.cursor.eat_char3(quote_char, quote_char, quote_char) { + return Ok(Some(Tok::FStringEnd)); + } + } else if self.cursor.eat_char(fstring.quote_char()) { + return Ok(Some(Tok::FStringEnd)); + } + + // We have to decode `{{` and `}}` into `{` and `}` respectively. As an + // optimization, we only allocate a new string we find any escaped curly braces, + // otherwise this string will remain empty and we'll use a source slice instead. + let mut normalized = String::new(); + + // Tracks the last offset of token value that has been written to `normalized`. + let mut last_offset = self.offset(); + + let mut in_named_unicode = false; + let mut end_format_spec = false; + + loop { + match self.cursor.first() { + EOF_CHAR => { + let error = if fstring.is_triple_quoted() { + FStringErrorType::UnterminatedTripleQuotedString + } else { + FStringErrorType::UnterminatedString + }; + return Err(LexicalError { + error: LexicalErrorType::FStringError(error), + location: self.offset(), + }); + } + '\n' if !fstring.is_triple_quoted() => { + return Err(LexicalError { + error: LexicalErrorType::FStringError(FStringErrorType::UnterminatedString), + location: self.offset(), + }); + } + '\\' => { + self.cursor.bump(); // '\' + if matches!(self.cursor.first(), '{' | '}') { + // Don't consume `{` or `}` as we want them to be emitted as tokens. + break; + } else if !fstring.is_raw_string() { + if self.cursor.eat_char2('N', '{') { + in_named_unicode = true; + continue; + } + } + // Consume the escaped character. + self.cursor.bump(); + } + quote @ ('\'' | '"') if quote == fstring.quote_char() => { + if let Some(triple_quotes) = fstring.triple_quotes() { + if self.cursor.rest().starts_with(triple_quotes) { + break; + } + self.cursor.bump(); + } else { + break; + } + } + '{' => { + if self.cursor.second() == '{' { + self.cursor.bump(); + normalized + .push_str(&self.source[TextRange::new(last_offset, self.offset())]); + self.cursor.bump(); // Skip the second `{` + last_offset = self.offset(); + } else { + break; + } + } + '}' => { + if in_named_unicode { + in_named_unicode = false; + self.cursor.bump(); + } else if self.cursor.second() == '}' + && !fstring.is_in_format_spec(self.nesting) + { + self.cursor.bump(); + normalized + .push_str(&self.source[TextRange::new(last_offset, self.offset())]); + self.cursor.bump(); // Skip the second `}` + last_offset = self.offset(); + } else { + end_format_spec = fstring.is_in_format_spec(self.nesting); + break; + } + } + _ => { + self.cursor.bump(); + } + } + } + let range = self.token_range(); + + // Avoid emitting the empty `FStringMiddle` token for anything other than + // the closing curly braces (`}`). + if range.is_empty() && !end_format_spec { + return Ok(None); + } + + let value = if range.is_empty() { + // Emit an empty `FStringMiddle` token for a special case to disallow + // lambda expressions without parenthesis. For example, in `f"{lambda x:{x}}"` + // the lexer wouldn't have emitted a `FStringMiddle` token. + String::new() + } else if normalized.is_empty() { + self.source[range].to_string() + } else { + normalized.push_str(&self.source[TextRange::new(last_offset, self.offset())]); + normalized + }; + let is_raw = fstring.is_raw_string(); + if end_format_spec { + // We need to decrement the format spec depth to avoid going into infinite + // loop where the lexer keeps on emitting an empty `FStringMiddle` token. + // This is because the lexer still thinks that we're in a f-string expression + // but as we've encountered a `}` token, we need to decrement the depth so + // that the lexer can go forward with the next token. + // + // SAFETY: Safe because the function is only called when `self.fstrings` is not empty. + self.fstrings.current_mut().unwrap().end_format_spec(); + } + Ok(Some(Tok::FStringMiddle { value, is_raw })) + } + /// Lex a string literal. fn lex_string(&mut self, kind: StringKind, quote: char) -> Result { #[cfg(debug_assertions)] @@ -512,6 +681,19 @@ impl<'source> Lexer<'source> { } } Some('\r' | '\n') if !triple_quoted => { + if let Some(fstring) = self.fstrings.current() { + // When we are in an f-string, check whether does the initial quote + // matches with f-strings quotes and if it is, then this must be a + // missing '}' token so raise the proper error. + if fstring.quote_char() == quote && !fstring.is_triple_quoted() { + return Err(LexicalError { + error: LexicalErrorType::FStringError( + FStringErrorType::UnclosedLbrace, + ), + location: self.offset() - fstring.quote_size(), + }); + } + } return Err(LexicalError { error: LexicalErrorType::OtherError( "EOL while scanning string literal".to_owned(), @@ -531,6 +713,21 @@ impl<'source> Lexer<'source> { Some(_) => {} None => { + if let Some(fstring) = self.fstrings.current() { + // When we are in an f-string, check whether does the initial quote + // matches with f-strings quotes and if it is, then this must be a + // missing '}' token so raise the proper error. + if fstring.quote_char() == quote + && fstring.is_triple_quoted() == triple_quoted + { + return Err(LexicalError { + error: LexicalErrorType::FStringError( + FStringErrorType::UnclosedLbrace, + ), + location: self.offset() - fstring.quote_size(), + }); + } + } return Err(LexicalError { error: if triple_quoted { LexicalErrorType::Eof @@ -554,8 +751,28 @@ impl<'source> Lexer<'source> { // This is the main entry point. Call this function to retrieve the next token. // This function is used by the iterator implementation. pub fn next_token(&mut self) -> LexResult { + if let Some(fstring) = self.fstrings.current() { + if !fstring.is_in_expression(self.nesting) { + match self.lex_fstring_middle_or_end() { + Ok(Some(tok)) => { + if tok == Tok::FStringEnd { + self.fstrings.pop(); + } + return Ok((tok, self.token_range())); + } + Err(e) => { + // This is to prevent an infinite loop in which the lexer + // continuously returns an error token because the f-string + // remains on the stack. + self.fstrings.pop(); + return Err(e); + } + _ => {} + } + } + } // Return dedent tokens until the current indentation level matches the indentation of the next token. - if let Some(indentation) = self.pending_indentation.take() { + else if let Some(indentation) = self.pending_indentation.take() { if let Ok(Ordering::Greater) = self.indentations.current().try_compare(indentation) { self.pending_indentation = Some(indentation); self.indentations.pop(); @@ -861,10 +1078,7 @@ impl<'source> Lexer<'source> { if self.cursor.eat_char('=') { Tok::NotEqual } else { - return Err(LexicalError { - error: LexicalErrorType::UnrecognizedToken { tok: '!' }, - location: self.token_start(), - }); + Tok::Exclamation } } '~' => Tok::Tilde, @@ -889,11 +1103,25 @@ impl<'source> Lexer<'source> { Tok::Lbrace } '}' => { + if let Some(fstring) = self.fstrings.current() { + if fstring.nesting() == self.nesting { + return Err(LexicalError { + error: LexicalErrorType::FStringError(FStringErrorType::SingleRbrace), + location: self.token_start(), + }); + } + } self.nesting = self.nesting.saturating_sub(1); Tok::Rbrace } ':' => { - if self.cursor.eat_char('=') { + if self + .fstrings + .current_mut() + .is_some_and(|fstring| fstring.try_start_format_spec(self.nesting)) + { + Tok::Colon + } else if self.cursor.eat_char('=') { Tok::ColonEqual } else { Tok::Colon @@ -1678,4 +1906,178 @@ def f(arg=%timeit a = b): result => panic!("Expected an error token but found {result:?}"), } } + + #[test] + fn test_empty_fstrings() { + let source = r#"f"" "" F"" f'' '' f"""""" f''''''"#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_prefix() { + let source = r#"f"" F"" rf"" rF"" Rf"" RF"" fr"" Fr"" fR"" FR"""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring() { + let source = r#"f"normal {foo} {{another}} {bar} {{{three}}}""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_parentheses() { + let source = r#"f"{}" f"{{}}" f" {}" f"{{{}}}" f"{{{{}}}}" f" {} {{}} {{{}}} {{{{}}}} ""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_escape() { + let source = r#"f"\{x:\"\{x}} \"\"\ + end""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_escape_raw() { + let source = r#"rf"\{x:\"\{x}} \"\"\ + end""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_named_unicode() { + let source = r#"f"\N{BULLET} normal \Nope \N""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_named_unicode_raw() { + let source = r#"rf"\N{BULLET} normal""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_with_named_expression() { + let source = r#"f"{x:=10} {(x:=10)} {x,{y:=10}} {[x:=10]}""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_with_format_spec() { + let source = r#"f"{foo:} {x=!s:.3f} {x:.{y}f} {'':*^{1:{1}}}""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_conversion() { + let source = r#"f"{x!s} {x=!r} {x:.3f!r} {{x!r}}""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_nested() { + let source = r#"f"foo {f"bar {x + f"{wow}"}"} baz" f'foo {f'bar'} some {f"another"}'"#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_expression_multiline() { + let source = r#"f"first { + x + * + y +} second""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_multiline() { + let source = r#"f""" +hello + world +""" f''' + world +hello +''' f"some {f"""multiline +allowed {x}"""} string""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_comments() { + let source = r#"f""" +# not a comment { # comment { + x +} # not a comment +""""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_with_ipy_escape_command() { + let source = r#"f"foo {!pwd} bar""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_with_lambda_expression() { + let source = r#" +f"{lambda x:{x}}" +f"{(lambda x:{x})}" +"# + .trim(); + assert_debug_snapshot!(lex_source(source)); + } + + fn lex_fstring_error(source: &str) -> FStringErrorType { + match lex(source, Mode::Module).find_map(std::result::Result::err) { + Some(err) => match err.error { + LexicalErrorType::FStringError(error) => error, + _ => panic!("Expected FStringError: {err:?}"), + }, + _ => panic!("Expected atleast one FStringError"), + } + } + + #[test] + fn test_fstring_error() { + use FStringErrorType::{ + SingleRbrace, UnclosedLbrace, UnterminatedString, UnterminatedTripleQuotedString, + }; + + assert_eq!(lex_fstring_error("f'}'"), SingleRbrace); + assert_eq!(lex_fstring_error("f'{{}'"), SingleRbrace); + assert_eq!(lex_fstring_error("f'{{}}}'"), SingleRbrace); + assert_eq!(lex_fstring_error("f'foo}'"), SingleRbrace); + assert_eq!(lex_fstring_error(r"f'\u007b}'"), SingleRbrace); + assert_eq!(lex_fstring_error("f'{a:b}}'"), SingleRbrace); + assert_eq!(lex_fstring_error("f'{3:}}>10}'"), SingleRbrace); + + assert_eq!(lex_fstring_error("f'{'"), UnclosedLbrace); + assert_eq!(lex_fstring_error("f'{foo!r'"), UnclosedLbrace); + assert_eq!(lex_fstring_error("f'{foo='"), UnclosedLbrace); + assert_eq!( + lex_fstring_error( + r#"f"{" +"# + ), + UnclosedLbrace + ); + assert_eq!(lex_fstring_error(r#"f"""{""""#), UnclosedLbrace); + + assert_eq!(lex_fstring_error(r#"f""#), UnterminatedString); + assert_eq!(lex_fstring_error(r#"f'"#), UnterminatedString); + + assert_eq!(lex_fstring_error(r#"f""""#), UnterminatedTripleQuotedString); + assert_eq!(lex_fstring_error(r#"f'''"#), UnterminatedTripleQuotedString); + assert_eq!( + lex_fstring_error(r#"f"""""#), + UnterminatedTripleQuotedString + ); + assert_eq!( + lex_fstring_error(r#"f""""""#), + UnterminatedTripleQuotedString + ); + } } diff --git a/crates/ruff_python_parser/src/lexer/cursor.rs b/crates/ruff_python_parser/src/lexer/cursor.rs index ed5011a1d8507..c026c88e9b7fb 100644 --- a/crates/ruff_python_parser/src/lexer/cursor.rs +++ b/crates/ruff_python_parser/src/lexer/cursor.rs @@ -96,6 +96,18 @@ impl<'a> Cursor<'a> { } } + pub(super) fn eat_char3(&mut self, c1: char, c2: char, c3: char) -> bool { + let mut chars = self.chars.clone(); + if chars.next() == Some(c1) && chars.next() == Some(c2) && chars.next() == Some(c3) { + self.bump(); + self.bump(); + self.bump(); + true + } else { + false + } + } + pub(super) fn eat_if(&mut self, mut predicate: F) -> Option where F: FnMut(char) -> bool, diff --git a/crates/ruff_python_parser/src/lexer/fstring.rs b/crates/ruff_python_parser/src/lexer/fstring.rs new file mode 100644 index 0000000000000..4cb12c44d206a --- /dev/null +++ b/crates/ruff_python_parser/src/lexer/fstring.rs @@ -0,0 +1,158 @@ +use bitflags::bitflags; + +use ruff_text_size::TextSize; + +bitflags! { + #[derive(Debug)] + pub(crate) struct FStringContextFlags: u8 { + /// The current f-string is a triple-quoted f-string i.e., the number of + /// opening quotes is 3. If this flag is not set, the number of opening + /// quotes is 1. + const TRIPLE = 1 << 0; + + /// The current f-string is a double-quoted f-string. If this flag is not + /// set, the current f-string is a single-quoted f-string. + const DOUBLE = 1 << 1; + + /// The current f-string is a raw f-string i.e., prefixed with `r`/`R`. + /// If this flag is not set, the current f-string is a normal f-string. + const RAW = 1 << 2; + } +} + +/// The context representing the current f-string that the lexer is in. +#[derive(Debug)] +pub(crate) struct FStringContext { + flags: FStringContextFlags, + + /// The level of nesting for the lexer when it entered the current f-string. + /// The nesting level includes all kinds of parentheses i.e., round, square, + /// and curly. + nesting: u32, + + /// The current depth of format spec for the current f-string. This is because + /// there can be multiple format specs nested for the same f-string. + /// For example, `{a:{b:{c}}}` has 3 format specs. + format_spec_depth: u32, +} + +impl FStringContext { + pub(crate) const fn new(flags: FStringContextFlags, nesting: u32) -> Self { + Self { + flags, + nesting, + format_spec_depth: 0, + } + } + + pub(crate) const fn nesting(&self) -> u32 { + self.nesting + } + + /// Returns the quote character for the current f-string. + pub(crate) const fn quote_char(&self) -> char { + if self.flags.contains(FStringContextFlags::DOUBLE) { + '"' + } else { + '\'' + } + } + + /// Returns the number of quotes for the current f-string. + pub(crate) const fn quote_size(&self) -> TextSize { + if self.is_triple_quoted() { + TextSize::new(3) + } else { + TextSize::new(1) + } + } + + /// Returns the triple quotes for the current f-string if it is a triple-quoted + /// f-string, `None` otherwise. + pub(crate) const fn triple_quotes(&self) -> Option<&'static str> { + if self.is_triple_quoted() { + if self.flags.contains(FStringContextFlags::DOUBLE) { + Some(r#"""""#) + } else { + Some("'''") + } + } else { + None + } + } + + /// Returns `true` if the current f-string is a raw f-string. + pub(crate) const fn is_raw_string(&self) -> bool { + self.flags.contains(FStringContextFlags::RAW) + } + + /// Returns `true` if the current f-string is a triple-quoted f-string. + pub(crate) const fn is_triple_quoted(&self) -> bool { + self.flags.contains(FStringContextFlags::TRIPLE) + } + + /// Calculates the number of open parentheses for the current f-string + /// based on the current level of nesting for the lexer. + const fn open_parentheses_count(&self, current_nesting: u32) -> u32 { + current_nesting.saturating_sub(self.nesting) + } + + /// Returns `true` if the lexer is in a f-string expression i.e., between + /// two curly braces. + pub(crate) const fn is_in_expression(&self, current_nesting: u32) -> bool { + self.open_parentheses_count(current_nesting) > self.format_spec_depth + } + + /// Returns `true` if the lexer is in a f-string format spec i.e., after a colon. + pub(crate) const fn is_in_format_spec(&self, current_nesting: u32) -> bool { + self.format_spec_depth > 0 && !self.is_in_expression(current_nesting) + } + + /// Returns `true` if the context is in a valid position to start format spec + /// i.e., at the same level of nesting as the opening parentheses token. + /// Increments the format spec depth if it is. + /// + /// This assumes that the current character for the lexer is a colon (`:`). + pub(crate) fn try_start_format_spec(&mut self, current_nesting: u32) -> bool { + if self + .open_parentheses_count(current_nesting) + .saturating_sub(self.format_spec_depth) + == 1 + { + self.format_spec_depth += 1; + true + } else { + false + } + } + + /// Decrements the format spec depth unconditionally. + pub(crate) fn end_format_spec(&mut self) { + self.format_spec_depth = self.format_spec_depth.saturating_sub(1); + } +} + +/// The f-strings stack is used to keep track of all the f-strings that the +/// lexer encounters. This is necessary because f-strings can be nested. +#[derive(Debug, Default)] +pub(crate) struct FStrings { + stack: Vec, +} + +impl FStrings { + pub(crate) fn push(&mut self, context: FStringContext) { + self.stack.push(context); + } + + pub(crate) fn pop(&mut self) -> Option { + self.stack.pop() + } + + pub(crate) fn current(&self) -> Option<&FStringContext> { + self.stack.last() + } + + pub(crate) fn current_mut(&mut self) -> Option<&mut FStringContext> { + self.stack.last_mut() + } +} diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__empty_fstrings.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__empty_fstrings.snap new file mode 100644 index 0000000000000..6f56991ade231 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__empty_fstrings.snap @@ -0,0 +1,66 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + ( + FStringStart, + 0..2, + ), + ( + FStringEnd, + 2..3, + ), + ( + String { + value: "", + kind: String, + triple_quoted: false, + }, + 4..6, + ), + ( + FStringStart, + 7..9, + ), + ( + FStringEnd, + 9..10, + ), + ( + FStringStart, + 11..13, + ), + ( + FStringEnd, + 13..14, + ), + ( + String { + value: "", + kind: String, + triple_quoted: false, + }, + 15..17, + ), + ( + FStringStart, + 18..22, + ), + ( + FStringEnd, + 22..25, + ), + ( + FStringStart, + 26..30, + ), + ( + FStringEnd, + 30..33, + ), + ( + Newline, + 33..33, + ), +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring.snap new file mode 100644 index 0000000000000..cc415cb5ccd61 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring.snap @@ -0,0 +1,88 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + ( + FStringStart, + 0..2, + ), + ( + FStringMiddle { + value: "normal ", + is_raw: false, + }, + 2..9, + ), + ( + Lbrace, + 9..10, + ), + ( + Name { + name: "foo", + }, + 10..13, + ), + ( + Rbrace, + 13..14, + ), + ( + FStringMiddle { + value: " {another} ", + is_raw: false, + }, + 14..27, + ), + ( + Lbrace, + 27..28, + ), + ( + Name { + name: "bar", + }, + 28..31, + ), + ( + Rbrace, + 31..32, + ), + ( + FStringMiddle { + value: " {", + is_raw: false, + }, + 32..35, + ), + ( + Lbrace, + 35..36, + ), + ( + Name { + name: "three", + }, + 36..41, + ), + ( + Rbrace, + 41..42, + ), + ( + FStringMiddle { + value: "}", + is_raw: false, + }, + 42..44, + ), + ( + FStringEnd, + 44..45, + ), + ( + Newline, + 45..45, + ), +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_comments.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_comments.snap new file mode 100644 index 0000000000000..b0427ad58fe64 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_comments.snap @@ -0,0 +1,60 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + ( + FStringStart, + 0..4, + ), + ( + FStringMiddle { + value: "\n# not a comment ", + is_raw: false, + }, + 4..21, + ), + ( + Lbrace, + 21..22, + ), + ( + Comment( + "# comment {", + ), + 23..34, + ), + ( + NonLogicalNewline, + 34..35, + ), + ( + Name { + name: "x", + }, + 39..40, + ), + ( + NonLogicalNewline, + 40..41, + ), + ( + Rbrace, + 41..42, + ), + ( + FStringMiddle { + value: " # not a comment\n", + is_raw: false, + }, + 42..59, + ), + ( + FStringEnd, + 59..62, + ), + ( + Newline, + 62..62, + ), +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_conversion.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_conversion.snap new file mode 100644 index 0000000000000..d52a1772889d1 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_conversion.snap @@ -0,0 +1,116 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + ( + FStringStart, + 0..2, + ), + ( + Lbrace, + 2..3, + ), + ( + Name { + name: "x", + }, + 3..4, + ), + ( + Exclamation, + 4..5, + ), + ( + Name { + name: "s", + }, + 5..6, + ), + ( + Rbrace, + 6..7, + ), + ( + FStringMiddle { + value: " ", + is_raw: false, + }, + 7..8, + ), + ( + Lbrace, + 8..9, + ), + ( + Name { + name: "x", + }, + 9..10, + ), + ( + Equal, + 10..11, + ), + ( + Exclamation, + 11..12, + ), + ( + Name { + name: "r", + }, + 12..13, + ), + ( + Rbrace, + 13..14, + ), + ( + FStringMiddle { + value: " ", + is_raw: false, + }, + 14..15, + ), + ( + Lbrace, + 15..16, + ), + ( + Name { + name: "x", + }, + 16..17, + ), + ( + Colon, + 17..18, + ), + ( + FStringMiddle { + value: ".3f!r", + is_raw: false, + }, + 18..23, + ), + ( + Rbrace, + 23..24, + ), + ( + FStringMiddle { + value: " {x!r}", + is_raw: false, + }, + 24..32, + ), + ( + FStringEnd, + 32..33, + ), + ( + Newline, + 33..33, + ), +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_escape.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_escape.snap new file mode 100644 index 0000000000000..ac64a5c52ddb5 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_escape.snap @@ -0,0 +1,78 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + ( + FStringStart, + 0..2, + ), + ( + FStringMiddle { + value: "\\", + is_raw: false, + }, + 2..3, + ), + ( + Lbrace, + 3..4, + ), + ( + Name { + name: "x", + }, + 4..5, + ), + ( + Colon, + 5..6, + ), + ( + FStringMiddle { + value: "\\\"\\", + is_raw: false, + }, + 6..9, + ), + ( + Lbrace, + 9..10, + ), + ( + Name { + name: "x", + }, + 10..11, + ), + ( + Rbrace, + 11..12, + ), + ( + FStringMiddle { + value: "", + is_raw: false, + }, + 12..12, + ), + ( + Rbrace, + 12..13, + ), + ( + FStringMiddle { + value: " \\\"\\\"\\\n end", + is_raw: false, + }, + 13..24, + ), + ( + FStringEnd, + 24..25, + ), + ( + Newline, + 25..25, + ), +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_escape_raw.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_escape_raw.snap new file mode 100644 index 0000000000000..2f2fa03b5bec9 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_escape_raw.snap @@ -0,0 +1,78 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + ( + FStringStart, + 0..3, + ), + ( + FStringMiddle { + value: "\\", + is_raw: true, + }, + 3..4, + ), + ( + Lbrace, + 4..5, + ), + ( + Name { + name: "x", + }, + 5..6, + ), + ( + Colon, + 6..7, + ), + ( + FStringMiddle { + value: "\\\"\\", + is_raw: true, + }, + 7..10, + ), + ( + Lbrace, + 10..11, + ), + ( + Name { + name: "x", + }, + 11..12, + ), + ( + Rbrace, + 12..13, + ), + ( + FStringMiddle { + value: "", + is_raw: true, + }, + 13..13, + ), + ( + Rbrace, + 13..14, + ), + ( + FStringMiddle { + value: " \\\"\\\"\\\n end", + is_raw: true, + }, + 14..25, + ), + ( + FStringEnd, + 25..26, + ), + ( + Newline, + 26..26, + ), +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_expression_multiline.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_expression_multiline.snap new file mode 100644 index 0000000000000..e6f6703a3ca76 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_expression_multiline.snap @@ -0,0 +1,72 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + ( + FStringStart, + 0..2, + ), + ( + FStringMiddle { + value: "first ", + is_raw: false, + }, + 2..8, + ), + ( + Lbrace, + 8..9, + ), + ( + NonLogicalNewline, + 9..10, + ), + ( + Name { + name: "x", + }, + 14..15, + ), + ( + NonLogicalNewline, + 15..16, + ), + ( + Star, + 24..25, + ), + ( + NonLogicalNewline, + 25..26, + ), + ( + Name { + name: "y", + }, + 38..39, + ), + ( + NonLogicalNewline, + 39..40, + ), + ( + Rbrace, + 40..41, + ), + ( + FStringMiddle { + value: " second", + is_raw: false, + }, + 41..48, + ), + ( + FStringEnd, + 48..49, + ), + ( + Newline, + 49..49, + ), +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_multiline.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_multiline.snap new file mode 100644 index 0000000000000..0b89f0e51d83e --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_multiline.snap @@ -0,0 +1,99 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + ( + FStringStart, + 0..4, + ), + ( + FStringMiddle { + value: "\nhello\n world\n", + is_raw: false, + }, + 4..21, + ), + ( + FStringEnd, + 21..24, + ), + ( + FStringStart, + 25..29, + ), + ( + FStringMiddle { + value: "\n world\nhello\n", + is_raw: false, + }, + 29..46, + ), + ( + FStringEnd, + 46..49, + ), + ( + FStringStart, + 50..52, + ), + ( + FStringMiddle { + value: "some ", + is_raw: false, + }, + 52..57, + ), + ( + Lbrace, + 57..58, + ), + ( + FStringStart, + 58..62, + ), + ( + FStringMiddle { + value: "multiline\nallowed ", + is_raw: false, + }, + 62..80, + ), + ( + Lbrace, + 80..81, + ), + ( + Name { + name: "x", + }, + 81..82, + ), + ( + Rbrace, + 82..83, + ), + ( + FStringEnd, + 83..86, + ), + ( + Rbrace, + 86..87, + ), + ( + FStringMiddle { + value: " string", + is_raw: false, + }, + 87..94, + ), + ( + FStringEnd, + 94..95, + ), + ( + Newline, + 95..95, + ), +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_named_unicode.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_named_unicode.snap new file mode 100644 index 0000000000000..0a9392ee7d415 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_named_unicode.snap @@ -0,0 +1,25 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + ( + FStringStart, + 0..2, + ), + ( + FStringMiddle { + value: "\\N{BULLET} normal \\Nope \\N", + is_raw: false, + }, + 2..28, + ), + ( + FStringEnd, + 28..29, + ), + ( + Newline, + 29..29, + ), +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_named_unicode_raw.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_named_unicode_raw.snap new file mode 100644 index 0000000000000..f80b8761c683e --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_named_unicode_raw.snap @@ -0,0 +1,46 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + ( + FStringStart, + 0..3, + ), + ( + FStringMiddle { + value: "\\N", + is_raw: true, + }, + 3..5, + ), + ( + Lbrace, + 5..6, + ), + ( + Name { + name: "BULLET", + }, + 6..12, + ), + ( + Rbrace, + 12..13, + ), + ( + FStringMiddle { + value: " normal", + is_raw: true, + }, + 13..20, + ), + ( + FStringEnd, + 20..21, + ), + ( + Newline, + 21..21, + ), +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_nested.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_nested.snap new file mode 100644 index 0000000000000..7a4191cd63172 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_nested.snap @@ -0,0 +1,163 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + ( + FStringStart, + 0..2, + ), + ( + FStringMiddle { + value: "foo ", + is_raw: false, + }, + 2..6, + ), + ( + Lbrace, + 6..7, + ), + ( + FStringStart, + 7..9, + ), + ( + FStringMiddle { + value: "bar ", + is_raw: false, + }, + 9..13, + ), + ( + Lbrace, + 13..14, + ), + ( + Name { + name: "x", + }, + 14..15, + ), + ( + Plus, + 16..17, + ), + ( + FStringStart, + 18..20, + ), + ( + Lbrace, + 20..21, + ), + ( + Name { + name: "wow", + }, + 21..24, + ), + ( + Rbrace, + 24..25, + ), + ( + FStringEnd, + 25..26, + ), + ( + Rbrace, + 26..27, + ), + ( + FStringEnd, + 27..28, + ), + ( + Rbrace, + 28..29, + ), + ( + FStringMiddle { + value: " baz", + is_raw: false, + }, + 29..33, + ), + ( + FStringEnd, + 33..34, + ), + ( + FStringStart, + 35..37, + ), + ( + FStringMiddle { + value: "foo ", + is_raw: false, + }, + 37..41, + ), + ( + Lbrace, + 41..42, + ), + ( + FStringStart, + 42..44, + ), + ( + FStringMiddle { + value: "bar", + is_raw: false, + }, + 44..47, + ), + ( + FStringEnd, + 47..48, + ), + ( + Rbrace, + 48..49, + ), + ( + FStringMiddle { + value: " some ", + is_raw: false, + }, + 49..55, + ), + ( + Lbrace, + 55..56, + ), + ( + FStringStart, + 56..58, + ), + ( + FStringMiddle { + value: "another", + is_raw: false, + }, + 58..65, + ), + ( + FStringEnd, + 65..66, + ), + ( + Rbrace, + 66..67, + ), + ( + FStringEnd, + 67..68, + ), + ( + Newline, + 68..68, + ), +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_parentheses.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_parentheses.snap new file mode 100644 index 0000000000000..1e4b829ac9db5 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_parentheses.snap @@ -0,0 +1,154 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + ( + FStringStart, + 0..2, + ), + ( + Lbrace, + 2..3, + ), + ( + Rbrace, + 3..4, + ), + ( + FStringEnd, + 4..5, + ), + ( + FStringStart, + 6..8, + ), + ( + FStringMiddle { + value: "{}", + is_raw: false, + }, + 8..12, + ), + ( + FStringEnd, + 12..13, + ), + ( + FStringStart, + 14..16, + ), + ( + FStringMiddle { + value: " ", + is_raw: false, + }, + 16..17, + ), + ( + Lbrace, + 17..18, + ), + ( + Rbrace, + 18..19, + ), + ( + FStringEnd, + 19..20, + ), + ( + FStringStart, + 21..23, + ), + ( + FStringMiddle { + value: "{", + is_raw: false, + }, + 23..25, + ), + ( + Lbrace, + 25..26, + ), + ( + Rbrace, + 26..27, + ), + ( + FStringMiddle { + value: "}", + is_raw: false, + }, + 27..29, + ), + ( + FStringEnd, + 29..30, + ), + ( + FStringStart, + 31..33, + ), + ( + FStringMiddle { + value: "{{}}", + is_raw: false, + }, + 33..41, + ), + ( + FStringEnd, + 41..42, + ), + ( + FStringStart, + 43..45, + ), + ( + FStringMiddle { + value: " ", + is_raw: false, + }, + 45..46, + ), + ( + Lbrace, + 46..47, + ), + ( + Rbrace, + 47..48, + ), + ( + FStringMiddle { + value: " {} {", + is_raw: false, + }, + 48..56, + ), + ( + Lbrace, + 56..57, + ), + ( + Rbrace, + 57..58, + ), + ( + FStringMiddle { + value: "} {{}} ", + is_raw: false, + }, + 58..71, + ), + ( + FStringEnd, + 71..72, + ), + ( + Newline, + 72..72, + ), +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_prefix.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_prefix.snap new file mode 100644 index 0000000000000..36e048ca40cfc --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_prefix.snap @@ -0,0 +1,90 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + ( + FStringStart, + 0..2, + ), + ( + FStringEnd, + 2..3, + ), + ( + FStringStart, + 4..6, + ), + ( + FStringEnd, + 6..7, + ), + ( + FStringStart, + 8..11, + ), + ( + FStringEnd, + 11..12, + ), + ( + FStringStart, + 13..16, + ), + ( + FStringEnd, + 16..17, + ), + ( + FStringStart, + 18..21, + ), + ( + FStringEnd, + 21..22, + ), + ( + FStringStart, + 23..26, + ), + ( + FStringEnd, + 26..27, + ), + ( + FStringStart, + 28..31, + ), + ( + FStringEnd, + 31..32, + ), + ( + FStringStart, + 33..36, + ), + ( + FStringEnd, + 36..37, + ), + ( + FStringStart, + 38..41, + ), + ( + FStringEnd, + 41..42, + ), + ( + FStringStart, + 43..46, + ), + ( + FStringEnd, + 46..47, + ), + ( + Newline, + 47..47, + ), +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_format_spec.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_format_spec.snap new file mode 100644 index 0000000000000..c664f564200db --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_format_spec.snap @@ -0,0 +1,222 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + ( + FStringStart, + 0..2, + ), + ( + Lbrace, + 2..3, + ), + ( + Name { + name: "foo", + }, + 3..6, + ), + ( + Colon, + 6..7, + ), + ( + FStringMiddle { + value: "", + is_raw: false, + }, + 7..7, + ), + ( + Rbrace, + 7..8, + ), + ( + FStringMiddle { + value: " ", + is_raw: false, + }, + 8..9, + ), + ( + Lbrace, + 9..10, + ), + ( + Name { + name: "x", + }, + 10..11, + ), + ( + Equal, + 11..12, + ), + ( + Exclamation, + 12..13, + ), + ( + Name { + name: "s", + }, + 13..14, + ), + ( + Colon, + 14..15, + ), + ( + FStringMiddle { + value: ".3f", + is_raw: false, + }, + 15..18, + ), + ( + Rbrace, + 18..19, + ), + ( + FStringMiddle { + value: " ", + is_raw: false, + }, + 19..20, + ), + ( + Lbrace, + 20..21, + ), + ( + Name { + name: "x", + }, + 21..22, + ), + ( + Colon, + 22..23, + ), + ( + FStringMiddle { + value: ".", + is_raw: false, + }, + 23..24, + ), + ( + Lbrace, + 24..25, + ), + ( + Name { + name: "y", + }, + 25..26, + ), + ( + Rbrace, + 26..27, + ), + ( + FStringMiddle { + value: "f", + is_raw: false, + }, + 27..28, + ), + ( + Rbrace, + 28..29, + ), + ( + FStringMiddle { + value: " ", + is_raw: false, + }, + 29..30, + ), + ( + Lbrace, + 30..31, + ), + ( + String { + value: "", + kind: String, + triple_quoted: false, + }, + 31..33, + ), + ( + Colon, + 33..34, + ), + ( + FStringMiddle { + value: "*^", + is_raw: false, + }, + 34..36, + ), + ( + Lbrace, + 36..37, + ), + ( + Int { + value: 1, + }, + 37..38, + ), + ( + Colon, + 38..39, + ), + ( + Lbrace, + 39..40, + ), + ( + Int { + value: 1, + }, + 40..41, + ), + ( + Rbrace, + 41..42, + ), + ( + FStringMiddle { + value: "", + is_raw: false, + }, + 42..42, + ), + ( + Rbrace, + 42..43, + ), + ( + FStringMiddle { + value: "", + is_raw: false, + }, + 43..43, + ), + ( + Rbrace, + 43..44, + ), + ( + FStringEnd, + 44..45, + ), + ( + Newline, + 45..45, + ), +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_ipy_escape_command.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_ipy_escape_command.snap new file mode 100644 index 0000000000000..89911851aa022 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_ipy_escape_command.snap @@ -0,0 +1,50 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + ( + FStringStart, + 0..2, + ), + ( + FStringMiddle { + value: "foo ", + is_raw: false, + }, + 2..6, + ), + ( + Lbrace, + 6..7, + ), + ( + Exclamation, + 7..8, + ), + ( + Name { + name: "pwd", + }, + 8..11, + ), + ( + Rbrace, + 11..12, + ), + ( + FStringMiddle { + value: " bar", + is_raw: false, + }, + 12..16, + ), + ( + FStringEnd, + 16..17, + ), + ( + Newline, + 17..17, + ), +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_lambda_expression.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_lambda_expression.snap new file mode 100644 index 0000000000000..9cf32a25ccc28 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_lambda_expression.snap @@ -0,0 +1,117 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + ( + FStringStart, + 0..2, + ), + ( + Lbrace, + 2..3, + ), + ( + Lambda, + 3..9, + ), + ( + Name { + name: "x", + }, + 10..11, + ), + ( + Colon, + 11..12, + ), + ( + Lbrace, + 12..13, + ), + ( + Name { + name: "x", + }, + 13..14, + ), + ( + Rbrace, + 14..15, + ), + ( + FStringMiddle { + value: "", + is_raw: false, + }, + 15..15, + ), + ( + Rbrace, + 15..16, + ), + ( + FStringEnd, + 16..17, + ), + ( + Newline, + 17..18, + ), + ( + FStringStart, + 18..20, + ), + ( + Lbrace, + 20..21, + ), + ( + Lpar, + 21..22, + ), + ( + Lambda, + 22..28, + ), + ( + Name { + name: "x", + }, + 29..30, + ), + ( + Colon, + 30..31, + ), + ( + Lbrace, + 31..32, + ), + ( + Name { + name: "x", + }, + 32..33, + ), + ( + Rbrace, + 33..34, + ), + ( + Rpar, + 34..35, + ), + ( + Rbrace, + 35..36, + ), + ( + FStringEnd, + 36..37, + ), + ( + Newline, + 37..37, + ), +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_named_expression.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_named_expression.snap new file mode 100644 index 0000000000000..aa551e4d73f3a --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_named_expression.snap @@ -0,0 +1,170 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + ( + FStringStart, + 0..2, + ), + ( + Lbrace, + 2..3, + ), + ( + Name { + name: "x", + }, + 3..4, + ), + ( + Colon, + 4..5, + ), + ( + FStringMiddle { + value: "=10", + is_raw: false, + }, + 5..8, + ), + ( + Rbrace, + 8..9, + ), + ( + FStringMiddle { + value: " ", + is_raw: false, + }, + 9..10, + ), + ( + Lbrace, + 10..11, + ), + ( + Lpar, + 11..12, + ), + ( + Name { + name: "x", + }, + 12..13, + ), + ( + ColonEqual, + 13..15, + ), + ( + Int { + value: 10, + }, + 15..17, + ), + ( + Rpar, + 17..18, + ), + ( + Rbrace, + 18..19, + ), + ( + FStringMiddle { + value: " ", + is_raw: false, + }, + 19..20, + ), + ( + Lbrace, + 20..21, + ), + ( + Name { + name: "x", + }, + 21..22, + ), + ( + Comma, + 22..23, + ), + ( + Lbrace, + 23..24, + ), + ( + Name { + name: "y", + }, + 24..25, + ), + ( + ColonEqual, + 25..27, + ), + ( + Int { + value: 10, + }, + 27..29, + ), + ( + Rbrace, + 29..30, + ), + ( + Rbrace, + 30..31, + ), + ( + FStringMiddle { + value: " ", + is_raw: false, + }, + 31..32, + ), + ( + Lbrace, + 32..33, + ), + ( + Lsqb, + 33..34, + ), + ( + Name { + name: "x", + }, + 34..35, + ), + ( + ColonEqual, + 35..37, + ), + ( + Int { + value: 10, + }, + 37..39, + ), + ( + Rsqb, + 39..40, + ), + ( + Rbrace, + 40..41, + ), + ( + FStringEnd, + 41..42, + ), + ( + Newline, + 42..42, + ), +] diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs index 0524473f1252e..a563ad8ac001e 100644 --- a/crates/ruff_python_parser/src/string.rs +++ b/crates/ruff_python_parser/src/string.rs @@ -733,6 +733,8 @@ pub enum FStringErrorType { // TODO: Test this case. /// Unterminated string. UnterminatedString, + /// Unterminated triple-quoted string. + UnterminatedTripleQuotedString, } impl std::fmt::Display for FStringErrorType { @@ -740,7 +742,7 @@ impl std::fmt::Display for FStringErrorType { use FStringErrorType::{ EmptyExpression, ExpressionCannotInclude, ExpressionNestedTooDeeply, InvalidConversionFlag, InvalidExpression, MismatchedDelimiter, SingleRbrace, - UnclosedLbrace, Unmatched, UnterminatedString, + UnclosedLbrace, Unmatched, UnterminatedString, UnterminatedTripleQuotedString, }; match self { UnclosedLbrace => write!(f, "expecting '}}'"), @@ -761,6 +763,9 @@ impl std::fmt::Display for FStringErrorType { UnterminatedString => { write!(f, "unterminated string") } + UnterminatedTripleQuotedString => { + write!(f, "unterminated triple-quoted string") + } ExpressionCannotInclude(c) => { if *c == '\\' { write!(f, "f-string expression part cannot include a backslash") diff --git a/crates/ruff_python_parser/src/token.rs b/crates/ruff_python_parser/src/token.rs index 9bec604b8d495..f48f02df1dad1 100644 --- a/crates/ruff_python_parser/src/token.rs +++ b/crates/ruff_python_parser/src/token.rs @@ -44,6 +44,19 @@ pub enum Tok { /// Whether the string is triple quoted. triple_quoted: bool, }, + /// Token value for the start of an f-string. This includes the `f`/`F`/`fr` prefix + /// and the opening quote(s). + FStringStart, + /// Token value that includes the portion of text inside the f-string that's not + /// part of the expression part and isn't an opening or closing brace. + FStringMiddle { + /// The string value. + value: String, + /// Whether the string is raw or not. + is_raw: bool, + }, + /// Token value for the end of an f-string. This includes the closing quote. + FStringEnd, /// Token value for IPython escape commands. These are recognized by the lexer /// only when the mode is [`Mode::Ipython`]. IpyEscapeCommand { @@ -66,6 +79,8 @@ pub enum Tok { EndOfFile, /// Token value for a question mark `?`. This is only used in [`Mode::Ipython`]. Question, + /// Token value for a exclamation mark `!`. + Exclamation, /// Token value for a left parenthesis `(`. Lpar, /// Token value for a right parenthesis `)`. @@ -234,6 +249,9 @@ impl fmt::Display for Tok { let quotes = "\"".repeat(if *triple_quoted { 3 } else { 1 }); write!(f, "{kind}{quotes}{value}{quotes}") } + FStringStart => f.write_str("FStringStart"), + FStringMiddle { value, .. } => f.write_str(value), + FStringEnd => f.write_str("FStringEnd"), IpyEscapeCommand { kind, value } => write!(f, "{kind}{value}"), Newline => f.write_str("Newline"), NonLogicalNewline => f.write_str("NonLogicalNewline"), @@ -243,6 +261,7 @@ impl fmt::Display for Tok { StartExpression => f.write_str("StartExpression"), EndOfFile => f.write_str("EOF"), Question => f.write_str("'?'"), + Exclamation => f.write_str("'!'"), Lpar => f.write_str("'('"), Rpar => f.write_str("')'"), Lsqb => f.write_str("'['"), @@ -450,6 +469,14 @@ pub enum TokenKind { Complex, /// Token value for a string. String, + /// Token value for the start of an f-string. This includes the `f`/`F`/`fr` prefix + /// and the opening quote(s). + FStringStart, + /// Token value that includes the portion of text inside the f-string that's not + /// part of the expression part and isn't an opening or closing brace. + FStringMiddle, + /// Token value for the end of an f-string. This includes the closing quote. + FStringEnd, /// Token value for a IPython escape command. EscapeCommand, /// Token value for a comment. These are filtered out of the token stream prior to parsing. @@ -466,6 +493,8 @@ pub enum TokenKind { EndOfFile, /// Token value for a question mark `?`. Question, + /// Token value for an exclamation mark `!`. + Exclamation, /// Token value for a left parenthesis `(`. Lpar, /// Token value for a right parenthesis `)`. @@ -781,6 +810,9 @@ impl TokenKind { Tok::Float { .. } => TokenKind::Float, Tok::Complex { .. } => TokenKind::Complex, Tok::String { .. } => TokenKind::String, + Tok::FStringStart => TokenKind::FStringStart, + Tok::FStringMiddle { .. } => TokenKind::FStringMiddle, + Tok::FStringEnd => TokenKind::FStringEnd, Tok::IpyEscapeCommand { .. } => TokenKind::EscapeCommand, Tok::Comment(_) => TokenKind::Comment, Tok::Newline => TokenKind::Newline, @@ -789,6 +821,7 @@ impl TokenKind { Tok::Dedent => TokenKind::Dedent, Tok::EndOfFile => TokenKind::EndOfFile, Tok::Question => TokenKind::Question, + Tok::Exclamation => TokenKind::Exclamation, Tok::Lpar => TokenKind::Lpar, Tok::Rpar => TokenKind::Rpar, Tok::Lsqb => TokenKind::Lsqb,