From a42b40c7e98ef702f6ff27cc061ad75762912a4e Mon Sep 17 00:00:00 2001 From: Dhruv Manilawala Date: Thu, 17 Aug 2023 20:09:06 +0530 Subject: [PATCH] Add support for the new f-string tokens per PEP 701 --- Cargo.lock | 1 + crates/ruff_python_parser/Cargo.toml | 1 + crates/ruff_python_parser/src/lexer.rs | 394 +++++++++++++++++- crates/ruff_python_parser/src/lexer/cursor.rs | 12 + .../ruff_python_parser/src/lexer/fstring.rs | 133 ++++++ ..._parser__lexer__tests__empty_fstrings.snap | 27 ++ ..._python_parser__lexer__tests__fstring.snap | 40 ++ ...arser__lexer__tests__fstring_comments.snap | 27 ++ ...ser__lexer__tests__fstring_conversion.snap | 50 +++ ..._parser__lexer__tests__fstring_escape.snap | 32 ++ ...ser__lexer__tests__fstring_escape_raw.snap | 32 ++ ...__tests__fstring_expression_multiline.snap | 30 ++ ...rser__lexer__tests__fstring_multiline.snap | 42 ++ ...__lexer__tests__fstring_named_unicode.snap | 13 + ...xer__tests__fstring_named_unicode_raw.snap | 22 + ..._parser__lexer__tests__fstring_nested.snap | 64 +++ ...er__lexer__tests__fstring_parentheses.snap | 61 +++ ..._parser__lexer__tests__fstring_prefix.snap | 27 ++ ...exer__tests__fstring_with_format_spec.snap | 84 ++++ ...ests__fstring_with_ipy_escape_command.snap | 23 + ..._tests__fstring_with_named_expression.snap | 68 +++ crates/ruff_python_parser/src/string.rs | 7 +- crates/ruff_python_parser/src/token.rs | 33 ++ 23 files changed, 1213 insertions(+), 10 deletions(-) create mode 100644 crates/ruff_python_parser/src/lexer/fstring.rs create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__empty_fstrings.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_comments.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_conversion.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_escape.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_escape_raw.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_expression_multiline.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_multiline.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_named_unicode.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_named_unicode_raw.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_nested.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_parentheses.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_prefix.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_format_spec.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_ipy_escape_command.snap create mode 100644 crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_named_expression.snap diff --git a/Cargo.lock b/Cargo.lock index 74123f7491db82..26544c9c53a364 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2384,6 +2384,7 @@ name = "ruff_python_parser" version = "0.0.0" dependencies = [ "anyhow", + "bitflags 2.3.3", "insta", "is-macro", "itertools", diff --git a/crates/ruff_python_parser/Cargo.toml b/crates/ruff_python_parser/Cargo.toml index 02b27577b2e667..c311d34e58c228 100644 --- a/crates/ruff_python_parser/Cargo.toml +++ b/crates/ruff_python_parser/Cargo.toml @@ -18,6 +18,7 @@ ruff_python_ast = { path = "../ruff_python_ast" } ruff_text_size = { path = "../ruff_text_size" } anyhow = { workspace = true } +bitflags = { workspace = true } is-macro = { workspace = true } itertools = { workspace = true } lalrpop-util = { version = "0.20.0", default-features = false } diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index 1691fc6c49a340..e3c4b641c3a935 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -40,6 +40,7 @@ use unic_emoji_char::is_emoji_presentation; use unic_ucd_ident::{is_xid_continue, is_xid_start}; use crate::lexer::cursor::{Cursor, EOF_CHAR}; +use crate::lexer::fstring::{FStringContext, FStringContextFlags}; use crate::lexer::indentation::{Indentation, Indentations}; use crate::{ soft_keywords::SoftKeywordTransformer, @@ -49,6 +50,7 @@ use crate::{ }; mod cursor; +mod fstring; mod indentation; /// A lexer for Python source code. @@ -65,6 +67,8 @@ pub struct Lexer<'source> { pending_indentation: Option, // Lexer mode. mode: Mode, + + fstring_stack: Vec, } /// Contains a Token along with its `range`. @@ -157,6 +161,7 @@ impl<'source> Lexer<'source> { source: input, cursor: Cursor::new(input), mode, + fstring_stack: vec![], }; // TODO: Handle possible mismatch between BOM and explicit encoding declaration. // spell-checker:ignore feff @@ -168,16 +173,24 @@ impl<'source> Lexer<'source> { /// Lex an identifier. Also used for keywords and string/bytes literals with a prefix. fn lex_identifier(&mut self, first: char) -> Result { // Detect potential string like rb'' b'' f'' u'' r'' - match self.cursor.first() { - quote @ ('\'' | '"') => { + match (first, self.cursor.first()) { + ('f' | 'F', quote @ ('\'' | '"')) => { + self.cursor.bump(); + return Ok(self.lex_fstring_start(quote, false)); + } + ('r' | 'R', 'f' | 'F') | ('f' | 'F', 'r' | 'R') if is_quote(self.cursor.second()) => { + self.cursor.bump(); + let quote = self.cursor.bump().unwrap(); + return Ok(self.lex_fstring_start(quote, true)); + } + (_, quote @ ('\'' | '"')) => { if let Ok(string_kind) = StringKind::try_from(first) { self.cursor.bump(); return self.lex_string(string_kind, quote); } } - second @ ('f' | 'F' | 'r' | 'R' | 'b' | 'B') if is_quote(self.cursor.second()) => { + (_, second @ ('r' | 'R' | 'b' | 'B')) if is_quote(self.cursor.second()) => { self.cursor.bump(); - if let Ok(string_kind) = StringKind::try_from([first, second]) { let quote = self.cursor.bump().unwrap(); return self.lex_string(string_kind, quote); @@ -508,6 +521,147 @@ impl<'source> Lexer<'source> { } } + /// Lex a f-string start token. + fn lex_fstring_start(&mut self, quote: char, is_raw_string: bool) -> Tok { + #[cfg(debug_assertions)] + debug_assert_eq!(self.cursor.previous(), quote); + + let mut flags = FStringContextFlags::empty(); + if quote == '"' { + flags |= FStringContextFlags::DOUBLE; + } + if is_raw_string { + flags |= FStringContextFlags::RAW; + } + if self.cursor.eat_char2(quote, quote) { + flags |= FStringContextFlags::TRIPLE; + }; + + self.fstring_stack.push(FStringContext::new(flags)); + Tok::FStringStart + } + + fn lex_fstring_middle_or_end(&mut self) -> Result, LexicalError> { + // SAFETY: Safe because the function is only called when `self.fstring_stack` is not empty. + let context = self.fstring_stack.last().unwrap(); + + // Check if we're at the end of the f-string. + if context.is_triple_quoted() { + let quote_char = context.quote_char(); + if self.cursor.eat_char3(quote_char, quote_char, quote_char) { + return Ok(Some(Tok::FStringEnd)); + } + } else if self.cursor.eat_char(context.quote_char()) { + return Ok(Some(Tok::FStringEnd)); + } + + // The normalized string if the token value is not yet normalized. + // This must remain empty if it's already normalized. Normalization + // is to replace `{{` and `}}` with `{` and `}` respectively. + let mut normalized = String::new(); + + // Tracks the last offset of token value that has been written to `normalized`. + let mut last_offset = self.offset(); + + let mut in_named_unicode = false; + + loop { + match self.cursor.first() { + EOF_CHAR => { + let error = if context.is_triple_quoted() { + FStringErrorType::UnterminatedTripleQuotedString + } else { + FStringErrorType::UnterminatedString + }; + // This is to avoid infinite loop where the lexer keeps returning + // the error token. + self.fstring_stack.pop(); + return Err(LexicalError { + error: LexicalErrorType::FStringError(error), + location: self.offset(), + }); + } + '\n' if !context.is_triple_quoted() => { + // This is to avoid infinite loop where the lexer keeps returning + // the error token. + self.fstring_stack.pop(); + return Err(LexicalError { + error: LexicalErrorType::FStringError(FStringErrorType::UnterminatedString), + location: self.offset(), + }); + } + '\\' => { + self.cursor.bump(); // '\' + if matches!(self.cursor.first(), '{' | '}') { + // Don't consume `{` or `}` as we want them to be consumed as tokens. + break; + } else if !context.is_raw_string() { + if self.cursor.eat_char2('N', '{') { + in_named_unicode = true; + continue; + } + } + // Consume the escaped character. + self.cursor.bump(); + } + quote @ ('\'' | '"') if quote == context.quote_char() => { + if let Some(triple_quotes) = context.triple_quotes() { + if self.cursor.rest().starts_with(triple_quotes) { + break; + } + self.cursor.bump(); + } else { + break; + } + } + '{' => { + if self.cursor.second() == '{' { + self.cursor.bump(); + normalized + .push_str(&self.source[TextRange::new(last_offset, self.offset())]); + self.cursor.bump(); // Skip the second `{` + last_offset = self.offset(); + } else { + break; + } + } + '}' => { + if in_named_unicode { + in_named_unicode = false; + self.cursor.bump(); + } else if self.cursor.second() == '}' && !context.is_in_format_spec() { + self.cursor.bump(); + normalized + .push_str(&self.source[TextRange::new(last_offset, self.offset())]); + self.cursor.bump(); // Skip the second `}` + last_offset = self.offset(); + } else { + break; + } + } + _ => { + self.cursor.bump(); + } + } + } + + let range = self.token_range(); + if range.is_empty() { + return Ok(None); + } + + let value = if normalized.is_empty() { + self.source[range].to_string() + } else { + normalized.push_str(&self.source[TextRange::new(last_offset, self.offset())]); + normalized + }; + Ok(Some(Tok::FStringMiddle { + value, + is_raw: context.is_raw_string(), + })) + } + /// Lex a string literal. fn lex_string(&mut self, kind: StringKind, quote: char) -> Result { #[cfg(debug_assertions)] @@ -529,6 +683,21 @@ impl<'source> Lexer<'source> { } } Some('\r' | '\n') if !triple_quoted => { + if let Some(fstring_context) = self.fstring_stack.last() { + // When we are in an f-string, check whether does the initial quote + // matches with f-strings quotes and if it is, then this must be a + // missing '}' token so raise the proper error. + if fstring_context.quote_char() == quote + && !fstring_context.is_triple_quoted() + { + return Err(LexicalError { + error: LexicalErrorType::FStringError( + FStringErrorType::UnclosedLbrace, + ), + location: self.offset() - fstring_context.quote_size(), + }); + } + } return Err(LexicalError { error: LexicalErrorType::OtherError( "EOL while scanning string literal".to_owned(), @@ -548,6 +717,21 @@ impl<'source> Lexer<'source> { Some(_) => {} None => { + if let Some(fstring_context) = self.fstring_stack.last() { + // When we are in an f-string, check whether does the initial quote + // matches with f-strings quotes and if it is, then this must be a + // missing '}' token so raise the proper error. + if fstring_context.quote_char() == quote + && fstring_context.is_triple_quoted() == triple_quoted + { + return Err(LexicalError { + error: LexicalErrorType::FStringError( + FStringErrorType::UnclosedLbrace, + ), + location: self.offset() - fstring_context.quote_size(), + }); + } + } return Err(LexicalError { error: if triple_quoted { LexicalErrorType::Eof @@ -571,6 +755,18 @@ impl<'source> Lexer<'source> { // This is the main entry point. Call this function to retrieve the next token. // This function is used by the iterator implementation. pub fn next_token(&mut self) -> LexResult { + if let Some(fstring_context) = self.fstring_stack.last() { + if !fstring_context.is_in_expression() { + self.cursor.start_token(); + if let Some(tok) = self.lex_fstring_middle_or_end()? { + if matches!(tok, Tok::FStringEnd) { + self.fstring_stack.pop(); + } + return Ok((tok, self.token_range())); + } + } + } + // Return dedent tokens until the current indentation level matches the indentation of the next token. if let Some(indentation) = self.pending_indentation.take() { if let Ok(Ordering::Greater) = self.indentations.current().try_compare(indentation) { @@ -841,39 +1037,66 @@ impl<'source> Lexer<'source> { if self.cursor.eat_char('=') { Tok::NotEqual } else { - return Err(LexicalError { - error: LexicalErrorType::UnrecognizedToken { tok: '!' }, - location: self.token_start(), - }); + Tok::Exclamation } } '~' => Tok::Tilde, '(' => { + if let Some(fstring_context) = self.fstring_stack.last_mut() { + fstring_context.increment_opening_parentheses(); + } self.nesting += 1; Tok::Lpar } ')' => { + if let Some(fstring_context) = self.fstring_stack.last_mut() { + fstring_context.decrement_closing_parentheses(); + } self.nesting = self.nesting.saturating_sub(1); Tok::Rpar } '[' => { + if let Some(fstring_context) = self.fstring_stack.last_mut() { + fstring_context.increment_opening_parentheses(); + } self.nesting += 1; Tok::Lsqb } ']' => { + if let Some(fstring_context) = self.fstring_stack.last_mut() { + fstring_context.decrement_closing_parentheses(); + } self.nesting = self.nesting.saturating_sub(1); Tok::Rsqb } '{' => { + if let Some(fstring_context) = self.fstring_stack.last_mut() { + fstring_context.increment_opening_parentheses(); + } self.nesting += 1; Tok::Lbrace } '}' => { + if let Some(fstring_context) = self.fstring_stack.last_mut() { + if !fstring_context.has_open_parentheses() { + return Err(LexicalError { + error: LexicalErrorType::FStringError(FStringErrorType::SingleRbrace), + location: self.token_start(), + }); + } + fstring_context.decrement_closing_parentheses(); + } self.nesting = self.nesting.saturating_sub(1); Tok::Rbrace } ':' => { - if self.cursor.eat_char('=') { + if self + .fstring_stack + .last_mut() + .is_some_and(FStringContext::try_start_format_spec) + { + Tok::Colon + } else if self.cursor.eat_char('=') { Tok::ColonEqual } else { Tok::Colon @@ -1680,4 +1903,157 @@ def f(arg=%timeit a = b): let source = "[1"; let _ = lex(source, Mode::Module).collect::>(); } + + #[test] + fn test_empty_fstrings() { + let source = r#"f"" "" F"" f'' '' f"""""" f''''''"#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_prefix() { + let source = r#"f"" F"" rf"" rF"" Rf"" RF"" fr"" Fr"" fR"" FR"""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring() { + let source = r#"f"normal {foo} {{another}} {bar} {{{three}}}""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_parentheses() { + let source = r#"f"{}" f"{{}}" f" {}" f"{{{}}}" f"{{{{}}}}" f" {} {{}} {{{}}} {{{{}}}} ""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_escape() { + let source = r#"f"\{x:\"\{x}} \"\"\ + end""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_escape_raw() { + let source = r#"rf"\{x:\"\{x}} \"\"\ + end""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_named_unicode() { + let source = r#"f"\N{BULLET} normal \Nope \N""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_named_unicode_raw() { + let source = r#"rf"\N{BULLET} normal""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_with_named_expression() { + let source = r#"f"{x:=10} {(x:=10)} {x,{y:=10}} {[x:=10]}""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_with_format_spec() { + let source = r#"f"{foo:} {x=!s:.3f} {x:.{y}f} {'':*^{1:{1}}}""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_conversion() { + let source = r#"f"{x!s} {x=!r} {x:.3f!r} {{x!r}}""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_nested() { + let source = r#"f"foo {f"bar {x + f"{wow}"}"} baz" f'foo {f'bar'} some {f"another"}'"#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_expression_multiline() { + let source = r#"f"first { + x + * + y +} second""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_multiline() { + let source = r#"f""" +hello + world +""" f''' + world +hello +''' f"some {f"""multiline +allowed {x}"""} string""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_comments() { + let source = r#"f""" +# not a comment { # comment { + x +} # not a comment +""""#; + assert_debug_snapshot!(lex_source(source)); + } + + #[test] + fn test_fstring_with_ipy_escape_command() { + let source = r#"f"foo {!pwd} bar""#; + assert_debug_snapshot!(lex_source(source)); + } + + fn lex_fstring_error(source: &str) -> FStringErrorType { + match lex(source, Mode::Module).find_map(std::result::Result::err) { + Some(err) => match err.error { + LexicalErrorType::FStringError(error) => error, + _ => panic!("Expected FStringError: {err:?}"), + }, + _ => panic!("Expected atleast one FStringError"), + } + } + + #[test] + fn test_fstring_error() { + use FStringErrorType::{ + UnclosedLbrace, UnterminatedString, UnterminatedTripleQuotedString, + }; + + assert_eq!(lex_fstring_error(r#"f"{""#), UnclosedLbrace); + assert_eq!(lex_fstring_error(r#"f"{foo!r""#), UnclosedLbrace); + assert_eq!( + lex_fstring_error( + r#"f"{" +"# + ), + UnclosedLbrace + ); + assert_eq!(lex_fstring_error(r#"f"""{""""#), UnclosedLbrace); + assert_eq!(lex_fstring_error(r#"f""#), UnterminatedString); + assert_eq!(lex_fstring_error(r#"f'"#), UnterminatedString); + assert_eq!(lex_fstring_error(r#"f""""#), UnterminatedTripleQuotedString); + assert_eq!(lex_fstring_error(r#"f'''"#), UnterminatedTripleQuotedString); + assert_eq!( + lex_fstring_error(r#"f"""""#), + UnterminatedTripleQuotedString + ); + assert_eq!( + lex_fstring_error(r#"f""""""#), + UnterminatedTripleQuotedString + ); + } } diff --git a/crates/ruff_python_parser/src/lexer/cursor.rs b/crates/ruff_python_parser/src/lexer/cursor.rs index ed5011a1d85075..c026c88e9b7fb1 100644 --- a/crates/ruff_python_parser/src/lexer/cursor.rs +++ b/crates/ruff_python_parser/src/lexer/cursor.rs @@ -96,6 +96,18 @@ impl<'a> Cursor<'a> { } } + pub(super) fn eat_char3(&mut self, c1: char, c2: char, c3: char) -> bool { + let mut chars = self.chars.clone(); + if chars.next() == Some(c1) && chars.next() == Some(c2) && chars.next() == Some(c3) { + self.bump(); + self.bump(); + self.bump(); + true + } else { + false + } + } + pub(super) fn eat_if(&mut self, mut predicate: F) -> Option where F: FnMut(char) -> bool, diff --git a/crates/ruff_python_parser/src/lexer/fstring.rs b/crates/ruff_python_parser/src/lexer/fstring.rs new file mode 100644 index 00000000000000..586d252b3d5ca6 --- /dev/null +++ b/crates/ruff_python_parser/src/lexer/fstring.rs @@ -0,0 +1,133 @@ +use bitflags::bitflags; + +use ruff_text_size::TextSize; + +bitflags! { + #[derive(Debug)] + pub(crate) struct FStringContextFlags: u32 { + /// The current f-string is a triple-quoted f-string i.e., the number of + /// opening and closing quotes is 3. If this flag is not set, the number + /// of opening and closing quotes is 1. + const TRIPLE = 1 << 0; + + /// The current f-string is a double-quoted f-string. If this flag is not + /// set, the current f-string is a single-quoted f-string. + const DOUBLE = 1 << 1; + + /// The current f-string is a raw f-string. If this flag is not set, the + /// current f-string is a non-raw f-string. + const RAW = 1 << 2; + } +} + +#[derive(Debug)] +pub(crate) struct FStringContext { + flags: FStringContextFlags, + /// The number of open parentheses for the current f-string. This includes all + /// three types of parentheses: round (`(`), square (`[`), and curly (`{`). + open_parentheses_count: u32, + /// The number of format specs for the current f-string. This is because there + /// can be multiple format specs nested. For example, `{a:{b:{c}}}` has 3 format + /// specs. + format_spec_depth: u32, +} + +impl FStringContext { + pub(crate) fn new(flags: FStringContextFlags) -> Self { + Self { + flags, + open_parentheses_count: 0, + format_spec_depth: 0, + } + } + + /// Returns the quote character for the current f-string. + pub(crate) fn quote_char(&self) -> char { + if self.flags.contains(FStringContextFlags::DOUBLE) { + '"' + } else { + '\'' + } + } + + /// Returns the number of quotes for the current f-string. + pub(crate) fn quote_size(&self) -> TextSize { + if self.is_triple_quoted() { + TextSize::from(3) + } else { + TextSize::from(1) + } + } + + /// Returns the triple quotes for the current f-string if it is a triple-quoted + /// f-string, `None` otherwise. + pub(crate) fn triple_quotes(&self) -> Option<&'static str> { + if self.is_triple_quoted() { + if self.flags.contains(FStringContextFlags::DOUBLE) { + Some(r#"""""#) + } else { + Some("'''") + } + } else { + None + } + } + + /// Returns `true` if the current f-string is a raw f-string. + pub(crate) fn is_raw_string(&self) -> bool { + self.flags.contains(FStringContextFlags::RAW) + } + + /// Returns `true` if the current f-string is a triple-quoted f-string. + pub(crate) fn is_triple_quoted(&self) -> bool { + self.flags.contains(FStringContextFlags::TRIPLE) + } + + /// Returns `true` if the current f-string has open parentheses. + pub(crate) fn has_open_parentheses(&mut self) -> bool { + self.open_parentheses_count > 0 + } + + /// Increments the number of parentheses for the current f-string. + pub(crate) fn increment_opening_parentheses(&mut self) { + self.open_parentheses_count += 1; + } + + /// Decrements the number of parentheses for the current f-string. If the + /// lexer is in a format spec, also decrements the number of format specs. + pub(crate) fn decrement_closing_parentheses(&mut self) { + if self.is_in_format_spec() { + self.format_spec_depth = self.format_spec_depth.saturating_sub(1); + } + self.open_parentheses_count = self.open_parentheses_count.saturating_sub(1); + } + + /// Returns `true` if the lexer is in a f-string expression i.e., between + /// two curly braces. + pub(crate) fn is_in_expression(&self) -> bool { + self.open_parentheses_count > self.format_spec_depth + } + + /// Returns `true` if the lexer is in a f-string format spec i.e., after a colon. + pub(crate) fn is_in_format_spec(&self) -> bool { + self.format_spec_depth > 0 && !self.is_in_expression() + } + + /// Returns `true` if the context is in a valid position to start format spec + /// i.e., at the same level of nesting as the opening parentheses token. + /// Increments the number of format specs if it is. + /// + /// This assumes that the current character for the lexer is a colon (`:`). + pub(crate) fn try_start_format_spec(&mut self) -> bool { + if self + .open_parentheses_count + .saturating_sub(self.format_spec_depth) + == 1 + { + self.format_spec_depth += 1; + true + } else { + false + } + } +} diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__empty_fstrings.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__empty_fstrings.snap new file mode 100644 index 00000000000000..d854312164f550 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__empty_fstrings.snap @@ -0,0 +1,27 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + FStringStart, + FStringEnd, + String { + value: "", + kind: String, + triple_quoted: false, + }, + FStringStart, + FStringEnd, + FStringStart, + FStringEnd, + String { + value: "", + kind: String, + triple_quoted: false, + }, + FStringStart, + FStringEnd, + FStringStart, + FStringEnd, + Newline, +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring.snap new file mode 100644 index 00000000000000..15eaed4907f48d --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring.snap @@ -0,0 +1,40 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + FStringStart, + FStringMiddle { + value: "normal ", + is_raw: false, + }, + Lbrace, + Name { + name: "foo", + }, + Rbrace, + FStringMiddle { + value: " {another} ", + is_raw: false, + }, + Lbrace, + Name { + name: "bar", + }, + Rbrace, + FStringMiddle { + value: " {", + is_raw: false, + }, + Lbrace, + Name { + name: "three", + }, + Rbrace, + FStringMiddle { + value: "}", + is_raw: false, + }, + FStringEnd, + Newline, +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_comments.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_comments.snap new file mode 100644 index 00000000000000..d7602c5a95e7d6 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_comments.snap @@ -0,0 +1,27 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + FStringStart, + FStringMiddle { + value: "\n# not a comment ", + is_raw: false, + }, + Lbrace, + Comment( + "# comment {", + ), + NonLogicalNewline, + Name { + name: "x", + }, + NonLogicalNewline, + Rbrace, + FStringMiddle { + value: " # not a comment\n", + is_raw: false, + }, + FStringEnd, + Newline, +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_conversion.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_conversion.snap new file mode 100644 index 00000000000000..c65bc4eccd30d8 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_conversion.snap @@ -0,0 +1,50 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + FStringStart, + Lbrace, + Name { + name: "x", + }, + Exclamation, + Name { + name: "s", + }, + Rbrace, + FStringMiddle { + value: " ", + is_raw: false, + }, + Lbrace, + Name { + name: "x", + }, + Equal, + Exclamation, + Name { + name: "r", + }, + Rbrace, + FStringMiddle { + value: " ", + is_raw: false, + }, + Lbrace, + Name { + name: "x", + }, + Colon, + FStringMiddle { + value: ".3f!r", + is_raw: false, + }, + Rbrace, + FStringMiddle { + value: " {x!r}", + is_raw: false, + }, + FStringEnd, + Newline, +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_escape.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_escape.snap new file mode 100644 index 00000000000000..c2bb475599111a --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_escape.snap @@ -0,0 +1,32 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + FStringStart, + FStringMiddle { + value: "\\", + is_raw: false, + }, + Lbrace, + Name { + name: "x", + }, + Colon, + FStringMiddle { + value: "\\\"\\", + is_raw: false, + }, + Lbrace, + Name { + name: "x", + }, + Rbrace, + Rbrace, + FStringMiddle { + value: " \\\"\\\"\\\n end", + is_raw: false, + }, + FStringEnd, + Newline, +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_escape_raw.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_escape_raw.snap new file mode 100644 index 00000000000000..989196d56ccf1d --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_escape_raw.snap @@ -0,0 +1,32 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + FStringStart, + FStringMiddle { + value: "\\", + is_raw: true, + }, + Lbrace, + Name { + name: "x", + }, + Colon, + FStringMiddle { + value: "\\\"\\", + is_raw: true, + }, + Lbrace, + Name { + name: "x", + }, + Rbrace, + Rbrace, + FStringMiddle { + value: " \\\"\\\"\\\n end", + is_raw: true, + }, + FStringEnd, + Newline, +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_expression_multiline.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_expression_multiline.snap new file mode 100644 index 00000000000000..1abd2fca4d7c38 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_expression_multiline.snap @@ -0,0 +1,30 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + FStringStart, + FStringMiddle { + value: "first ", + is_raw: false, + }, + Lbrace, + NonLogicalNewline, + Name { + name: "x", + }, + NonLogicalNewline, + Star, + NonLogicalNewline, + Name { + name: "y", + }, + NonLogicalNewline, + Rbrace, + FStringMiddle { + value: " second", + is_raw: false, + }, + FStringEnd, + Newline, +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_multiline.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_multiline.snap new file mode 100644 index 00000000000000..da37f282676550 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_multiline.snap @@ -0,0 +1,42 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + FStringStart, + FStringMiddle { + value: "\nhello\n world\n", + is_raw: false, + }, + FStringEnd, + FStringStart, + FStringMiddle { + value: "\n world\nhello\n", + is_raw: false, + }, + FStringEnd, + FStringStart, + FStringMiddle { + value: "some ", + is_raw: false, + }, + Lbrace, + FStringStart, + FStringMiddle { + value: "multiline\nallowed ", + is_raw: false, + }, + Lbrace, + Name { + name: "x", + }, + Rbrace, + FStringEnd, + Rbrace, + FStringMiddle { + value: " string", + is_raw: false, + }, + FStringEnd, + Newline, +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_named_unicode.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_named_unicode.snap new file mode 100644 index 00000000000000..74b6e997a2c862 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_named_unicode.snap @@ -0,0 +1,13 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + FStringStart, + FStringMiddle { + value: "\\N{BULLET} normal \\Nope \\N", + is_raw: false, + }, + FStringEnd, + Newline, +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_named_unicode_raw.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_named_unicode_raw.snap new file mode 100644 index 00000000000000..7196be49f24417 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_named_unicode_raw.snap @@ -0,0 +1,22 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + FStringStart, + FStringMiddle { + value: "\\N", + is_raw: true, + }, + Lbrace, + Name { + name: "BULLET", + }, + Rbrace, + FStringMiddle { + value: " normal", + is_raw: true, + }, + FStringEnd, + Newline, +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_nested.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_nested.snap new file mode 100644 index 00000000000000..1fa039cb4aa28d --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_nested.snap @@ -0,0 +1,64 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + FStringStart, + FStringMiddle { + value: "foo ", + is_raw: false, + }, + Lbrace, + FStringStart, + FStringMiddle { + value: "bar ", + is_raw: false, + }, + Lbrace, + Name { + name: "x", + }, + Plus, + FStringStart, + Lbrace, + Name { + name: "wow", + }, + Rbrace, + FStringEnd, + Rbrace, + FStringEnd, + Rbrace, + FStringMiddle { + value: " baz", + is_raw: false, + }, + FStringEnd, + FStringStart, + FStringMiddle { + value: "foo ", + is_raw: false, + }, + Lbrace, + FStringStart, + FStringMiddle { + value: "bar", + is_raw: false, + }, + FStringEnd, + Rbrace, + FStringMiddle { + value: " some ", + is_raw: false, + }, + Lbrace, + FStringStart, + FStringMiddle { + value: "another", + is_raw: false, + }, + FStringEnd, + Rbrace, + FStringEnd, + Newline, +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_parentheses.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_parentheses.snap new file mode 100644 index 00000000000000..42d111aa4532af --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_parentheses.snap @@ -0,0 +1,61 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + FStringStart, + Lbrace, + Rbrace, + FStringEnd, + FStringStart, + FStringMiddle { + value: "{}", + is_raw: false, + }, + FStringEnd, + FStringStart, + FStringMiddle { + value: " ", + is_raw: false, + }, + Lbrace, + Rbrace, + FStringEnd, + FStringStart, + FStringMiddle { + value: "{", + is_raw: false, + }, + Lbrace, + Rbrace, + FStringMiddle { + value: "}", + is_raw: false, + }, + FStringEnd, + FStringStart, + FStringMiddle { + value: "{{}}", + is_raw: false, + }, + FStringEnd, + FStringStart, + FStringMiddle { + value: " ", + is_raw: false, + }, + Lbrace, + Rbrace, + FStringMiddle { + value: " {} {", + is_raw: false, + }, + Lbrace, + Rbrace, + FStringMiddle { + value: "} {{}} ", + is_raw: false, + }, + FStringEnd, + Newline, +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_prefix.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_prefix.snap new file mode 100644 index 00000000000000..3f249489698a43 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_prefix.snap @@ -0,0 +1,27 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + FStringStart, + FStringEnd, + FStringStart, + FStringEnd, + FStringStart, + FStringEnd, + FStringStart, + FStringEnd, + FStringStart, + FStringEnd, + FStringStart, + FStringEnd, + FStringStart, + FStringEnd, + FStringStart, + FStringEnd, + FStringStart, + FStringEnd, + FStringStart, + FStringEnd, + Newline, +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_format_spec.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_format_spec.snap new file mode 100644 index 00000000000000..b1125413f3f5f2 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_format_spec.snap @@ -0,0 +1,84 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + FStringStart, + Lbrace, + Name { + name: "foo", + }, + Colon, + Rbrace, + FStringMiddle { + value: " ", + is_raw: false, + }, + Lbrace, + Name { + name: "x", + }, + Equal, + Exclamation, + Name { + name: "s", + }, + Colon, + FStringMiddle { + value: ".3f", + is_raw: false, + }, + Rbrace, + FStringMiddle { + value: " ", + is_raw: false, + }, + Lbrace, + Name { + name: "x", + }, + Colon, + FStringMiddle { + value: ".", + is_raw: false, + }, + Lbrace, + Name { + name: "y", + }, + Rbrace, + FStringMiddle { + value: "f", + is_raw: false, + }, + Rbrace, + FStringMiddle { + value: " ", + is_raw: false, + }, + Lbrace, + String { + value: "", + kind: String, + triple_quoted: false, + }, + Colon, + FStringMiddle { + value: "*^", + is_raw: false, + }, + Lbrace, + Int { + value: 1, + }, + Colon, + Lbrace, + Int { + value: 1, + }, + Rbrace, + Rbrace, + Rbrace, + FStringEnd, + Newline, +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_ipy_escape_command.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_ipy_escape_command.snap new file mode 100644 index 00000000000000..99982d714fa45f --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_ipy_escape_command.snap @@ -0,0 +1,23 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + FStringStart, + FStringMiddle { + value: "foo ", + is_raw: false, + }, + Lbrace, + Exclamation, + Name { + name: "pwd", + }, + Rbrace, + FStringMiddle { + value: " bar", + is_raw: false, + }, + FStringEnd, + Newline, +] diff --git a/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_named_expression.snap b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_named_expression.snap new file mode 100644 index 00000000000000..7308b9e2b824a5 --- /dev/null +++ b/crates/ruff_python_parser/src/snapshots/ruff_python_parser__lexer__tests__fstring_with_named_expression.snap @@ -0,0 +1,68 @@ +--- +source: crates/ruff_python_parser/src/lexer.rs +expression: lex_source(source) +--- +[ + FStringStart, + Lbrace, + Name { + name: "x", + }, + Colon, + FStringMiddle { + value: "=10", + is_raw: false, + }, + Rbrace, + FStringMiddle { + value: " ", + is_raw: false, + }, + Lbrace, + Lpar, + Name { + name: "x", + }, + ColonEqual, + Int { + value: 10, + }, + Rpar, + Rbrace, + FStringMiddle { + value: " ", + is_raw: false, + }, + Lbrace, + Name { + name: "x", + }, + Comma, + Lbrace, + Name { + name: "y", + }, + ColonEqual, + Int { + value: 10, + }, + Rbrace, + Rbrace, + FStringMiddle { + value: " ", + is_raw: false, + }, + Lbrace, + Lsqb, + Name { + name: "x", + }, + ColonEqual, + Int { + value: 10, + }, + Rsqb, + Rbrace, + FStringEnd, + Newline, +] diff --git a/crates/ruff_python_parser/src/string.rs b/crates/ruff_python_parser/src/string.rs index 936c4fdef18241..01ef71b83b5fbf 100644 --- a/crates/ruff_python_parser/src/string.rs +++ b/crates/ruff_python_parser/src/string.rs @@ -736,6 +736,8 @@ pub enum FStringErrorType { // TODO: Test this case. /// Unterminated string. UnterminatedString, + /// Unterminated triple-quoted string. + UnterminatedTripleQuotedString, } impl std::fmt::Display for FStringErrorType { @@ -743,7 +745,7 @@ impl std::fmt::Display for FStringErrorType { use FStringErrorType::{ EmptyExpression, ExpressionCannotInclude, ExpressionNestedTooDeeply, InvalidConversionFlag, InvalidExpression, MismatchedDelimiter, SingleRbrace, - UnclosedLbrace, Unmatched, UnterminatedString, + UnclosedLbrace, Unmatched, UnterminatedString, UnterminatedTripleQuotedString, }; match self { UnclosedLbrace => write!(f, "expecting '}}'"), @@ -764,6 +766,9 @@ impl std::fmt::Display for FStringErrorType { UnterminatedString => { write!(f, "unterminated string") } + UnterminatedTripleQuotedString => { + write!(f, "unterminated triple-quoted string") + } ExpressionCannotInclude(c) => { if *c == '\\' { write!(f, "f-string expression part cannot include a backslash") diff --git a/crates/ruff_python_parser/src/token.rs b/crates/ruff_python_parser/src/token.rs index db159a0340e039..1189c0686496cf 100644 --- a/crates/ruff_python_parser/src/token.rs +++ b/crates/ruff_python_parser/src/token.rs @@ -44,6 +44,19 @@ pub enum Tok { /// Whether the string is triple quoted. triple_quoted: bool, }, + /// Token value for the start of an f-string. This includes the `f`/`F`/`fr` prefix + /// and the opening quote(s). + FStringStart, + /// Token value that includes the portion of text inside the f-string that's not + /// part of the expression part and isn't an opening or closing brace. + FStringMiddle { + /// The string value. + value: String, + /// Whether the string is raw or not. + is_raw: bool, + }, + /// Token value for the end of an f-string. This includes the closing quote. + FStringEnd, /// Token value for IPython escape commands. These are recognized by the lexer /// only when the mode is [`Mode::Jupyter`]. IpyEscapeCommand { @@ -66,6 +79,8 @@ pub enum Tok { EndOfFile, /// Token value for a question mark `?`. This is only used in [`Mode::Jupyter`]. Question, + /// Token value for a exclamation mark `!`. + Exclamation, /// Token value for a left parenthesis `(`. Lpar, /// Token value for a right parenthesis `)`. @@ -234,6 +249,9 @@ impl fmt::Display for Tok { let quotes = "\"".repeat(if *triple_quoted { 3 } else { 1 }); write!(f, "{kind}{quotes}{value}{quotes}") } + FStringStart => f.write_str("FStringStart"), + FStringMiddle { value, .. } => f.write_str(value), + FStringEnd => f.write_str("FStringEnd"), IpyEscapeCommand { kind, value } => write!(f, "{kind}{value}"), Newline => f.write_str("Newline"), NonLogicalNewline => f.write_str("NonLogicalNewline"), @@ -243,6 +261,7 @@ impl fmt::Display for Tok { StartExpression => f.write_str("StartExpression"), EndOfFile => f.write_str("EOF"), Question => f.write_str("'?'"), + Exclamation => f.write_str("'!'"), Lpar => f.write_str("'('"), Rpar => f.write_str("')'"), Lsqb => f.write_str("'['"), @@ -450,6 +469,14 @@ pub enum TokenKind { Complex, /// Token value for a string. String, + /// Token value for the start of an f-string. This includes the `f`/`F`/`fr` prefix + /// and the opening quote(s). + FStringStart, + /// Token value that includes the portion of text inside the f-string that's not + /// part of the expression part and isn't an opening or closing brace. + FStringMiddle, + /// Token value for the end of an f-string. This includes the closing quote. + FStringEnd, /// Token value for a IPython escape command. EscapeCommand, /// Token value for a comment. These are filtered out of the token stream prior to parsing. @@ -466,6 +493,8 @@ pub enum TokenKind { EndOfFile, /// Token value for a question mark `?`. Question, + /// Token value for an exclamation mark `!`. + Exclamation, /// Token value for a left parenthesis `(`. Lpar, /// Token value for a right parenthesis `)`. @@ -781,6 +810,9 @@ impl TokenKind { Tok::Float { .. } => TokenKind::Float, Tok::Complex { .. } => TokenKind::Complex, Tok::String { .. } => TokenKind::String, + Tok::FStringStart => TokenKind::FStringStart, + Tok::FStringMiddle { .. } => TokenKind::FStringMiddle, + Tok::FStringEnd => TokenKind::FStringEnd, Tok::IpyEscapeCommand { .. } => TokenKind::EscapeCommand, Tok::Comment(_) => TokenKind::Comment, Tok::Newline => TokenKind::Newline, @@ -789,6 +821,7 @@ impl TokenKind { Tok::Dedent => TokenKind::Dedent, Tok::EndOfFile => TokenKind::EndOfFile, Tok::Question => TokenKind::Question, + Tok::Exclamation => TokenKind::Exclamation, Tok::Lpar => TokenKind::Lpar, Tok::Rpar => TokenKind::Rpar, Tok::Lsqb => TokenKind::Lsqb,