From 466e42ca2bea2480ff367e0e26e3967435ac3e30 Mon Sep 17 00:00:00 2001 From: Andrew Gallant Date: Sat, 14 Oct 2023 12:48:09 -0400 Subject: [PATCH] lite: fix stack overflow in NFA compiler This commit fixes a bug where the parser could produce a very deeply nested Hir value beyond the configured nested limit. This was caused by the fact that the Hir can have some of its nested structures added to it without a corresponding recursive call in the parser. For example, repetition operators. This means that even if we don't blow the nest limit in the parser, the Hir itself can still become nested beyond the limit. This in turn will make it possible to unintentionally overflow the stack in subsequent recursion over the Hir value, such as in the Thompson NFA compiler. We fix this by checking the nesting limit both on every recursive parse call and also on the depth of the final Hir value once parsing is finished but before it has returned to the caller. Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60608 --- ...zed-fuzz_regex_lite_match-4692452983046144 | Bin 0 -> 5437 bytes regex-lite/src/hir/parse.rs | 60 ++++++++++++++++-- regex-lite/tests/fuzz/mod.rs | 17 +++++ 3 files changed, 72 insertions(+), 5 deletions(-) create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-4692452983046144 diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-4692452983046144 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-4692452983046144 new file mode 100644 index 0000000000000000000000000000000000000000..184b6ed7019033ef791366a8dfd3287d7d347965 GIT binary patch literal 5437 zcmdPk(umX40wR6=va~WS2!Jy6pj;v;b0XD^Dx+}-aM{z?f>Gl~Lx7YJ;37rwsM^sG z7?~lUOf_$FQN<+cm_ik&j2be+Ltr#5j_?F9>U~HEj8+oxl)wdNjWS3Ifzf~?DY%GK sJQ}J*2Go#K#>4;uYz#;aze`hqWDyORTr@Bfi;-4X#ON% Parser<'a> { /// own routine. impl<'a> Parser<'a> { pub(super) fn parse(&self) -> Result { + let hir = self.parse_inner()?; + // While we also check nesting during parsing, that only checks the + // number of recursive parse calls. It does not necessarily cover + // all possible recursive nestings of the Hir itself. For example, + // repetition operators don't require recursive parse calls. So one + // can stack them arbitrarily without overflowing the stack in the + // *parser*. But then if one recurses over the resulting Hir, a stack + // overflow is possible. So here we check the Hir nesting level + // thoroughly to ensure it isn't nested too deeply. + // + // Note that we do still need the nesting limit check in the parser as + // well, since that will avoid overflowing the stack during parse time + // before the complete Hir value is constructed. + check_hir_nesting(&hir, self.config.nest_limit)?; + Ok(hir) + } + + fn parse_inner(&self) -> Result { let depth = self.increment_depth()?; let mut alternates = vec![]; let mut concat = vec![]; @@ -806,7 +824,7 @@ impl<'a> Parser<'a> { if self.bump_if("?P<") || self.bump_if("?<") { let index = self.next_capture_index()?; let name = Some(Box::from(self.parse_capture_name()?)); - let sub = Box::new(self.parse()?); + let sub = Box::new(self.parse_inner()?); let cap = hir::Capture { index, name, sub }; Ok(Some(Hir::capture(cap))) } else if self.bump_if("?") { @@ -826,11 +844,11 @@ impl<'a> Parser<'a> { } else { assert_eq!(':', self.char()); self.bump(); - self.parse().map(Some) + self.parse_inner().map(Some) } } else { let index = self.next_capture_index()?; - let sub = Box::new(self.parse()?); + let sub = Box::new(self.parse_inner()?); let cap = hir::Capture { index, name: None, sub }; Ok(Some(Hir::capture(cap))) } @@ -1263,6 +1281,38 @@ impl<'a> Parser<'a> { } } +/// This checks the depth of the given `Hir` value, and if it exceeds the given +/// limit, then an error is returned. +fn check_hir_nesting(hir: &Hir, limit: u32) -> Result<(), Error> { + fn recurse(hir: &Hir, limit: u32, depth: u32) -> Result<(), Error> { + if depth > limit { + return Err(Error::new(ERR_TOO_MUCH_NESTING)); + } + let Some(next_depth) = depth.checked_add(1) else { + return Err(Error::new(ERR_TOO_MUCH_NESTING)); + }; + match *hir.kind() { + HirKind::Empty + | HirKind::Char(_) + | HirKind::Class(_) + | HirKind::Look(_) => Ok(()), + HirKind::Repetition(hir::Repetition { ref sub, .. }) => { + recurse(sub, limit, next_depth) + } + HirKind::Capture(hir::Capture { ref sub, .. }) => { + recurse(sub, limit, next_depth) + } + HirKind::Concat(ref subs) | HirKind::Alternation(ref subs) => { + for sub in subs.iter() { + recurse(sub, limit, next_depth)?; + } + Ok(()) + } + } + } + recurse(hir, limit, 0) +} + /// Converts the given Hir to a literal char if the Hir is just a single /// character. Otherwise this returns an error. /// @@ -1344,12 +1394,12 @@ mod tests { use super::*; fn p(pattern: &str) -> Hir { - Parser::new(Config::default(), pattern).parse().unwrap() + Parser::new(Config::default(), pattern).parse_inner().unwrap() } fn perr(pattern: &str) -> String { Parser::new(Config::default(), pattern) - .parse() + .parse_inner() .unwrap_err() .to_string() } diff --git a/regex-lite/tests/fuzz/mod.rs b/regex-lite/tests/fuzz/mod.rs index 6eb37b50b..747aab040 100644 --- a/regex-lite/tests/fuzz/mod.rs +++ b/regex-lite/tests/fuzz/mod.rs @@ -14,6 +14,23 @@ fn captures_wrong_order_min() { let _ = run(data); } +// Simpler regression test from a failure found by OSS-fuzz[1]. This test, +// when it failed, caused a stack overflow. We fixed it by adding another nest +// check on the Hir value itself, since the Hir type can have depth added to +// it without recursive calls in the parser (which is where the existing nest +// check was). +// +// Many thanks to Addison Crump for coming up with this test case[2]. +// +// [1]: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60608 +// [2]: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60608#c1 +#[test] +fn many_zero_to_many_reps() { + let pat = format!(".{}", "*".repeat(1 << 15)); + let Ok(re) = regex_lite::RegexBuilder::new(&pat).build() else { return }; + re.is_match(""); +} + // This is the fuzz target function. We duplicate it here since this is the // thing we use to interpret the data. It is ultimately what we want to // succeed.