From 466e42ca2bea2480ff367e0e26e3967435ac3e30 Mon Sep 17 00:00:00 2001
From: Andrew Gallant <jamslam@gmail.com>
Date: Sat, 14 Oct 2023 12:48:09 -0400
Subject: [PATCH] lite: fix stack overflow in NFA compiler

This commit fixes a bug where the parser could produce a very deeply
nested Hir value beyond the configured nested limit. This was caused by
the fact that the Hir can have some of its nested structures added to it
without a corresponding recursive call in the parser. For example,
repetition operators. This means that even if we don't blow the nest
limit in the parser, the Hir itself can still become nested beyond the
limit. This in turn will make it possible to unintentionally overflow
the stack in subsequent recursion over the Hir value, such as in the
Thompson NFA compiler.

We fix this by checking the nesting limit both on every recursive parse
call and also on the depth of the final Hir value once parsing is
finished but before it has returned to the caller.

Fixes https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60608
---
 ...zed-fuzz_regex_lite_match-4692452983046144 | Bin 0 -> 5437 bytes
 regex-lite/src/hir/parse.rs                   |  60 ++++++++++++++++--
 regex-lite/tests/fuzz/mod.rs                  |  17 +++++
 3 files changed, 72 insertions(+), 5 deletions(-)
 create mode 100644 fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-4692452983046144

diff --git a/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-4692452983046144 b/fuzz/regressions/clusterfuzz-testcase-minimized-fuzz_regex_lite_match-4692452983046144
new file mode 100644
index 0000000000000000000000000000000000000000..184b6ed7019033ef791366a8dfd3287d7d347965
GIT binary patch
literal 5437
zcmdPk(umX40wR6=va~WS2!Jy6pj;v;b0XD^Dx+}-aM{z?f>Gl~Lx7YJ;37rwsM^sG
z7?~lUOf_$FQN<+cm_ik&j2be+Ltr#5j_?F9>U~HEj8+oxl)wdNjWS3Ifzf~?DY%GK
sJQ}J*2Go#K#>4;uYz#;aze`hqWDyORTr@Bfi;-4X#ON%<MN|9A0HC71j{pDw

literal 0
HcmV?d00001

diff --git a/regex-lite/src/hir/parse.rs b/regex-lite/src/hir/parse.rs
index 33bb97a7d..0dcccdd46 100644
--- a/regex-lite/src/hir/parse.rs
+++ b/regex-lite/src/hir/parse.rs
@@ -377,6 +377,24 @@ impl<'a> Parser<'a> {
 /// own routine.
 impl<'a> Parser<'a> {
     pub(super) fn parse(&self) -> Result<Hir, Error> {
+        let hir = self.parse_inner()?;
+        // While we also check nesting during parsing, that only checks the
+        // number of recursive parse calls. It does not necessarily cover
+        // all possible recursive nestings of the Hir itself. For example,
+        // repetition operators don't require recursive parse calls. So one
+        // can stack them arbitrarily without overflowing the stack in the
+        // *parser*. But then if one recurses over the resulting Hir, a stack
+        // overflow is possible. So here we check the Hir nesting level
+        // thoroughly to ensure it isn't nested too deeply.
+        //
+        // Note that we do still need the nesting limit check in the parser as
+        // well, since that will avoid overflowing the stack during parse time
+        // before the complete Hir value is constructed.
+        check_hir_nesting(&hir, self.config.nest_limit)?;
+        Ok(hir)
+    }
+
+    fn parse_inner(&self) -> Result<Hir, Error> {
         let depth = self.increment_depth()?;
         let mut alternates = vec![];
         let mut concat = vec![];
@@ -806,7 +824,7 @@ impl<'a> Parser<'a> {
         if self.bump_if("?P<") || self.bump_if("?<") {
             let index = self.next_capture_index()?;
             let name = Some(Box::from(self.parse_capture_name()?));
-            let sub = Box::new(self.parse()?);
+            let sub = Box::new(self.parse_inner()?);
             let cap = hir::Capture { index, name, sub };
             Ok(Some(Hir::capture(cap)))
         } else if self.bump_if("?") {
@@ -826,11 +844,11 @@ impl<'a> Parser<'a> {
             } else {
                 assert_eq!(':', self.char());
                 self.bump();
-                self.parse().map(Some)
+                self.parse_inner().map(Some)
             }
         } else {
             let index = self.next_capture_index()?;
-            let sub = Box::new(self.parse()?);
+            let sub = Box::new(self.parse_inner()?);
             let cap = hir::Capture { index, name: None, sub };
             Ok(Some(Hir::capture(cap)))
         }
@@ -1263,6 +1281,38 @@ impl<'a> Parser<'a> {
     }
 }
 
+/// This checks the depth of the given `Hir` value, and if it exceeds the given
+/// limit, then an error is returned.
+fn check_hir_nesting(hir: &Hir, limit: u32) -> Result<(), Error> {
+    fn recurse(hir: &Hir, limit: u32, depth: u32) -> Result<(), Error> {
+        if depth > limit {
+            return Err(Error::new(ERR_TOO_MUCH_NESTING));
+        }
+        let Some(next_depth) = depth.checked_add(1) else {
+            return Err(Error::new(ERR_TOO_MUCH_NESTING));
+        };
+        match *hir.kind() {
+            HirKind::Empty
+            | HirKind::Char(_)
+            | HirKind::Class(_)
+            | HirKind::Look(_) => Ok(()),
+            HirKind::Repetition(hir::Repetition { ref sub, .. }) => {
+                recurse(sub, limit, next_depth)
+            }
+            HirKind::Capture(hir::Capture { ref sub, .. }) => {
+                recurse(sub, limit, next_depth)
+            }
+            HirKind::Concat(ref subs) | HirKind::Alternation(ref subs) => {
+                for sub in subs.iter() {
+                    recurse(sub, limit, next_depth)?;
+                }
+                Ok(())
+            }
+        }
+    }
+    recurse(hir, limit, 0)
+}
+
 /// Converts the given Hir to a literal char if the Hir is just a single
 /// character. Otherwise this returns an error.
 ///
@@ -1344,12 +1394,12 @@ mod tests {
     use super::*;
 
     fn p(pattern: &str) -> Hir {
-        Parser::new(Config::default(), pattern).parse().unwrap()
+        Parser::new(Config::default(), pattern).parse_inner().unwrap()
     }
 
     fn perr(pattern: &str) -> String {
         Parser::new(Config::default(), pattern)
-            .parse()
+            .parse_inner()
             .unwrap_err()
             .to_string()
     }
diff --git a/regex-lite/tests/fuzz/mod.rs b/regex-lite/tests/fuzz/mod.rs
index 6eb37b50b..747aab040 100644
--- a/regex-lite/tests/fuzz/mod.rs
+++ b/regex-lite/tests/fuzz/mod.rs
@@ -14,6 +14,23 @@ fn captures_wrong_order_min() {
     let _ = run(data);
 }
 
+// Simpler regression test from a failure found by OSS-fuzz[1]. This test,
+// when it failed, caused a stack overflow. We fixed it by adding another nest
+// check on the Hir value itself, since the Hir type can have depth added to
+// it without recursive calls in the parser (which is where the existing nest
+// check was).
+//
+// Many thanks to Addison Crump for coming up with this test case[2].
+//
+// [1]: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60608
+// [2]: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=60608#c1
+#[test]
+fn many_zero_to_many_reps() {
+    let pat = format!(".{}", "*".repeat(1 << 15));
+    let Ok(re) = regex_lite::RegexBuilder::new(&pat).build() else { return };
+    re.is_match("");
+}
+
 // This is the fuzz target function. We duplicate it here since this is the
 // thing we use to interpret the data. It is ultimately what we want to
 // succeed.