From ab47f889c6639a0e59ea5acc063c85c5e459d55d Mon Sep 17 00:00:00 2001
From: Philip Hazel <Philip.Hazel@gmail.com>
Date: Sat, 30 Sep 2023 17:35:47 +0100
Subject: [PATCH] Fix ECMAScript interpretation of \u{ 12} to be literal, not a
 repeated u.

---
 src/pcre2_compile.c  | 38 +++++++++++++++++++++++++++++++++++---
 src/pcre2_internal.h |  8 +++++++-
 testdata/testinput5  |  9 +++++++++
 testdata/testoutput5 | 12 ++++++++++++
 4 files changed, 63 insertions(+), 4 deletions(-)

diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index 5d04daad9..f72ddc822 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -1680,7 +1680,9 @@ else
     is set. Otherwise, \u must be followed by exactly four hex digits or, if
     PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
     Otherwise it is a lowercase u letter. This gives some compatibility with
-    ECMAScript (aka JavaScript). */
+    ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
+    allowed. When \u{ is not followed by hex digits, a special return is given
+    because otherwise \u{ 12} (for example) would be treated as u{12}. */
 
     case CHAR_u:
     if (!alt_bsux) *errorcodeptr = ERR37; else
@@ -1709,7 +1711,11 @@ else
         if (hptr == ptr + 1 ||   /* No hex digits */
             hptr >= ptrend ||    /* Hit end of input */
             *hptr != CHAR_RIGHT_CURLY_BRACKET)  /* No } terminator */
-          break;         /* Hex escape not recognized */
+          {
+          escape = ESC_ub;    /* Special return */
+          ptr++;              /* Skip { */
+          break;              /* Hex escape not recognized */
+          }
 
         c = cc;          /* Accept the code point */
         ptr = hptr + 1;
@@ -2780,6 +2786,7 @@ int escape;
 int i;
 BOOL inescq = FALSE;
 BOOL inverbname = FALSE;
+BOOL next_is_literal = FALSE;
 BOOL utf = (options & PCRE2_UTF) != 0;
 BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
 BOOL isdupname;
@@ -2875,6 +2882,16 @@ while (ptr < ptrend)
   thisptr = ptr;
   GETCHARINCTEST(c, ptr);
 
+  /* Handle cases where previous processing has determined that the next
+  character is literal. */
+
+  if (next_is_literal)
+    {
+    PARSED_LITERAL(c, parsed_pattern);
+    next_is_literal = FALSE;
+    continue;  /* Next character */
+    }
+
   /* Copy quoted literals until \E, allowing for the possibility of automatic
   callouts, except when processing a (*VERB) "name".  */
 
@@ -2992,6 +3009,11 @@ while (ptr < ptrend)
         *parsed_pattern++ = c;
         break;
 
+        case ESC_ub:
+        *parsed_pattern++ = CHAR_u;
+        PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
+        break;
+
         case ESC_Q:
         inescq = TRUE;
         break;
@@ -3249,6 +3271,16 @@ while (ptr < ptrend)
       *parsed_pattern++ = META_ESCAPE + escape;
       break;
 
+      /* This is a special return that happens only in EXTRA_ALT_BSUX mode,
+      when \u{ is not followed by hex digits and }. It requests two literal
+      characters, u and { and we need this, as otherwise \u{ 12} (for example)
+      would be treated as u{12} now that spaces are allowed in quantifiers. */
+
+      case ESC_ub:
+      *parsed_pattern++ = CHAR_u;
+      PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
+      break;
+
       case ESC_X:
 #ifndef SUPPORT_UNICODE
       errorcode = ERR45;   /* Supported only with Unicode support */
@@ -3745,7 +3777,7 @@ while (ptr < ptrend)
           {
           case 0:  /* Escaped character code point is in c */
           char_is_literal = FALSE;
-          goto CLASS_LITERAL;
+          goto CLASS_LITERAL;      /* (a few lines above) */
 
           case ESC_b:
           c = CHAR_BS;    /* \b is backspace in a class */
diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h
index f7c3d519f..8f667114a 100644
--- a/src/pcre2_internal.h
+++ b/src/pcre2_internal.h
@@ -1345,6 +1345,12 @@ mode rather than an escape sequence. It is also used for [^] in JavaScript
 compatibility mode, and for \C in non-utf mode. In non-DOTALL mode, "." behaves
 like \N.
 
+ESC_ub is a special return from check_escape() when, in BSUX mode, \u{ is not 
+followed by hex digits and }, in which case it should mean a literal "u" 
+followed by a literal "{". This hack is necessary for cases like \u{ 12} 
+because without it, this is interpreted as u{12} now that spaces are allowed in 
+quantifiers.
+
 Negative numbers are used to encode a backreference (\1, \2, \3, etc.) in
 check_escape(). There are tests in the code for an escape greater than ESC_b
 and less than ESC_Z to detect the types that may be repeated. These are the
@@ -1354,7 +1360,7 @@ consume a character, that code will have to change. */
 enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
        ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
        ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z,
-       ESC_E, ESC_Q, ESC_g, ESC_k };
+       ESC_E, ESC_Q, ESC_g, ESC_k, ESC_ub };
 
 
 /********************** Opcode definitions ******************/
diff --git a/testdata/testinput5 b/testdata/testinput5
index e7158ff25..0068cea43 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -824,6 +824,15 @@
 
 /\u{}/extra_alt_bsux
     u{}
+    
+/\u{ 12}/extra_alt_bsux
+    --u{ 12}--
+
+/\u{Q12}/extra_alt_bsux
+    --u{Q12}--
+
+/\u{{3}}/extra_alt_bsux
+    --u{{{}--
 
 /\u/utf,alt_bsux
     \\u
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index 0f7ca1d2c..e752c7691 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -1747,6 +1747,18 @@ No match
 /\u{}/extra_alt_bsux
     u{}
  0: u{}
+    
+/\u{ 12}/extra_alt_bsux
+    --u{ 12}--
+ 0: u{ 12}
+
+/\u{Q12}/extra_alt_bsux
+    --u{Q12}--
+ 0: u{Q12}
+
+/\u{{3}}/extra_alt_bsux
+    --u{{{}--
+ 0: u{{{}
 
 /\u/utf,alt_bsux
     \\u