Skip to content

Commit

Permalink
Fix ECMAScript interpretation of \u{ 12} to be literal, not a repeate…
Browse files Browse the repository at this point in the history
…d u.
  • Loading branch information
PhilipHazel committed Sep 30, 2023
1 parent 90991cf commit ab47f88
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 4 deletions.
38 changes: 35 additions & 3 deletions src/pcre2_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -1680,7 +1680,9 @@ else
is set. Otherwise, \u must be followed by exactly four hex digits or, if
PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces.
Otherwise it is a lowercase u letter. This gives some compatibility with
ECMAScript (aka JavaScript). */
ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT
allowed. When \u{ is not followed by hex digits, a special return is given
because otherwise \u{ 12} (for example) would be treated as u{12}. */

case CHAR_u:
if (!alt_bsux) *errorcodeptr = ERR37; else
Expand Down Expand Up @@ -1709,7 +1711,11 @@ else
if (hptr == ptr + 1 || /* No hex digits */
hptr >= ptrend || /* Hit end of input */
*hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */
break; /* Hex escape not recognized */
{
escape = ESC_ub; /* Special return */
ptr++; /* Skip { */
break; /* Hex escape not recognized */
}

c = cc; /* Accept the code point */
ptr = hptr + 1;
Expand Down Expand Up @@ -2780,6 +2786,7 @@ int escape;
int i;
BOOL inescq = FALSE;
BOOL inverbname = FALSE;
BOOL next_is_literal = FALSE;
BOOL utf = (options & PCRE2_UTF) != 0;
BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0;
BOOL isdupname;
Expand Down Expand Up @@ -2875,6 +2882,16 @@ while (ptr < ptrend)
thisptr = ptr;
GETCHARINCTEST(c, ptr);

/* Handle cases where previous processing has determined that the next
character is literal. */

if (next_is_literal)
{
PARSED_LITERAL(c, parsed_pattern);
next_is_literal = FALSE;
continue; /* Next character */
}

/* Copy quoted literals until \E, allowing for the possibility of automatic
callouts, except when processing a (*VERB) "name". */

Expand Down Expand Up @@ -2992,6 +3009,11 @@ while (ptr < ptrend)
*parsed_pattern++ = c;
break;

case ESC_ub:
*parsed_pattern++ = CHAR_u;
PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
break;

case ESC_Q:
inescq = TRUE;
break;
Expand Down Expand Up @@ -3249,6 +3271,16 @@ while (ptr < ptrend)
*parsed_pattern++ = META_ESCAPE + escape;
break;

/* This is a special return that happens only in EXTRA_ALT_BSUX mode,
when \u{ is not followed by hex digits and }. It requests two literal
characters, u and { and we need this, as otherwise \u{ 12} (for example)
would be treated as u{12} now that spaces are allowed in quantifiers. */

case ESC_ub:
*parsed_pattern++ = CHAR_u;
PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern);
break;

case ESC_X:
#ifndef SUPPORT_UNICODE
errorcode = ERR45; /* Supported only with Unicode support */
Expand Down Expand Up @@ -3745,7 +3777,7 @@ while (ptr < ptrend)
{
case 0: /* Escaped character code point is in c */
char_is_literal = FALSE;
goto CLASS_LITERAL;
goto CLASS_LITERAL; /* (a few lines above) */

case ESC_b:
c = CHAR_BS; /* \b is backspace in a class */
Expand Down
8 changes: 7 additions & 1 deletion src/pcre2_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -1345,6 +1345,12 @@ mode rather than an escape sequence. It is also used for [^] in JavaScript
compatibility mode, and for \C in non-utf mode. In non-DOTALL mode, "." behaves
like \N.
ESC_ub is a special return from check_escape() when, in BSUX mode, \u{ is not
followed by hex digits and }, in which case it should mean a literal "u"
followed by a literal "{". This hack is necessary for cases like \u{ 12}
because without it, this is interpreted as u{12} now that spaces are allowed in
quantifiers.
Negative numbers are used to encode a backreference (\1, \2, \3, etc.) in
check_escape(). There are tests in the code for an escape greater than ESC_b
and less than ESC_Z to detect the types that may be repeated. These are the
Expand All @@ -1354,7 +1360,7 @@ consume a character, that code will have to change. */
enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s,
ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H,
ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z,
ESC_E, ESC_Q, ESC_g, ESC_k };
ESC_E, ESC_Q, ESC_g, ESC_k, ESC_ub };


/********************** Opcode definitions ******************/
Expand Down
9 changes: 9 additions & 0 deletions testdata/testinput5
Original file line number Diff line number Diff line change
Expand Up @@ -824,6 +824,15 @@

/\u{}/extra_alt_bsux
u{}

/\u{ 12}/extra_alt_bsux
--u{ 12}--

/\u{Q12}/extra_alt_bsux
--u{Q12}--

/\u{{3}}/extra_alt_bsux
--u{{{}--

/\u/utf,alt_bsux
\\u
Expand Down
12 changes: 12 additions & 0 deletions testdata/testoutput5
Original file line number Diff line number Diff line change
Expand Up @@ -1747,6 +1747,18 @@ No match
/\u{}/extra_alt_bsux
u{}
0: u{}

/\u{ 12}/extra_alt_bsux
--u{ 12}--
0: u{ 12}

/\u{Q12}/extra_alt_bsux
--u{Q12}--
0: u{Q12}

/\u{{3}}/extra_alt_bsux
--u{{{}--
0: u{{{}

/\u/utf,alt_bsux
\\u
Expand Down

0 comments on commit ab47f88

Please sign in to comment.