From ab47f889c6639a0e59ea5acc063c85c5e459d55d Mon Sep 17 00:00:00 2001 From: Philip Hazel Date: Sat, 30 Sep 2023 17:35:47 +0100 Subject: [PATCH] Fix ECMAScript interpretation of \u{ 12} to be literal, not a repeated u. --- src/pcre2_compile.c | 38 +++++++++++++++++++++++++++++++++++--- src/pcre2_internal.h | 8 +++++++- testdata/testinput5 | 9 +++++++++ testdata/testoutput5 | 12 ++++++++++++ 4 files changed, 63 insertions(+), 4 deletions(-) diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 5d04daad9..f72ddc822 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -1680,7 +1680,9 @@ else is set. Otherwise, \u must be followed by exactly four hex digits or, if PCRE2_EXTRA_ALT_BSUX is set, by any number of hex digits in braces. Otherwise it is a lowercase u letter. This gives some compatibility with - ECMAScript (aka JavaScript). */ + ECMAScript (aka JavaScript). Unlike other braced items, white space is NOT + allowed. When \u{ is not followed by hex digits, a special return is given + because otherwise \u{ 12} (for example) would be treated as u{12}. */ case CHAR_u: if (!alt_bsux) *errorcodeptr = ERR37; else @@ -1709,7 +1711,11 @@ else if (hptr == ptr + 1 || /* No hex digits */ hptr >= ptrend || /* Hit end of input */ *hptr != CHAR_RIGHT_CURLY_BRACKET) /* No } terminator */ - break; /* Hex escape not recognized */ + { + escape = ESC_ub; /* Special return */ + ptr++; /* Skip { */ + break; /* Hex escape not recognized */ + } c = cc; /* Accept the code point */ ptr = hptr + 1; @@ -2780,6 +2786,7 @@ int escape; int i; BOOL inescq = FALSE; BOOL inverbname = FALSE; +BOOL next_is_literal = FALSE; BOOL utf = (options & PCRE2_UTF) != 0; BOOL auto_callout = (options & PCRE2_AUTO_CALLOUT) != 0; BOOL isdupname; @@ -2875,6 +2882,16 @@ while (ptr < ptrend) thisptr = ptr; GETCHARINCTEST(c, ptr); + /* Handle cases where previous processing has determined that the next + character is literal. */ + + if (next_is_literal) + { + PARSED_LITERAL(c, parsed_pattern); + next_is_literal = FALSE; + continue; /* Next character */ + } + /* Copy quoted literals until \E, allowing for the possibility of automatic callouts, except when processing a (*VERB) "name". */ @@ -2992,6 +3009,11 @@ while (ptr < ptrend) *parsed_pattern++ = c; break; + case ESC_ub: + *parsed_pattern++ = CHAR_u; + PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern); + break; + case ESC_Q: inescq = TRUE; break; @@ -3249,6 +3271,16 @@ while (ptr < ptrend) *parsed_pattern++ = META_ESCAPE + escape; break; + /* This is a special return that happens only in EXTRA_ALT_BSUX mode, + when \u{ is not followed by hex digits and }. It requests two literal + characters, u and { and we need this, as otherwise \u{ 12} (for example) + would be treated as u{12} now that spaces are allowed in quantifiers. */ + + case ESC_ub: + *parsed_pattern++ = CHAR_u; + PARSED_LITERAL(CHAR_LEFT_CURLY_BRACKET, parsed_pattern); + break; + case ESC_X: #ifndef SUPPORT_UNICODE errorcode = ERR45; /* Supported only with Unicode support */ @@ -3745,7 +3777,7 @@ while (ptr < ptrend) { case 0: /* Escaped character code point is in c */ char_is_literal = FALSE; - goto CLASS_LITERAL; + goto CLASS_LITERAL; /* (a few lines above) */ case ESC_b: c = CHAR_BS; /* \b is backspace in a class */ diff --git a/src/pcre2_internal.h b/src/pcre2_internal.h index f7c3d519f..8f667114a 100644 --- a/src/pcre2_internal.h +++ b/src/pcre2_internal.h @@ -1345,6 +1345,12 @@ mode rather than an escape sequence. It is also used for [^] in JavaScript compatibility mode, and for \C in non-utf mode. In non-DOTALL mode, "." behaves like \N. +ESC_ub is a special return from check_escape() when, in BSUX mode, \u{ is not +followed by hex digits and }, in which case it should mean a literal "u" +followed by a literal "{". This hack is necessary for cases like \u{ 12} +because without it, this is interpreted as u{12} now that spaces are allowed in +quantifiers. + Negative numbers are used to encode a backreference (\1, \2, \3, etc.) in check_escape(). There are tests in the code for an escape greater than ESC_b and less than ESC_Z to detect the types that may be repeated. These are the @@ -1354,7 +1360,7 @@ consume a character, that code will have to change. */ enum { ESC_A = 1, ESC_G, ESC_K, ESC_B, ESC_b, ESC_D, ESC_d, ESC_S, ESC_s, ESC_W, ESC_w, ESC_N, ESC_dum, ESC_C, ESC_P, ESC_p, ESC_R, ESC_H, ESC_h, ESC_V, ESC_v, ESC_X, ESC_Z, ESC_z, - ESC_E, ESC_Q, ESC_g, ESC_k }; + ESC_E, ESC_Q, ESC_g, ESC_k, ESC_ub }; /********************** Opcode definitions ******************/ diff --git a/testdata/testinput5 b/testdata/testinput5 index e7158ff25..0068cea43 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -824,6 +824,15 @@ /\u{}/extra_alt_bsux u{} + +/\u{ 12}/extra_alt_bsux + --u{ 12}-- + +/\u{Q12}/extra_alt_bsux + --u{Q12}-- + +/\u{{3}}/extra_alt_bsux + --u{{{}-- /\u/utf,alt_bsux \\u diff --git a/testdata/testoutput5 b/testdata/testoutput5 index 0f7ca1d2c..e752c7691 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -1747,6 +1747,18 @@ No match /\u{}/extra_alt_bsux u{} 0: u{} + +/\u{ 12}/extra_alt_bsux + --u{ 12}-- + 0: u{ 12} + +/\u{Q12}/extra_alt_bsux + --u{Q12}-- + 0: u{Q12} + +/\u{{3}}/extra_alt_bsux + --u{{{}-- + 0: u{{{} /\u/utf,alt_bsux \\u