From 4a37c505d2580c8c31d78c6094908acc60ad06fe Mon Sep 17 00:00:00 2001 From: Nicholas Wilson Date: Fri, 13 Sep 2024 20:47:10 +0100 Subject: [PATCH] First PR! octal handling flags --- src/pcre2.h.in | 2 + src/pcre2_compile.c | 131 ++++++++++++++++++++++++++++-------------- src/pcre2test.c | 2 + testdata/testinput10 | 22 +++++++ testdata/testinput2 | 46 +++++++++++++++ testdata/testinput5 | 13 +++++ testdata/testoutput10 | 29 ++++++++++ testdata/testoutput2 | 64 +++++++++++++++++++++ testdata/testoutput5 | 17 ++++++ 9 files changed, 283 insertions(+), 43 deletions(-) diff --git a/src/pcre2.h.in b/src/pcre2.h.in index a19313c9e..5558d8b7e 100644 --- a/src/pcre2.h.in +++ b/src/pcre2.h.in @@ -159,6 +159,8 @@ D is inspected during pcre2_dfa_match() execution #define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */ #define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */ #define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */ +#define PCRE2_EXTRA_PYTHON_OCTAL 0x00002000u /* C */ +#define PCRE2_EXTRA_NO_BS0 0x00004000u /* C */ /* These are for pcre2_jit_compile(). */ diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c index 26f6c1645..fc2c3fd30 100644 --- a/src/pcre2_compile.c +++ b/src/pcre2_compile.c @@ -517,44 +517,44 @@ in UTF-8 mode. It runs from '0' to 'z'. */ #define UPPER_CASE(c) (c-32) static const short int escapes[] = { - 0, 0, - 0, 0, - 0, 0, - 0, 0, - 0, 0, - CHAR_COLON, CHAR_SEMICOLON, - CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN, - CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK, - CHAR_COMMERCIAL_AT, -ESC_A, - -ESC_B, -ESC_C, - -ESC_D, -ESC_E, - 0, -ESC_G, - -ESC_H, 0, - 0, -ESC_K, - 0, 0, - -ESC_N, 0, - -ESC_P, -ESC_Q, - -ESC_R, -ESC_S, - 0, 0, - -ESC_V, -ESC_W, - -ESC_X, 0, - -ESC_Z, CHAR_LEFT_SQUARE_BRACKET, - CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET, - CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE, - CHAR_GRAVE_ACCENT, CHAR_BEL, - -ESC_b, 0, - -ESC_d, CHAR_ESC, - CHAR_FF, 0, - -ESC_h, 0, - 0, -ESC_k, - 0, 0, - CHAR_LF, 0, - -ESC_p, 0, - CHAR_CR, -ESC_s, - CHAR_HT, 0, - -ESC_v, -ESC_w, - 0, 0, - -ESC_z + /* 0 */ 0, /* 1 */ 0, + /* 2 */ 0, /* 3 */ 0, + /* 4 */ 0, /* 5 */ 0, + /* 6 */ 0, /* 7 */ 0, + /* 8 */ 0, /* 9 */ 0, + /* : */ CHAR_COLON, /* ; */ CHAR_SEMICOLON, + /* < */ CHAR_LESS_THAN_SIGN, /* = */ CHAR_EQUALS_SIGN, + /* > */ CHAR_GREATER_THAN_SIGN, /* ? */ CHAR_QUESTION_MARK, + /* @ */ CHAR_COMMERCIAL_AT, /* A */ -ESC_A, + /* B */ -ESC_B, /* C */ -ESC_C, + /* D */ -ESC_D, /* E */ -ESC_E, + /* F */ 0, /* G */ -ESC_G, + /* H */ -ESC_H, /* I */ 0, + /* J */ 0, /* K */ -ESC_K, + /* L */ 0, /* M */ 0, + /* N */ -ESC_N, /* O */ 0, + /* P */ -ESC_P, /* Q */ -ESC_Q, + /* R */ -ESC_R, /* S */ -ESC_S, + /* T */ 0, /* U */ 0, + /* V */ -ESC_V, /* W */ -ESC_W, + /* X */ -ESC_X, /* Y */ 0, + /* Z */ -ESC_Z, /* [ */ CHAR_LEFT_SQUARE_BRACKET, + /* \ */ CHAR_BACKSLASH, /* ] */ CHAR_RIGHT_SQUARE_BRACKET, + /* ^ */ CHAR_CIRCUMFLEX_ACCENT, /* _ */ CHAR_UNDERSCORE, + /* ` */ CHAR_GRAVE_ACCENT, /* a */ CHAR_BEL, + /* b */ -ESC_b, /* c */ 0, + /* d */ -ESC_d, /* e */ CHAR_ESC, + /* f */ CHAR_FF, /* g */ 0, + /* h */ -ESC_h, /* i */ 0, + /* j */ 0, /* k */ -ESC_k, + /* l */ 0, /* m */ 0, + /* n */ CHAR_LF, /* o */ 0, + /* p */ -ESC_p, /* q */ 0, + /* r */ CHAR_CR, /* s */ -ESC_s, + /* t */ CHAR_HT, /* u */ 0, + /* v */ -ESC_v, /* w */ -ESC_w, + /* x */ 0, /* y */ 0, + /* z */ -ESC_z }; #else @@ -801,7 +801,7 @@ are allowed. */ PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \ PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \ PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \ - PCRE2_EXTRA_ASCII_DIGIT) + PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_PYTHON_OCTAL|PCRE2_EXTRA_NO_BS0) /* Compile time error code numbers. They are given names so that they can more easily be tracked. When a new number is added, the tables called eint1 and @@ -1495,7 +1495,7 @@ else if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE; } -/* Now process the quantifier for real. We know it must be {n} or (n,} or {,m} +/* Now process the quantifier for real. We know it must be {n} or {n,} or {,m} or {n,m}. The only error that read_number() can return is for a number that is too big. If *errorcodeptr is returned as zero it means no number was found. */ @@ -1889,7 +1889,16 @@ else number is less than 10, or if there are that many previous extracting left brackets, it is a back reference. Otherwise, up to three octal digits are read to form an escaped character code. Thus \123 is likely to be octal 123 - (cf \0123, which is octal 012 followed by the literal 3). + (cf \0123, which is octal 012 followed by the literal 3). This is the "Perl + style" of handling ambiguous octal/backrefences such as \12. + + There is an alternative disambiguation strategy, selected by + PCRE2_EXTRA_PYTHON_OCTAL, which follows Python's behaviour. An octal must + have either a leading zero, or exactly three octal digits; otherwise it's + a backreference. The disambiguation is stable, and does not depend on how + many capture groups are defined (it's simply an invalid backreference if + there is no corresponding capture group). Additionally, octal values above + \377 (\xff) are rejected. Inside a character class, \ followed by a digit is always either a literal 8 or 9 or an octal number. */ @@ -1897,8 +1906,37 @@ else case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9: - if (!isclass) + if (isclass) + { + /* Fall through to octal handling; never a backreference inside a class. */ + } + else if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0) + { + /* Python-style disambiguation. */ + if (ptr[-1] <= CHAR_7 && ptr + 1 < ptrend && ptr[0] >= CHAR_0 && + ptr[0] <= CHAR_7 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7) + { + /* We peeked a three-digit octal, so fall through */ + } + else + { + /* We are at a digit, so the only possible error from read_number() is + a number that is too large. */ + ptr--; /* Back to the digit */ + + if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr)) + { + *errorcodeptr = ERR61; + break; + } + + escape = -s; + break; + } + } + else { + /* Perl-style disambiguation. */ oldptr = ptr; ptr--; /* Back to the digit */ @@ -1935,7 +1973,7 @@ else /* \0 always starts an octal number, but we may drop through to here with a larger first octal digit. The original code used just to take the least significant 8 bits of octal numbers (I think this is what early Perls used - to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode, + to do). Nowadays we allow for larger numbers in UTF-8 mode and 16/32-bit mode, but no more than 3 octal digits. */ case CHAR_0: @@ -1945,6 +1983,13 @@ else #if PCRE2_CODE_UNIT_WIDTH == 8 if (!utf && c > 0xff) *errorcodeptr = ERR51; #endif + if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0 && c > 0xff) + *errorcodeptr = ERR51; + + /* PCRE2_EXTRA_NO_BS0 disables the NUL escape '\0' but doesn't affect + two- or three-character octal escapes \00 and \000, nor \x00. */ + if ((xoptions & PCRE2_EXTRA_NO_BS0) != 0 && c == 0 && i == 1) + *errorcodeptr = ERR3; break; /* \o is a relatively new Perl feature, supporting a more general way of diff --git a/src/pcre2test.c b/src/pcre2test.c index d8f5d6483..f5e6c734d 100644 --- a/src/pcre2test.c +++ b/src/pcre2test.c @@ -730,6 +730,7 @@ static modstruct modlist[] = { { "newline", MOD_CTC, MOD_NL, 0, CO(newline_convention) }, { "no_auto_capture", MOD_PAT, MOD_OPT, PCRE2_NO_AUTO_CAPTURE, PO(options) }, { "no_auto_possess", MOD_PATP, MOD_OPT, PCRE2_NO_AUTO_POSSESS, PO(options) }, + { "no_bs0", MOD_CTC, MOD_OPT, PCRE2_EXTRA_NO_BS0, CO(extra_options) }, { "no_dotstar_anchor", MOD_PAT, MOD_OPT, PCRE2_NO_DOTSTAR_ANCHOR, PO(options) }, { "no_jit", MOD_DATP, MOD_OPT, PCRE2_NO_JIT, DO(options) }, { "no_start_optimize", MOD_PATP, MOD_OPT, PCRE2_NO_START_OPTIMIZE, PO(options) }, @@ -756,6 +757,7 @@ static modstruct modlist[] = { { "push", MOD_PAT, MOD_CTL, CTL_PUSH, PO(control) }, { "pushcopy", MOD_PAT, MOD_CTL, CTL_PUSHCOPY, PO(control) }, { "pushtablescopy", MOD_PAT, MOD_CTL, CTL_PUSHTABLESCOPY, PO(control) }, + { "python_octal", MOD_CTC, MOD_OPT, PCRE2_EXTRA_PYTHON_OCTAL, CO(extra_options) }, { "recursion_limit", MOD_CTM, MOD_INT, 0, MO(depth_limit) }, /* Obsolete synonym */ { "regerror_buffsize", MOD_PAT, MOD_INT, 0, PO(regerror_buffsize) }, { "replace", MOD_PND, MOD_STR, REPLACE_MODSIZE, PO(replacement) }, diff --git a/testdata/testinput10 b/testdata/testinput10 index 100a3ad16..c17010e68 100644 --- a/testdata/testinput10 +++ b/testdata/testinput10 @@ -666,4 +666,26 @@ /(..)(*scs:(1)ab$)/match_invalid_utf ab\x80cde +# python_octal + +/\400/ + +/abc/substitute_extended + abc\=replace=\400 + +/\400/python_octal + +/abc/substitute_extended,python_octal + abc\=replace=\400 + +/\400/utf + +/abc/utf,substitute_extended + abc\=replace=\400 + +/\400/utf,python_octal + +/abc/utf,substitute_extended,python_octal + abc\=replace=\400 + # End of testinput10 diff --git a/testdata/testinput2 b/testdata/testinput2 index 8be78ff50..771de4f59 100644 --- a/testdata/testinput2 +++ b/testdata/testinput2 @@ -6429,4 +6429,50 @@ a)"xI # over INT_MAX (used to succeed with \8 being literal 8) /a\8000000000b/ +# -------------- + +# no_bs0 + +/a\0b\x00c\00d/ + a\x{00}b\x{00}c\x{00}d + +/a\0b/no_bs0 + +/b\x00c\00d/no_bs0 + b\x{00}c\x{00}d + +/abc/substitute_extended + abc\=replace=a\0b\x00c\00d + +/abc/substitute_extended,no_bs0 + abc\=replace=a\0b + abc\=replace=b\x00c\00d + +# python_octal + +/\0-\00-\01-\012-\0123-\123-\1234/ + \x00-\x00-\x01-\o{12}-\o{12}3-\o{123}-\o{123}4 + +/\1/ + +/\12/ + \o{12} + +/abc/substitute_extended + abc\=replace=\0-\00-\01-\012-\0123-\123-\1234 + abc\=replace=\1 + abc\=replace=\12 + +/\0-\00-\01-\012-\0123-\123-\1234/python_octal + \x00-\x00-\x01-\o{12}-\o{12}3-\o{123}-\o{123}4 + +/\1/python_octal + +/\12/python_octal + +/abc/substitute_extended,python_octal + abc\=replace=\0-\00-\01-\012-\0123-\123-\1234 + abc\=replace=\1 + abc\=replace=\12 + # End of testinput2 diff --git a/testdata/testinput5 b/testdata/testinput5 index 5aae6ee06..1ffc79320 100644 --- a/testdata/testinput5 +++ b/testdata/testinput5 @@ -2520,4 +2520,17 @@ /a(?b)c/utf,substitute_extended abc\=replace=>${nämedverylongbutperfectlylegalsoyoushouldnthaveaproblem_٢}< +# python_octal + +/\400/utf + \o{400} + +/\400/utf,python_octal + +/abc/utf,substitute_extended + abc\=replace=\400 + +/abc/utf,substitute_extended,python_octal + abc\=replace=\400 + # End of testinput5 diff --git a/testdata/testoutput10 b/testdata/testoutput10 index 91376c08d..1d1b7f09d 100644 --- a/testdata/testoutput10 +++ b/testdata/testoutput10 @@ -1955,4 +1955,33 @@ No match 0: ab 1: ab +# python_octal + +/\400/ +Failed: error 151 at offset 4: octal value is greater than \377 in 8-bit non-UTF-8 mode + +/abc/substitute_extended + abc\=replace=\400 +Failed: error -57 at offset 4 in replacement: bad escape sequence in replacement string + +/\400/python_octal +Failed: error 151 at offset 4: octal value is greater than \377 in 8-bit non-UTF-8 mode + +/abc/substitute_extended,python_octal + abc\=replace=\400 +Failed: error -57 at offset 4 in replacement: bad escape sequence in replacement string + +/\400/utf + +/abc/utf,substitute_extended + abc\=replace=\400 + 1: \x{100} + +/\400/utf,python_octal +Failed: error 151 at offset 4: octal value is greater than \377 in 8-bit non-UTF-8 mode + +/abc/utf,substitute_extended,python_octal + abc\=replace=\400 +Failed: error -57 at offset 4 in replacement: bad escape sequence in replacement string + # End of testinput10 diff --git a/testdata/testoutput2 b/testdata/testoutput2 index ccf209b5c..46f126b95 100644 --- a/testdata/testoutput2 +++ b/testdata/testoutput2 @@ -19115,6 +19115,70 @@ Failed: error 161 at offset 7: subpattern number is too big /a\8000000000b/ Failed: error 161 at offset 7: subpattern number is too big +# -------------- + +# no_bs0 + +/a\0b\x00c\00d/ + a\x{00}b\x{00}c\x{00}d + 0: a\x00b\x00c\x00d + +/a\0b/no_bs0 +Failed: error 103 at offset 3: unrecognized character follows \ + +/b\x00c\00d/no_bs0 + b\x{00}c\x{00}d + 0: b\x00c\x00d + +/abc/substitute_extended + abc\=replace=a\0b\x00c\00d + 1: a\x00b\x00c\x00d + +/abc/substitute_extended,no_bs0 + abc\=replace=a\0b +Failed: error -57 at offset 3 in replacement: bad escape sequence in replacement string + abc\=replace=b\x00c\00d + 1: b\x00c\x00d + +# python_octal + +/\0-\00-\01-\012-\0123-\123-\1234/ + \x00-\x00-\x01-\o{12}-\o{12}3-\o{123}-\o{123}4 + 0: \x00-\x00-\x01-\x0a-\x0a3-S-S4 + +/\1/ +Failed: error 115 at offset 1: reference to non-existent subpattern + +/\12/ + \o{12} + 0: \x0a + +/abc/substitute_extended + abc\=replace=\0-\00-\01-\012-\0123-\123-\1234 + 1: \x00-\x00-\x01-\x0a-\x0a3-S-S4 + abc\=replace=\1 +Failed: error -49 at offset 2 in replacement: unknown substring + abc\=replace=\12 + 1: \x0a + +/\0-\00-\01-\012-\0123-\123-\1234/python_octal + \x00-\x00-\x01-\o{12}-\o{12}3-\o{123}-\o{123}4 + 0: \x00-\x00-\x01-\x0a-\x0a3-S-S4 + +/\1/python_octal +Failed: error 115 at offset 1: reference to non-existent subpattern + +/\12/python_octal +Failed: error 115 at offset 2: reference to non-existent subpattern + +/abc/substitute_extended,python_octal + abc\=replace=\0-\00-\01-\012-\0123-\123-\1234 + 1: \x00-\x00-\x01-\x0a-\x0a3-S-S4 + abc\=replace=\1 +Failed: error -49 at offset 2 in replacement: unknown substring + abc\=replace=\12 +Failed: error -49 at offset 3 in replacement: unknown substring + # End of testinput2 Error -70: PCRE2_ERROR_BADDATA (unknown error number) Error -62: bad serialized data diff --git a/testdata/testoutput5 b/testdata/testoutput5 index dcd387ed6..f71944f9f 100644 --- a/testdata/testoutput5 +++ b/testdata/testoutput5 @@ -5498,4 +5498,21 @@ Failed: error 147 at offset 10: unknown property after \P or \p abc\=replace=>${nämedverylongbutperfectlylegalsoyoushouldnthaveaproblem_٢}< 1: >b< +# python_octal + +/\400/utf + \o{400} + 0: \x{100} + +/\400/utf,python_octal +Failed: error 151 at offset 4: octal value is greater than \377 in 8-bit non-UTF-8 mode + +/abc/utf,substitute_extended + abc\=replace=\400 + 1: \x{100} + +/abc/utf,substitute_extended,python_octal + abc\=replace=\400 +Failed: error -57 at offset 4 in replacement: bad escape sequence in replacement string + # End of testinput5