Skip to content

Commit

Permalink
First PR! octal handling flags
Browse files Browse the repository at this point in the history
  • Loading branch information
NWilson committed Sep 18, 2024
1 parent 829414f commit 4a37c50
Show file tree
Hide file tree
Showing 9 changed files with 283 additions and 43 deletions.
2 changes: 2 additions & 0 deletions src/pcre2.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,8 @@ D is inspected during pcre2_dfa_match() execution
#define PCRE2_EXTRA_ASCII_BSW 0x00000400u /* C */
#define PCRE2_EXTRA_ASCII_POSIX 0x00000800u /* C */
#define PCRE2_EXTRA_ASCII_DIGIT 0x00001000u /* C */
#define PCRE2_EXTRA_PYTHON_OCTAL 0x00002000u /* C */
#define PCRE2_EXTRA_NO_BS0 0x00004000u /* C */

/* These are for pcre2_jit_compile(). */

Expand Down
131 changes: 88 additions & 43 deletions src/pcre2_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -517,44 +517,44 @@ in UTF-8 mode. It runs from '0' to 'z'. */
#define UPPER_CASE(c) (c-32)

static const short int escapes[] = {
0, 0,
0, 0,
0, 0,
0, 0,
0, 0,
CHAR_COLON, CHAR_SEMICOLON,
CHAR_LESS_THAN_SIGN, CHAR_EQUALS_SIGN,
CHAR_GREATER_THAN_SIGN, CHAR_QUESTION_MARK,
CHAR_COMMERCIAL_AT, -ESC_A,
-ESC_B, -ESC_C,
-ESC_D, -ESC_E,
0, -ESC_G,
-ESC_H, 0,
0, -ESC_K,
0, 0,
-ESC_N, 0,
-ESC_P, -ESC_Q,
-ESC_R, -ESC_S,
0, 0,
-ESC_V, -ESC_W,
-ESC_X, 0,
-ESC_Z, CHAR_LEFT_SQUARE_BRACKET,
CHAR_BACKSLASH, CHAR_RIGHT_SQUARE_BRACKET,
CHAR_CIRCUMFLEX_ACCENT, CHAR_UNDERSCORE,
CHAR_GRAVE_ACCENT, CHAR_BEL,
-ESC_b, 0,
-ESC_d, CHAR_ESC,
CHAR_FF, 0,
-ESC_h, 0,
0, -ESC_k,
0, 0,
CHAR_LF, 0,
-ESC_p, 0,
CHAR_CR, -ESC_s,
CHAR_HT, 0,
-ESC_v, -ESC_w,
0, 0,
-ESC_z
/* 0 */ 0, /* 1 */ 0,
/* 2 */ 0, /* 3 */ 0,
/* 4 */ 0, /* 5 */ 0,
/* 6 */ 0, /* 7 */ 0,
/* 8 */ 0, /* 9 */ 0,
/* : */ CHAR_COLON, /* ; */ CHAR_SEMICOLON,
/* < */ CHAR_LESS_THAN_SIGN, /* = */ CHAR_EQUALS_SIGN,
/* > */ CHAR_GREATER_THAN_SIGN, /* ? */ CHAR_QUESTION_MARK,
/* @ */ CHAR_COMMERCIAL_AT, /* A */ -ESC_A,
/* B */ -ESC_B, /* C */ -ESC_C,
/* D */ -ESC_D, /* E */ -ESC_E,
/* F */ 0, /* G */ -ESC_G,
/* H */ -ESC_H, /* I */ 0,
/* J */ 0, /* K */ -ESC_K,
/* L */ 0, /* M */ 0,
/* N */ -ESC_N, /* O */ 0,
/* P */ -ESC_P, /* Q */ -ESC_Q,
/* R */ -ESC_R, /* S */ -ESC_S,
/* T */ 0, /* U */ 0,
/* V */ -ESC_V, /* W */ -ESC_W,
/* X */ -ESC_X, /* Y */ 0,
/* Z */ -ESC_Z, /* [ */ CHAR_LEFT_SQUARE_BRACKET,
/* \ */ CHAR_BACKSLASH, /* ] */ CHAR_RIGHT_SQUARE_BRACKET,
/* ^ */ CHAR_CIRCUMFLEX_ACCENT, /* _ */ CHAR_UNDERSCORE,
/* ` */ CHAR_GRAVE_ACCENT, /* a */ CHAR_BEL,
/* b */ -ESC_b, /* c */ 0,
/* d */ -ESC_d, /* e */ CHAR_ESC,
/* f */ CHAR_FF, /* g */ 0,
/* h */ -ESC_h, /* i */ 0,
/* j */ 0, /* k */ -ESC_k,
/* l */ 0, /* m */ 0,
/* n */ CHAR_LF, /* o */ 0,
/* p */ -ESC_p, /* q */ 0,
/* r */ CHAR_CR, /* s */ -ESC_s,
/* t */ CHAR_HT, /* u */ 0,
/* v */ -ESC_v, /* w */ -ESC_w,
/* x */ 0, /* y */ 0,
/* z */ -ESC_z
};

#else
Expand Down Expand Up @@ -801,7 +801,7 @@ are allowed. */
PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
PCRE2_EXTRA_ASCII_DIGIT)
PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_PYTHON_OCTAL|PCRE2_EXTRA_NO_BS0)

/* Compile time error code numbers. They are given names so that they can more
easily be tracked. When a new number is added, the tables called eint1 and
Expand Down Expand Up @@ -1495,7 +1495,7 @@ else
if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
}

/* Now process the quantifier for real. We know it must be {n} or (n,} or {,m}
/* Now process the quantifier for real. We know it must be {n} or {n,} or {,m}
or {n,m}. The only error that read_number() can return is for a number that is
too big. If *errorcodeptr is returned as zero it means no number was found. */

Expand Down Expand Up @@ -1889,16 +1889,54 @@ else
number is less than 10, or if there are that many previous extracting left
brackets, it is a back reference. Otherwise, up to three octal digits are
read to form an escaped character code. Thus \123 is likely to be octal 123
(cf \0123, which is octal 012 followed by the literal 3).
(cf \0123, which is octal 012 followed by the literal 3). This is the "Perl
style" of handling ambiguous octal/backrefences such as \12.
There is an alternative disambiguation strategy, selected by
PCRE2_EXTRA_PYTHON_OCTAL, which follows Python's behaviour. An octal must
have either a leading zero, or exactly three octal digits; otherwise it's
a backreference. The disambiguation is stable, and does not depend on how
many capture groups are defined (it's simply an invalid backreference if
there is no corresponding capture group). Additionally, octal values above
\377 (\xff) are rejected.
Inside a character class, \ followed by a digit is always either a literal
8 or 9 or an octal number. */

case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:

if (!isclass)
if (isclass)
{
/* Fall through to octal handling; never a backreference inside a class. */
}
else if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0)
{
/* Python-style disambiguation. */
if (ptr[-1] <= CHAR_7 && ptr + 1 < ptrend && ptr[0] >= CHAR_0 &&
ptr[0] <= CHAR_7 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
{
/* We peeked a three-digit octal, so fall through */
}
else
{
/* We are at a digit, so the only possible error from read_number() is
a number that is too large. */
ptr--; /* Back to the digit */

if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
{
*errorcodeptr = ERR61;
break;
}

escape = -s;
break;
}
}
else
{
/* Perl-style disambiguation. */
oldptr = ptr;
ptr--; /* Back to the digit */

Expand Down Expand Up @@ -1935,7 +1973,7 @@ else
/* \0 always starts an octal number, but we may drop through to here with a
larger first octal digit. The original code used just to take the least
significant 8 bits of octal numbers (I think this is what early Perls used
to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
to do). Nowadays we allow for larger numbers in UTF-8 mode and 16/32-bit mode,
but no more than 3 octal digits. */

case CHAR_0:
Expand All @@ -1945,6 +1983,13 @@ else
#if PCRE2_CODE_UNIT_WIDTH == 8
if (!utf && c > 0xff) *errorcodeptr = ERR51;
#endif
if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0 && c > 0xff)
*errorcodeptr = ERR51;

/* PCRE2_EXTRA_NO_BS0 disables the NUL escape '\0' but doesn't affect
two- or three-character octal escapes \00 and \000, nor \x00. */
if ((xoptions & PCRE2_EXTRA_NO_BS0) != 0 && c == 0 && i == 1)
*errorcodeptr = ERR3;
break;

/* \o is a relatively new Perl feature, supporting a more general way of
Expand Down
2 changes: 2 additions & 0 deletions src/pcre2test.c
Original file line number Diff line number Diff line change
Expand Up @@ -730,6 +730,7 @@ static modstruct modlist[] = {
{ "newline", MOD_CTC, MOD_NL, 0, CO(newline_convention) },
{ "no_auto_capture", MOD_PAT, MOD_OPT, PCRE2_NO_AUTO_CAPTURE, PO(options) },
{ "no_auto_possess", MOD_PATP, MOD_OPT, PCRE2_NO_AUTO_POSSESS, PO(options) },
{ "no_bs0", MOD_CTC, MOD_OPT, PCRE2_EXTRA_NO_BS0, CO(extra_options) },
{ "no_dotstar_anchor", MOD_PAT, MOD_OPT, PCRE2_NO_DOTSTAR_ANCHOR, PO(options) },
{ "no_jit", MOD_DATP, MOD_OPT, PCRE2_NO_JIT, DO(options) },
{ "no_start_optimize", MOD_PATP, MOD_OPT, PCRE2_NO_START_OPTIMIZE, PO(options) },
Expand All @@ -756,6 +757,7 @@ static modstruct modlist[] = {
{ "push", MOD_PAT, MOD_CTL, CTL_PUSH, PO(control) },
{ "pushcopy", MOD_PAT, MOD_CTL, CTL_PUSHCOPY, PO(control) },
{ "pushtablescopy", MOD_PAT, MOD_CTL, CTL_PUSHTABLESCOPY, PO(control) },
{ "python_octal", MOD_CTC, MOD_OPT, PCRE2_EXTRA_PYTHON_OCTAL, CO(extra_options) },
{ "recursion_limit", MOD_CTM, MOD_INT, 0, MO(depth_limit) }, /* Obsolete synonym */
{ "regerror_buffsize", MOD_PAT, MOD_INT, 0, PO(regerror_buffsize) },
{ "replace", MOD_PND, MOD_STR, REPLACE_MODSIZE, PO(replacement) },
Expand Down
22 changes: 22 additions & 0 deletions testdata/testinput10
Original file line number Diff line number Diff line change
Expand Up @@ -666,4 +666,26 @@
/(..)(*scs:(1)ab$)/match_invalid_utf
ab\x80cde

# python_octal

/\400/

/abc/substitute_extended
abc\=replace=\400

/\400/python_octal

/abc/substitute_extended,python_octal
abc\=replace=\400

/\400/utf

/abc/utf,substitute_extended
abc\=replace=\400

/\400/utf,python_octal

/abc/utf,substitute_extended,python_octal
abc\=replace=\400

# End of testinput10
46 changes: 46 additions & 0 deletions testdata/testinput2
Original file line number Diff line number Diff line change
Expand Up @@ -6429,4 +6429,50 @@ a)"xI
# over INT_MAX (used to succeed with \8 being literal 8)
/a\8000000000b/

# --------------

# no_bs0

/a\0b\x00c\00d/
a\x{00}b\x{00}c\x{00}d

/a\0b/no_bs0

/b\x00c\00d/no_bs0
b\x{00}c\x{00}d

/abc/substitute_extended
abc\=replace=a\0b\x00c\00d

/abc/substitute_extended,no_bs0
abc\=replace=a\0b
abc\=replace=b\x00c\00d

# python_octal

/\0-\00-\01-\012-\0123-\123-\1234/
\x00-\x00-\x01-\o{12}-\o{12}3-\o{123}-\o{123}4

/\1/

/\12/
\o{12}

/abc/substitute_extended
abc\=replace=\0-\00-\01-\012-\0123-\123-\1234
abc\=replace=\1
abc\=replace=\12

/\0-\00-\01-\012-\0123-\123-\1234/python_octal
\x00-\x00-\x01-\o{12}-\o{12}3-\o{123}-\o{123}4

/\1/python_octal

/\12/python_octal

/abc/substitute_extended,python_octal
abc\=replace=\0-\00-\01-\012-\0123-\123-\1234
abc\=replace=\1
abc\=replace=\12

# End of testinput2
13 changes: 13 additions & 0 deletions testdata/testinput5
Original file line number Diff line number Diff line change
Expand Up @@ -2520,4 +2520,17 @@
/a(?<nämedverylongbutperfectlylegalsoyoushouldnthaveaproblem_٢>b)c/utf,substitute_extended
abc\=replace=>${nämedverylongbutperfectlylegalsoyoushouldnthaveaproblem_٢}<

# python_octal

/\400/utf
\o{400}

/\400/utf,python_octal

/abc/utf,substitute_extended
abc\=replace=\400

/abc/utf,substitute_extended,python_octal
abc\=replace=\400

# End of testinput5
Expand Down
29 changes: 29 additions & 0 deletions testdata/testoutput10
Original file line number Diff line number Diff line change
Expand Up @@ -1955,4 +1955,33 @@ No match
0: ab
1: ab

# python_octal

/\400/
Failed: error 151 at offset 4: octal value is greater than \377 in 8-bit non-UTF-8 mode

/abc/substitute_extended
abc\=replace=\400
Failed: error -57 at offset 4 in replacement: bad escape sequence in replacement string

/\400/python_octal
Failed: error 151 at offset 4: octal value is greater than \377 in 8-bit non-UTF-8 mode

/abc/substitute_extended,python_octal
abc\=replace=\400
Failed: error -57 at offset 4 in replacement: bad escape sequence in replacement string

/\400/utf

/abc/utf,substitute_extended
abc\=replace=\400
1: \x{100}

/\400/utf,python_octal
Failed: error 151 at offset 4: octal value is greater than \377 in 8-bit non-UTF-8 mode

/abc/utf,substitute_extended,python_octal
abc\=replace=\400
Failed: error -57 at offset 4 in replacement: bad escape sequence in replacement string

# End of testinput10
Loading

0 comments on commit 4a37c50

Please sign in to comment.