From 4a37c505d2580c8c31d78c6094908acc60ad06fe Mon Sep 17 00:00:00 2001
From: Nicholas Wilson <niwilson@microsoft.com>
Date: Fri, 13 Sep 2024 20:47:10 +0100
Subject: [PATCH] First PR! octal handling flags

---
 src/pcre2.h.in        |   2 +
 src/pcre2_compile.c   | 131 ++++++++++++++++++++++++++++--------------
 src/pcre2test.c       |   2 +
 testdata/testinput10  |  22 +++++++
 testdata/testinput2   |  46 +++++++++++++++
 testdata/testinput5   |  13 +++++
 testdata/testoutput10 |  29 ++++++++++
 testdata/testoutput2  |  64 +++++++++++++++++++++
 testdata/testoutput5  |  17 ++++++
 9 files changed, 283 insertions(+), 43 deletions(-)

diff --git a/src/pcre2.h.in b/src/pcre2.h.in
index a19313c9e..5558d8b7e 100644
--- a/src/pcre2.h.in
+++ b/src/pcre2.h.in
@@ -159,6 +159,8 @@ D   is inspected during pcre2_dfa_match() execution
 #define PCRE2_EXTRA_ASCII_BSW                0x00000400u  /* C */
 #define PCRE2_EXTRA_ASCII_POSIX              0x00000800u  /* C */
 #define PCRE2_EXTRA_ASCII_DIGIT              0x00001000u  /* C */
+#define PCRE2_EXTRA_PYTHON_OCTAL             0x00002000u  /* C */
+#define PCRE2_EXTRA_NO_BS0                   0x00004000u  /* C */
 
 /* These are for pcre2_jit_compile(). */
 
diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
index 26f6c1645..fc2c3fd30 100644
--- a/src/pcre2_compile.c
+++ b/src/pcre2_compile.c
@@ -517,44 +517,44 @@ in UTF-8 mode. It runs from '0' to 'z'. */
 #define UPPER_CASE(c)       (c-32)
 
 static const short int escapes[] = {
-     0,                       0,
-     0,                       0,
-     0,                       0,
-     0,                       0,
-     0,                       0,
-     CHAR_COLON,              CHAR_SEMICOLON,
-     CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
-     CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
-     CHAR_COMMERCIAL_AT,      -ESC_A,
-     -ESC_B,                  -ESC_C,
-     -ESC_D,                  -ESC_E,
-     0,                       -ESC_G,
-     -ESC_H,                  0,
-     0,                       -ESC_K,
-     0,                       0,
-     -ESC_N,                  0,
-     -ESC_P,                  -ESC_Q,
-     -ESC_R,                  -ESC_S,
-     0,                       0,
-     -ESC_V,                  -ESC_W,
-     -ESC_X,                  0,
-     -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
-     CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
-     CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
-     CHAR_GRAVE_ACCENT,       CHAR_BEL,
-     -ESC_b,                  0,
-     -ESC_d,                  CHAR_ESC,
-     CHAR_FF,                 0,
-     -ESC_h,                  0,
-     0,                       -ESC_k,
-     0,                       0,
-     CHAR_LF,                 0,
-     -ESC_p,                  0,
-     CHAR_CR,                 -ESC_s,
-     CHAR_HT,                 0,
-     -ESC_v,                  -ESC_w,
-     0,                       0,
-     -ESC_z
+    /* 0 */ 0,                       /* 1 */ 0,
+    /* 2 */ 0,                       /* 3 */ 0,
+    /* 4 */ 0,                       /* 5 */ 0,
+    /* 6 */ 0,                       /* 7 */ 0,
+    /* 8 */ 0,                       /* 9 */ 0,
+    /* : */ CHAR_COLON,              /* ; */ CHAR_SEMICOLON,
+    /* < */ CHAR_LESS_THAN_SIGN,     /* = */ CHAR_EQUALS_SIGN,
+    /* > */ CHAR_GREATER_THAN_SIGN,  /* ? */ CHAR_QUESTION_MARK,
+    /* @ */ CHAR_COMMERCIAL_AT,      /* A */ -ESC_A,
+    /* B */ -ESC_B,                  /* C */ -ESC_C,
+    /* D */ -ESC_D,                  /* E */ -ESC_E,
+    /* F */ 0,                       /* G */ -ESC_G,
+    /* H */ -ESC_H,                  /* I */ 0,
+    /* J */ 0,                       /* K */ -ESC_K,
+    /* L */ 0,                       /* M */ 0,
+    /* N */ -ESC_N,                  /* O */ 0,
+    /* P */ -ESC_P,                  /* Q */ -ESC_Q,
+    /* R */ -ESC_R,                  /* S */ -ESC_S,
+    /* T */ 0,                       /* U */ 0,
+    /* V */ -ESC_V,                  /* W */ -ESC_W,
+    /* X */ -ESC_X,                  /* Y */ 0,
+    /* Z */ -ESC_Z,                  /* [ */ CHAR_LEFT_SQUARE_BRACKET,
+    /* \ */ CHAR_BACKSLASH,          /* ] */ CHAR_RIGHT_SQUARE_BRACKET,
+    /* ^ */ CHAR_CIRCUMFLEX_ACCENT,  /* _ */ CHAR_UNDERSCORE,
+    /* ` */ CHAR_GRAVE_ACCENT,       /* a */ CHAR_BEL,
+    /* b */ -ESC_b,                  /* c */ 0,
+    /* d */ -ESC_d,                  /* e */ CHAR_ESC,
+    /* f */ CHAR_FF,                 /* g */ 0,
+    /* h */ -ESC_h,                  /* i */ 0,
+    /* j */ 0,                       /* k */ -ESC_k,
+    /* l */ 0,                       /* m */ 0,
+    /* n */ CHAR_LF,                 /* o */ 0,
+    /* p */ -ESC_p,                  /* q */ 0,
+    /* r */ CHAR_CR,                 /* s */ -ESC_s,
+    /* t */ CHAR_HT,                 /* u */ 0,
+    /* v */ -ESC_v,                  /* w */ -ESC_w,
+    /* x */ 0,                       /* y */ 0,
+    /* z */ -ESC_z
 };
 
 #else
@@ -801,7 +801,7 @@ are allowed. */
     PCRE2_EXTRA_ESCAPED_CR_IS_LF|PCRE2_EXTRA_ALT_BSUX| \
     PCRE2_EXTRA_ALLOW_LOOKAROUND_BSK|PCRE2_EXTRA_ASCII_BSD| \
     PCRE2_EXTRA_ASCII_BSS|PCRE2_EXTRA_ASCII_BSW|PCRE2_EXTRA_ASCII_POSIX| \
-    PCRE2_EXTRA_ASCII_DIGIT)
+    PCRE2_EXTRA_ASCII_DIGIT|PCRE2_EXTRA_PYTHON_OCTAL|PCRE2_EXTRA_NO_BS0)
 
 /* Compile time error code numbers. They are given names so that they can more
 easily be tracked. When a new number is added, the tables called eint1 and
@@ -1495,7 +1495,7 @@ else
   if (pp >= ptrend || *pp != CHAR_RIGHT_CURLY_BRACKET) return FALSE;
   }
 
-/* Now process the quantifier for real. We know it must be {n} or (n,} or {,m}
+/* Now process the quantifier for real. We know it must be {n} or {n,} or {,m}
 or {n,m}. The only error that read_number() can return is for a number that is
 too big. If *errorcodeptr is returned as zero it means no number was found. */
 
@@ -1889,7 +1889,16 @@ else
     number is less than 10, or if there are that many previous extracting left
     brackets, it is a back reference. Otherwise, up to three octal digits are
     read to form an escaped character code. Thus \123 is likely to be octal 123
-    (cf \0123, which is octal 012 followed by the literal 3).
+    (cf \0123, which is octal 012 followed by the literal 3). This is the "Perl
+    style" of handling ambiguous octal/backrefences such as \12.
+
+    There is an alternative disambiguation strategy, selected by
+    PCRE2_EXTRA_PYTHON_OCTAL, which follows Python's behaviour. An octal must
+    have either a leading zero, or exactly three octal digits; otherwise it's
+    a backreference. The disambiguation is stable, and does not depend on how
+    many capture groups are defined (it's simply an invalid backreference if
+    there is no corresponding capture group). Additionally, octal values above
+    \377 (\xff) are rejected.
 
     Inside a character class, \ followed by a digit is always either a literal
     8 or 9 or an octal number. */
@@ -1897,8 +1906,37 @@ else
     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
 
-    if (!isclass)
+    if (isclass)
+      {
+      /* Fall through to octal handling; never a backreference inside a class. */
+      }
+    else if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0)
+      {
+      /* Python-style disambiguation. */
+      if (ptr[-1] <= CHAR_7 && ptr + 1 < ptrend && ptr[0] >= CHAR_0 &&
+          ptr[0] <= CHAR_7 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
+        {
+        /* We peeked a three-digit octal, so fall through */
+        }
+      else
+        {
+        /* We are at a digit, so the only possible error from read_number() is
+        a number that is too large. */
+        ptr--;   /* Back to the digit */
+
+        if (!read_number(&ptr, ptrend, -1, MAX_GROUP_NUMBER, 0, &s, errorcodeptr))
+          {
+          *errorcodeptr = ERR61;
+          break;
+          }
+
+        escape = -s;
+        break;
+        }
+      }
+    else
       {
+      /* Perl-style disambiguation. */
       oldptr = ptr;
       ptr--;   /* Back to the digit */
 
@@ -1935,7 +1973,7 @@ else
     /* \0 always starts an octal number, but we may drop through to here with a
     larger first octal digit. The original code used just to take the least
     significant 8 bits of octal numbers (I think this is what early Perls used
-    to do). Nowadays we allow for larger numbers in UTF-8 mode and 16-bit mode,
+    to do). Nowadays we allow for larger numbers in UTF-8 mode and 16/32-bit mode,
     but no more than 3 octal digits. */
 
     case CHAR_0:
@@ -1945,6 +1983,13 @@ else
 #if PCRE2_CODE_UNIT_WIDTH == 8
     if (!utf && c > 0xff) *errorcodeptr = ERR51;
 #endif
+    if ((xoptions & PCRE2_EXTRA_PYTHON_OCTAL) != 0 && c > 0xff)
+        *errorcodeptr = ERR51;
+
+    /* PCRE2_EXTRA_NO_BS0 disables the NUL escape '\0' but doesn't affect
+    two- or three-character octal escapes \00 and \000, nor \x00. */
+    if ((xoptions & PCRE2_EXTRA_NO_BS0) != 0 && c == 0 && i == 1)
+        *errorcodeptr = ERR3;
     break;
 
     /* \o is a relatively new Perl feature, supporting a more general way of
diff --git a/src/pcre2test.c b/src/pcre2test.c
index d8f5d6483..f5e6c734d 100644
--- a/src/pcre2test.c
+++ b/src/pcre2test.c
@@ -730,6 +730,7 @@ static modstruct modlist[] = {
   { "newline",                     MOD_CTC,  MOD_NL,  0,                          CO(newline_convention) },
   { "no_auto_capture",             MOD_PAT,  MOD_OPT, PCRE2_NO_AUTO_CAPTURE,      PO(options) },
   { "no_auto_possess",             MOD_PATP, MOD_OPT, PCRE2_NO_AUTO_POSSESS,      PO(options) },
+  { "no_bs0",                      MOD_CTC,  MOD_OPT, PCRE2_EXTRA_NO_BS0,         CO(extra_options) },
   { "no_dotstar_anchor",           MOD_PAT,  MOD_OPT, PCRE2_NO_DOTSTAR_ANCHOR,    PO(options) },
   { "no_jit",                      MOD_DATP, MOD_OPT, PCRE2_NO_JIT,               DO(options) },
   { "no_start_optimize",           MOD_PATP, MOD_OPT, PCRE2_NO_START_OPTIMIZE,    PO(options) },
@@ -756,6 +757,7 @@ static modstruct modlist[] = {
   { "push",                        MOD_PAT,  MOD_CTL, CTL_PUSH,                   PO(control) },
   { "pushcopy",                    MOD_PAT,  MOD_CTL, CTL_PUSHCOPY,               PO(control) },
   { "pushtablescopy",              MOD_PAT,  MOD_CTL, CTL_PUSHTABLESCOPY,         PO(control) },
+  { "python_octal",                MOD_CTC,  MOD_OPT, PCRE2_EXTRA_PYTHON_OCTAL,   CO(extra_options) },
   { "recursion_limit",             MOD_CTM,  MOD_INT, 0,                          MO(depth_limit) },  /* Obsolete synonym */
   { "regerror_buffsize",           MOD_PAT,  MOD_INT, 0,                          PO(regerror_buffsize) },
   { "replace",                     MOD_PND,  MOD_STR, REPLACE_MODSIZE,            PO(replacement) },
diff --git a/testdata/testinput10 b/testdata/testinput10
index 100a3ad16..c17010e68 100644
--- a/testdata/testinput10
+++ b/testdata/testinput10
@@ -666,4 +666,26 @@
 /(..)(*scs:(1)ab$)/match_invalid_utf
     ab\x80cde         
 
+# python_octal
+
+/\400/
+
+/abc/substitute_extended
+    abc\=replace=\400
+
+/\400/python_octal
+
+/abc/substitute_extended,python_octal
+    abc\=replace=\400
+
+/\400/utf
+
+/abc/utf,substitute_extended
+    abc\=replace=\400
+
+/\400/utf,python_octal
+
+/abc/utf,substitute_extended,python_octal
+    abc\=replace=\400
+
 # End of testinput10
diff --git a/testdata/testinput2 b/testdata/testinput2
index 8be78ff50..771de4f59 100644
--- a/testdata/testinput2
+++ b/testdata/testinput2
@@ -6429,4 +6429,50 @@ a)"xI
 # over INT_MAX (used to succeed with \8 being literal 8)
 /a\8000000000b/
 
+# --------------
+
+# no_bs0
+
+/a\0b\x00c\00d/
+    a\x{00}b\x{00}c\x{00}d
+
+/a\0b/no_bs0
+
+/b\x00c\00d/no_bs0
+    b\x{00}c\x{00}d
+
+/abc/substitute_extended
+    abc\=replace=a\0b\x00c\00d
+
+/abc/substitute_extended,no_bs0
+    abc\=replace=a\0b
+    abc\=replace=b\x00c\00d
+
+# python_octal
+
+/\0-\00-\01-\012-\0123-\123-\1234/
+    \x00-\x00-\x01-\o{12}-\o{12}3-\o{123}-\o{123}4
+
+/\1/
+
+/\12/
+    \o{12}
+
+/abc/substitute_extended
+    abc\=replace=\0-\00-\01-\012-\0123-\123-\1234
+    abc\=replace=\1
+    abc\=replace=\12
+
+/\0-\00-\01-\012-\0123-\123-\1234/python_octal
+    \x00-\x00-\x01-\o{12}-\o{12}3-\o{123}-\o{123}4
+
+/\1/python_octal
+
+/\12/python_octal
+
+/abc/substitute_extended,python_octal
+    abc\=replace=\0-\00-\01-\012-\0123-\123-\1234
+    abc\=replace=\1
+    abc\=replace=\12
+
 # End of testinput2
diff --git a/testdata/testinput5 b/testdata/testinput5
index 5aae6ee06..1ffc79320 100644
--- a/testdata/testinput5
+++ b/testdata/testinput5
@@ -2520,4 +2520,17 @@
 /a(?<nämedverylongbutperfectlylegalsoyoushouldnthaveaproblem_٢>b)c/utf,substitute_extended
     abc\=replace=>${nämedverylongbutperfectlylegalsoyoushouldnthaveaproblem_٢}<
 
+# python_octal
+
+/\400/utf
+    \o{400}
+
+/\400/utf,python_octal
+
+/abc/utf,substitute_extended
+    abc\=replace=\400
+
+/abc/utf,substitute_extended,python_octal
+    abc\=replace=\400
+
 # End of testinput5
diff --git a/testdata/testoutput10 b/testdata/testoutput10
index 91376c08d..1d1b7f09d 100644
--- a/testdata/testoutput10
+++ b/testdata/testoutput10
@@ -1955,4 +1955,33 @@ No match
  0: ab
  1: ab
 
+# python_octal
+
+/\400/
+Failed: error 151 at offset 4: octal value is greater than \377 in 8-bit non-UTF-8 mode
+
+/abc/substitute_extended
+    abc\=replace=\400
+Failed: error -57 at offset 4 in replacement: bad escape sequence in replacement string
+
+/\400/python_octal
+Failed: error 151 at offset 4: octal value is greater than \377 in 8-bit non-UTF-8 mode
+
+/abc/substitute_extended,python_octal
+    abc\=replace=\400
+Failed: error -57 at offset 4 in replacement: bad escape sequence in replacement string
+
+/\400/utf
+
+/abc/utf,substitute_extended
+    abc\=replace=\400
+ 1: \x{100}
+
+/\400/utf,python_octal
+Failed: error 151 at offset 4: octal value is greater than \377 in 8-bit non-UTF-8 mode
+
+/abc/utf,substitute_extended,python_octal
+    abc\=replace=\400
+Failed: error -57 at offset 4 in replacement: bad escape sequence in replacement string
+
 # End of testinput10
diff --git a/testdata/testoutput2 b/testdata/testoutput2
index ccf209b5c..46f126b95 100644
--- a/testdata/testoutput2
+++ b/testdata/testoutput2
@@ -19115,6 +19115,70 @@ Failed: error 161 at offset 7: subpattern number is too big
 /a\8000000000b/
 Failed: error 161 at offset 7: subpattern number is too big
 
+# --------------
+
+# no_bs0
+
+/a\0b\x00c\00d/
+    a\x{00}b\x{00}c\x{00}d
+ 0: a\x00b\x00c\x00d
+
+/a\0b/no_bs0
+Failed: error 103 at offset 3: unrecognized character follows \
+
+/b\x00c\00d/no_bs0
+    b\x{00}c\x{00}d
+ 0: b\x00c\x00d
+
+/abc/substitute_extended
+    abc\=replace=a\0b\x00c\00d
+ 1: a\x00b\x00c\x00d
+
+/abc/substitute_extended,no_bs0
+    abc\=replace=a\0b
+Failed: error -57 at offset 3 in replacement: bad escape sequence in replacement string
+    abc\=replace=b\x00c\00d
+ 1: b\x00c\x00d
+
+# python_octal
+
+/\0-\00-\01-\012-\0123-\123-\1234/
+    \x00-\x00-\x01-\o{12}-\o{12}3-\o{123}-\o{123}4
+ 0: \x00-\x00-\x01-\x0a-\x0a3-S-S4
+
+/\1/
+Failed: error 115 at offset 1: reference to non-existent subpattern
+
+/\12/
+    \o{12}
+ 0: \x0a
+
+/abc/substitute_extended
+    abc\=replace=\0-\00-\01-\012-\0123-\123-\1234
+ 1: \x00-\x00-\x01-\x0a-\x0a3-S-S4
+    abc\=replace=\1
+Failed: error -49 at offset 2 in replacement: unknown substring
+    abc\=replace=\12
+ 1: \x0a
+
+/\0-\00-\01-\012-\0123-\123-\1234/python_octal
+    \x00-\x00-\x01-\o{12}-\o{12}3-\o{123}-\o{123}4
+ 0: \x00-\x00-\x01-\x0a-\x0a3-S-S4
+
+/\1/python_octal
+Failed: error 115 at offset 1: reference to non-existent subpattern
+
+/\12/python_octal
+Failed: error 115 at offset 2: reference to non-existent subpattern
+
+/abc/substitute_extended,python_octal
+    abc\=replace=\0-\00-\01-\012-\0123-\123-\1234
+ 1: \x00-\x00-\x01-\x0a-\x0a3-S-S4
+    abc\=replace=\1
+Failed: error -49 at offset 2 in replacement: unknown substring
+    abc\=replace=\12
+Failed: error -49 at offset 3 in replacement: unknown substring
+
 # End of testinput2
 Error -70: PCRE2_ERROR_BADDATA (unknown error number)
 Error -62: bad serialized data
diff --git a/testdata/testoutput5 b/testdata/testoutput5
index dcd387ed6..f71944f9f 100644
--- a/testdata/testoutput5
+++ b/testdata/testoutput5
@@ -5498,4 +5498,21 @@ Failed: error 147 at offset 10: unknown property after \P or \p
     abc\=replace=>${nämedverylongbutperfectlylegalsoyoushouldnthaveaproblem_٢}<
  1: >b<
 
+# python_octal
+
+/\400/utf
+    \o{400}
+ 0: \x{100}
+
+/\400/utf,python_octal
+Failed: error 151 at offset 4: octal value is greater than \377 in 8-bit non-UTF-8 mode
+
+/abc/utf,substitute_extended
+    abc\=replace=\400
+ 1: \x{100}
+
+/abc/utf,substitute_extended,python_octal
+    abc\=replace=\400
+Failed: error -57 at offset 4 in replacement: bad escape sequence in replacement string
+
 # End of testinput5