Allow restricting [:xdigit:] to ASCII for POSIX compatibility

While at it, do some cleanup to related code and documentation
carenas · Oct 8, 2023 · cecb429 · cecb429
1 parent 2fef163
commit cecb429
Show file tree

Hide file tree

Showing 7 changed files with 93 additions and 37 deletions.
diff --git a/ChangeLog b/ChangeLog
@@ -119,7 +119,7 @@ does the same, except for \u{...}, which is recognized only when
 PCRE2_EXTRA_ALT_BSUX is set. This an ECMAScript, non-Perl compatible,
 extension, so PCRE2 follows ECMAScript rather than Perl.
 
-31. Applied pull request #300 bu Carlo, which fixes #261. The bug was that
+31. Applied pull request #300 by Carlo, which fixes #261. The bug was that
 pcre2_match() was not fully resetting all captures that had been set within a
 (possibly recursive) subroutine call such as (?3).
 
@@ -128,8 +128,9 @@ now matches characters whose general categories are L or N or whose particular
 categories are Mn (non-spacing mark) or Pc (combining puntuation). The latter
 includes underscore.
 
-33. Changed the meaning of [:digit:] in UCP mode to match Perl. It now also
-matches the "fullwidth" versions of the hex digits.
+33. Changed the meaning of [:xdigit:] in UCP mode to match Perl. It now also
+matches the "fullwidth" versions of the hex digits. Just like it is done for
+[:digit:], PCRE2_EXTRA_ASCII_DIGIT can be used to keep this class ASCII only.
 
 
 Version 10.42 11-December-2022

diff --git a/doc/pcre2_set_compile_extra_options.3 b/doc/pcre2_set_compile_extra_options.3
@@ -29,8 +29,8 @@ options are:
   PCRE2_EXTRA_ASCII_BSS                \es remains ASCII in UCP mode
   PCRE2_EXTRA_ASCII_BSW                \ew remains ASCII in UCP mode
 .\" JOIN
-  PCRE2_EXTRA_ASCII_DIGIT              [:digit:] POSIX class remains ASCII
-                                         in UCP mode
+  PCRE2_EXTRA_ASCII_DIGIT              [:digit:] and [:xdigit:] POSIX classes
+                                         remain ASCII in UCP mode
 .\" JOIN
   PCRE2_EXTRA_ASCII_POSIX              POSIX classes remain ASCII in
                                          UCP mode

diff --git a/doc/pcre2api.3 b/doc/pcre2api.3
@@ -1971,8 +1971,8 @@ setting.
 .sp
   PCRE2_EXTRA_ASCII_DIGIT
 .sp
-This option forces the POSIX character class [:digit:] to match only ASCII
-digits, even when PCRE2_UCP is set.
+This option forces the POSIX character classes [:digit:] and [:xdigit:] to
+match only ASCII digits, even when PCRE2_UCP is set.
 .sp
   PCRE2_EXTRA_ASCII_POSIX
 .sp

diff --git a/doc/pcre2pattern.3 b/doc/pcre2pattern.3
@@ -1569,14 +1569,16 @@ plus those characters with code points less than 256 that have the S (Symbol)
 property.
 .TP 10
 [:xdigit:]
-In addition to the ASCII hexadecimal digits, this also matches the "fullwidth" 
-versions of those characters, whose Unicode code points start at U+FF10. This 
-is a change that was made in PCRE release 10.43 for Perl compatibility.
+In addition to the ASCII hexadecimal digits, this also matches the "fullwidth"
+versions of those characters, whose Unicode code points start at U+FF10. The
+effect of PCRE2_UCP can be negated by setting the PCRE2_EXTRA_ASCII_DIGIT
+option, just like it does for [:digit]. This is a change that was made in
+PCRE release 10.43 for Perl compatibility.
 .P
 The other POSIX classes are unchanged by PCRE2_UCP, and match only characters
-with code points less than 256. The effect of PCRE2_UCP on POSIX classes can be
-negated by setting the PCRE2_EXTRA_ASCII_POSIX option, either when calling
-\fBpcre2_compile()\fP or internally within the pattern.
+with code points less than 256. The effect of PCRE2_UCP on all POSIX classes
+can be negated by setting the PCRE2_EXTRA_ASCII_POSIX option, either when
+calling \fBpcre2_compile()\fP or internally within the pattern.
 .
 .
 .SH "COMPATIBILITY FEATURE FOR WORD BOUNDARIES"

diff --git a/src/pcre2_compile.c b/src/pcre2_compile.c
@@ -706,6 +706,7 @@ static const char posix_names[] =
 static const uint8_t posix_name_lengths[] = {
   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
 
+#define PC_DIGIT   7
 #define PC_GRAPH   8
 #define PC_PRINT   9
 #define PC_PUNCT  10
@@ -722,20 +723,20 @@ absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
 remove vertical space characters, 2 => remove underscore. */
 
 static const int posix_class_maps[] = {
-  cbit_word,  cbit_digit, -2,             /* alpha */
-  cbit_lower, -1,          0,             /* lower */
-  cbit_upper, -1,          0,             /* upper */
-  cbit_word,  -1,          2,             /* alnum - word without underscore */
-  cbit_print, cbit_cntrl,  0,             /* ascii */
-  cbit_space, -1,          1,             /* blank - a GNU extension */
-  cbit_cntrl, -1,          0,             /* cntrl */
-  cbit_digit, -1,          0,             /* digit */
-  cbit_graph, -1,          0,             /* graph */
-  cbit_print, -1,          0,             /* print */
-  cbit_punct, -1,          0,             /* punct */
-  cbit_space, -1,          0,             /* space */
-  cbit_word,  -1,          0,             /* word - a Perl extension */
-  cbit_xdigit,-1,          0              /* xdigit */
+  cbit_word,   cbit_digit, -2,            /* alpha */
+  cbit_lower,  -1,          0,            /* lower */
+  cbit_upper,  -1,          0,            /* upper */
+  cbit_word,   -1,          2,            /* alnum - word without underscore */
+  cbit_print,  cbit_cntrl,  0,            /* ascii */
+  cbit_space,  -1,          1,            /* blank - a GNU extension */
+  cbit_cntrl,  -1,          0,            /* cntrl */
+  cbit_digit,  -1,          0,            /* digit */
+  cbit_graph,  -1,          0,            /* graph */
+  cbit_print,  -1,          0,            /* print */
+  cbit_punct,  -1,          0,            /* punct */
+  cbit_space,  -1,          0,            /* space */
+  cbit_word,   -1,          0,            /* word - a Perl extension */
+  cbit_xdigit, -1,          0             /* xdigit */
 };
 
 #ifdef SUPPORT_UNICODE
@@ -3676,7 +3677,8 @@ while (ptr < ptrend)
 #ifdef SUPPORT_UNICODE
         if ((options & PCRE2_UCP) != 0 &&
             (xoptions & PCRE2_EXTRA_ASCII_POSIX) == 0 &&
-            !(posix_class == 7 && (xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0))
+            !((xoptions & PCRE2_EXTRA_ASCII_DIGIT) != 0 &&
+              (posix_class == PC_DIGIT || posix_class == PC_XDIGIT)))
           {
           int ptype = posix_substitutes[2*posix_class];
           int pvalue = posix_substitutes[2*posix_class + 1];
@@ -6028,19 +6030,18 @@ for (;; pptr++)
             *class_uchardata++ = local_negate? XCL_NOTPROP : XCL_PROP;
             *class_uchardata++ = (PCRE2_UCHAR)
               ((posix_class == PC_GRAPH)? PT_PXGRAPH :
-               (posix_class == PC_PRINT)? PT_PXPRINT : 
-               (posix_class == PC_XDIGIT)? PT_PXXDIGIT : PT_PXPUNCT);
+               (posix_class == PC_PRINT)? PT_PXPRINT : PT_PXPUNCT);
             *class_uchardata++ = 0;
             xclass_has_prop = TRUE;
             goto CONTINUE_CLASS;
 
-            /* For the other POSIX classes (ascii, xdigit) we are going to
+            /* For the other POSIX classes (ex: ascii) we are going to
             fall through to the non-UCP case and build a bit map for
             characters with code points less than 256. However, if we are in
             a negated POSIX class, characters with code points greater than
             255 must either all match or all not match, depending on whether
             the whole class is not or is negated. For example, for
-            [[:^ascii:]... they must all match, whereas for [^[:^xdigit:]...
+            [[:^ascii:]... they must all match, whereas for [^[:^ascii:]...
             they must not.
 
             In the special case where there are no xclass items, this is
@@ -6352,11 +6353,11 @@ for (;; pptr++)
     characters > 255 are in or not in the class, so any that were explicitly
     given as well can be ignored.
 
-    In the UCP case, if certain negated POSIX classes ([:^ascii:] or
-    [^:xdigit:]) were present in a class, we either have to match or not match
-    all wide characters (depending on whether the whole class is or is not
-    negated). This requirement is indicated by match_all_or_no_wide_chars being
-    true. We do this by including an explicit range, which works in both cases.
+    In the UCP case, if certain negated POSIX classes (ex: [:^ascii:]) were
+    were present in a class, we either have to match or not match all wide
+    characters (depending on whether the whole class is or is not negated).
+    This requirement is indicated by match_all_or_no_wide_chars being true.
+    We do this by including an explicit range, which works in both cases.
     This applies only in UTF and 16-bit and 32-bit non-UTF modes, since there
     cannot be any wide characters in 8-bit non-UTF mode.
 

diff --git a/testdata/testinput5 b/testdata/testinput5
@@ -1234,6 +1234,8 @@
 
 /[[:xdigit:]]/B,ucp
 
+/[[:xdigit:]]/B,ucp,ascii_digit
+
 # Unicode properties for \b and \B
 
 /\b...\B/utf,ucp
@@ -2445,6 +2447,22 @@
 /[[:digit:]]+/utf,ucp,ascii_posix
     123\x{660}456
 
+/^[[:xdigit:]]+$/utf,ucp
+    f0
+    1A
+    d\x{ff10}
+    \x{ff26}8
+\= Expect no match
+    8g\=no_jit
+
+/^[[:xdigit:]]+$/utf,ucp,ascii_digit
+    f0
+    1A
+\= Expect no match
+    d\x{ff10}
+    \x{ff26}8
+    8g
+
 />[[:space:]]+</utf,ucp
     >\x{a0} \x{a0}<
     >\x{a0}\x{a0}\x{a0}<

diff --git a/testdata/testoutput5 b/testdata/testoutput5
@@ -2583,6 +2583,14 @@ No match
         End
 ------------------------------------------------------------------
 
+/[[:xdigit:]]/B,ucp,ascii_digit
+------------------------------------------------------------------
+        Bra
+        [0-9A-Fa-f]
+        Ket
+        End
+------------------------------------------------------------------
+
 # Unicode properties for \b and \B
 
 /\b...\B/utf,ucp
@@ -5384,6 +5392,32 @@ No match
     123\x{660}456
  0: 123
 
+/^[[:xdigit:]]+$/utf,ucp
+    f0
+ 0: f0
+    1A
+ 0: 1A
+    d\x{ff10}
+ 0: d\x{ff10}
+    \x{ff26}8
+ 0: \x{ff26}8
+\= Expect no match
+    8g\=no_jit
+No match
+
+/^[[:xdigit:]]+$/utf,ucp,ascii_digit
+    f0
+ 0: f0
+    1A
+ 0: 1A
+\= Expect no match
+    d\x{ff10}
+No match
+    \x{ff26}8
+No match
+    8g
+No match
+
 />[[:space:]]+</utf,ucp
     >\x{a0} \x{a0}<
  0: >\x{a0} \x{a0}<