Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ucptest updates #199

Merged
merged 2 commits into from
Feb 1, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 18 additions & 92 deletions maint/ucptest.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,13 @@
* A program for testing the Unicode property table *
***************************************************/

/* Copyright (c) University of Cambridge 2008-2022 */
/* Copyright (c) University of Cambridge 2008-2023 */

/* Compile thus:

gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 -o ucptest \
ucptest.c ../src/pcre2_ucd.c ../src/pcre2_tables.c
gcc -DHAVE_CONFIG_H -DPCRE2_CODE_UNIT_WIDTH=8 \
-fvisibility=hidden -o ucptest ucptest.c \
../src/pcre2_ord2utf.c ../src/pcre2_ucd.c ../src/pcre2_tables.c

Add -lreadline or -ledit if PCRE2 was configured with readline or libedit
support in pcre2test.
Expand Down Expand Up @@ -87,7 +88,7 @@ type, gbreak or bidi. The defined values for that property are listed. */
#endif

#ifndef SUPPORT_UNICODE
#define SUPPORT_UNICODE
#error "Unicode support not enabled"
#endif

#include <ctype.h>
Expand Down Expand Up @@ -125,7 +126,6 @@ type, gbreak or bidi. The defined values for that property are listed. */
#define CSS (char **)
#define US (unsigned char *)
#define CUS (const unsigned char *)
#define USS (unsigned char **)

/* -------------------------------------------------------------------*/

Expand Down Expand Up @@ -208,81 +208,6 @@ static const unsigned char *bd_names[] = {
US"WS", US"White space"
};

static const unsigned int utf8_table1[] = {
0x0000007f, 0x000007ff, 0x0000ffff, 0x001fffff, 0x03ffffff, 0x7fffffff};

static const int utf8_table2[] = {
0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};

/* Macro to pick up the remaining bytes of a UTF-8 character, advancing
the pointer. */

#define GETUTF8INC(c, eptr) \
{ \
if ((c & 0x20u) == 0) \
c = ((c & 0x1fu) << 6) | (*eptr++ & 0x3fu); \
else if ((c & 0x10u) == 0) \
{ \
c = ((c & 0x0fu) << 12) | ((*eptr & 0x3fu) << 6) | (eptr[1] & 0x3fu); \
eptr += 2; \
} \
else if ((c & 0x08u) == 0) \
{ \
c = ((c & 0x07u) << 18) | ((*eptr & 0x3fu) << 12) | \
((eptr[1] & 0x3fu) << 6) | (eptr[2] & 0x3fu); \
eptr += 3; \
} \
else if ((c & 0x04u) == 0) \
{ \
c = ((c & 0x03u) << 24) | ((*eptr & 0x3fu) << 18) | \
((eptr[1] & 0x3fu) << 12) | ((eptr[2] & 0x3fu) << 6) | \
(eptr[3] & 0x3fu); \
eptr += 4; \
} \
else \
{ \
c = ((c & 0x01u) << 30) | ((*eptr & 0x3fu) << 24) | \
((eptr[1] & 0x3fu) << 18) | ((eptr[2] & 0x3fu) << 12) | \
((eptr[3] & 0x3fu) << 6) | (eptr[4] & 0x3fu); \
eptr += 5; \
} \
}



/*************************************************
* Convert character value to UTF-8 *
*************************************************/

/* This function takes an unsigned long integer value in the range 0 -
0x7fffffff and encodes it as a UTF-8 character in 1 to 6 bytes.

Arguments:
cvalue the character value
buffer pointer to buffer for result - at least 6 bytes long

Returns: number of bytes placed in the buffer
0 if input code point is too big
*/

static size_t
ord2utf8(unsigned int cvalue, unsigned char *buffer)
{
size_t i, j;
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
if (cvalue <= utf8_table1[i]) break;
if (i >= sizeof(utf8_table1)/sizeof(int)) return 0;
buffer += i;
for (j = i; j > 0; j--)
{
*buffer-- = 0x80 | (cvalue & 0x3f);
cvalue >>= 6;
}
*buffer = utf8_table2[i] | cvalue;
return i + 1;
}



/*************************************************
* Test for interaction *
Expand Down Expand Up @@ -357,7 +282,7 @@ return yield;
static void
print_prop(unsigned int c, BOOL is_just_one)
{
int type = UCD_CATEGORY(c);
unsigned int type = UCD_CATEGORY(c);
int fulltype = UCD_CHARTYPE(c);
int script = UCD_SCRIPT(c);
int scriptx = UCD_SCRIPTX(c);
Expand Down Expand Up @@ -473,7 +398,7 @@ printf("U+%04X %s %s: %s, %s, %s", c, bidiclass, typename, fulltypename,

if (is_just_one && (othercase != c || caseset != 0))
{
printf(", U+%04X", othercase);
if (othercase != c) printf(", U+%04X", othercase);
if (caseset != 0)
{
const uint32_t *p = PRIV(ucd_caseless_sets) + caseset - 1;
Expand Down Expand Up @@ -517,8 +442,8 @@ if (bprops != 0)
if (show_character && is_just_one)
{
unsigned char buffer[8];
size_t len = ord2utf8(c, buffer);
printf(", >%.*s<", (int)len, buffer);
int len = (int)PRIV(ord2utf_8)(c, buffer);
printf(", >%.*s<", len, buffer);
}

printf("\n");
Expand Down Expand Up @@ -557,7 +482,6 @@ const char *pad = " ";
while (*s != 0)
{
unsigned int offset = 0;
BOOL scriptx_not = FALSE;

for (t = name; *s != 0 && !isspace(*s); s++) *t++ = *s;
*t = 0;
Expand All @@ -573,6 +497,7 @@ while (*s != 0)
if (strcmp(CS name, "script") == 0 ||
strcmp(CS name, "scriptx") == 0)
{
BOOL scriptx_not = FALSE;
for (t = value; *t != 0; t++) *t = tolower(*t);

if (value[0] == '!')
Expand Down Expand Up @@ -656,7 +581,7 @@ while (*s != 0)

for (i = 0; i < sizeof(type_names)/sizeof(char *); i += 2)
{
if (strcmp(CS (value + offset), CS type_names[i]) == 0)
if (strcmp(CS (value + offset), CCS type_names[i]) == 0)
{
type = i/2;
break;
Expand Down Expand Up @@ -687,7 +612,7 @@ while (*s != 0)

for (i = 0; i < sizeof(gb_names)/sizeof(char *); i += 2)
{
if (strcmp(CS (value + offset), CS gb_names[i]) == 0)
if (strcmp(CS (value + offset), CCS gb_names[i]) == 0)
{
gbreak = i/2;
break;
Expand Down Expand Up @@ -719,7 +644,7 @@ while (*s != 0)
}
for (i = 0; i < sizeof(bd_names)/sizeof(char *); i += 2)
{
if (strcasecmp(CS (value + offset), CS bd_names[i]) == 0)
if (strcasecmp(CS (value + offset), CCS bd_names[i]) == 0)
{
bidiclass = i/2;
break;
Expand Down Expand Up @@ -903,13 +828,14 @@ if (strcmp(CS name, "findprop") == 0)
if (c > 0x7fu)
{
GETCHARINC(c, t);
endptr = t;
}
endptr = t+1;
else endptr = t+1;
}
else
{
if (strncmp(CS t, "U+", 2) == 0) t += 2;
c = strtoul(CS t, CSS(&endptr), 16);
if (memcmp(t, "U+", 2) == 0) t += 2;
c = (uint32_t)strtoul(CS t, CSS(&endptr), 16);
}

if (*endptr != 0 && !isspace(*endptr))
Expand Down Expand Up @@ -1018,7 +944,7 @@ if (argc > first_arg)
char *arg = argv[first_arg];
unsigned char *s = buffer;

if (*arg != '+' && strncmp(arg, "U+", 2) != 0 && !isdigit(*arg))
if (*arg != '+' && memcmp(arg, "U+", 2) != 0 && !isdigit(*arg))
{
while (*arg != 0)
{
Expand Down
1 change: 1 addition & 0 deletions maint/ucptestdata/testinput1
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,6 @@ findprop 32ff
findprop 1f16d

findprop U+10e93 U+10eaa
findprop +á +é U+212A

findprop 0602 202a 202b 202c 2068 2069 202d 202e 2067
20 changes: 12 additions & 8 deletions maint/ucptestdata/testoutput1
Original file line number Diff line number Diff line change
Expand Up @@ -300,18 +300,18 @@ U+FFED ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, exte
U+FFEE ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+FFEF L Control: Unassigned, unknown, Other
findprop fff8 fff9 fffa fffb fffc fffd fffe ffff
U+FFF8 BN Control: Unassigned, unknown, Control, [dash, defaultignorablecodepoint, deprecated, extendedpictographic, joincontrol, lowercase, patternwhitespace, quotationmark, sentenceterminal, softdotted, xidcontinue, xidstart]
U+FFF9 ON Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
U+FFFA ON Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
U+FFFB ON Control: Format, common, Control, [changeswhenuppercased, deprecated, emojimodifier, emojipresentation, extender, sentenceterminal, xidcontinue, xidstart]
U+FFF8 BN Control: Unassigned, unknown, Control, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, patternsyntax]
U+FFF9 ON Control: Format, common, Control, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, graphemeextend, hexdigit, logicalorderexception, lowercase, math, noncharactercodepoint, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
U+FFFA ON Control: Format, common, Control, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, graphemeextend, hexdigit, logicalorderexception, lowercase, math, noncharactercodepoint, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
U+FFFB ON Control: Format, common, Control, [changeswhencasemapped, changeswhentitlecased, emojimodifier, emojimodifierbase, graphemeextend, hexdigit, logicalorderexception, lowercase, math, noncharactercodepoint, patternsyntax, patternwhitespace, prependedconcatenationmark, quotationmark, radical, regionalindicator, sentenceterminal, softdotted, terminalpunctuation, unifiedideograph, uppercase, variationselector, whitespace, xidcontinue, xidstart]
U+FFFC ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+FFFD ON Symbol: Other symbol, common, Other, [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]
U+FFFE BN Control: Unassigned, unknown, Other, [changeswhenuppercased, deprecated, emojicomponent, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+FFFF BN Control: Unassigned, unknown, Other, [changeswhenuppercased, deprecated, emojicomponent, emojimodifierbase, extender, quotationmark, sentenceterminal, xidcontinue, xidstart]
U+FFFE BN Control: Unassigned, unknown, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, math, patternwhitespace, prependedconcatenationmark]
U+FFFF BN Control: Unassigned, unknown, Other, [changeswhencasemapped, changeswhenuppercased, emojimodifier, emojimodifierbase, joincontrol, logicalorderexception, math, patternwhitespace, prependedconcatenationmark]
findprop 10000 10001 e01ef f0000 100000
U+10000 L Letter: Other letter, linearb, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+10001 L Letter: Other letter, linearb, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+E01EF NSM Mark: Non-spacing mark, inherited, Extend, []
U+E01EF NSM Mark: Non-spacing mark, inherited, Extend, [ascii, alphabetic, cased, emojicomponent]
U+F0000 L Control: Private use, unknown, Other
U+100000 L Control: Private use, unknown, Other

Expand Down Expand Up @@ -391,11 +391,15 @@ findprop 32ff
U+32FF L Symbol: Other symbol, common, Other, [han], [alphabetic, caseignorable, extender, graphemebase, idcontinue, idstart, xidcontinue, xidstart]

findprop 1f16d
U+1F16D ON Symbol: Other symbol, common, Extended Pictographic, [ascii, sentenceterminal, unifiedideograph, whitespace, xidcontinue]
U+1F16D ON Symbol: Other symbol, common, Extended Pictographic, [changeswhenuppercased, deprecated, emojimodifier, emojimodifierbase, extender, quotationmark, regionalindicator, xidcontinue, xidstart]

findprop U+10e93 U+10eaa
U+10E93 R Letter: Other letter, yezidi, Other, [alphabetic, diacritic, graphemebase, idcontinue, xidcontinue]
U+10EAA R Control: Unassigned, unknown, Other
findprop +á +é U+212A
U+00E1 L Letter: Lower case letter, latin, Other, U+00C1, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+00E9 L Letter: Lower case letter, latin, Other, U+00C9, [alphabetic, graphemebase, idcontinue, idstart, xidcontinue]
U+212A L Letter: Upper case letter, latin, Other, U+004B, U+006B, [alphabetic, graphemeextend, idcontinue, xidcontinue]

findprop 0602 202a 202b 202c 2068 2069 202d 202e 2067
U+0602 AN Control: Format, arabic, Prepend, [alphabetic, cased, changeswhencasemapped, changeswhentitlecased, changeswhenuppercased, graphemebase, lowercase]
Expand Down
Loading