Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix script extension support on jit #69

Merged
merged 1 commit into from
Jan 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
82 changes: 65 additions & 17 deletions src/pcre2_jit_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -7413,14 +7413,18 @@ return cc;
static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr);

#ifdef SUPPORT_UNICODE
#define XCLASS_SAVE_CHAR 0x01
#define XCLASS_CHAR_SAVED 0x02
#define XCLASS_HAS_TYPE 0x04
#define XCLASS_HAS_SCRIPT 0x08
#define XCLASS_HAS_SCRIPT_EXTENSION 0x10
#define XCLASS_HAS_BIDICO 0x20
#define XCLASS_HAS_BIDICL 0x40
#define XCLASS_SAVE_CHAR 0x001
#define XCLASS_CHAR_SAVED 0x002
#define XCLASS_HAS_TYPE 0x004
#define XCLASS_HAS_SCRIPT 0x008
#define XCLASS_HAS_SCRIPT_EXTENSION 0x010
#define XCLASS_HAS_BIDICO 0x020
#define XCLASS_HAS_BIDICL 0x040
#define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_SCRIPT_EXTENSION | XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL)
#define XCLASS_SCRIPT_EXTENSION_NOTPROP 0x080
#define XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR 0x100
#define XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0 0x200

#endif /* SUPPORT_UNICODE */

static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks)
Expand Down Expand Up @@ -7521,6 +7525,11 @@ while (*cc != XCL_END)

case PT_SCX:
unicode_status |= XCLASS_HAS_SCRIPT_EXTENSION;
if (cc[-1] == XCL_NOTPROP)
{
unicode_status |= XCLASS_SCRIPT_EXTENSION_NOTPROP;
break;
}
compares++;

case PT_SC:
Expand Down Expand Up @@ -7679,14 +7688,19 @@ if (unicode_status & XCLASS_NEEDS_UCD)
{
SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP);
cc++;
if (*cc == PT_SC || *cc == PT_SCX)
switch (*cc)
{
case PT_SCX:
if (cc[-1] == XCL_NOTPROP)
break;

case PT_SC:
compares--;
invertcmp = (compares == 0 && list != backtracks);
if (cc[-1] == XCL_NOTPROP)
invertcmp ^= 0x1;
jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]);
add_jump(compiler, compares > 0 ? list : backtracks, jump);

add_jump(compiler, compares > 0 ? list : backtracks, CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]));
}
cc += 2;
}
Expand All @@ -7697,6 +7711,27 @@ if (unicode_status & XCLASS_NEEDS_UCD)

if (unicode_status & XCLASS_HAS_SCRIPT_EXTENSION)
{
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx));
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);

if (unicode_status & XCLASS_SCRIPT_EXTENSION_NOTPROP)
{
if (unicode_status & (XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL | XCLASS_HAS_TYPE))
{
if (unicode_status & XCLASS_SAVE_CHAR)
{
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, TMP2, 0);
unicode_status |= XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0;
}
else
{
OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP2, 0);
unicode_status |= XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR;
}
}
OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script));
}

while (*cc != XCL_END)
{
if (*cc == XCL_SINGLE)
Expand All @@ -7716,22 +7751,35 @@ if (unicode_status & XCLASS_NEEDS_UCD)
cc++;
if (*cc == PT_SCX)
{
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx));
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
OP1(SLJIT_MOV_U32, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)));
OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)1 << (cc[1] & 0x1f));

compares--;
invertcmp = (compares == 0 && list != backtracks);

jump = NULL;
if (cc[-1] == XCL_NOTPROP)
{
jump = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, (int)cc[1]);
if (invertcmp)
{
add_jump(compiler, backtracks, jump);
jump = NULL;
}
invertcmp ^= 0x1;
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
add_jump(compiler, compares > 0 ? list : backtracks, jump);
}

OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)), SLJIT_IMM, (sljit_sw)1 << (cc[1] & 0x1f));
add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));

if (jump != NULL)
JUMPHERE(jump);
}
cc += 2;
}
}

if (unicode_status & XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0)
OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
else if (unicode_status & XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR)
OP1(SLJIT_MOV, TMP2, 0, RETURN_ADDR, 0);
cc = ccbegin;
}

Expand Down
2 changes: 2 additions & 0 deletions src/pcre2_jit_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,8 @@ static struct regression_test_case regression_test_cases[] = {
{ MUP, A, 0, 0 | F_PROPERTY, "[\\P{L&}]{2}[^\xc2\x85-\xc2\x89\\p{Ll}\\p{Lu}]{2}", "\xc3\xa9\xe6\x92\xad.a\xe6\x92\xad|\xc2\x8a#" },
{ PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "[a-b\\s]{2,5}[^a]", "AB baaa" },
{ MUP, 0, 0, 0 | F_NOMATCH, "[^\\p{Hangul}\\p{Z}]", " " },
{ MUP, 0, 0, 0, "[\\p{Lu}\\P{Latin}]+", "c\xEA\xA4\xAE,A,b" },
{ MUP, 0, 0, 0, "[\\x{a92e}\\p{Lu}\\P{Latin}]+", "c\xEA\xA4\xAE,A,b" },

/* Possible empty brackets. */
{ MU, A, 0, 0, "(?:|ab||bc|a)+d", "abcxabcabd" },
Expand Down
8 changes: 0 additions & 8 deletions testdata/testinput4
Original file line number Diff line number Diff line change
Expand Up @@ -1144,8 +1144,6 @@
\= Expect no match
X\x{06e9}

#subject no_jit

/^\P{Katakana}+/utf
\x{3105}
\= Expect no match
Expand All @@ -1157,8 +1155,6 @@
\x{a014}
\x{a4c6}

#subject -no_jit

/^\p{Any}X/utf
AXYZ
\x{1234}XYZ
Expand Down Expand Up @@ -1410,8 +1406,6 @@
\x{2116}
\x{1D183}

#subject no_jit

/^\p{Inherited}/utf
\x{200c}
\= Expect no match
Expand Down Expand Up @@ -1464,8 +1458,6 @@
/\p{sc:katakana}{3,}?/utf
\x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC

#subject -no_jit

/\p{Carian}\p{Cham}\p{Kayah_Li}\p{Lepcha}\p{Lycian}\p{Lydian}\p{Ol_Chiki}\p{Rejang}\p{Saurashtra}\p{Sundanese}\p{Vai}/utf
\x{102A4}\x{AA52}\x{A91D}\x{1C46}\x{10283}\x{1092E}\x{1C6B}\x{A93B}\x{A8BF}\x{1BA0}\x{A50A}====

Expand Down
4 changes: 0 additions & 4 deletions testdata/testinput5
Original file line number Diff line number Diff line change
Expand Up @@ -2035,8 +2035,6 @@
# doesn't recognize all these scripts. In time these three tests can be moved
# to test 4.

#subject no_jit

/^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+)
(\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+)
(\p{Zanabazar_Square}+)/x,utf
Expand Down Expand Up @@ -2085,8 +2083,6 @@
\x{655}
\x{1D1AA}

#subject -no_jit

/\N{U+}/

/\N{U+}/utf
Expand Down
8 changes: 0 additions & 8 deletions testdata/testoutput4
Original file line number Diff line number Diff line change
Expand Up @@ -1892,8 +1892,6 @@ No match
X\x{06e9}
No match

#subject no_jit

/^\P{Katakana}+/utf
\x{3105}
0: \x{3105}
Expand All @@ -1910,8 +1908,6 @@ No match
\x{a4c6}
No match

#subject -no_jit

/^\p{Any}X/utf
AXYZ
0: AX
Expand Down Expand Up @@ -2312,8 +2308,6 @@ No match
\x{1D183}
0: \x{1d183}

#subject no_jit

/^\p{Inherited}/utf
\x{200c}
0: \x{200c}
Expand Down Expand Up @@ -2392,8 +2386,6 @@ No match
\x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC
0: \x{30a1}\x{30fa}\x{32d0}

#subject -no_jit

/\p{Carian}\p{Cham}\p{Kayah_Li}\p{Lepcha}\p{Lycian}\p{Lydian}\p{Ol_Chiki}\p{Rejang}\p{Saurashtra}\p{Sundanese}\p{Vai}/utf
\x{102A4}\x{AA52}\x{A91D}\x{1C46}\x{10283}\x{1092E}\x{1C6B}\x{A93B}\x{A8BF}\x{1BA0}\x{A50A}====
0: \x{102a4}\x{aa52}\x{a91d}\x{1c46}\x{10283}\x{1092e}\x{1c6b}\x{a93b}\x{a8bf}\x{1ba0}\x{a50a}
Expand Down
4 changes: 0 additions & 4 deletions testdata/testoutput5
Original file line number Diff line number Diff line change
Expand Up @@ -4599,8 +4599,6 @@ No match
# doesn't recognize all these scripts. In time these three tests can be moved
# to test 4.

#subject no_jit

/^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+)
(\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+)
(\p{Zanabazar_Square}+)/x,utf
Expand Down Expand Up @@ -4742,8 +4740,6 @@ Callout 0: last capture = 1
\x{1D1AA}
0: \x{1d1aa}

#subject -no_jit

/\N{U+}/
Failed: error 193 at offset 2: \N{U+dddd} is supported only in Unicode (UTF) mode

Expand Down