Skip to content

Commit

Permalink
Fix script extension support on jit (#69)
Browse files Browse the repository at this point in the history
Co-authored-by: Zoltan Herczeg <hzmester@freemail.hu>
  • Loading branch information
zherczeg and Zoltan Herczeg committed Jan 3, 2022
1 parent c24047f commit 435140a
Show file tree
Hide file tree
Showing 6 changed files with 67 additions and 41 deletions.
82 changes: 65 additions & 17 deletions src/pcre2_jit_compile.c
Original file line number Diff line number Diff line change
Expand Up @@ -7413,14 +7413,18 @@ return cc;
static PCRE2_SPTR compile_char1_matchingpath(compiler_common *common, PCRE2_UCHAR type, PCRE2_SPTR cc, jump_list **backtracks, BOOL check_str_ptr);

#ifdef SUPPORT_UNICODE
#define XCLASS_SAVE_CHAR 0x01
#define XCLASS_CHAR_SAVED 0x02
#define XCLASS_HAS_TYPE 0x04
#define XCLASS_HAS_SCRIPT 0x08
#define XCLASS_HAS_SCRIPT_EXTENSION 0x10
#define XCLASS_HAS_BIDICO 0x20
#define XCLASS_HAS_BIDICL 0x40
#define XCLASS_SAVE_CHAR 0x001
#define XCLASS_CHAR_SAVED 0x002
#define XCLASS_HAS_TYPE 0x004
#define XCLASS_HAS_SCRIPT 0x008
#define XCLASS_HAS_SCRIPT_EXTENSION 0x010
#define XCLASS_HAS_BIDICO 0x020
#define XCLASS_HAS_BIDICL 0x040
#define XCLASS_NEEDS_UCD (XCLASS_HAS_TYPE | XCLASS_HAS_SCRIPT | XCLASS_HAS_SCRIPT_EXTENSION | XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL)
#define XCLASS_SCRIPT_EXTENSION_NOTPROP 0x080
#define XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR 0x100
#define XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0 0x200

#endif /* SUPPORT_UNICODE */

static void compile_xclass_matchingpath(compiler_common *common, PCRE2_SPTR cc, jump_list **backtracks)
Expand Down Expand Up @@ -7521,6 +7525,11 @@ while (*cc != XCL_END)

case PT_SCX:
unicode_status |= XCLASS_HAS_SCRIPT_EXTENSION;
if (cc[-1] == XCL_NOTPROP)
{
unicode_status |= XCLASS_SCRIPT_EXTENSION_NOTPROP;
break;
}
compares++;

case PT_SC:
Expand Down Expand Up @@ -7679,14 +7688,19 @@ if (unicode_status & XCLASS_NEEDS_UCD)
{
SLJIT_ASSERT(*cc == XCL_PROP || *cc == XCL_NOTPROP);
cc++;
if (*cc == PT_SC || *cc == PT_SCX)
switch (*cc)
{
case PT_SCX:
if (cc[-1] == XCL_NOTPROP)
break;

case PT_SC:
compares--;
invertcmp = (compares == 0 && list != backtracks);
if (cc[-1] == XCL_NOTPROP)
invertcmp ^= 0x1;
jump = CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]);
add_jump(compiler, compares > 0 ? list : backtracks, jump);

add_jump(compiler, compares > 0 ? list : backtracks, CMP(SLJIT_EQUAL ^ invertcmp, TMP1, 0, SLJIT_IMM, (int)cc[1]));
}
cc += 2;
}
Expand All @@ -7697,6 +7711,27 @@ if (unicode_status & XCLASS_NEEDS_UCD)

if (unicode_status & XCLASS_HAS_SCRIPT_EXTENSION)
{
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx));
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);

if (unicode_status & XCLASS_SCRIPT_EXTENSION_NOTPROP)
{
if (unicode_status & (XCLASS_HAS_BIDICO | XCLASS_HAS_BIDICL | XCLASS_HAS_TYPE))
{
if (unicode_status & XCLASS_SAVE_CHAR)
{
OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, TMP2, 0);
unicode_status |= XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0;
}
else
{
OP1(SLJIT_MOV, RETURN_ADDR, 0, TMP2, 0);
unicode_status |= XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR;
}
}
OP1(SLJIT_MOV_U8, TMP2, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, script));
}

while (*cc != XCL_END)
{
if (*cc == XCL_SINGLE)
Expand All @@ -7716,22 +7751,35 @@ if (unicode_status & XCLASS_NEEDS_UCD)
cc++;
if (*cc == PT_SCX)
{
OP1(SLJIT_MOV_U8, TMP1, 0, SLJIT_MEM1(TMP2), (sljit_sw)PRIV(ucd_records) + SLJIT_OFFSETOF(ucd_record, scriptx));
OP2(SLJIT_SHL, TMP1, 0, TMP1, 0, SLJIT_IMM, 2);
OP1(SLJIT_MOV_U32, TMP1, 0, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)));
OP2(SLJIT_AND | SLJIT_SET_Z, SLJIT_UNUSED, 0, TMP1, 0, SLJIT_IMM, (sljit_sw)1 << (cc[1] & 0x1f));

compares--;
invertcmp = (compares == 0 && list != backtracks);

jump = NULL;
if (cc[-1] == XCL_NOTPROP)
{
jump = CMP(SLJIT_EQUAL, TMP2, 0, SLJIT_IMM, (int)cc[1]);
if (invertcmp)
{
add_jump(compiler, backtracks, jump);
jump = NULL;
}
invertcmp ^= 0x1;
jump = JUMP(SLJIT_NOT_ZERO ^ invertcmp);
add_jump(compiler, compares > 0 ? list : backtracks, jump);
}

OP2(SLJIT_AND32 | SLJIT_SET_Z, SLJIT_UNUSED, 0, SLJIT_MEM1(TMP1), (sljit_sw)(PRIV(ucd_script_sets) + (cc[1] >> 5)), SLJIT_IMM, (sljit_sw)1 << (cc[1] & 0x1f));
add_jump(compiler, compares > 0 ? list : backtracks, JUMP(SLJIT_NOT_ZERO ^ invertcmp));

if (jump != NULL)
JUMPHERE(jump);
}
cc += 2;
}
}

if (unicode_status & XCLASS_SCRIPT_EXTENSION_RESTORE_LOCALS0)
OP1(SLJIT_MOV, TMP2, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
else if (unicode_status & XCLASS_SCRIPT_EXTENSION_RESTORE_RETURN_ADDR)
OP1(SLJIT_MOV, TMP2, 0, RETURN_ADDR, 0);
cc = ccbegin;
}

Expand Down
2 changes: 2 additions & 0 deletions src/pcre2_jit_test.c
Original file line number Diff line number Diff line change
Expand Up @@ -413,6 +413,8 @@ static struct regression_test_case regression_test_cases[] = {
{ MUP, A, 0, 0 | F_PROPERTY, "[\\P{L&}]{2}[^\xc2\x85-\xc2\x89\\p{Ll}\\p{Lu}]{2}", "\xc3\xa9\xe6\x92\xad.a\xe6\x92\xad|\xc2\x8a#" },
{ PCRE2_UCP, 0, 0, 0 | F_PROPERTY, "[a-b\\s]{2,5}[^a]", "AB baaa" },
{ MUP, 0, 0, 0 | F_NOMATCH, "[^\\p{Hangul}\\p{Z}]", " " },
{ MUP, 0, 0, 0, "[\\p{Lu}\\P{Latin}]+", "c\xEA\xA4\xAE,A,b" },
{ MUP, 0, 0, 0, "[\\x{a92e}\\p{Lu}\\P{Latin}]+", "c\xEA\xA4\xAE,A,b" },

/* Possible empty brackets. */
{ MU, A, 0, 0, "(?:|ab||bc|a)+d", "abcxabcabd" },
Expand Down
8 changes: 0 additions & 8 deletions testdata/testinput4
Original file line number Diff line number Diff line change
Expand Up @@ -1144,8 +1144,6 @@
\= Expect no match
X\x{06e9}

#subject no_jit

/^\P{Katakana}+/utf
\x{3105}
\= Expect no match
Expand All @@ -1157,8 +1155,6 @@
\x{a014}
\x{a4c6}

#subject -no_jit

/^\p{Any}X/utf
AXYZ
\x{1234}XYZ
Expand Down Expand Up @@ -1410,8 +1406,6 @@
\x{2116}
\x{1D183}

#subject no_jit

/^\p{Inherited}/utf
\x{200c}
\= Expect no match
Expand Down Expand Up @@ -1464,8 +1458,6 @@
/\p{sc:katakana}{3,}?/utf
\x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC

#subject -no_jit

/\p{Carian}\p{Cham}\p{Kayah_Li}\p{Lepcha}\p{Lycian}\p{Lydian}\p{Ol_Chiki}\p{Rejang}\p{Saurashtra}\p{Sundanese}\p{Vai}/utf
\x{102A4}\x{AA52}\x{A91D}\x{1C46}\x{10283}\x{1092E}\x{1C6B}\x{A93B}\x{A8BF}\x{1BA0}\x{A50A}====

Expand Down
4 changes: 0 additions & 4 deletions testdata/testinput5
Original file line number Diff line number Diff line change
Expand Up @@ -2035,8 +2035,6 @@
# doesn't recognize all these scripts. In time these three tests can be moved
# to test 4.

#subject no_jit

/^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+)
(\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+)
(\p{Zanabazar_Square}+)/x,utf
Expand Down Expand Up @@ -2085,8 +2083,6 @@
\x{655}
\x{1D1AA}

#subject -no_jit

/\N{U+}/

/\N{U+}/utf
Expand Down
8 changes: 0 additions & 8 deletions testdata/testoutput4
Original file line number Diff line number Diff line change
Expand Up @@ -1892,8 +1892,6 @@ No match
X\x{06e9}
No match

#subject no_jit

/^\P{Katakana}+/utf
\x{3105}
0: \x{3105}
Expand All @@ -1910,8 +1908,6 @@ No match
\x{a4c6}
No match

#subject -no_jit

/^\p{Any}X/utf
AXYZ
0: AX
Expand Down Expand Up @@ -2312,8 +2308,6 @@ No match
\x{1D183}
0: \x{1d183}

#subject no_jit

/^\p{Inherited}/utf
\x{200c}
0: \x{200c}
Expand Down Expand Up @@ -2392,8 +2386,6 @@ No match
\x{30a1}\x{30fa}\x{32d0}\x{1b122}\x{ff66}\x{3001}ABC
0: \x{30a1}\x{30fa}\x{32d0}

#subject -no_jit

/\p{Carian}\p{Cham}\p{Kayah_Li}\p{Lepcha}\p{Lycian}\p{Lydian}\p{Ol_Chiki}\p{Rejang}\p{Saurashtra}\p{Sundanese}\p{Vai}/utf
\x{102A4}\x{AA52}\x{A91D}\x{1C46}\x{10283}\x{1092E}\x{1C6B}\x{A93B}\x{A8BF}\x{1BA0}\x{A50A}====
0: \x{102a4}\x{aa52}\x{a91d}\x{1c46}\x{10283}\x{1092e}\x{1c6b}\x{a93b}\x{a8bf}\x{1ba0}\x{a50a}
Expand Down
4 changes: 0 additions & 4 deletions testdata/testoutput5
Original file line number Diff line number Diff line change
Expand Up @@ -4599,8 +4599,6 @@ No match
# doesn't recognize all these scripts. In time these three tests can be moved
# to test 4.

#subject no_jit

/^(\p{Adlam}+)(\p{Bhaiksuki}+)(\p{Marchen}+)(\p{Newa}+)(\p{Osage}+)
(\p{Tangut}+)(\p{Masaram_Gondi}+)(\p{Nushu}+)(\p{Soyombo}+)
(\p{Zanabazar_Square}+)/x,utf
Expand Down Expand Up @@ -4742,8 +4740,6 @@ Callout 0: last capture = 1
\x{1D1AA}
0: \x{1d1aa}

#subject -no_jit

/\N{U+}/
Failed: error 193 at offset 2: \N{U+dddd} is supported only in Unicode (UTF) mode

Expand Down

0 comments on commit 435140a

Please sign in to comment.