Skip to content

Commit

Permalink
Add support for regular expressions containing hexadecimal digits gre…
Browse files Browse the repository at this point in the history
…ater than `0x7f` (#5442)

* Convert hex digits greater than 0x7f to direct unicode char

Signed-off-by: Anthony Chang <antchang@nvidia.com>

* Add integration tests for hex digits

Signed-off-by: Anthony Chang <antchang@nvidia.com>

* Improve tests

Signed-off-by: Anthony Chang <antchang@nvidia.com>
  • Loading branch information
anthony-chang authored May 12, 2022
1 parent 8650a42 commit 15ae542
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 15 deletions.
14 changes: 14 additions & 0 deletions integration_tests/src/main/python/string_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -836,6 +836,20 @@ def test_regexp_extract_idx_0():
'regexp_extract(a, "^([a-d]*)[0-9]*([a-d]*)\\z", 0)'),
conf=_regexp_conf)

def test_regexp_hexadecimal_digits():
gen = mk_str_gen(
'[abcd]\\\\x00\\\\x7f\\\\x80\\\\xff\\\\x{10ffff}\\\\x{00eeee}[\\\\xa0-\\\\xb0][abcd]')
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, gen).selectExpr(
'rlike(a, "\\\\x7f")',
'rlike(a, "\\\\x80")',
'rlike(a, "\\\\x{00eeee}")',
'regexp_extract(a, "([a-d]+)\\\\xa0([a-d]+)", 1)',
'regexp_replace(a, "\\\\xff", "")',
'regexp_replace(a, "\\\\x{10ffff}", "")',
),
conf=_regexp_conf)

def test_regexp_whitespace():
gen = mk_str_gen('\u001e[abcd]\t\n{1,3} [0-9]\n {1,3}\x0b\t[abcd]\r\f[0-9]{0,10}')
assert_gpu_and_cpu_are_equal_collect(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -630,11 +630,11 @@ class CudfRegexTranspiler(mode: RegexMode) {
case RegexHexDigit(digits) =>
val codePoint = Integer.parseInt(digits, 16)
if (codePoint >= 128) {
// see https://github.com/NVIDIA/spark-rapids/issues/4866
throw new RegexUnsupportedException(
"cuDF does not support hex digits > 0x7F")
// cuDF only supports 0x00 to 0x7f hexidecimal chars
RegexChar(codePoint.toChar)
} else {
RegexHexDigit(String.format("%02x", Int.box(codePoint)))
}
RegexHexDigit(String.format("%02x", Int.box(codePoint)))

case RegexEscaped(ch) => ch match {
case 'D' =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,14 +150,6 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
"cuDF does not support octal digits 0o177 < n <= 0o377"))
}

test("cuDF does not support hex digits > 0x7F") {
// see https://github.com/NVIDIA/spark-rapids/issues/4866
val patterns = Seq(raw"\x80", raw"\xff", raw"\xFF", raw"\x{ABC}")
patterns.foreach(pattern =>
assertUnsupported(pattern, RegexFindMode,
"cuDF does not support hex digits > 0x7F"))
}

test("cuDF does not support octal digits in character classes") {
// see https://github.com/NVIDIA/spark-rapids/issues/4862
val patterns = Seq(raw"[\02]", raw"[\012]", raw"[\0177]")
Expand All @@ -184,10 +176,11 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
"\u0007\u003f\u007f", "\u007f", "\u007f2"))
}

test("hex digits < 0x7F - find") {
val patterns = Seq(raw"\x07", raw"\x3f", raw"\x7F", raw"\x7f", raw"\x{7}", raw"\x{0007f}")
test("hex digits - find") {
val patterns = Seq(raw"\x07", raw"\x3f", raw"\x7F", raw"\x7f", raw"\x{7}", raw"\x{0007f}",
raw"\x80", raw"\xff", raw"\x{0008f}", raw"\x{10FFFF}", raw"\x{00eeee}")
assertCpuGpuMatchesRegexpFind(patterns, Seq("", "\u0007", "a\u0007b",
"\u0007\u003f\u007f", "\u007f", "\u007f2"))
"\u0007\u003f\u007f", "\u0080", "a\u00fe\u00ffb", "ab\ueeeecd"))
}

test("string anchors - find") {
Expand Down

0 comments on commit 15ae542

Please sign in to comment.