Skip to content

Commit

Permalink
Revert "Fix regressions related to cuDF changes in handline of end-of…
Browse files Browse the repository at this point in the history
…-line/string anchors (NVIDIA#7211)"

This reverts commit 3398daa.
  • Loading branch information
andygrove committed Jan 27, 2023
1 parent 3398daa commit fcbf7c7
Show file tree
Hide file tree
Showing 5 changed files with 43 additions and 82 deletions.
17 changes: 9 additions & 8 deletions docs/compatibility.md
Original file line number Diff line number Diff line change
Expand Up @@ -437,23 +437,18 @@ disable regular expressions on the GPU, set `spark.rapids.sql.regexp.enabled=fal

These are the known edge cases where running on the GPU will produce different results to the CPU:

- Regular expressions that contain an end of line anchor '$' or end of string anchor '\Z' immediately
- Regular expressions that contain an end of line anchor '$' or end of string anchor '\Z' or '\z' immediately
next to a newline or a repetition that produces zero or more results
([#5610](https://github.com/NVIDIA/spark-rapids/pull/5610))`
- Word and non-word boundaries, `\b` and `\B`
- Line anchor `$` will incorrectly match any of the unicode characters `\u0085`, `\u2028`, or `\u2029` followed by
another line-terminator, such as `\n`. For example, the pattern `TEST$` will match `TEST\u0085\n` on the GPU but
not on the CPU ([#7585](https://github.com/NVIDIA/spark-rapids/issues/7585)).

The following regular expression patterns are not yet supported on the GPU and will fall back to the CPU.

- Line anchor `^` is not supported in some contexts, such as when combined with a choice (`^|a`).
- Line anchor `$` is not supported in some rare contexts.
- Line anchor `$` is not supported by `regexp_replace`, and in some rare contexts.
- String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts.
- String anchor `\z` is not supported
- Patterns containing an end of line or string anchor immediately next to a newline or repetition that produces zero
or more results
- Line anchor `$` and string anchors `\Z` are not supported in patterns containing `\W` or `\D`
- Line anchor `$` and string anchors `\z` and `\Z` are not supported in patterns containing `\W` or `\D`
- Line and string anchors are not supported by `string_split` and `str_to_map`
- Lazy quantifiers, such as `a*?`
- Possessive quantifiers, such as `a*+`
Expand All @@ -462,6 +457,12 @@ The following regular expression patterns are not yet supported on the GPU and w
- Empty groups: `()`
- `regexp_replace` does not support back-references

The following regular expression patterns are known to potentially produce different results on the GPU
vs the CPU.

- Word and non-word boundaries, `\b` and `\B`


Work is ongoing to increase the range of regular expressions that can run on the GPU.

## Timestamps
Expand Down
5 changes: 5 additions & 0 deletions integration_tests/src/main/python/regexp_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,7 @@ def test_re_replace_backrefs():
),
conf=_regexp_conf)

@pytest.mark.skip(reason='https://github.com/NVIDIA/spark-rapids/issues/7090')
def test_re_replace_anchors():
gen = mk_str_gen('.{0,2}TEST[\ud720 A]{0,5}TEST[\r\n\u0085\u2028\u2029]?') \
.with_special_case("TEST") \
Expand All @@ -305,7 +306,11 @@ def test_re_replace_anchors():
'REGEXP_REPLACE(a, "\\\\ATEST$", "PROD")',
'REGEXP_REPLACE(a, "^TEST\\\\Z", "PROD")',
'REGEXP_REPLACE(a, "TEST\\\\Z", "PROD")',
'REGEXP_REPLACE(a, "TEST\\\\z", "PROD")',
'REGEXP_REPLACE(a, "\\\\zTEST", "PROD")',
'REGEXP_REPLACE(a, "^TEST$", "PROD")',
'REGEXP_REPLACE(a, "^TEST\\\\z", "PROD")',
'REGEXP_REPLACE(a, "TEST\\\\z", "PROD")',
),
conf=_regexp_conf)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -628,8 +628,6 @@ class RegexParser(pattern: String) {
object RegexParser {
private val regexpChars = Set('\u0000', '\\', '.', '^', '$', '\u0007', '\u001b', '\f')

def parse(pattern: String): RegexAST = new RegexParser(pattern).parse

def isRegExpString(s: String): Boolean = {

def isRegExpString(ast: RegexAST): Boolean = ast match {
Expand Down Expand Up @@ -844,7 +842,10 @@ class CudfRegexTranspiler(mode: RegexMode) {
None
)
} else {
RegexGroup(capture = capture, RegexParser.parse("\r|\u0085|\u2028|\u2029|\r\n"), None)
RegexGroup(capture = capture,
RegexChoice(
RegexCharacterClass(negated = false, characters = terminatorChars),
RegexSequence(ListBuffer(RegexChar('\r'), RegexChar('\n')))), None)
}
}

Expand Down Expand Up @@ -1143,10 +1144,8 @@ class CudfRegexTranspiler(mode: RegexMode) {
case 'z' if mode == RegexSplitMode =>
RegexEscaped('Z')
case 'z' =>
// cuDF does not support "\z" except for in split mode
throw new RegexUnsupportedException(
"\\z is not supported on GPU for find or replace",
regex.position)
// cuDF does not support "\z" but supports "$", which is equivalent
RegexChar('$')
case 'Z' =>
// \Z is really a synonymn for $. It's used in Java to preserve that behavior when
// using modes that change the meaning of $ (such as MULTILINE or UNIX_LINES)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -490,20 +490,11 @@ object GpuToTimestamp extends Arm {
// the string as well which works well for fixed-length formats but if/when we want to
// support variable-length formats (such as timestamps with milliseconds) then we will need
// to use regex instead.
val isTimestamp = withResource(col.matchesRe(fmt.validRegex)) { matches =>
withResource(col.matchesRe(fmt.validRegex)) { matches =>
withResource(col.isTimestamp(strfFormat)) { isTimestamp =>
isTimestamp.and(matches)
}
}
withResource(isTimestamp) { _ =>
withResource(col.getCharLengths) { len =>
withResource(Scalar.fromInt(sparkFormat.length)) { expectedLen =>
withResource(len.equalTo(expectedLen)) { lenMatches =>
lenMatches.and(isTimestamp)
}
}
}
}
case _ =>
// this is the incompatibleDateFormats case where we do not guarantee compatibility with
// Spark and assume that all non-null inputs are valid
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,11 +144,6 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
)
}

test("cuDF does not support \\z") {
assertUnsupported("foo\\z", RegexFindMode,
"\\z is not supported on GPU")
}

test("cuDF does not support positive or negative lookahead") {
val negPatterns = Seq("a(!b)", "a(!b)c?")
negPatterns.foreach(pattern =>
Expand Down Expand Up @@ -244,8 +239,9 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
}

test("string anchors - find") {
assume(false, "Skipping due to https://github.com/NVIDIA/spark-rapids/issues/7090")
val patterns = Seq("\\Atest", "\\A+test", "\\A{1}test", "\\A{1,}test",
"(\\A)+test", "(\\A){1}test", "(\\A){1,}test")
"(\\A)+test", "(\\A){1}test", "(\\A){1,}test", "test\\z")
assertCpuGpuMatchesRegexpFind(patterns, Seq("", "test", "atest", "testa",
"\ntest", "test\n", "\ntest\n"))
}
Expand Down Expand Up @@ -282,13 +278,15 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
}

test("line anchors - replace") {
val patterns = Seq("^test", "test$", "^test$", "test\\Z")
assume(false, "Skipping due to https://github.com/NVIDIA/spark-rapids/issues/7090")
val patterns = Seq("^test", "test$", "^test$", "test\\Z", "test\\z")
assertCpuGpuMatchesRegexpReplace(patterns, Seq("", "test", "atest", "testa",
"\ntest", "test\n", "\ntest\n", "\ntest\r\ntest\n"))
}

test("string anchors - replace") {
val patterns = Seq("\\Atest", "test\\Z")
assume(false, "Skipping due to https://github.com/NVIDIA/spark-rapids/issues/7090")
val patterns = Seq("\\Atest", "test\\z", "test\\Z")
assertCpuGpuMatchesRegexpReplace(patterns, Seq("", "test", "atest", "testa",
"\ntest", "test\n", "\ntest\n", "\ntest\r\ntest\n"))
}
Expand All @@ -299,9 +297,10 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
}

test("line anchor $ - find") {
assume(false, "Skipping due to https://github.com/NVIDIA/spark-rapids/issues/7090")
val patterns = Seq("a$", "a$b", "\f$", "$\f")
val inputs = Seq("a", "a\n", "a\r", "a\r\n", "a\f", "\f", "\r", "\u0085", "\u2028",
"\u2029", "\n", "\r\n", "\r\n\r", "\r\n\u0085", "\n\r",
val inputs = Seq("a", "a\n", "a\r", "a\r\n", "a\u0085\n", "a\f", "\f", "\r", "\u0085", "\u2028",
"\u2029", "\n", "\r\n", "\r\n\r", "\r\n\u0085", "\u0085\r", "\u2028\n", "\u2029\n", "\n\r",
"\n\u0085", "\n\u2028", "\n\u2029", "2+|+??wD\n", "a\r\nb")
assertCpuGpuMatchesRegexpFind(patterns, inputs)
val unsupportedPatterns = Seq("[\r\n]?$", "$\r", "\r$",
Expand All @@ -313,9 +312,10 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
}

test("string anchor \\Z - find") {
assume(false, "Skipping due to https://github.com/NVIDIA/spark-rapids/issues/7090")
val patterns = Seq("a\\Z", "a\\Zb", "a\\Z+", "\f\\Z", "\\Z\f")
val inputs = Seq("a", "a\n", "a\r", "a\r\n", "a\f", "\f", "\r", "\u0085", "\u2028",
"\u2029", "\n", "\r\n", "\r\n\r", "\r\n\u0085", "\n\r",
val inputs = Seq("a", "a\n", "a\r", "a\r\n", "a\u0085\n", "a\f", "\f", "\r", "\u0085", "\u2028",
"\u2029", "\n", "\r\n", "\r\n\r", "\r\n\u0085", "\u0085\r", "\u2028\n", "\u2029\n", "\n\r",
"\n\u0085", "\n\u2028", "\n\u2029", "2+|+??wD\n", "a\r\nb")
assertCpuGpuMatchesRegexpFind(patterns, inputs)
val unsupportedPatterns = Seq("[\r\n]?\\Z", "\\Z\r", "\r\\Z",
Expand Down Expand Up @@ -416,14 +416,14 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
test("transpile complex regex 2") {
val TIMESTAMP_TRUNCATE_REGEX = "^([0-9]{4}-[0-9]{2}-[0-9]{2} " +
"[0-9]{2}:[0-9]{2}:[0-9]{2})" +
"(.[1-9]*(?:0)?[1-9]+)?(.0*[1-9]+)?(?:.0*)?.\\Z"
"(.[1-9]*(?:0)?[1-9]+)?(.0*[1-9]+)?(?:.0*)?.\\z"

// input and output should be identical except for `.` being replaced
// with `[^\n\r\u0085\u2028\u2029]` and `\z` being replaced with `$`
doTranspileTest(TIMESTAMP_TRUNCATE_REGEX,
TIMESTAMP_TRUNCATE_REGEX
.replaceAll("\\.", "[^\n\r\u0085\u2028\u2029]")
.replaceAll("\\\\Z", "(?:\r|\u0085|\u2028|\u2029|\r\n)?\\$"))
.replaceAll("\\\\z", "\\$"))
}

test("transpile \\A repetitions") {
Expand All @@ -434,19 +434,20 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
}

test("transpile \\z") {
assertUnsupported("abc\\z", RegexFindMode, "")
doTranspileTest("abc\\z", "abc$")
doTranspileTest("abc\\Z\\z", "abc$")
doTranspileTest("abc$\\z", "abc$")
}

test("transpile $") {
doTranspileTest("a$", "a(?:\r|\u0085|\u2028|\u2029|\r\n)?$")
doTranspileTest("a$", "a(?:[\n\r\u0085\u2028\u2029]|\r\n)?$")
}

test("transpile \\Z") {
val expected = "a(?:\r|\u0085|\u2028|\u2029|\r\n)?$"
doTranspileTest("a\\Z", expected)
doTranspileTest("a\\Z+", expected)
doTranspileTest("a\\Z{1}", expected)
doTranspileTest("a\\Z{1,}", expected)
doTranspileTest("a\\Z", "a(?:[\n\r\u0085\u2028\u2029]|\r\n)?$")
doTranspileTest("a\\Z+", "a(?:[\n\r\u0085\u2028\u2029]|\r\n)?$")
doTranspileTest("a\\Z{1}", "a(?:[\n\r\u0085\u2028\u2029]|\r\n)?$")
doTranspileTest("a\\Z{1,}", "a(?:[\n\r\u0085\u2028\u2029]|\r\n)?$")
}

test("transpile predefined character classes") {
Expand Down Expand Up @@ -515,42 +516,6 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
assertCpuGpuMatchesRegexpReplace(patterns, inputs)
}

test("line anchor find - unicode line separators 0085") {
assume(false, "https://github.com/NVIDIA/spark-rapids/issues/7585")
val inputs = Seq("aTEST\u0085", "aTEST\u0085\n", "aTEST\n\u0085")
assertCpuGpuMatchesRegexpFind(Seq("TEST$"), inputs)
}

test("line anchor find - unicode line separators 2028") {
assume(false, "https://github.com/NVIDIA/spark-rapids/issues/7585")
val inputs = Seq("aTEST\u2028", "aTEST\u2028\n", "aTEST\n\u2028")
assertCpuGpuMatchesRegexpFind(Seq("TEST$"), inputs)
}

test("line anchor find - unicode line separators 2029") {
assume(false, "https://github.com/NVIDIA/spark-rapids/issues/7585")
val inputs = Seq("aTEST\u2029", "aTEST\u2029\n", "aTEST\n\u2029")
assertCpuGpuMatchesRegexpFind(Seq("TEST$"), inputs)
}

test("line anchor replace - unicode line separators 0085") {
assume(false, "https://github.com/NVIDIA/spark-rapids/issues/7585")
val inputs = Seq("aTEST\u0085", "aTEST\u0085\n", "aTEST\n\u0085")
assertCpuGpuMatchesRegexpReplace(Seq("TEST$"), inputs)
}

test("line anchor replace - unicode line separators 2028") {
assume(false, "https://github.com/NVIDIA/spark-rapids/issues/7585")
val inputs = Seq("aTEST\u2028", "aTEST\u2028\n", "aTEST\n\u2028")
assertCpuGpuMatchesRegexpReplace(Seq("TEST$"), inputs)
}

test("line anchor replace - unicode line separators 2029") {
assume(false, "https://github.com/NVIDIA/spark-rapids/issues/7585")
val inputs = Seq("aTEST\u2029", "aTEST\u2029\n", "aTEST\n\u2029")
assertCpuGpuMatchesRegexpReplace(Seq("TEST$"), inputs)
}

test("cuDF does not support some uses of line anchors in regexp_replace") {
Seq("^", "$", "^*", "$*", "^+", "$+", "^|$", "^^|$$").foreach(
pattern =>
Expand Down

0 comments on commit fcbf7c7

Please sign in to comment.