Skip to content

Commit

Permalink
[SPARK-49738][SQL] Endswith bug fix
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?

Bugfix in "endswith" string predicate. Also fixed the same type of the bug in `CollationAwareUTF8String.java` in method `lowercaseMatchLengthFrom`.

### Why are the changes needed?

Expression `select endswith('İo' collate utf8_lcase, 'İo' collate utf8_lcase)`
returns `false` but should return `true`.

### Does this PR introduce _any_ user-facing change?

No.

### How was this patch tested?

Added tests in CollationSupportSuite.

### Was this patch authored or co-authored using generative AI tooling?

No.

Closes #48187 from viktorluc-db/matchBugFix.

Authored-by: viktorluc-db <viktor.lucic@databricks.com>
Signed-off-by: Max Gekk <max.gekk@gmail.com>
  • Loading branch information
viktorluc-db authored and MaxGekk committed Sep 20, 2024
1 parent f3785fa commit f76a9b1
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 4 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ private static int lowercaseMatchLengthFrom(
}
// Compare the characters in the target and pattern strings.
int matchLength = 0, codePointBuffer = -1, targetCodePoint, patternCodePoint;
while (targetIterator.hasNext() && patternIterator.hasNext()) {
while ((targetIterator.hasNext() || codePointBuffer != -1) && patternIterator.hasNext()) {
if (codePointBuffer != -1) {
targetCodePoint = codePointBuffer;
codePointBuffer = -1;
Expand Down Expand Up @@ -211,7 +211,7 @@ private static int lowercaseMatchLengthUntil(
}
// Compare the characters in the target and pattern strings.
int matchLength = 0, codePointBuffer = -1, targetCodePoint, patternCodePoint;
while (targetIterator.hasNext() && patternIterator.hasNext()) {
while ((targetIterator.hasNext() || codePointBuffer != -1) && patternIterator.hasNext()) {
if (codePointBuffer != -1) {
targetCodePoint = codePointBuffer;
codePointBuffer = -1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,8 @@ public void testStartsWith() throws SparkException {
assertStartsWith("İonic", "Io", "UTF8_LCASE", false);
assertStartsWith("İonic", "i\u0307o", "UTF8_LCASE", true);
assertStartsWith("İonic", "İo", "UTF8_LCASE", true);
assertStartsWith("oİ", "oİ", "UTF8_LCASE", true);
assertStartsWith("oİ", "oi̇", "UTF8_LCASE", true);
// Conditional case mapping (e.g. Greek sigmas).
assertStartsWith("σ", "σ", "UTF8_BINARY", true);
assertStartsWith("σ", "ς", "UTF8_BINARY", false);
Expand Down Expand Up @@ -880,6 +882,8 @@ public void testEndsWith() throws SparkException {
assertEndsWith("the İo", "Io", "UTF8_LCASE", false);
assertEndsWith("the İo", "i\u0307o", "UTF8_LCASE", true);
assertEndsWith("the İo", "İo", "UTF8_LCASE", true);
assertEndsWith("İo", "İo", "UTF8_LCASE", true);
assertEndsWith("İo", "i̇o", "UTF8_LCASE", true);
// Conditional case mapping (e.g. Greek sigmas).
assertEndsWith("σ", "σ", "UTF8_BINARY", true);
assertEndsWith("σ", "ς", "UTF8_BINARY", false);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2213,8 +2213,8 @@ struct<endswith(collate(utf8_binary, utf8_lcase), collate(utf8_lcase, utf8_lcase
false
false
false
false
false
true
true
true
true
true
Expand Down

0 comments on commit f76a9b1

Please sign in to comment.