From f76a9b1135e748649bdb9a2104360f0dc533cc1f Mon Sep 17 00:00:00 2001 From: viktorluc-db Date: Fri, 20 Sep 2024 22:47:30 +0200 Subject: [PATCH] [SPARK-49738][SQL] Endswith bug fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What changes were proposed in this pull request? Bugfix in "endswith" string predicate. Also fixed the same type of the bug in `CollationAwareUTF8String.java` in method `lowercaseMatchLengthFrom`. ### Why are the changes needed? Expression `select endswith('İo' collate utf8_lcase, 'İo' collate utf8_lcase)` returns `false` but should return `true`. ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? Added tests in CollationSupportSuite. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #48187 from viktorluc-db/matchBugFix. Authored-by: viktorluc-db Signed-off-by: Max Gekk --- .../spark/sql/catalyst/util/CollationAwareUTF8String.java | 4 ++-- .../org/apache/spark/unsafe/types/CollationSupportSuite.java | 4 ++++ .../src/test/resources/sql-tests/results/collations.sql.out | 4 ++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index 5ed3048fb72b3..fb610a5d96f17 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -109,7 +109,7 @@ private static int lowercaseMatchLengthFrom( } // Compare the characters in the target and pattern strings. int matchLength = 0, codePointBuffer = -1, targetCodePoint, patternCodePoint; - while (targetIterator.hasNext() && patternIterator.hasNext()) { + while ((targetIterator.hasNext() || codePointBuffer != -1) && patternIterator.hasNext()) { if (codePointBuffer != -1) { targetCodePoint = codePointBuffer; codePointBuffer = -1; @@ -211,7 +211,7 @@ private static int lowercaseMatchLengthUntil( } // Compare the characters in the target and pattern strings. int matchLength = 0, codePointBuffer = -1, targetCodePoint, patternCodePoint; - while (targetIterator.hasNext() && patternIterator.hasNext()) { + while ((targetIterator.hasNext() || codePointBuffer != -1) && patternIterator.hasNext()) { if (codePointBuffer != -1) { targetCodePoint = codePointBuffer; codePointBuffer = -1; diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java index 5719303a0dce8..a445cde52ad57 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java @@ -629,6 +629,8 @@ public void testStartsWith() throws SparkException { assertStartsWith("İonic", "Io", "UTF8_LCASE", false); assertStartsWith("İonic", "i\u0307o", "UTF8_LCASE", true); assertStartsWith("İonic", "İo", "UTF8_LCASE", true); + assertStartsWith("oİ", "oİ", "UTF8_LCASE", true); + assertStartsWith("oİ", "oi̇", "UTF8_LCASE", true); // Conditional case mapping (e.g. Greek sigmas). assertStartsWith("σ", "σ", "UTF8_BINARY", true); assertStartsWith("σ", "ς", "UTF8_BINARY", false); @@ -880,6 +882,8 @@ public void testEndsWith() throws SparkException { assertEndsWith("the İo", "Io", "UTF8_LCASE", false); assertEndsWith("the İo", "i\u0307o", "UTF8_LCASE", true); assertEndsWith("the İo", "İo", "UTF8_LCASE", true); + assertEndsWith("İo", "İo", "UTF8_LCASE", true); + assertEndsWith("İo", "i̇o", "UTF8_LCASE", true); // Conditional case mapping (e.g. Greek sigmas). assertEndsWith("σ", "σ", "UTF8_BINARY", true); assertEndsWith("σ", "ς", "UTF8_BINARY", false); diff --git a/sql/core/src/test/resources/sql-tests/results/collations.sql.out b/sql/core/src/test/resources/sql-tests/results/collations.sql.out index 5999bf20f6884..9d29a46e5a0ef 100644 --- a/sql/core/src/test/resources/sql-tests/results/collations.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/collations.sql.out @@ -2213,8 +2213,8 @@ struct