-
Notifications
You must be signed in to change notification settings - Fork 230
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Support Ascii function for ascii and latin-1 [databricks] (#10054)
* wip Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * Support Ascii function for ascii and latin-1 Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * Refine to make it run faster Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * Add bytes-based solution and shims Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * empty string to 0 in lower version Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * db test Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * db test Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * shim 342 Signed-off-by: Haoyang Li <haoyangl@nvidia.com> * Add 351 shim --------- Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
- Loading branch information
1 parent
142dead
commit c8febd1
Showing
8 changed files
with
270 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
78 changes: 78 additions & 0 deletions
78
sql-plugin/src/main/spark311/scala/org/apache/spark/sql/rapids/shims/GpuAscii.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
/* | ||
* Copyright (c) 2023, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
/*** spark-rapids-shim-json-lines | ||
{"spark": "311"} | ||
{"spark": "312"} | ||
{"spark": "313"} | ||
{"spark": "320"} | ||
{"spark": "321"} | ||
{"spark": "321cdh"} | ||
{"spark": "322"} | ||
{"spark": "330"} | ||
{"spark": "330cdh"} | ||
spark-rapids-shim-json-lines ***/ | ||
|
||
package org.apache.spark.sql.rapids.shims | ||
|
||
import ai.rapids.cudf.{ColumnVector, DType, Scalar} | ||
import com.nvidia.spark.rapids._ | ||
import com.nvidia.spark.rapids.Arm._ | ||
|
||
import org.apache.spark.sql.catalyst.expressions._ | ||
import org.apache.spark.sql.types._ | ||
|
||
case class GpuAscii(child: Expression) extends GpuUnaryExpression with ImplicitCastInputTypes | ||
with NullIntolerant { | ||
|
||
override def dataType: DataType = IntegerType | ||
override def inputTypes: Seq[AbstractDataType] = Seq(StringType) | ||
|
||
override def doColumnar(input: GpuColumnVector): ColumnVector = { | ||
val emptyMask = withResource(Scalar.fromString("")) { emptyScalar => | ||
input.getBase.equalTo(emptyScalar) | ||
} | ||
val emptyReplaced = withResource(emptyMask) { _ => | ||
// replace empty strings with 'NUL' (which will convert to ascii 0) | ||
withResource(Scalar.fromString('\u0000'.toString)) { zeroScalar => | ||
emptyMask.ifElse(zeroScalar, input.getBase) | ||
} | ||
} | ||
// convert to byte lists | ||
val byteLists = withResource(emptyReplaced) { _ => | ||
emptyReplaced.asByteList() | ||
} | ||
val firstBytes = withResource(byteLists) { bytes => | ||
bytes.extractListElement(0) | ||
} | ||
val firstBytesInt = withResource(firstBytes) { _ => | ||
firstBytes.castTo(DType.INT32) | ||
} | ||
withResource(firstBytesInt) { _ => | ||
val greaterThan127 = withResource(Scalar.fromInt(127)) { scalar => | ||
firstBytesInt.greaterThan(scalar) | ||
} | ||
withResource(greaterThan127) { _ => | ||
val sub256 = withResource(Scalar.fromInt(256)) { scalar => | ||
firstBytesInt.sub(scalar) | ||
} | ||
withResource(sub256) { _ => | ||
greaterThan127.ifElse(sub256, firstBytesInt) | ||
} | ||
} | ||
} | ||
} | ||
} |
125 changes: 125 additions & 0 deletions
125
sql-plugin/src/main/spark321db/scala/org/apache/spark/sql/rapids/shims/GpuAscii.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
/* | ||
* Copyright (c) 2023, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
/*** spark-rapids-shim-json-lines | ||
{"spark": "321db"} | ||
{"spark": "323"} | ||
{"spark": "324"} | ||
{"spark": "330db"} | ||
{"spark": "331"} | ||
{"spark": "332"} | ||
{"spark": "332cdh"} | ||
{"spark": "332db"} | ||
{"spark": "333"} | ||
{"spark": "340"} | ||
{"spark": "341"} | ||
{"spark": "341db"} | ||
{"spark": "342"} | ||
{"spark": "350"} | ||
{"spark": "351"} | ||
spark-rapids-shim-json-lines ***/ | ||
|
||
package org.apache.spark.sql.rapids.shims | ||
|
||
import ai.rapids.cudf.{ColumnVector, DType, Scalar} | ||
import com.nvidia.spark.rapids._ | ||
import com.nvidia.spark.rapids.Arm._ | ||
|
||
import org.apache.spark.sql.catalyst.expressions._ | ||
import org.apache.spark.sql.types._ | ||
|
||
case class GpuAscii(child: Expression) extends GpuUnaryExpression with ImplicitCastInputTypes | ||
with NullIntolerant { | ||
|
||
override def dataType: DataType = IntegerType | ||
override def inputTypes: Seq[AbstractDataType] = Seq(StringType) | ||
|
||
private def utf8CodePointsToAscii(codePoints: ColumnVector): ColumnVector = { | ||
// Currently we only support ASCII characters, so we need to convert the UTF8 code points | ||
// to ASCII code points. Results for code points outside range [0, 255] are undefined. | ||
// seg A: 0 <= codePoints < 128, already ASCII | ||
// seg B: 49792 <= codePoints < 49856, ASCII = codePoints - 49664 | ||
// seg C: 50048 <= codePoints < 50112, ASCII = codePoints - 49856 | ||
// | ||
// To reduce cuDF API calling, following algorithm will be performed: | ||
// 1. For anything above 49792, we subtract 49664, now seg A and B are correct. | ||
// 2. seg C: 50048 <= current + 49664 < 50112 => 384 <= current < 448, ASCII = current - 192 | ||
// So for anything above 384, we subtract 192, now seg C is correct too. | ||
val greaterThan49792 = withResource(Scalar.fromInt(49792)) { segBLeftEnd => | ||
codePoints.greaterOrEqualTo(segBLeftEnd) | ||
} | ||
val segAB = withResource(greaterThan49792) { _ => | ||
val sub1 = withResource(Scalar.fromInt(49664)) { segBValue => | ||
codePoints.sub(segBValue) | ||
} | ||
withResource(sub1) { _ => | ||
greaterThan49792.ifElse(sub1, codePoints) | ||
} | ||
} | ||
withResource(segAB) { _ => | ||
val geraterThan384 = withResource(Scalar.fromInt(384)) { segCLeftEnd => | ||
segAB.greaterOrEqualTo(segCLeftEnd) | ||
} | ||
withResource(geraterThan384) { _ => | ||
val sub2 = withResource(Scalar.fromInt(192)) { segCValue => | ||
segAB.sub(segCValue) | ||
} | ||
withResource(sub2) { _ => | ||
geraterThan384.ifElse(sub2, segAB) | ||
} | ||
} | ||
} | ||
} | ||
|
||
override def doColumnar(input: GpuColumnVector): ColumnVector = { | ||
// replace empty strings with 'NUL' (which will convert to ascii 0) | ||
val emptyMask = withResource(Scalar.fromString("")) { emptyScalar => | ||
input.getBase.equalTo(emptyScalar) | ||
} | ||
val emptyReplaced = withResource(emptyMask) { _ => | ||
// replace empty strings with 'NUL' (which will convert to ascii 0) | ||
withResource(Scalar.fromString('\u0000'.toString)) { zeroScalar => | ||
emptyMask.ifElse(zeroScalar, input.getBase) | ||
} | ||
} | ||
// replace nulls with 'n' and save the null mask | ||
val nullMask = closeOnExcept(emptyReplaced) { _ => | ||
input.getBase.isNull | ||
} | ||
withResource(nullMask) { _ => | ||
val nullsReplaced = withResource(emptyReplaced) { _ => | ||
withResource(Scalar.fromString("n")) { nullScalar => | ||
nullMask.ifElse(nullScalar, emptyReplaced) | ||
} | ||
} | ||
val substr = withResource(nullsReplaced) { _ => | ||
nullsReplaced.substring(0, 1) | ||
} | ||
val codePoints = withResource(substr) { _ => | ||
substr.codePoints() | ||
} | ||
val segABC = withResource(codePoints) { _ => | ||
utf8CodePointsToAscii(codePoints) | ||
} | ||
// replace nulls with null | ||
withResource(segABC) { _ => | ||
withResource(Scalar.fromNull(DType.INT32)) { nullScalar => | ||
nullMask.ifElse(nullScalar, segABC) | ||
} | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,6 +57,7 @@ ArrayTransform,4 | |
ArrayUnion,4 | ||
ArraysOverlap,4 | ||
ArraysZip,4 | ||
Ascii,4 | ||
Asin,4 | ||
Asinh,4 | ||
AtLeastNNonNulls,4 | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters