diff --git a/docs/configs.md b/docs/configs.md index 5b202c29228..55b8b986bbe 100644 --- a/docs/configs.md +++ b/docs/configs.md @@ -131,7 +131,8 @@ Name | SQL Function(s) | Description | Default Value | Notes spark.rapids.sql.expression.Cos|`cos`|Cosine|true|None| spark.rapids.sql.expression.Cosh|`cosh`|Hyperbolic cosine|true|None| spark.rapids.sql.expression.Cot|`cot`|Cotangent|true|None| -spark.rapids.sql.expression.CreateNamedStruct|`named_struct`, `struct`|Creates a struct with the given field names and values.|true|None| +spark.rapids.sql.expression.CreateArray|`array`| Returns an array with the given elements|true|None| +spark.rapids.sql.expression.CreateNamedStruct|`named_struct`, `struct`|Creates a struct with the given field names and values|true|None| spark.rapids.sql.expression.CurrentRow$| |Special boundary for a window frame, indicating stopping at the current row|true|None| spark.rapids.sql.expression.DateAdd|`date_add`|Returns the date that is num_days after start_date|true|None| spark.rapids.sql.expression.DateDiff|`datediff`|Returns the number of days from startDate to endDate|true|None| diff --git a/docs/supported_ops.md b/docs/supported_ops.md index 0214b8d3b79..2eb45dbef5b 100644 --- a/docs/supported_ops.md +++ b/docs/supported_ops.md @@ -511,7 +511,7 @@ Accelerator supports are described below. S S* S -NS +S* S NS NS @@ -557,7 +557,7 @@ Accelerator supports are described below. S S* S -NS +S* S NS NS @@ -580,7 +580,7 @@ Accelerator supports are described below. S S* S -NS +S* S NS NS @@ -603,7 +603,7 @@ Accelerator supports are described below. S S* S -NS +S* S NS NS @@ -626,7 +626,7 @@ Accelerator supports are described below. S S* S -NS +S* S NS NS @@ -649,7 +649,7 @@ Accelerator supports are described below. S S* S -NS +S* S NS NS @@ -3417,9 +3417,99 @@ Accelerator support is described below. +CreateArray +`array` + Returns an array with the given elements +None +project +arg +S +S +S +S +S +S +S +S +S* +S +S* +S +NS +NS +NS +NS +NS +NS + + +result + + + + + + + + + + + + + + +PS* (missing nested BINARY, CALENDAR, ARRAY, MAP, STRUCT, UDT) + + + + + +lambda +arg +NS +NS +NS +NS +NS +NS +NS +NS +NS +NS +NS +NS +NS +NS +NS +NS +NS +NS + + +result + + + + + + + + + + + + + + +NS + + + + + CreateNamedStruct `named_struct`, `struct` -Creates a struct with the given field names and values. +Creates a struct with the given field names and values None project name diff --git a/integration_tests/src/main/python/array_test.py b/integration_tests/src/main/python/array_test.py index 309de60ae85..06c78654a9e 100644 --- a/integration_tests/src/main/python/array_test.py +++ b/integration_tests/src/main/python/array_test.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2021, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -46,3 +46,12 @@ def test_nested_array_index(data_gen): 'a[1]', 'a[3]', 'a[50]')) + + +@pytest.mark.parametrize('data_gen', all_basic_gens + [decimal_gen_default, decimal_gen_scale_precision], ids=idfn) +def test_make_array(data_gen): + (s1, s2) = gen_scalars_for_sql(data_gen, 2, force_no_nulls=not isinstance(data_gen, NullGen)) + assert_gpu_and_cpu_are_equal_collect( + lambda spark : binary_op_df(spark, data_gen).selectExpr( + 'array(a, b)', + 'array(b, a, null, {}, {})'.format(s1, s2))) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index b5d3534e7db..841d6ca1e73 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -1912,12 +1912,26 @@ object GpuOverrides { ("key", TypeSig.lit(TypeEnum.STRING), TypeSig.all)), (in, conf, p, r) => new GpuGetMapValueMeta(in, conf, p, r)), expr[CreateNamedStruct]( - "Creates a struct with the given field names and values.", + "Creates a struct with the given field names and values", CreateNamedStructCheck, (in, conf, p, r) => new ExprMeta[CreateNamedStruct](in, conf, p, r) { override def convertToGpu(): GpuExpression = GpuCreateNamedStruct(childExprs.map(_.convertToGpu())) }), + expr[CreateArray]( + " Returns an array with the given elements", + ExprChecks.projectNotLambda( + TypeSig.ARRAY.nested(TypeSig.numeric + TypeSig.NULL + TypeSig.STRING + + TypeSig.BOOLEAN + TypeSig.DATE + TypeSig.TIMESTAMP), + TypeSig.ARRAY.nested(TypeSig.all), + repeatingParamCheck = Some(RepeatingParamCheck("arg", + TypeSig.numeric + TypeSig.NULL + TypeSig.STRING + + TypeSig.BOOLEAN + TypeSig.DATE + TypeSig.TIMESTAMP, + TypeSig.all))), + (in, conf, p, r) => new ExprMeta[CreateArray](in, conf, p, r) { + override def convertToGpu(): GpuExpression = + GpuCreateArray(childExprs.map(_.convertToGpu()), wrapped.useStringTypeWhenEmpty) + }), expr[StringLocate]( "Substring search operator", ExprChecks.projectNotLambda(TypeSig.INT, TypeSig.INT, diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/complexTypeCreator.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/complexTypeCreator.scala index f134ae4577f..9af53ac87c2 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/complexTypeCreator.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/complexTypeCreator.scala @@ -18,14 +18,71 @@ package org.apache.spark.sql.rapids import ai.rapids.cudf.ColumnVector import com.nvidia.spark.rapids.{GpuColumnVector, GpuExpression, GpuScalar} -import com.nvidia.spark.rapids.RapidsPluginImplicits.{AutoCloseableArray, ReallyAGpuExpression} +import com.nvidia.spark.rapids.RapidsPluginImplicits.ReallyAGpuExpression +import org.apache.spark.sql.catalyst.analysis.{TypeCheckResult, TypeCoercion} import org.apache.spark.sql.catalyst.analysis.FunctionRegistry.FUNC_ALIAS -import org.apache.spark.sql.catalyst.analysis.TypeCheckResult import org.apache.spark.sql.catalyst.expressions.{EmptyRow, Expression, NamedExpression} -import org.apache.spark.sql.types.{Metadata, StringType, StructField, StructType} +import org.apache.spark.sql.catalyst.util.TypeUtils +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{ArrayType, DataType, Metadata, NullType, StringType, StructField, StructType} import org.apache.spark.sql.vectorized.ColumnarBatch +case class GpuCreateArray(children: Seq[Expression], useStringTypeWhenEmpty: Boolean) + extends GpuExpression { + + def this(children: Seq[Expression]) = { + this(children, SQLConf.get.getConf(SQLConf.LEGACY_CREATE_EMPTY_COLLECTION_USING_STRING_TYPE)) + } + + override def foldable: Boolean = children.forall(_.foldable) + + override def stringArgs: Iterator[Any] = super.stringArgs.take(1) + + override def checkInputDataTypes(): TypeCheckResult = { + TypeUtils.checkForSameTypeInputExpr(children.map(_.dataType), s"function $prettyName") + } + + private val defaultElementType: DataType = { + if (useStringTypeWhenEmpty) { + StringType + } else { + NullType + } + } + + override def dataType: ArrayType = { + ArrayType( + TypeCoercion.findCommonTypeDifferentOnlyInNullFlags(children.map(_.dataType)) + .getOrElse(defaultElementType), + containsNull = children.exists(_.nullable)) + } + + override def nullable: Boolean = false + + override def prettyName: String = "array" + + override def columnarEval(batch: ColumnarBatch): Any = { + withResource(new Array[ColumnVector](children.size)) { columns => + val numRows = batch.numRows() + children.indices.foreach { index => + children(index).columnarEval(batch) match { + case cv: GpuColumnVector => + columns(index) = cv.getBase + case other => + val dt = dataType.elementType + withResource(GpuScalar.from(other, dt)) { scalar => + columns(index) = ColumnVector.fromScalar(scalar, numRows) + } + } + } + GpuColumnVector.from(ColumnVector.makeList(numRows, + GpuColumnVector.getNonNestedRapidsType(dataType.elementType), + columns: _*), dataType) + } + } +} + case class GpuCreateNamedStruct(children: Seq[Expression]) extends GpuExpression { lazy val (nameExprs, valExprs) = children.grouped(2).map { case Seq(name, value) => (name, value)