Add in murmur3 support for float, double, date and timestamp (NVIDIA#…

…2017) Signed-off-by: Robert (Bobby) Evans <bobby@apache.org>
nartal1 · Mar 25, 2021 · c83a8c6 · c83a8c6
1 parent e95ca04
commit c83a8c6
Show file tree

Hide file tree

Showing 4 changed files with 13 additions and 33 deletions.
diff --git a/docs/supported_ops.md b/docs/supported_ops.md
@@ -9719,10 +9719,10 @@ Accelerator support is described below.
 <td>S</td>
 <td>S</td>
 <td>S</td>
-<td><b>NS</b></td>
-<td><b>NS</b></td>
-<td><b>NS</b></td>
-<td><b>NS</b></td>
+<td>S</td>
+<td>S</td>
+<td>S</td>
+<td>S*</td>
 <td>S</td>
 <td>S*</td>
 <td>S</td>

diff --git a/integration_tests/src/main/python/repart_test.py b/integration_tests/src/main/python/repart_test.py
@@ -81,8 +81,10 @@ def test_repartion_df(num_parts, length):
     ([('a', short_gen)], ['a']),
     ([('a', int_gen)], ['a']),
     ([('a', long_gen)], ['a']),
-    pytest.param(([('a', float_gen)], ['a']), marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/1914')),
-    pytest.param(([('a', double_gen)], ['a']), marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/1914')),
+    ([('a', float_gen)], ['a']),
+    ([('a', double_gen)], ['a']),
+    ([('a', timestamp_gen)], ['a']),
+    ([('a', date_gen)], ['a']),
     ([('a', decimal_gen_default)], ['a']),
     ([('a', decimal_gen_neg_scale)], ['a']),
     ([('a', decimal_gen_scale_precision)], ['a']),
@@ -97,6 +99,8 @@ def test_repartion_df(num_parts, length):
     ([('a', int_gen), ('b', byte_gen)], ['a', 'b']),
     ([('a', long_gen), ('b', null_gen)], ['a', 'b']),
     ([('a', byte_gen), ('b', boolean_gen), ('c', short_gen)], ['a', 'b', 'c']),
+    ([('a', float_gen), ('b', double_gen), ('c', short_gen)], ['a', 'b', 'c']),
+    ([('a', timestamp_gen), ('b', date_gen), ('c', int_gen)], ['a', 'b', 'c']),
     ([('a', short_gen), ('b', string_gen), ('c', int_gen)], ['a', 'b', 'c']),
     ([('a', decimal_gen_default), ('b', decimal_gen_64bit), ('c', decimal_gen_scale_precision)], ['a', 'b', 'c']),
     ], ids=idfn)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -2312,9 +2312,7 @@ object GpuOverrides {
       "Murmur3 hash operator",
       ExprChecks.projectNotLambda(TypeSig.INT, TypeSig.INT,
         repeatingParamCheck = Some(RepeatingParamCheck("input",
-          // Floating point values don't work because of -0.0 is not hashed properly
-          TypeSig.BOOLEAN + TypeSig.BYTE + TypeSig.SHORT + TypeSig.INT + TypeSig.LONG +
-             TypeSig.STRING + TypeSig.NULL + TypeSig.DECIMAL,
+          TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL,
           TypeSig.all))),
       (a, conf, p, r) => new ExprMeta[Murmur3Hash](a, conf, p, r) {
         override val childExprs: Seq[BaseExprMeta[_]] = a.children
@@ -2486,8 +2484,7 @@ object GpuOverrides {
           // TODO In 0.5 we should make the checks self documenting, and look more like what
           //  SparkPlan and Expression support
           //  https://github.com/NVIDIA/spark-rapids/issues/1915
-          val sig = TypeSig.BOOLEAN + TypeSig.BYTE + TypeSig.SHORT + TypeSig.INT + TypeSig.LONG +
-              TypeSig.STRING + TypeSig.NULL + TypeSig.DECIMAL
+          val sig = TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL
           hp.children.foreach { child =>
             sig.tagExprParam(this, child, "hash_key")
           }

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/HashFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/HashFunctions.scala
@@ -38,28 +38,7 @@ case class GpuMd5(child: Expression)
 
 object GpuMurmur3Hash extends Arm {
   def compute(batch: ColumnarBatch, boundExpr: Seq[Expression], seed: Int = 42): ColumnVector = {
-    val newExprs = boundExpr.map { expr =>
-      expr.dataType match {
-        case ByteType | ShortType =>
-          GpuCast(expr, IntegerType)
-        case DoubleType =>
-          // We have to normalize the NaNs, but not zeros
-          // however the current cudf code does the wrong thing for -0.0
-          // https://github.com/NVIDIA/spark-rapids/issues/1914
-          GpuIf(GpuIsNan(expr), GpuLiteral(Double.NaN, DoubleType), expr)
-        case FloatType =>
-          // We have to normalize the NaNs, but not zeros
-          // however the current cudf code does the wrong thing for -0.0
-          // https://github.com/NVIDIA/spark-rapids/issues/1914
-          GpuIf(GpuIsNan(expr), GpuLiteral(Float.NaN, FloatType), expr)
-        case dt: DecimalType if dt.precision <= DType.DECIMAL64_MAX_PRECISION =>
-          // For these values it is just hashing it as a long
-          GpuUnscaledValue(expr)
-        case _ =>
-          expr
-      }
-    }
-    withResource(GpuProjectExec.project(batch, newExprs)) { args =>
+    withResource(GpuProjectExec.project(batch, boundExpr)) { args =>
       val bases = GpuColumnVector.extractBases(args)
       ColumnVector.spark32BitMurmurHash3(seed, bases.toArray[ColumnView])
     }