Skip to content

Commit

Permalink
Add in murmur3 support for float, double, date and timestamp (NVIDIA#…
Browse files Browse the repository at this point in the history
…2017)

Signed-off-by: Robert (Bobby) Evans <bobby@apache.org>
  • Loading branch information
revans2 authored Mar 25, 2021
1 parent e95ca04 commit c83a8c6
Show file tree
Hide file tree
Showing 4 changed files with 13 additions and 33 deletions.
8 changes: 4 additions & 4 deletions docs/supported_ops.md
Original file line number Diff line number Diff line change
Expand Up @@ -9719,10 +9719,10 @@ Accelerator support is described below.
<td>S</td>
<td>S</td>
<td>S</td>
<td><b>NS</b></td>
<td><b>NS</b></td>
<td><b>NS</b></td>
<td><b>NS</b></td>
<td>S</td>
<td>S</td>
<td>S</td>
<td>S*</td>
<td>S</td>
<td>S*</td>
<td>S</td>
Expand Down
8 changes: 6 additions & 2 deletions integration_tests/src/main/python/repart_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,8 +81,10 @@ def test_repartion_df(num_parts, length):
([('a', short_gen)], ['a']),
([('a', int_gen)], ['a']),
([('a', long_gen)], ['a']),
pytest.param(([('a', float_gen)], ['a']), marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/1914')),
pytest.param(([('a', double_gen)], ['a']), marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/1914')),
([('a', float_gen)], ['a']),
([('a', double_gen)], ['a']),
([('a', timestamp_gen)], ['a']),
([('a', date_gen)], ['a']),
([('a', decimal_gen_default)], ['a']),
([('a', decimal_gen_neg_scale)], ['a']),
([('a', decimal_gen_scale_precision)], ['a']),
Expand All @@ -97,6 +99,8 @@ def test_repartion_df(num_parts, length):
([('a', int_gen), ('b', byte_gen)], ['a', 'b']),
([('a', long_gen), ('b', null_gen)], ['a', 'b']),
([('a', byte_gen), ('b', boolean_gen), ('c', short_gen)], ['a', 'b', 'c']),
([('a', float_gen), ('b', double_gen), ('c', short_gen)], ['a', 'b', 'c']),
([('a', timestamp_gen), ('b', date_gen), ('c', int_gen)], ['a', 'b', 'c']),
([('a', short_gen), ('b', string_gen), ('c', int_gen)], ['a', 'b', 'c']),
([('a', decimal_gen_default), ('b', decimal_gen_64bit), ('c', decimal_gen_scale_precision)], ['a', 'b', 'c']),
], ids=idfn)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2312,9 +2312,7 @@ object GpuOverrides {
"Murmur3 hash operator",
ExprChecks.projectNotLambda(TypeSig.INT, TypeSig.INT,
repeatingParamCheck = Some(RepeatingParamCheck("input",
// Floating point values don't work because of -0.0 is not hashed properly
TypeSig.BOOLEAN + TypeSig.BYTE + TypeSig.SHORT + TypeSig.INT + TypeSig.LONG +
TypeSig.STRING + TypeSig.NULL + TypeSig.DECIMAL,
TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL,
TypeSig.all))),
(a, conf, p, r) => new ExprMeta[Murmur3Hash](a, conf, p, r) {
override val childExprs: Seq[BaseExprMeta[_]] = a.children
Expand Down Expand Up @@ -2486,8 +2484,7 @@ object GpuOverrides {
// TODO In 0.5 we should make the checks self documenting, and look more like what
// SparkPlan and Expression support
// https://github.com/NVIDIA/spark-rapids/issues/1915
val sig = TypeSig.BOOLEAN + TypeSig.BYTE + TypeSig.SHORT + TypeSig.INT + TypeSig.LONG +
TypeSig.STRING + TypeSig.NULL + TypeSig.DECIMAL
val sig = TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL
hp.children.foreach { child =>
sig.tagExprParam(this, child, "hash_key")
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,28 +38,7 @@ case class GpuMd5(child: Expression)

object GpuMurmur3Hash extends Arm {
def compute(batch: ColumnarBatch, boundExpr: Seq[Expression], seed: Int = 42): ColumnVector = {
val newExprs = boundExpr.map { expr =>
expr.dataType match {
case ByteType | ShortType =>
GpuCast(expr, IntegerType)
case DoubleType =>
// We have to normalize the NaNs, but not zeros
// however the current cudf code does the wrong thing for -0.0
// https://github.com/NVIDIA/spark-rapids/issues/1914
GpuIf(GpuIsNan(expr), GpuLiteral(Double.NaN, DoubleType), expr)
case FloatType =>
// We have to normalize the NaNs, but not zeros
// however the current cudf code does the wrong thing for -0.0
// https://github.com/NVIDIA/spark-rapids/issues/1914
GpuIf(GpuIsNan(expr), GpuLiteral(Float.NaN, FloatType), expr)
case dt: DecimalType if dt.precision <= DType.DECIMAL64_MAX_PRECISION =>
// For these values it is just hashing it as a long
GpuUnscaledValue(expr)
case _ =>
expr
}
}
withResource(GpuProjectExec.project(batch, newExprs)) { args =>
withResource(GpuProjectExec.project(batch, boundExpr)) { args =>
val bases = GpuColumnVector.extractBases(args)
ColumnVector.spark32BitMurmurHash3(seed, bases.toArray[ColumnView])
}
Expand Down

0 comments on commit c83a8c6

Please sign in to comment.