From 52d95b15a45376f8d879e973fd313da29e3920c0 Mon Sep 17 00:00:00 2001 From: Alfred Xu Date: Wed, 24 Feb 2021 01:17:09 +0800 Subject: [PATCH] Better float/double cases for casting tests (#1781) * enhance float/double cases for casting tests Signed-off-by: sperlingxx * continue * code clean * code clean * fix typo * fix typo * some updates * fix typo --- .../nvidia/spark/rapids/AnsiCastOpSuite.scala | 16 +- .../com/nvidia/spark/rapids/CastOpSuite.scala | 62 ++------ .../com/nvidia/spark/rapids/FuzzerUtils.scala | 16 +- .../rapids/SparkQueryCompareTestSuite.scala | 149 +----------------- 4 files changed, 38 insertions(+), 205 deletions(-) diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/AnsiCastOpSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/AnsiCastOpSuite.scala index 4f0206e657a..38f113602e9 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/AnsiCastOpSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/AnsiCastOpSuite.scala @@ -338,31 +338,31 @@ class AnsiCastOpSuite extends GpuExpressionTestSuite { frame => testCastTo(DataTypes.DoubleType)(frame) } - testCastFailsForBadInputs("Test bad cast 1 from strings to floats", badFloatStringsDf, + testCastFailsForBadInputs("Test bad cast 1 from strings to floats", invalidFloatStringsDf, msg = GpuCast.INVALID_FLOAT_CAST_MSG) { frame =>frame.select(col("c0").cast(FloatType)) } - testCastFailsForBadInputs("Test bad cast 2 from strings to floats", badFloatStringsDf, + testCastFailsForBadInputs("Test bad cast 2 from strings to floats", invalidFloatStringsDf, msg = GpuCast.INVALID_FLOAT_CAST_MSG) { frame =>frame.select(col("c1").cast(FloatType)) } - testCastFailsForBadInputs("Test bad cast 1 from strings to double", badFloatStringsDf, + testCastFailsForBadInputs("Test bad cast 1 from strings to double", invalidFloatStringsDf, msg = GpuCast.INVALID_FLOAT_CAST_MSG) { frame =>frame.select(col("c0").cast(DoubleType)) } - testCastFailsForBadInputs("Test bad cast 2 from strings to double", badFloatStringsDf, + testCastFailsForBadInputs("Test bad cast 2 from strings to double", invalidFloatStringsDf, msg = GpuCast.INVALID_FLOAT_CAST_MSG) { frame =>frame.select(col("c1").cast(DoubleType)) } - //Currently there is a bug in cudf which doesn't convert one value correctly + // Currently there is a bug in cudf which doesn't convert some corner cases correctly // The bug is documented here https://github.com/rapidsai/cudf/issues/5225 ignore("Test cast from strings to double that doesn't match") { testSparkResultsAreEqual("Test cast from strings to double that doesn't match", - badDoubleStringsDf) { + badDoubleStringsDf, conf = sparkConf, maxFloatDiff = 0.0001) { frame =>frame.select( col("c0").cast(DoubleType)) } @@ -372,12 +372,12 @@ class AnsiCastOpSuite extends GpuExpressionTestSuite { // Ansi cast from floating point to string /////////////////////////////////////////////////////////////////////////// - ignore("ansi_cast float to string") { + test("ansi_cast float to string") { testCastToString[Float](DataTypes.FloatType, ansiMode = true, comparisonFunc = Some(compareStringifiedFloats)) } - ignore("ansi_cast double to string") { + test("ansi_cast double to string") { testCastToString[Double](DataTypes.DoubleType, ansiMode = true, comparisonFunc = Some(compareStringifiedFloats)) } diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala index d9a4efe1a7e..403fe2aadfb 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala @@ -203,16 +203,6 @@ class CastOpSuite extends GpuExpressionTestSuite { } } - private def testCastTo(castTo: DataType)(frame: DataFrame): DataFrame ={ - frame.withColumn("c1", col("c0").cast(castTo)) - } - - private def stringDf(str: String)(session: SparkSession): DataFrame = { - import session.sqlContext.implicits._ - // use more than one value otherwise spark optimizes it out as a literal - Seq(str, str).toDF("c0") - } - private def castToStringExpectedFun[T]: T => Option[String] = (d: T) => Some(String.valueOf(d)) test("cast byte to string") { @@ -231,11 +221,11 @@ class CastOpSuite extends GpuExpressionTestSuite { testCastToString[Long](DataTypes.LongType) } - ignore("cast float to string") { + test("cast float to string") { testCastToString[Float](DataTypes.FloatType, comparisonFunc = Some(compareStringifiedFloats)) } - ignore("cast double to string") { + test("cast double to string") { testCastToString[Double](DataTypes.DoubleType, comparisonFunc = Some(compareStringifiedFloats)) } @@ -298,30 +288,6 @@ class CastOpSuite extends GpuExpressionTestSuite { col("doubles").cast(TimestampType)) } - ignore("Test cast from double to string") { - - //NOTE that the testSparkResultsAreEqual method isn't adequate in this case because we - // need to use a specialized comparison function - - val conf = new SparkConf() - .set(RapidsConf.ENABLE_CAST_FLOAT_TO_STRING.key, "true") - - val (cpu, gpu) = runOnCpuAndGpu(doubleDf, frame => frame.select( - col("doubles").cast(StringType)) - .orderBy(col("doubles")), conf) - - val fromCpu = cpu.map(row => row.getAs[String](0)) - val fromGpu = gpu.map(row => row.getAs[String](0)) - - fromCpu.zip(fromGpu).foreach { - case (c, g) => - if (!compareStringifiedFloats(c, g)) { - fail(s"Running on the GPU and on the CPU did not match: CPU value: $c. " + - s"GPU value: $g.") - } - } - } - testSparkResultsAreEqual("Test cast from boolean", booleanDf) { frame => frame.select( col("bools").cast(IntegerType), @@ -396,14 +362,6 @@ class CastOpSuite extends GpuExpressionTestSuite { col("doubles").cast(TimestampType)) } - ignore("Test cast from strings to double that doesn't match") { - testSparkResultsAreEqual("Test cast from strings to double that doesn't match", - badDoubleStringsDf) { - frame =>frame.select( - col("doubles").cast(DoubleType)) - } - } - testSparkResultsAreEqual("Test cast from strings to doubles", doublesAsStrings, conf = sparkConf, maxFloatDiff = 0.0001) { frame => frame.select( @@ -416,7 +374,7 @@ class CastOpSuite extends GpuExpressionTestSuite { col("c0").cast(FloatType)) } - testSparkResultsAreEqual("Test bad cast from strings to floats", badFloatStringsDf, + testSparkResultsAreEqual("Test bad cast from strings to floats", invalidFloatStringsDf, conf = sparkConf, maxFloatDiff = 0.0001) { frame =>frame.select( col("c0").cast(DoubleType), @@ -425,6 +383,16 @@ class CastOpSuite extends GpuExpressionTestSuite { col("c1").cast(FloatType)) } + // Currently there is a bug in cudf which doesn't convert some corner cases correctly + // The bug is documented here https://github.com/rapidsai/cudf/issues/5225 + ignore("Test cast from strings to double that doesn't match") { + testSparkResultsAreEqual("Test cast from strings to double that doesn't match", + badDoubleStringsDf, conf = sparkConf, maxFloatDiff = 0.0001) { + frame => frame.select( + col("c0").cast(DoubleType)) + } + } + testSparkResultsAreEqual("ansi_cast string to double exp", exponentsAsStringsDf, conf = sparkConf, maxFloatDiff = 0.0001) { frame => frame.select( @@ -720,13 +688,13 @@ object CastOpSuite { def doublesAsStrings(session: SparkSession): DataFrame = { val schema = FuzzerUtils.createSchema(Seq(DoubleType), false) - val df = FuzzerUtils.generateDataFrame(session, schema, 100) + val df = FuzzerUtils.generateDataFrame(session, schema, 2048) df.withColumn("c0", col("c0").cast(StringType)) } def floatsAsStrings(session: SparkSession): DataFrame = { val schema = FuzzerUtils.createSchema(Seq(FloatType), false) - val df = FuzzerUtils.generateDataFrame(session, schema, 100) + val df = FuzzerUtils.generateDataFrame(session, schema, 2048) df.withColumn("c0", col("c0").cast(StringType)) } diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/FuzzerUtils.scala b/tests/src/test/scala/com/nvidia/spark/rapids/FuzzerUtils.scala index 8670325a9ab..6bf531a7bdb 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/FuzzerUtils.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/FuzzerUtils.scala @@ -301,30 +301,34 @@ class EnhancedRandom(protected val r: Random, protected val options: FuzzerOptio } def nextFloat(): Float = { - r.nextInt(9) match { + r.nextInt(11) match { case 0 => Float.NaN case 1 => Float.PositiveInfinity case 2 => Float.NegativeInfinity - case 3 => r.nextFloat() * Float.MinValue - case 4 => r.nextFloat() * Float.MaxValue + case 3 => Float.MinValue + case 4 => Float.MaxValue case 5 => 0 - r.nextFloat() case 6 => r.nextFloat() case 7 => 0f case 8 => -0f + case 9 => r.nextFloat() * Float.MinValue + case 10 => r.nextFloat() * Float.MaxValue } } def nextDouble(): Double = { - r.nextInt(9) match { + r.nextInt(11) match { case 0 => Double.NaN case 1 => Double.PositiveInfinity case 2 => Double.NegativeInfinity - case 3 => r.nextDouble() * Double.MinValue - case 4 => r.nextDouble() * Double.MaxValue + case 3 => Double.MaxValue + case 4 => Double.MinValue case 5 => 0 - r.nextDouble() case 6 => r.nextDouble() case 7 => 0d case 8 => -0d + case 9 => r.nextDouble() * Double.MinValue + case 10 => r.nextDouble() * Double.MaxValue } } diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala index 971985b61d5..c44500bd712 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -1131,14 +1131,16 @@ trait SparkQueryCompareTestSuite extends FunSuite with Arm { "9.8e5").toDF("c0") } - def badFloatStringsDf(session: SparkSession): DataFrame = { + def invalidFloatStringsDf(session: SparkSession): DataFrame = { import session.sqlContext.implicits._ Seq(("A", "null"), ("1.3", "43.54")).toDF("c0", "c1") } def badDoubleStringsDf(session: SparkSession): DataFrame = { import session.sqlContext.implicits._ - Seq("1.7976931348623159E308", "-1.7976931348623159E308").toDF("c0") + Seq("1.7976931348623159E308", "-1.7976931348623159E308", + "1.79769313486231581E308", "-1.79769313486231581E308", + "17.9769313486231582E307", "-17.9769313486231582E307").toDF("c0") } def stringsAndLongsDf(session: SparkSession): DataFrame = { @@ -1240,45 +1242,6 @@ trait SparkQueryCompareTestSuite extends FunSuite with Arm { ).toDF("longs", "more_longs") } - def smallDoubleDf(session: SparkSession): DataFrame = { - import session.sqlContext.implicits._ - Seq( - (1.4, 1.134), - (2.1, 2.4), - (3.0, 3.42), - (4.4, 4.5), - (5.345, 5.2), - (-1.3, 6.0), - (-5.14, 0.0) - ).toDF("doubles", "more_doubles") - } - - def doubleDf(session: SparkSession): DataFrame = { - import session.sqlContext.implicits._ - Seq( - (24854.55893, 90770.74881), - (79946.87288, -15456.4335), - (7967.43488, 32213.22119), - (-86099.68377, 36223.96138), - (63477.14374, 98993.65544), - (13763380.78173, 19869268.744), - (8677894.99092, 4029109.83562) - ).toDF("doubles", "more_doubles") - } - - def nonZeroDoubleDf(session: SparkSession): DataFrame = { - import session.sqlContext.implicits._ - Seq( - (100.3, 1.09), - (200.1, 2.12), - (300.5, 3.5), - (400.0, 4.32), - (500.5, 5.0), - (-100.1, 6.4), - (-500.934, 50.5) - ).toDF("doubles", "more_doubles") - } - def nanDf(session: SparkSession): DataFrame = { import session.sqlContext.implicits._ Seq[java.lang.Double]( @@ -1386,46 +1349,6 @@ trait SparkQueryCompareTestSuite extends FunSuite with Arm { ).toDF("doubles", "more_doubles") } - def mixedSingleColumnDoubleDf(session: SparkSession): DataFrame = { - import session.sqlContext.implicits._ - Seq[java.lang.Double]( - Double.PositiveInfinity, - Double.NegativeInfinity, - 0.8435376941d, - 23.1927672582d, - 2309.4430349398d, - Double.NaN, - DOUBLE_POSITIVE_NAN_LOWER_RANGE, - DOUBLE_POSITIVE_NAN_UPPER_RANGE, - DOUBLE_NEGATIVE_NAN_LOWER_RANGE, - DOUBLE_NEGATIVE_NAN_UPPER_RANGE, - null, - -0.7078783860d, - -70.9667587507d, - -838600.5867225748d - ).toDF("doubles") - } - - def mixedSingleColumnFloatDf(session: SparkSession): DataFrame = { - import session.sqlContext.implicits._ - Seq[java.lang.Float]( - Float.PositiveInfinity, - Float.NegativeInfinity, - 0.8435376941f, - 23.1927672582f, - 2309.4430349398f, - Float.NaN, - FLOAT_POSITIVE_NAN_LOWER_RANGE, - FLOAT_NEGATIVE_NAN_LOWER_RANGE, - FLOAT_POSITIVE_NAN_UPPER_RANGE, - FLOAT_NEGATIVE_NAN_UPPER_RANGE, - null, - -0.7078783860f, - -70.9667587507f, - -838600.5867225748f - ).toDF("floats") - } - def mixedFloatDf(session: SparkSession): DataFrame = { import session.sqlContext.implicits._ Seq[(java.lang.Float, java.lang.Float)]( @@ -1532,33 +1455,6 @@ trait SparkQueryCompareTestSuite extends FunSuite with Arm { ).toDF("ints", "floats") } - def doubleStringsDf(session: SparkSession): DataFrame = { - import session.sqlContext.implicits._ - Seq( - ("100.23", "1.0"), - ("200.65", "2.3"), - ("300.12", "3.6"), - ("400.43", "4.1"), - ("500.09", "5.0009"), - ("-100.124", "6.234"), - ("-500.13", "0.23"), - ("50.65", "50.5") - ).toDF("doubles", "more_doubles") - } - - def nullableFloatDf(session: SparkSession): DataFrame = { - import session.sqlContext.implicits._ - Seq[(java.lang.Float, java.lang.Float)]( - (100.44f, 1.046f), - (200.2f, null), - (300.230f, 3.04f), - (null, 4.0f), - (500.09f, null), - (null, 6.10f), - (-500.0f, 50.5f) - ).toDF("floats", "more_floats") - } - def doubleWithNansDf(session: SparkSession): DataFrame = { import session.sqlContext.implicits._ Seq[(java.lang.Double, java.lang.Double)]( @@ -1620,41 +1516,6 @@ trait SparkQueryCompareTestSuite extends FunSuite with Arm { ).toDF("float", "int") } - def floatWithNansDf(session: SparkSession): DataFrame = { - import session.sqlContext.implicits._ - Seq[(java.lang.Float, java.lang.Float)]( - (100.50f, 1.0f), - (200.80f, Float.NaN), - (300.30f, 3.0f), - (Float.PositiveInfinity, Float.NegativeInfinity), - (Float.NegativeInfinity, Float.PositiveInfinity), - (Float.NaN, 4.0f), - (Float.PositiveInfinity, 4.0f), - (Float.NegativeInfinity, 4.0f), - (0.0f, 4.0f), - (500.0f, Float.NaN), - (Float.NaN, 6.0f), - (-500.0f, 50.5f), - (Float.NegativeInfinity, Float.NaN), - (Float.PositiveInfinity, 1.2f), - (Float.NaN, 3.2f), - (null, null) - ).toDF("floats", "more_floats") - } - - def nullableStringsDf(session: SparkSession): DataFrame = { - import session.sqlContext.implicits._ - Seq[(String, String)]( - ("100.0", "1.0"), - (null, "2.0"), - ("300.0", "3.0"), - ("400.0", null), - ("500.0", "5.0"), - ("-100.0", null), - ("-500.0", "0.0") - ).toDF("strings", "more_strings") - } - def nullableStringsIntsDf(session: SparkSession): DataFrame = { import session.sqlContext.implicits._ Seq[(String, Integer)](