From 52d95b15a45376f8d879e973fd313da29e3920c0 Mon Sep 17 00:00:00 2001
From: Alfred Xu <lovedreamf@gmail.com>
Date: Wed, 24 Feb 2021 01:17:09 +0800
Subject: [PATCH] Better float/double cases for casting tests (#1781)

* enhance float/double cases for casting tests

Signed-off-by: sperlingxx <lovedreamf@gmail.com>

* continue

* code clean

* code clean

* fix typo

* fix typo

* some updates

* fix typo
---
 .../nvidia/spark/rapids/AnsiCastOpSuite.scala |  16 +-
 .../com/nvidia/spark/rapids/CastOpSuite.scala |  62 ++------
 .../com/nvidia/spark/rapids/FuzzerUtils.scala |  16 +-
 .../rapids/SparkQueryCompareTestSuite.scala   | 149 +-----------------
 4 files changed, 38 insertions(+), 205 deletions(-)

diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/AnsiCastOpSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/AnsiCastOpSuite.scala
index 4f0206e657a..38f113602e9 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/AnsiCastOpSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/AnsiCastOpSuite.scala
@@ -338,31 +338,31 @@ class AnsiCastOpSuite extends GpuExpressionTestSuite {
     frame => testCastTo(DataTypes.DoubleType)(frame)
   }
 
-  testCastFailsForBadInputs("Test bad cast 1 from strings to floats", badFloatStringsDf,
+  testCastFailsForBadInputs("Test bad cast 1 from strings to floats", invalidFloatStringsDf,
       msg = GpuCast.INVALID_FLOAT_CAST_MSG) {
     frame =>frame.select(col("c0").cast(FloatType))
   }
 
-  testCastFailsForBadInputs("Test bad cast 2 from strings to floats", badFloatStringsDf,
+  testCastFailsForBadInputs("Test bad cast 2 from strings to floats", invalidFloatStringsDf,
       msg = GpuCast.INVALID_FLOAT_CAST_MSG) {
     frame =>frame.select(col("c1").cast(FloatType))
   }
 
-  testCastFailsForBadInputs("Test bad cast 1 from strings to double", badFloatStringsDf,
+  testCastFailsForBadInputs("Test bad cast 1 from strings to double", invalidFloatStringsDf,
       msg = GpuCast.INVALID_FLOAT_CAST_MSG) {
     frame =>frame.select(col("c0").cast(DoubleType))
   }
 
-  testCastFailsForBadInputs("Test bad cast 2 from strings to double", badFloatStringsDf,
+  testCastFailsForBadInputs("Test bad cast 2 from strings to double", invalidFloatStringsDf,
       msg = GpuCast.INVALID_FLOAT_CAST_MSG) {
     frame =>frame.select(col("c1").cast(DoubleType))
   }
 
-  //Currently there is a bug in cudf which doesn't convert one value correctly
+  // Currently there is a bug in cudf which doesn't convert some corner cases correctly
   // The bug is documented here https://github.com/rapidsai/cudf/issues/5225
   ignore("Test cast from strings to double that doesn't match") {
     testSparkResultsAreEqual("Test cast from strings to double that doesn't match",
-        badDoubleStringsDf) {
+        badDoubleStringsDf, conf = sparkConf, maxFloatDiff = 0.0001) {
       frame =>frame.select(
         col("c0").cast(DoubleType))
     }
@@ -372,12 +372,12 @@ class AnsiCastOpSuite extends GpuExpressionTestSuite {
   // Ansi cast from floating point to string
   ///////////////////////////////////////////////////////////////////////////
 
-  ignore("ansi_cast float to string") {
+  test("ansi_cast float to string") {
     testCastToString[Float](DataTypes.FloatType, ansiMode = true,
       comparisonFunc = Some(compareStringifiedFloats))
   }
 
-  ignore("ansi_cast double to string") {
+  test("ansi_cast double to string") {
     testCastToString[Double](DataTypes.DoubleType, ansiMode = true,
       comparisonFunc = Some(compareStringifiedFloats))
   }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala
index d9a4efe1a7e..403fe2aadfb 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala
@@ -203,16 +203,6 @@ class CastOpSuite extends GpuExpressionTestSuite {
     }
   }
 
-  private def testCastTo(castTo: DataType)(frame: DataFrame): DataFrame ={
-    frame.withColumn("c1", col("c0").cast(castTo))
-  }
-
-  private def stringDf(str: String)(session: SparkSession): DataFrame = {
-    import session.sqlContext.implicits._
-    // use more than one value otherwise spark optimizes it out as a literal
-    Seq(str, str).toDF("c0")
-  }
-
   private def castToStringExpectedFun[T]: T => Option[String] = (d: T) => Some(String.valueOf(d))
 
   test("cast byte to string") {
@@ -231,11 +221,11 @@ class CastOpSuite extends GpuExpressionTestSuite {
     testCastToString[Long](DataTypes.LongType)
   }
 
-  ignore("cast float to string") {
+  test("cast float to string") {
     testCastToString[Float](DataTypes.FloatType, comparisonFunc = Some(compareStringifiedFloats))
   }
 
-  ignore("cast double to string") {
+  test("cast double to string") {
     testCastToString[Double](DataTypes.DoubleType, comparisonFunc = Some(compareStringifiedFloats))
   }
 
@@ -298,30 +288,6 @@ class CastOpSuite extends GpuExpressionTestSuite {
       col("doubles").cast(TimestampType))
   }
 
-  ignore("Test cast from double to string") {
-
-    //NOTE that the testSparkResultsAreEqual method isn't adequate in this case because we
-    // need to use a specialized comparison function
-
-    val conf = new SparkConf()
-      .set(RapidsConf.ENABLE_CAST_FLOAT_TO_STRING.key, "true")
-
-    val (cpu, gpu) = runOnCpuAndGpu(doubleDf, frame => frame.select(
-      col("doubles").cast(StringType))
-      .orderBy(col("doubles")), conf)
-
-    val fromCpu = cpu.map(row => row.getAs[String](0))
-    val fromGpu = gpu.map(row => row.getAs[String](0))
-
-    fromCpu.zip(fromGpu).foreach {
-      case (c, g) =>
-        if (!compareStringifiedFloats(c, g)) {
-          fail(s"Running on the GPU and on the CPU did not match: CPU value: $c. " +
-            s"GPU value: $g.")
-        }
-    }
-  }
-
   testSparkResultsAreEqual("Test cast from boolean", booleanDf) {
     frame => frame.select(
       col("bools").cast(IntegerType),
@@ -396,14 +362,6 @@ class CastOpSuite extends GpuExpressionTestSuite {
       col("doubles").cast(TimestampType))
   }
 
-  ignore("Test cast from strings to double that doesn't match") {
-        testSparkResultsAreEqual("Test cast from strings to double that doesn't match",
-          badDoubleStringsDf) {
-          frame =>frame.select(
-              col("doubles").cast(DoubleType))
-        }
-  }
-
   testSparkResultsAreEqual("Test cast from strings to doubles", doublesAsStrings,
     conf = sparkConf, maxFloatDiff = 0.0001) {
     frame => frame.select(
@@ -416,7 +374,7 @@ class CastOpSuite extends GpuExpressionTestSuite {
       col("c0").cast(FloatType))
   }
 
-  testSparkResultsAreEqual("Test bad cast from strings to floats", badFloatStringsDf,
+  testSparkResultsAreEqual("Test bad cast from strings to floats", invalidFloatStringsDf,
     conf = sparkConf, maxFloatDiff = 0.0001) {
     frame =>frame.select(
       col("c0").cast(DoubleType),
@@ -425,6 +383,16 @@ class CastOpSuite extends GpuExpressionTestSuite {
       col("c1").cast(FloatType))
   }
 
+  // Currently there is a bug in cudf which doesn't convert some corner cases correctly
+  // The bug is documented here https://github.com/rapidsai/cudf/issues/5225
+  ignore("Test cast from strings to double that doesn't match") {
+    testSparkResultsAreEqual("Test cast from strings to double that doesn't match",
+        badDoubleStringsDf, conf = sparkConf, maxFloatDiff = 0.0001) {
+      frame => frame.select(
+        col("c0").cast(DoubleType))
+    }
+  }
+
   testSparkResultsAreEqual("ansi_cast string to double exp", exponentsAsStringsDf,
     conf = sparkConf, maxFloatDiff = 0.0001) {
     frame => frame.select(
@@ -720,13 +688,13 @@ object CastOpSuite {
 
   def doublesAsStrings(session: SparkSession): DataFrame = {
     val schema = FuzzerUtils.createSchema(Seq(DoubleType), false)
-    val df = FuzzerUtils.generateDataFrame(session, schema, 100)
+    val df = FuzzerUtils.generateDataFrame(session, schema, 2048)
     df.withColumn("c0", col("c0").cast(StringType))
   }
 
   def floatsAsStrings(session: SparkSession): DataFrame = {
     val schema = FuzzerUtils.createSchema(Seq(FloatType), false)
-    val df = FuzzerUtils.generateDataFrame(session, schema, 100)
+    val df = FuzzerUtils.generateDataFrame(session, schema, 2048)
     df.withColumn("c0", col("c0").cast(StringType))
   }
 
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/FuzzerUtils.scala b/tests/src/test/scala/com/nvidia/spark/rapids/FuzzerUtils.scala
index 8670325a9ab..6bf531a7bdb 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/FuzzerUtils.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/FuzzerUtils.scala
@@ -301,30 +301,34 @@ class EnhancedRandom(protected val r: Random, protected val options: FuzzerOptio
   }
 
   def nextFloat(): Float = {
-    r.nextInt(9) match {
+    r.nextInt(11) match {
       case 0 => Float.NaN
       case 1 => Float.PositiveInfinity
       case 2 => Float.NegativeInfinity
-      case 3 => r.nextFloat() * Float.MinValue
-      case 4 => r.nextFloat() * Float.MaxValue
+      case 3 => Float.MinValue
+      case 4 => Float.MaxValue
       case 5 => 0 - r.nextFloat()
       case 6 => r.nextFloat()
       case 7 => 0f
       case 8 => -0f
+      case 9 => r.nextFloat() * Float.MinValue
+      case 10 => r.nextFloat() * Float.MaxValue
     }
   }
 
   def nextDouble(): Double = {
-    r.nextInt(9) match {
+    r.nextInt(11) match {
       case 0 => Double.NaN
       case 1 => Double.PositiveInfinity
       case 2 => Double.NegativeInfinity
-      case 3 => r.nextDouble() * Double.MinValue
-      case 4 => r.nextDouble() * Double.MaxValue
+      case 3 => Double.MaxValue
+      case 4 => Double.MinValue
       case 5 => 0 - r.nextDouble()
       case 6 => r.nextDouble()
       case 7 => 0d
       case 8 => -0d
+      case 9 => r.nextDouble() * Double.MinValue
+      case 10 => r.nextDouble() * Double.MaxValue
     }
   }
 
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala
index 971985b61d5..c44500bd712 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1131,14 +1131,16 @@ trait SparkQueryCompareTestSuite extends FunSuite with Arm {
       "9.8e5").toDF("c0")
   }
 
-  def badFloatStringsDf(session: SparkSession): DataFrame = {
+  def invalidFloatStringsDf(session: SparkSession): DataFrame = {
     import session.sqlContext.implicits._
     Seq(("A", "null"), ("1.3", "43.54")).toDF("c0", "c1")
   }
 
   def badDoubleStringsDf(session: SparkSession): DataFrame = {
     import session.sqlContext.implicits._
-    Seq("1.7976931348623159E308", "-1.7976931348623159E308").toDF("c0")
+    Seq("1.7976931348623159E308", "-1.7976931348623159E308",
+      "1.79769313486231581E308", "-1.79769313486231581E308",
+      "17.9769313486231582E307", "-17.9769313486231582E307").toDF("c0")
   }
 
   def stringsAndLongsDf(session: SparkSession): DataFrame = {
@@ -1240,45 +1242,6 @@ trait SparkQueryCompareTestSuite extends FunSuite with Arm {
     ).toDF("longs", "more_longs")
   }
 
-  def smallDoubleDf(session: SparkSession): DataFrame = {
-    import session.sqlContext.implicits._
-    Seq(
-      (1.4, 1.134),
-      (2.1, 2.4),
-      (3.0, 3.42),
-      (4.4, 4.5),
-      (5.345, 5.2),
-      (-1.3, 6.0),
-      (-5.14, 0.0)
-    ).toDF("doubles", "more_doubles")
-  }
-
-  def doubleDf(session: SparkSession): DataFrame = {
-    import session.sqlContext.implicits._
-    Seq(
-      (24854.55893, 90770.74881),
-      (79946.87288, -15456.4335),
-      (7967.43488, 32213.22119),
-      (-86099.68377, 36223.96138),
-      (63477.14374, 98993.65544),
-      (13763380.78173, 19869268.744),
-      (8677894.99092, 4029109.83562)
-    ).toDF("doubles", "more_doubles")
-  }
-
-  def nonZeroDoubleDf(session: SparkSession): DataFrame = {
-    import session.sqlContext.implicits._
-    Seq(
-      (100.3, 1.09),
-      (200.1, 2.12),
-      (300.5, 3.5),
-      (400.0, 4.32),
-      (500.5, 5.0),
-      (-100.1, 6.4),
-      (-500.934, 50.5)
-    ).toDF("doubles", "more_doubles")
-  }
-
   def nanDf(session: SparkSession): DataFrame = {
     import session.sqlContext.implicits._
     Seq[java.lang.Double](
@@ -1386,46 +1349,6 @@ trait SparkQueryCompareTestSuite extends FunSuite with Arm {
     ).toDF("doubles", "more_doubles")
   }
 
-  def mixedSingleColumnDoubleDf(session: SparkSession): DataFrame = {
-    import session.sqlContext.implicits._
-    Seq[java.lang.Double](
-      Double.PositiveInfinity,
-      Double.NegativeInfinity,
-      0.8435376941d,
-      23.1927672582d,
-      2309.4430349398d,
-      Double.NaN,
-      DOUBLE_POSITIVE_NAN_LOWER_RANGE,
-      DOUBLE_POSITIVE_NAN_UPPER_RANGE,
-      DOUBLE_NEGATIVE_NAN_LOWER_RANGE,
-      DOUBLE_NEGATIVE_NAN_UPPER_RANGE,
-      null,
-      -0.7078783860d,
-      -70.9667587507d,
-      -838600.5867225748d
-    ).toDF("doubles")
-  }
-
-  def mixedSingleColumnFloatDf(session: SparkSession): DataFrame = {
-    import session.sqlContext.implicits._
-    Seq[java.lang.Float](
-      Float.PositiveInfinity,
-      Float.NegativeInfinity,
-      0.8435376941f,
-      23.1927672582f,
-      2309.4430349398f,
-      Float.NaN,
-      FLOAT_POSITIVE_NAN_LOWER_RANGE,
-      FLOAT_NEGATIVE_NAN_LOWER_RANGE,
-      FLOAT_POSITIVE_NAN_UPPER_RANGE,
-      FLOAT_NEGATIVE_NAN_UPPER_RANGE,
-      null,
-      -0.7078783860f,
-      -70.9667587507f,
-      -838600.5867225748f
-    ).toDF("floats")
-  }
-
   def mixedFloatDf(session: SparkSession): DataFrame = {
     import session.sqlContext.implicits._
     Seq[(java.lang.Float, java.lang.Float)](
@@ -1532,33 +1455,6 @@ trait SparkQueryCompareTestSuite extends FunSuite with Arm {
     ).toDF("ints", "floats")
   }
 
-  def doubleStringsDf(session: SparkSession): DataFrame = {
-    import session.sqlContext.implicits._
-    Seq(
-      ("100.23", "1.0"),
-      ("200.65", "2.3"),
-      ("300.12", "3.6"),
-      ("400.43", "4.1"),
-      ("500.09", "5.0009"),
-      ("-100.124", "6.234"),
-      ("-500.13", "0.23"),
-      ("50.65", "50.5")
-    ).toDF("doubles", "more_doubles")
-  }
-
-  def nullableFloatDf(session: SparkSession): DataFrame = {
-    import session.sqlContext.implicits._
-    Seq[(java.lang.Float, java.lang.Float)](
-      (100.44f, 1.046f),
-      (200.2f, null),
-      (300.230f, 3.04f),
-      (null, 4.0f),
-      (500.09f, null),
-      (null, 6.10f),
-      (-500.0f, 50.5f)
-    ).toDF("floats", "more_floats")
-  }
-
   def doubleWithNansDf(session: SparkSession): DataFrame = {
     import session.sqlContext.implicits._
     Seq[(java.lang.Double, java.lang.Double)](
@@ -1620,41 +1516,6 @@ trait SparkQueryCompareTestSuite extends FunSuite with Arm {
     ).toDF("float", "int")
   }
 
-  def floatWithNansDf(session: SparkSession): DataFrame = {
-    import session.sqlContext.implicits._
-    Seq[(java.lang.Float, java.lang.Float)](
-      (100.50f, 1.0f),
-      (200.80f, Float.NaN),
-      (300.30f, 3.0f),
-      (Float.PositiveInfinity, Float.NegativeInfinity),
-      (Float.NegativeInfinity, Float.PositiveInfinity),
-      (Float.NaN, 4.0f),
-      (Float.PositiveInfinity, 4.0f),
-      (Float.NegativeInfinity, 4.0f),
-      (0.0f, 4.0f),
-      (500.0f, Float.NaN),
-      (Float.NaN, 6.0f),
-      (-500.0f, 50.5f),
-      (Float.NegativeInfinity, Float.NaN),
-      (Float.PositiveInfinity, 1.2f),
-      (Float.NaN, 3.2f),
-      (null, null)
-    ).toDF("floats", "more_floats")
-  }
-
-  def nullableStringsDf(session: SparkSession): DataFrame = {
-    import session.sqlContext.implicits._
-    Seq[(String, String)](
-      ("100.0", "1.0"),
-      (null, "2.0"),
-      ("300.0", "3.0"),
-      ("400.0", null),
-      ("500.0", "5.0"),
-      ("-100.0", null),
-      ("-500.0", "0.0")
-    ).toDF("strings", "more_strings")
-  }
-
   def nullableStringsIntsDf(session: SparkSession): DataFrame = {
     import session.sqlContext.implicits._
     Seq[(String, Integer)](