NVIDIA · andygrove · Jul 20, 2021 · Jul 14, 2021 · Jul 14, 2021 · Jul 14, 2021
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala
@@ -175,6 +175,64 @@ object GpuCast extends Arm {
       cv.stringReplaceWithBackrefs(rule.search, rule.replace)
     })
   }
+
+  def sanitizeStringToFloat(input: ColumnVector): ColumnVector = {
+
+    // This regex gets applied after the transformation to normalize use of Inf and is
+    // just strict enough to filter out known edge cases that would result in incorrect
+    // values. We further filter out invalid values using the cuDF isFloat method.
+    val VALID_FLOAT_REGEX =
+      "^" +                         // start of line
+      "[+\\-]?" +                   // optional + or - at start of string
+      "(" +
+        "(" +
+          "(" +
+            "([0-9]+)|" +           // digits, OR
+            "([0-9]*\\.[0-9]+)|" +  // decimal with optional leading and mandatory trailing, OR
+            "([0-9]+\\.[0-9]*)" +   // decimal with mandatory leading and optional trailing
+          ")" +
+          "([eE][+\\-]?[0-9]+)?" +  // exponent
+          "[fFdD]?" +               // floating-point designator
+        ")" +
+        "|Inf" +                    // Infinity
+        "|[nN][aA][nN]" +           // NaN
+      ")" +
+      "$"                           // end of line
+
+    withResource(input.lstrip()) { stripped =>
+      withResource(GpuScalar.from(null, DataTypes.StringType)) { nullString =>
+        // filter out strings containing breaking whitespace
+        val withoutWhitespace = withResource(ColumnVector.fromStrings("\r", "\n")) {
+            verticalWhitespace =>
+          withResource(stripped.contains(verticalWhitespace)) {
+            _.ifElse(nullString, stripped)
+          }
+        }
+        // replace all possible versions of infinity with Inf
+        val inf = withResource(withoutWhitespace) { _ =>
+          withoutWhitespace.stringReplaceWithBackrefs(
+            "([iI][nN][fF])([iI][nN][iI][tT][yY])?", "Inf")
+        }
+        // replace +Inf with Inf because cuDF only supports "Inf" and "-Inf"
+        val infWithoutPlus = withResource(inf) { _ =>
+          withResource(GpuScalar.from("+Inf", DataTypes.StringType)) { search =>
+            withResource(GpuScalar.from("Inf", DataTypes.StringType)) { replace =>
+              inf.stringReplace(search, replace)
+            }
+          }
+        }
+        // filter out any strings that are not valid floating point numbers according
+        // to the regex pattern
+        val floatOrNull = withResource(infWithoutPlus.matchesRe(VALID_FLOAT_REGEX)) {
+            _.ifElse(infWithoutPlus, nullString)
+        }
+        // strip floating-point designator 'f' or 'd' but don't strip the 'f' from 'Inf'
+        withResource(floatOrNull) {
+          _.stringReplaceWithBackrefs("(Inf|-Inf|[nN][aA][nN]|[^InNaAfFdD]*)([fFdD]?)", "\\1")
+        }
+      }
+    }
+  }
 }
 
 /**
@@ -737,59 +795,37 @@ case class GpuCast(
       ansiEnabled: Boolean,
       dType: DType): ColumnVector = {
 
-    // TODO: since cudf doesn't support case-insensitive regex, we have to generate all
-    //  possible strings. But these should cover most of the cases
-    val POS_INF_REGEX = "^[+]?(?:infinity|inf|Infinity|Inf|INF|INFINITY)$"
-    val NEG_INF_REGEX = "^[\\-](?:infinity|inf|Infinity|Inf|INF|INFINITY)$"
-    val NAN_REGEX = "^(?:nan|NaN|NAN)$"
-
-
-//      1. convert the different infinities to "Inf"/"-Inf" which is the only variation cudf
-//         understands
-//      2. identify the nans
-//      3. identify the floats. "nan", "null" and letters are not considered floats
-//      4. if ansi is enabled we want to throw and exception if the string is neither float nor nan
-//      5. convert everything thats not floats to null
-//      6. set the indices where we originally had nans to Float.NaN
-//
-//      NOTE Limitation: "1.7976931348623159E308" and "-1.7976931348623159E308" are not considered
-//      Inf even though spark does
-
-    if (ansiEnabled && input.hasNulls()) {
-      throw new NumberFormatException(GpuCast.INVALID_FLOAT_CAST_MSG)
-    }
-    // First replace different spellings/cases of infinity with Inf and -Infinity with -Inf
-    val posInfReplaced = withResource(input.matchesRe(POS_INF_REGEX)) { containsInf =>
-      withResource(Scalar.fromString("Inf")) { inf =>
-        containsInf.ifElse(inf, input)
-      }
-    }
-    val withPosNegInfinityReplaced = withResource(posInfReplaced) { withPositiveInfinityReplaced =>
-      withResource(withPositiveInfinityReplaced.matchesRe(NEG_INF_REGEX)) { containsNegInf =>
-        withResource(Scalar.fromString("-Inf")) { negInf =>
-          containsNegInf.ifElse(negInf, withPositiveInfinityReplaced)
-        }
+    // 1. convert the different infinities to "Inf"/"-Inf" which is the only variation cudf
+    // understands
+    // 2. identify the nans
+    // 3. identify the floats. "nan", "null" and letters are not considered floats
+    // 4. if ansi is enabled we want to throw and exception if the string is neither float nor nan
+    // 5. convert everything thats not floats to null
+    // 6. set the indices where we originally had nans to Float.NaN
+    //
+    // NOTE Limitation: "1.7976931348623159E308" and "-1.7976931348623159E308" are not considered
+    // Inf even though spark does
+
+    val NAN_REGEX = "^[nN][aA][nN]$"
+
+    withResource(GpuCast.sanitizeStringToFloat(input)) { sanitized =>
+      if (ansiEnabled && sanitized.hasNulls) {
+        throw new NumberFormatException(GpuCast.INVALID_FLOAT_CAST_MSG)
       }
-    }
-    withResource(withPosNegInfinityReplaced) { withPosNegInfinityReplaced =>
       //Now identify the different variations of nans
-      withResource(withPosNegInfinityReplaced.matchesRe(NAN_REGEX)) { isNan =>
+      withResource(sanitized.matchesRe(NAN_REGEX)) { isNan =>
         // now check if the values are floats
-        withResource(withPosNegInfinityReplaced.isFloat()) { isFloat =>
+        withResource(sanitized.isFloat) { isFloat =>
           if (ansiEnabled) {
-            withResource(isNan.not()) { notNan =>
-              withResource(isFloat.not()) { notFloat =>
-                withResource(notFloat.and(notNan)) { notFloatAndNotNan =>
-                  withResource(notFloatAndNotNan.any()) { notNanAndNotFloat =>
-                    if (notNanAndNotFloat.getBoolean()) {
-                      throw new NumberFormatException(GpuCast.INVALID_FLOAT_CAST_MSG)
-                    }
-                  }
+            withResource(isNan.or(isFloat)) { nanOrFloat =>
+              withResource(nanOrFloat.all()) { allNanOrFloat =>
+                if (!allNanOrFloat.getBoolean) {
+                  throw new NumberFormatException(GpuCast.INVALID_FLOAT_CAST_MSG)
                 }
               }
             }
           }
-          withResource(withPosNegInfinityReplaced.castTo(dType)) { casted =>
+          withResource(sanitized.castTo(dType)) { casted =>
             withResource(Scalar.fromNull(dType)) { nulls =>
               withResource(isFloat.ifElse(casted, nulls)) { floatsOnly =>
                 withResource(FloatUtils.getNanScalar(dType)) { nan =>

diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala
@@ -58,7 +58,7 @@ class CastOpSuite extends GpuExpressionTestSuite {
   }
 
   private val BOOL_CHARS = " \t\r\nFALSEfalseTRUEtrue01yesYESnoNO"
-  private val NUMERIC_CHARS = "inf \t\r\n0123456789.+-eE"
+  private val NUMERIC_CHARS = "infinityINFINITY \t\r\n0123456789.+-eEfFdD"
   private val DATE_CHARS = " \t\r\n0123456789:-/TZ"
 
   test("Cast from string to boolean using random inputs") {
@@ -98,15 +98,16 @@ class CastOpSuite extends GpuExpressionTestSuite {
     testCastStringTo(DataTypes.LongType, generateRandomStrings(Some(NUMERIC_CHARS)))
   }
 
-  ignore("Cast from string to float using random inputs") {
-    // Test ignored due to known issues
-    // https://github.com/NVIDIA/spark-rapids/issues/2900
+  test("Cast from string to float using random inputs") {
     testCastStringTo(DataTypes.FloatType, generateRandomStrings(Some(NUMERIC_CHARS)))
   }
 
-  ignore("Cast from string to double using random inputs") {
-    // Test ignored due to known issues
-    // https://github.com/NVIDIA/spark-rapids/issues/2900
+  test("Cast from string to float using hand-picked values") {
+    testCastStringTo(DataTypes.FloatType, Seq(".", "e", "Infinity", "+Infinity", "-Infinity",
+      "+nAn", "-naN", "Nan"))
+  }
+
+  test("Cast from string to double using random inputs") {
     testCastStringTo(DataTypes.DoubleType, generateRandomStrings(Some(NUMERIC_CHARS)))
   }
 
@@ -139,7 +140,7 @@ class CastOpSuite extends GpuExpressionTestSuite {
       prefix: Option[String] = None): Seq[String] = {
     val randomValueCount = 8192
 
-    val random = new Random(0)
+    val random = new Random()
     val r = new EnhancedRandom(random,
       FuzzerOptions(validChars, maxStringLen))
 
@@ -179,8 +180,11 @@ class CastOpSuite extends GpuExpressionTestSuite {
       assert(cpuRow.getInt(INDEX_ID) === gpuRow.getInt(INDEX_ID))
       val cpuValue = cpuRow.get(INDEX_C1)
       val gpuValue = gpuRow.get(INDEX_C1)
-      if (!compare(cpuValue, gpuValue)) {
+      if (!compare(cpuValue, gpuValue, epsilon = 0.0001)) {
         val inputValue = cpuRow.getString(INDEX_C0)
+          .replace("\r", "\\r")
+          .replace("\t", "\\t")
+          .replace("\n", "\\n")
         fail(s"Mismatch casting string [$inputValue] " +
           s"to $toType. CPU: $cpuValue; GPU: $gpuValue")
       }
@@ -827,6 +831,26 @@ class CastOpSuite extends GpuExpressionTestSuite {
     }
   }
 
+  test("CAST string to float - sanitize step") {
+    val testPairs = Seq(
+      ("\tinf", "Inf"),
+      ("\t+InFinITy", "Inf"),
+      ("\tInFinITy", "Inf"),
+      ("\t-InFinITy", "-Inf"),
+      ("\t61f", "61"),
+      (".8E4f", ".8E4")
+    )
+    val inputs = testPairs.map(_._1)
+    val expected = testPairs.map(_._2)
+    withResource(ColumnVector.fromStrings(inputs: _*)) { v =>
+      withResource(ColumnVector.fromStrings(expected: _*)) { expected =>
+        withResource(GpuCast.sanitizeStringToFloat(v)) { actual =>
+          CudfTestHelper.assertColumnsAreEqual(expected, actual)
+        }
+      }
+    }
+  }
+
   test("CAST string to date - sanitize step") {
     val testPairs = Seq(
       ("2001-1", "2001-01"),