Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve CAST string to float implementation to handle more edge cases #2933

Merged
merged 20 commits into from
Jul 20, 2021
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 81 additions & 45 deletions sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala
Original file line number Diff line number Diff line change
Expand Up @@ -175,6 +175,64 @@ object GpuCast extends Arm {
cv.stringReplaceWithBackrefs(rule.search, rule.replace)
})
}

def sanitizeStringToFloat(input: ColumnVector): ColumnVector = {

// This regex gets applied after the transformation to normalize use of Inf and is
// just strict enough to filter out known edge cases that would result in incorrect
// values. We further filter out invalid values using the cuDF isFloat method.
val VALID_FLOAT_REGEX =
"^" + // start of line
"[+\\-]?" + // optional + or - at start of string
"(" +
"(" +
"(" +
"([0-9]+)|" + // digits, OR
jlowe marked this conversation as resolved.
Show resolved Hide resolved
"([0-9]*\\.[0-9]+)|" + // decimal with optional leading and mandatory trailing, OR
"([0-9]+\\.[0-9]*)" + // decimal with mandatory leading and optional trailing
")" +
"([eE][+\\-]?[0-9]+)?" + // exponent
"[fFdD]?" + // floating-point designator
")" +
"|Inf" + // Infinity
"|[nN][aA][nN]" + // NaN
")" +
"$" // end of line

withResource(input.lstrip()) { stripped =>
withResource(GpuScalar.from(null, DataTypes.StringType)) { nullString =>
// filter out strings containing breaking whitespace
val withoutWhitespace = withResource(ColumnVector.fromStrings("\r", "\n")) {
verticalWhitespace =>
withResource(stripped.contains(verticalWhitespace)) {
_.ifElse(nullString, stripped)
}
}
// replace all possible versions of infinity with Inf
val inf = withResource(withoutWhitespace) { _ =>
withoutWhitespace.stringReplaceWithBackrefs(
"([iI][nN][fF])([iI][nN][iI][tT][yY])?", "Inf")
andygrove marked this conversation as resolved.
Show resolved Hide resolved
}
// replace +Inf with Inf because cuDF only supports "Inf" and "-Inf"
val infWithoutPlus = withResource(inf) { _ =>
withResource(GpuScalar.from("+Inf", DataTypes.StringType)) { search =>
withResource(GpuScalar.from("Inf", DataTypes.StringType)) { replace =>
inf.stringReplace(search, replace)
}
}
}
// filter out any strings that are not valid floating point numbers according
// to the regex pattern
val floatOrNull = withResource(infWithoutPlus.matchesRe(VALID_FLOAT_REGEX)) {
_.ifElse(infWithoutPlus, nullString)
}
// strip floating-point designator 'f' or 'd' but don't strip the 'f' from 'Inf'
withResource(floatOrNull) {
_.stringReplaceWithBackrefs("(Inf|-Inf|[nN][aA][nN]|[^InNaAfFdD]*)([fFdD]?)", "\\1")
jlowe marked this conversation as resolved.
Show resolved Hide resolved
}
}
}
}
}

/**
Expand Down Expand Up @@ -737,59 +795,37 @@ case class GpuCast(
ansiEnabled: Boolean,
dType: DType): ColumnVector = {

// TODO: since cudf doesn't support case-insensitive regex, we have to generate all
// possible strings. But these should cover most of the cases
val POS_INF_REGEX = "^[+]?(?:infinity|inf|Infinity|Inf|INF|INFINITY)$"
val NEG_INF_REGEX = "^[\\-](?:infinity|inf|Infinity|Inf|INF|INFINITY)$"
val NAN_REGEX = "^(?:nan|NaN|NAN)$"


// 1. convert the different infinities to "Inf"/"-Inf" which is the only variation cudf
// understands
// 2. identify the nans
// 3. identify the floats. "nan", "null" and letters are not considered floats
// 4. if ansi is enabled we want to throw and exception if the string is neither float nor nan
// 5. convert everything thats not floats to null
// 6. set the indices where we originally had nans to Float.NaN
//
// NOTE Limitation: "1.7976931348623159E308" and "-1.7976931348623159E308" are not considered
// Inf even though spark does

if (ansiEnabled && input.hasNulls()) {
throw new NumberFormatException(GpuCast.INVALID_FLOAT_CAST_MSG)
}
// First replace different spellings/cases of infinity with Inf and -Infinity with -Inf
val posInfReplaced = withResource(input.matchesRe(POS_INF_REGEX)) { containsInf =>
withResource(Scalar.fromString("Inf")) { inf =>
containsInf.ifElse(inf, input)
}
}
val withPosNegInfinityReplaced = withResource(posInfReplaced) { withPositiveInfinityReplaced =>
withResource(withPositiveInfinityReplaced.matchesRe(NEG_INF_REGEX)) { containsNegInf =>
withResource(Scalar.fromString("-Inf")) { negInf =>
containsNegInf.ifElse(negInf, withPositiveInfinityReplaced)
}
// 1. convert the different infinities to "Inf"/"-Inf" which is the only variation cudf
// understands
// 2. identify the nans
// 3. identify the floats. "nan", "null" and letters are not considered floats
// 4. if ansi is enabled we want to throw and exception if the string is neither float nor nan
// 5. convert everything thats not floats to null
// 6. set the indices where we originally had nans to Float.NaN
//
// NOTE Limitation: "1.7976931348623159E308" and "-1.7976931348623159E308" are not considered
// Inf even though spark does
andygrove marked this conversation as resolved.
Show resolved Hide resolved

val NAN_REGEX = "^[nN][aA][nN]$"

withResource(GpuCast.sanitizeStringToFloat(input)) { sanitized =>
if (ansiEnabled && sanitized.hasNulls) {
throw new NumberFormatException(GpuCast.INVALID_FLOAT_CAST_MSG)
}
}
withResource(withPosNegInfinityReplaced) { withPosNegInfinityReplaced =>
//Now identify the different variations of nans
withResource(withPosNegInfinityReplaced.matchesRe(NAN_REGEX)) { isNan =>
withResource(sanitized.matchesRe(NAN_REGEX)) { isNan =>
// now check if the values are floats
withResource(withPosNegInfinityReplaced.isFloat()) { isFloat =>
withResource(sanitized.isFloat) { isFloat =>
if (ansiEnabled) {
withResource(isNan.not()) { notNan =>
withResource(isFloat.not()) { notFloat =>
withResource(notFloat.and(notNan)) { notFloatAndNotNan =>
withResource(notFloatAndNotNan.any()) { notNanAndNotFloat =>
if (notNanAndNotFloat.getBoolean()) {
throw new NumberFormatException(GpuCast.INVALID_FLOAT_CAST_MSG)
}
}
withResource(isNan.or(isFloat)) { nanOrFloat =>
withResource(nanOrFloat.all()) { allNanOrFloat =>
if (!allNanOrFloat.getBoolean) {
throw new NumberFormatException(GpuCast.INVALID_FLOAT_CAST_MSG)
}
}
}
}
withResource(withPosNegInfinityReplaced.castTo(dType)) { casted =>
withResource(sanitized.castTo(dType)) { casted =>
withResource(Scalar.fromNull(dType)) { nulls =>
withResource(isFloat.ifElse(casted, nulls)) { floatsOnly =>
withResource(FloatUtils.getNanScalar(dType)) { nan =>
Expand Down
42 changes: 33 additions & 9 deletions tests/src/test/scala/com/nvidia/spark/rapids/CastOpSuite.scala
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class CastOpSuite extends GpuExpressionTestSuite {
}

private val BOOL_CHARS = " \t\r\nFALSEfalseTRUEtrue01yesYESnoNO"
private val NUMERIC_CHARS = "inf \t\r\n0123456789.+-eE"
private val NUMERIC_CHARS = "infinityINFINITY \t\r\n0123456789.+-eEfFdD"
private val DATE_CHARS = " \t\r\n0123456789:-/TZ"

test("Cast from string to boolean using random inputs") {
Expand Down Expand Up @@ -98,15 +98,16 @@ class CastOpSuite extends GpuExpressionTestSuite {
testCastStringTo(DataTypes.LongType, generateRandomStrings(Some(NUMERIC_CHARS)))
}

ignore("Cast from string to float using random inputs") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2900
test("Cast from string to float using random inputs") {
testCastStringTo(DataTypes.FloatType, generateRandomStrings(Some(NUMERIC_CHARS)))
}

ignore("Cast from string to double using random inputs") {
// Test ignored due to known issues
// https://github.com/NVIDIA/spark-rapids/issues/2900
test("Cast from string to float using hand-picked values") {
testCastStringTo(DataTypes.FloatType, Seq(".", "e", "Infinity", "+Infinity", "-Infinity",
"+nAn", "-naN", "Nan"))
}

test("Cast from string to double using random inputs") {
testCastStringTo(DataTypes.DoubleType, generateRandomStrings(Some(NUMERIC_CHARS)))
}

Expand Down Expand Up @@ -139,7 +140,7 @@ class CastOpSuite extends GpuExpressionTestSuite {
prefix: Option[String] = None): Seq[String] = {
val randomValueCount = 8192

val random = new Random(0)
val random = new Random()
val r = new EnhancedRandom(random,
FuzzerOptions(validChars, maxStringLen))

Expand Down Expand Up @@ -179,8 +180,11 @@ class CastOpSuite extends GpuExpressionTestSuite {
assert(cpuRow.getInt(INDEX_ID) === gpuRow.getInt(INDEX_ID))
val cpuValue = cpuRow.get(INDEX_C1)
val gpuValue = gpuRow.get(INDEX_C1)
if (!compare(cpuValue, gpuValue)) {
if (!compare(cpuValue, gpuValue, epsilon = 0.0001)) {
val inputValue = cpuRow.getString(INDEX_C0)
.replace("\r", "\\r")
.replace("\t", "\\t")
.replace("\n", "\\n")
fail(s"Mismatch casting string [$inputValue] " +
s"to $toType. CPU: $cpuValue; GPU: $gpuValue")
}
Expand Down Expand Up @@ -827,6 +831,26 @@ class CastOpSuite extends GpuExpressionTestSuite {
}
}

test("CAST string to float - sanitize step") {
val testPairs = Seq(
("\tinf", "Inf"),
("\t+InFinITy", "Inf"),
("\tInFinITy", "Inf"),
("\t-InFinITy", "-Inf"),
("\t61f", "61"),
(".8E4f", ".8E4")
)
val inputs = testPairs.map(_._1)
val expected = testPairs.map(_._2)
withResource(ColumnVector.fromStrings(inputs: _*)) { v =>
withResource(ColumnVector.fromStrings(expected: _*)) { expected =>
withResource(GpuCast.sanitizeStringToFloat(v)) { actual =>
CudfTestHelper.assertColumnsAreEqual(expected, actual)
}
}
}
}

test("CAST string to date - sanitize step") {
val testPairs = Seq(
("2001-1", "2001-01"),
Expand Down