Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix regressions in cast from string to date and timestamp #3485

Merged
merged 7 commits into from
Sep 16, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
197 changes: 54 additions & 143 deletions sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCast.scala
Original file line number Diff line number Diff line change
Expand Up @@ -126,18 +126,17 @@ class CastExprMeta[INPUT <: CastBase](

object GpuCast extends Arm {

private val DATE_REGEX_YYYY_MM_DD = "\\A\\d{4}\\-\\d{2}\\-\\d{2}([ T](:?[\\r\\n]|.)*)?\\Z"

private val TIMESTAMP_REGEX_YYYY_MM = "\\A\\d{4}\\-\\d{2}[ ]?\\Z"
private val TIMESTAMP_REGEX_YYYY_MM_DD = "\\A\\d{4}\\-\\d{2}\\-\\d{2}[ ]?\\Z"
private val DATE_REGEX_YYYY_MM_DD = "\\A\\d{4}\\-\\d{1,2}\\-\\d{1,2}([ T](:?[\\r\\n]|.)*)?\\Z"
private val DATE_REGEX_YYYY_MM = "\\A\\d{4}\\-\\d{1,2}\\Z"
private val DATE_REGEX_YYYY = "\\A\\d{4}\\Z"

private val TIMESTAMP_REGEX_YYYY_MM_DD = "\\A\\d{4}\\-\\d{1,2}\\-\\d{1,2}[ ]?\\Z"
private val TIMESTAMP_REGEX_YYYY_MM = "\\A\\d{4}\\-\\d{1,2}[ ]?\\Z"
private val TIMESTAMP_REGEX_YYYY = "\\A\\d{4}[ ]?\\Z"
private val TIMESTAMP_REGEX_FULL =
"\\A\\d{4}\\-\\d{1,2}\\-\\d{1,2}[ T]?(\\d{2}:\\d{2}:\\d{2}\\.\\d{6}Z)\\Z"
private val TIMESTAMP_REGEX_NO_DATE = "\\A[T]?(\\d{2}:\\d{2}:\\d{2}\\.\\d{6}Z)\\Z"

/**
* The length of a timestamp with 6 digits for microseconds followed by 'Z', such
* as "2020-01-01T12:34:56.123456Z".
*/
private val FULL_TIMESTAMP_LENGTH = 27

/**
* Regex to match timestamps with or without trailing zeros.
*/
Expand All @@ -150,22 +149,6 @@ object GpuCast extends Arm {

val INVALID_FLOAT_CAST_MSG: String = "At least one value is either null or is an invalid number"

/**
* Sanitization step for CAST string to date. Inserts a single zero before any
* single digit month or day.
*/
def sanitizeStringToDate(input: ColumnVector): ColumnVector = {
val rules = Seq(
// replace yyyy-m with yyyy-mm
RegexReplace(raw"(\A\d{4})-(\d{1})(-|\Z)", raw"\1-0\2\3"),
// replace yyyy-mm-d with yyyy-mm-dd
RegexReplace(raw"(\A\d{4}-\d{2})-(\d{1})([ T]|\Z)", raw"\1-0\2\3")
)
rules.foldLeft(input.incRefCount())((cv, rule) => withResource(cv) { _ =>
cv.stringReplaceWithBackrefs(rule.search, rule.replace)
})
}

def sanitizeStringToFloat(input: ColumnVector, ansiEnabled: Boolean): ColumnVector = {

// This regex gets applied after the transformation to normalize use of Inf and is
Expand Down Expand Up @@ -826,69 +809,42 @@ object GpuCast extends Arm {
}
}

/**
* Parse dates that match the provided length and format. This method does not
* close the `input` ColumnVector.
*
* @param input Input ColumnVector
* @param len The string length to match against
* @param cudfFormat The cuDF timestamp format to match against
* @return ColumnVector containing timestamps for input entries that match both
* the length and format, and null for other entries
*/
def convertFixedLenDateOrNull(
/** This method does not close the `input` ColumnVector. */
def convertDateOrNull(
input: ColumnVector,
len: Int,
regex: String,
cudfFormat: String): ColumnVector = {

withResource(isValidTimestamp(input, len, cudfFormat)) { isValidDate =>
withResource(input.asTimestampDays(cudfFormat)) { asDays =>
withResource(Scalar.fromNull(DType.TIMESTAMP_DAYS)) { nullScalar =>
isValidDate.ifElse(asDays, nullScalar)
}
val isValidDate = withResource(input.matchesRe(regex)) { isMatch =>
withResource(input.isTimestamp(cudfFormat)) { isTimestamp =>
isMatch.and(isTimestamp)
}
}
}

/** This method does not close the `input` ColumnVector. */
def convertVarLenDateOr(
input: ColumnVector,
regex: String,
cudfFormat: String,
orElse: ColumnVector): ColumnVector = {

withResource(orElse) { orElse =>
val isValidDate = withResource(input.matchesRe(regex)) { isMatch =>
withResource(input.isTimestamp(cudfFormat)) { isTimestamp =>
isMatch.and(isTimestamp)
}
}
withResource(isValidDate) { isValidDate =>
withResource(isValidDate) { _ =>
withResource(Scalar.fromNull(DType.TIMESTAMP_DAYS)) { orElse =>
withResource(input.asTimestampDays(cudfFormat)) { asDays =>
isValidDate.ifElse(asDays, orElse)
}
}
}
}

/**
* Parse dates that match the provided length and format. This method does not
* close the `input` ColumnVector.
*
* @param input Input ColumnVector
* @param len The string length to match against
* @param cudfFormat The cuDF timestamp format to match against
* @return ColumnVector containing timestamps for input entries that match both
* the length and format, and null for other entries
*/
def convertFixedLenDateOr(
/** This method does not close the `input` ColumnVector. */
def convertDateOr(
input: ColumnVector,
len: Int,
regex: String,
cudfFormat: String,
orElse: ColumnVector): ColumnVector = {

withResource(orElse) { orElse =>
withResource(isValidTimestamp(input, len, cudfFormat)) { isValidDate =>
val isValidDate = withResource(input.matchesRe(regex)) { isMatch =>
withResource(input.isTimestamp(cudfFormat)) { isTimestamp =>
isMatch.and(isTimestamp)
}
}

withResource(isValidDate) { _ =>
withResource(orElse) { _ =>
withResource(input.asTimestampDays(cudfFormat)) { asDays =>
isValidDate.ifElse(asDays, orElse)
}
Expand Down Expand Up @@ -926,21 +882,18 @@ object GpuCast extends Arm {
* `yyyy-[m]m-[d]d *`
* `yyyy-[m]m-[d]dT*`
*/
private def castStringToDate(input: ColumnVector): ColumnVector = {
private def castStringToDate(sanitizedInput: ColumnVector): ColumnVector = {

val specialDates = DateUtils.specialDatesDays

withResource(sanitizeStringToDate(input)) { sanitizedInput =>

// convert dates that are in valid formats yyyy, yyyy-mm, yyyy-mm-dd
val converted = convertVarLenDateOr(sanitizedInput, DATE_REGEX_YYYY_MM_DD, "%Y-%m-%d",
convertFixedLenDateOr(sanitizedInput, 7, "%Y-%m",
convertFixedLenDateOrNull(sanitizedInput, 4, "%Y")))
// convert dates that are in valid formats yyyy, yyyy-mm, yyyy-mm-dd
val converted = convertDateOr(sanitizedInput, DATE_REGEX_YYYY_MM_DD, "%Y-%m-%d",
convertDateOr(sanitizedInput, DATE_REGEX_YYYY_MM, "%Y-%m",
convertDateOrNull(sanitizedInput, DATE_REGEX_YYYY, "%Y")))

// handle special dates like "epoch", "now", etc.
specialDates.foldLeft(converted)((prev, specialDate) =>
specialDateOr(sanitizedInput, specialDate._1, specialDate._2, prev))
}
// handle special dates like "epoch", "now", etc.
specialDates.foldLeft(converted)((prev, specialDate) =>
specialDateOr(sanitizedInput, specialDate._1, specialDate._2, prev))
}

private def castStringToDateAnsi(input: ColumnVector, ansiMode: Boolean): ColumnVector = {
Expand Down Expand Up @@ -976,26 +929,28 @@ object GpuCast extends Arm {
}
}

/**
* Parse dates that match the the provided regex. This method does not close the `input`
* ColumnVector.
*/
private def convertFixedLenTimestampOrNull(
/** This method does not close the `input` ColumnVector. */
private def convertTimestampOrNull(
input: ColumnVector,
len: Int,
regex: String,
cudfFormat: String): ColumnVector = {

withResource(isValidTimestamp(input, len, cudfFormat)) { isTimestamp =>
withResource(Scalar.fromNull(DType.TIMESTAMP_MICROSECONDS)) { nullScalar =>
withResource(Scalar.fromNull(DType.TIMESTAMP_MICROSECONDS)) { orElse =>
val isValidTimestamp = withResource(input.matchesRe(regex)) { isMatch =>
withResource(input.isTimestamp(cudfFormat)) { isTimestamp =>
isMatch.and(isTimestamp)
}
}
withResource(isValidTimestamp) { isValidTimestamp =>
withResource(input.asTimestampMicroseconds(cudfFormat)) { asDays =>
isTimestamp.ifElse(asDays, nullScalar)
isValidTimestamp.ifElse(asDays, orElse)
}
}
}
}

/** This method does not close the `input` ColumnVector. */
private def convertVarLenTimestampOr(
private def convertTimestampOr(
input: ColumnVector,
regex: String,
cudfFormat: String,
Expand Down Expand Up @@ -1033,19 +988,14 @@ object GpuCast extends Arm {
}

val isValidTimestamp = withResource(isCudfMatch) { isCudfMatch =>
val isValidLength = withResource(Scalar.fromInt(FULL_TIMESTAMP_LENGTH)) { requiredLen =>
withResource(input.getCharLengths) { actualLen =>
requiredLen.equalTo(actualLen)
}
}
withResource(isValidLength) { isValidLength =>
isValidLength.and(isCudfMatch)
withResource(input.matchesRe(TIMESTAMP_REGEX_FULL)) { isRegexMatch =>
isCudfMatch.and(isRegexMatch)
}
}

// we only need to parse with one of the cuDF formats because the parsing code ignores
// the ' ' or 'T' between the date and time components
withResource(isValidTimestamp) { isValidTimestamp =>
withResource(isValidTimestamp) { _ =>
withResource(input.asTimestampMicroseconds(cudfFormat1)) { asDays =>
isValidTimestamp.ifElse(asDays, orElse)
}
Expand All @@ -1063,22 +1013,6 @@ object GpuCast extends Arm {

var sanitizedInput = input.incRefCount()

// replace partial months
sanitizedInput = withResource(sanitizedInput) { cv =>
cv.stringReplaceWithBackrefs("-([0-9])-", "-0\\1-")
}

// replace partial month or day at end of string
sanitizedInput = withResource(sanitizedInput) { cv =>
cv.stringReplaceWithBackrefs("-([0-9])[ ]?\\Z", "-0\\1")
}

// replace partial day in timestamp formats without dates
sanitizedInput = withResource(sanitizedInput) { cv =>
cv.stringReplaceWithBackrefs(
"-([0-9])([ T]\\d{1,2}:\\d{1,2}:\\d{1,2}\\.\\d{6}Z)\\Z", "-0\\1\\2")
}

// prepend today's date to timestamp formats without dates
sanitizedInput = withResource(sanitizedInput) { _ =>
sanitizedInput.stringReplaceWithBackrefs(TIMESTAMP_REGEX_NO_DATE, s"${todayStr}T\\1")
Expand All @@ -1088,9 +1022,9 @@ object GpuCast extends Arm {
// convert dates that are in valid timestamp formats
val converted =
convertFullTimestampOr(sanitizedInput,
convertVarLenTimestampOr(sanitizedInput, TIMESTAMP_REGEX_YYYY_MM_DD, "%Y-%m-%d",
convertVarLenTimestampOr(sanitizedInput, TIMESTAMP_REGEX_YYYY_MM, "%Y-%m",
convertFixedLenTimestampOrNull(sanitizedInput, 4, "%Y"))))
convertTimestampOr(sanitizedInput, TIMESTAMP_REGEX_YYYY_MM_DD, "%Y-%m-%d",
convertTimestampOr(sanitizedInput, TIMESTAMP_REGEX_YYYY_MM, "%Y-%m",
convertTimestampOrNull(sanitizedInput, TIMESTAMP_REGEX_YYYY, "%Y"))))

// handle special dates like "epoch", "now", etc.
val finalResult = specialDates.foldLeft(converted)((prev, specialDate) =>
Expand All @@ -1107,29 +1041,6 @@ object GpuCast extends Arm {
}
}

/**
* Determine which timestamps are the specified length and also comply with the specified
* cuDF format string.
*
* @param input Input ColumnVector
* @param len The string length to match against
* @param cudfFormat The cuDF timestamp format to match against
* @return ColumnVector containing booleans representing which entries match both
* the length and format
*/
private def isValidTimestamp(input: ColumnVector, len: Int, cudfFormat: String) = {
val isCorrectLength = withResource(Scalar.fromInt(len)) { requiredLen =>
withResource(input.getCharLengths) { actualLen =>
requiredLen.equalTo(actualLen)
}
}
withResource(isCorrectLength) { isCorrectLength =>
withResource(input.isTimestamp(cudfFormat)) { isTimestamp =>
isCorrectLength.and(isTimestamp)
}
}
}

private def castMapToMap(
from: MapType,
to: MapType,
Expand Down
Loading