Skip to content

Commit

Permalink
[SPARK-35780][SQL] Support DATE/TIMESTAMP literals across the full range
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
DATE/TIMESTAMP literals support years 0000 to 9999. However, internally we support a range that is much larger.
We can add or subtract large intervals from a date/timestamp and the system will happily process and display large negative and positive dates.

Since we obviously cannot put this genie back into the bottle the only thing we can do is allow matching DATE/TIMESTAMP literals.

### Why are the changes needed?
make spark more usable and bug fix

### Does this PR introduce _any_ user-facing change?
Yes, after this PR, below SQL will have different results
```sql
select cast('-10000-1-2' as date) as date_col
-- before PR: NULL
-- after PR: -10000-1-2
```

```sql
select cast('2021-4294967297-11' as date) as date_col
-- before PR: 2021-01-11
-- after PR: NULL
```

### How was this patch tested?
newly added test cases

Closes #32959 from linhongliu-db/SPARK-35780-full-range-datetime.

Lead-authored-by: Linhong Liu <linhong.liu@databricks.com>
Co-authored-by: Linhong Liu <67896261+linhongliu-db@users.noreply.github.com>
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
(cherry picked from commit b866457)
Signed-off-by: Wenchen Fan <wenchen@databricks.com>
  • Loading branch information
2 people authored and cloud-fan committed Jul 14, 2021
1 parent dcee7a6 commit c9813f7
Show file tree
Hide file tree
Showing 14 changed files with 548 additions and 116 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -224,12 +224,12 @@ object DateTimeUtils {
* value. The return type is [[Option]] in order to distinguish between 0L and null. The following
* formats are allowed:
*
* `yyyy`
* `yyyy-[m]m`
* `yyyy-[m]m-[d]d`
* `yyyy-[m]m-[d]d `
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
* `[+-]yyyy*`
* `[+-]yyyy*-[m]m`
* `[+-]yyyy*-[m]m-[d]d`
* `[+-]yyyy*-[m]m-[d]d `
* `[+-]yyyy*-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
* `[+-]yyyy*-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
* `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
* `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
*
Expand All @@ -249,17 +249,30 @@ object DateTimeUtils {
* the input string can't be parsed as timestamp, the result timestamp segments are empty.
*/
def parseTimestampString(s: UTF8String): (Array[Int], Option[ZoneId], Boolean) = {
if (s == null) {
def isValidDigits(segment: Int, digits: Int): Boolean = {
// A Long is able to represent a timestamp within [+-]200 thousand years
val maxDigitsYear = 6
// For the nanosecond part, more than 6 digits is allowed, but will be truncated.
segment == 6 || (segment == 0 && digits >= 4 && digits <= maxDigitsYear) ||
(segment != 0 && segment != 6 && digits <= 2)
}
if (s == null || s.trimAll().numBytes() == 0) {
return (Array.empty, None, false)
}
var tz: Option[String] = None
val segments: Array[Int] = Array[Int](1, 1, 1, 0, 0, 0, 0, 0, 0)
var i = 0
var currentSegmentValue = 0
var currentSegmentDigits = 0
val bytes = s.trimAll().getBytes
var j = 0
var digitsMilli = 0
var justTime = false
var yearSign: Option[Int] = None
if (bytes(j) == '-' || bytes(j) == '+') {
yearSign = if (bytes(j) == '-') Some(-1) else Some(1)
j += 1
}
while (j < bytes.length) {
val b = bytes(j)
val parsedValue = b - '0'.toByte
Expand All @@ -269,50 +282,74 @@ object DateTimeUtils {
i += 3
} else if (i < 2) {
if (b == '-') {
if (i == 0 && j != 4) {
// year should have exact four digits
if (!isValidDigits(i, currentSegmentDigits)) {
return (Array.empty, None, false)
}
segments(i) = currentSegmentValue
currentSegmentValue = 0
currentSegmentDigits = 0
i += 1
} else if (i == 0 && b == ':') {
} else if (i == 0 && b == ':' && yearSign.isEmpty) {
justTime = true
if (!isValidDigits(3, currentSegmentDigits)) {
return (Array.empty, None, false)
}
segments(3) = currentSegmentValue
currentSegmentValue = 0
currentSegmentDigits = 0
i = 4
} else {
return (Array.empty, None, false)
}
} else if (i == 2) {
if (b == ' ' || b == 'T') {
if (!isValidDigits(i, currentSegmentDigits)) {
return (Array.empty, None, false)
}
segments(i) = currentSegmentValue
currentSegmentValue = 0
currentSegmentDigits = 0
i += 1
} else {
return (Array.empty, None, false)
}
} else if (i == 3 || i == 4) {
if (b == ':') {
if (!isValidDigits(i, currentSegmentDigits)) {
return (Array.empty, None, false)
}
segments(i) = currentSegmentValue
currentSegmentValue = 0
currentSegmentDigits = 0
i += 1
} else {
return (Array.empty, None, false)
}
} else if (i == 5 || i == 6) {
if (b == '-' || b == '+') {
if (!isValidDigits(i, currentSegmentDigits)) {
return (Array.empty, None, false)
}
segments(i) = currentSegmentValue
currentSegmentValue = 0
currentSegmentDigits = 0
i += 1
tz = Some(new String(bytes, j, 1))
} else if (b == '.' && i == 5) {
if (!isValidDigits(i, currentSegmentDigits)) {
return (Array.empty, None, false)
}
segments(i) = currentSegmentValue
currentSegmentValue = 0
currentSegmentDigits = 0
i += 1
} else {
if (!isValidDigits(i, currentSegmentDigits)) {
return (Array.empty, None, false)
}
segments(i) = currentSegmentValue
currentSegmentValue = 0
currentSegmentDigits = 0
i += 1
tz = Some(new String(bytes, j, bytes.length - j))
j = bytes.length - 1
Expand All @@ -322,8 +359,12 @@ object DateTimeUtils {
}
} else {
if (i < segments.length && (b == ':' || b == ' ')) {
if (!isValidDigits(i, currentSegmentDigits)) {
return (Array.empty, None, false)
}
segments(i) = currentSegmentValue
currentSegmentValue = 0
currentSegmentDigits = 0
i += 1
} else {
return (Array.empty, None, false)
Expand All @@ -333,61 +374,40 @@ object DateTimeUtils {
if (i == 6) {
digitsMilli += 1
}
currentSegmentValue = currentSegmentValue * 10 + parsedValue
// We will truncate the nanosecond part if there are more than 6 digits, which results
// in loss of precision
if (i != 6 || currentSegmentDigits < 6) {
currentSegmentValue = currentSegmentValue * 10 + parsedValue
}
currentSegmentDigits += 1
}
j += 1
}

segments(i) = currentSegmentValue
if (!justTime && i == 0 && j != 4) {
// year should have exact four digits
if (!isValidDigits(i, currentSegmentDigits)) {
return (Array.empty, None, false)
}
segments(i) = currentSegmentValue

while (digitsMilli < 6) {
segments(6) *= 10
digitsMilli += 1
}

// We are truncating the nanosecond part, which results in loss of precision
while (digitsMilli > 6) {
segments(6) /= 10
digitsMilli -= 1
}
// This step also validates time zone part
val zoneId = tz.map {
case "+" => ZoneOffset.ofHoursMinutes(segments(7), segments(8))
case "-" => ZoneOffset.ofHoursMinutes(-segments(7), -segments(8))
case zoneName: String => getZoneId(zoneName.trim)
}
segments(0) *= yearSign.getOrElse(1)
(segments, zoneId, justTime)
}

/**
* Trims and parses a given UTF8 timestamp string to the corresponding a corresponding [[Long]]
* value. The return type is [[Option]] in order to distinguish between 0L and null. The following
* formats are allowed:
*
* `yyyy`
* `yyyy-[m]m`
* `yyyy-[m]m-[d]d`
* `yyyy-[m]m-[d]d `
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
* `[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
* `T[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
*
* where `zone_id` should have one of the forms:
* - Z - Zulu time zone UTC+0
* - +|-[h]h:[m]m
* - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
* - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-,
* and a suffix in the formats:
* - +|-h[h]
* - +|-hh[:]mm
* - +|-hh:mm:ss
* - +|-hhmmss
* - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
* value. The return type is [[Option]] in order to distinguish between 0L and null. Please
* refer to `parseTimestampString` for the allowed formats
*/
def stringToTimestamp(s: UTF8String, timeZoneId: ZoneId): Option[Long] = {
try {
Expand Down Expand Up @@ -422,30 +442,8 @@ object DateTimeUtils {
* Trims and parses a given UTF8 string to a corresponding [[Long]] value which representing the
* number of microseconds since the epoch. The result is independent of time zones,
* which means that zone ID in the input string will be ignored.
* The return type is [[Option]] in order to distinguish between 0L and null. The following
* formats are allowed:
*
* `yyyy`
* `yyyy-[m]m`
* `yyyy-[m]m-[d]d`
* `yyyy-[m]m-[d]d `
* `yyyy-[m]m-[d]d [h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
* `yyyy-[m]m-[d]dT[h]h:[m]m:[s]s.[ms][ms][ms][us][us][us][zone_id]`
*
* where `zone_id` should have one of the forms:
* - Z - Zulu time zone UTC+0
* - +|-[h]h:[m]m
* - A short id, see https://docs.oracle.com/javase/8/docs/api/java/time/ZoneId.html#SHORT_IDS
* - An id with one of the prefixes UTC+, UTC-, GMT+, GMT-, UT+ or UT-,
* and a suffix in the formats:
* - +|-h[h]
* - +|-hh[:]mm
* - +|-hh:mm:ss
* - +|-hhmmss
* - Region-based zone IDs in the form `area/city`, such as `Europe/Paris`
*
* Note: The input string has to contains year/month/day fields, otherwise Spark can't determine
* the value of timestamp without time zone.
* The return type is [[Option]] in order to distinguish between 0L and null. Please
* refer to `parseTimestampString` for the allowed formats.
*/
def stringToTimestampWithoutTimeZone(s: UTF8String): Option[Long] = {
try {
Expand Down Expand Up @@ -518,44 +516,55 @@ object DateTimeUtils {
* The return type is [[Option]] in order to distinguish between 0 and null. The following
* formats are allowed:
*
* `yyyy`
* `yyyy-[m]m`
* `yyyy-[m]m-[d]d`
* `yyyy-[m]m-[d]d `
* `yyyy-[m]m-[d]d *`
* `yyyy-[m]m-[d]dT*`
* `[+-]yyyy*`
* `[+-]yyyy*-[m]m`
* `[+-]yyyy*-[m]m-[d]d`
* `[+-]yyyy*-[m]m-[d]d `
* `[+-]yyyy*-[m]m-[d]d *`
* `[+-]yyyy*-[m]m-[d]dT*`
*/
def stringToDate(s: UTF8String): Option[Int] = {
if (s == null) {
def isValidDigits(segment: Int, digits: Int): Boolean = {
// An integer is able to represent a date within [+-]5 million years.
var maxDigitsYear = 7
(segment == 0 && digits >= 4 && digits <= maxDigitsYear) || (segment != 0 && digits <= 2)
}
if (s == null || s.trimAll().numBytes() == 0) {
return None
}
val segments: Array[Int] = Array[Int](1, 1, 1)
var sign = 1
var i = 0
var currentSegmentValue = 0
var currentSegmentDigits = 0
val bytes = s.trimAll().getBytes
var j = 0
if (bytes(j) == '-' || bytes(j) == '+') {
sign = if (bytes(j) == '-') -1 else 1
j += 1
}
while (j < bytes.length && (i < 3 && !(bytes(j) == ' ' || bytes(j) == 'T'))) {
val b = bytes(j)
if (i < 2 && b == '-') {
if (i == 0 && j != 4) {
// year should have exact four digits
if (!isValidDigits(i, currentSegmentDigits)) {
return None
}
segments(i) = currentSegmentValue
currentSegmentValue = 0
currentSegmentDigits = 0
i += 1
} else {
val parsedValue = b - '0'.toByte
if (parsedValue < 0 || parsedValue > 9) {
return None
} else {
currentSegmentValue = currentSegmentValue * 10 + parsedValue
currentSegmentDigits += 1
}
}
j += 1
}
if (i == 0 && j != 4) {
// year should have exact four digits
if (!isValidDigits(i, currentSegmentDigits)) {
return None
}
if (i < 2 && j < bytes.length) {
Expand All @@ -564,7 +573,7 @@ object DateTimeUtils {
}
segments(i) = currentSegmentValue
try {
val localDate = LocalDate.of(segments(0), segments(1), segments(2))
val localDate = LocalDate.of(sign * segments(0), segments(1), segments(2))
Some(localDateToDays(localDate))
} catch {
case NonFatal(_) => None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -392,8 +392,6 @@ abstract class AnsiCastSuiteBase extends CastSuiteBase {
s"Cannot cast $str to DateType.")
}

checkCastWithParseError("12345")
checkCastWithParseError("12345-12-18")
checkCastWithParseError("2015-13-18")
checkCastWithParseError("2015-03-128")
checkCastWithParseError("2015/03/18")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,12 @@ abstract class CastSuiteBase extends SparkFunSuite with ExpressionEvalHelper {

test("cast string to date") {
var c = Calendar.getInstance()
c.set(12345, 0, 1, 0, 0, 0)
c.set(Calendar.MILLISECOND, 0)
checkEvaluation(Cast(Literal("12345"), DateType), new Date(c.getTimeInMillis))
c.set(12345, 11, 18, 0, 0, 0)
c.set(Calendar.MILLISECOND, 0)
checkEvaluation(Cast(Literal("12345-12-18"), DateType), new Date(c.getTimeInMillis))
c.set(2015, 0, 1, 0, 0, 0)
c.set(Calendar.MILLISECOND, 0)
checkEvaluation(Cast(Literal("2015"), DateType), new Date(c.getTimeInMillis))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -194,9 +194,11 @@ class HashExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {
// before epoch
checkHiveHashForDateType("1800-01-01", -62091)

// negative year
checkHiveHashForDateType("-1212-01-01", -1162202)

// Invalid input: bad date string. Hive returns 0 for such cases
intercept[NoSuchElementException](checkHiveHashForDateType("0-0-0", 0))
intercept[NoSuchElementException](checkHiveHashForDateType("-1212-01-01", 0))
intercept[NoSuchElementException](checkHiveHashForDateType("2016-99-99", 0))

// Invalid input: Empty string. Hive returns 0 for this case
Expand Down
Loading

0 comments on commit c9813f7

Please sign in to comment.