Skip to content

Commit

Permalink
Update tests that originally restricted the Spark timestamp range (#1…
Browse files Browse the repository at this point in the history
…0085)

Signed-off-by: Navin Kumar <navink@nvidia.com>
  • Loading branch information
NVnavkumar authored Dec 26, 2023
1 parent e09c144 commit c38ef2d
Show file tree
Hide file tree
Showing 7 changed files with 18 additions and 15 deletions.
8 changes: 4 additions & 4 deletions integration_tests/src/main/python/cast_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ def test_cast_string_date_valid_format():
# In Spark 3.2.0+ the valid format changed, and we cannot support all of the format.
# This provides values that are valid in all of those formats.
assert_gpu_and_cpu_are_equal_collect(
lambda spark : unary_op_df(spark, StringGen(date_start_1_2_1)).select(f.col('a').cast(DateType())),
lambda spark : unary_op_df(spark, StringGen(date_start_1_1_1)).select(f.col('a').cast(DateType())),
conf = {'spark.rapids.sql.hasExtendedYearValues': 'false'})

invalid_values_string_to_date = ['200', ' 1970A', '1970 A', '1970T', # not conform to "yyyy" after trim
Expand Down Expand Up @@ -146,9 +146,9 @@ def test_cast_string_date_non_ansi():
lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())),
conf={'spark.rapids.sql.hasExtendedYearValues': 'false'})

@pytest.mark.parametrize('data_gen', [StringGen(date_start_1_2_1),
StringGen(date_start_1_2_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'),
StringGen(date_start_1_2_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]\.[0-9]{0,6}Z?')
@pytest.mark.parametrize('data_gen', [StringGen(date_start_1_1_1),
StringGen(date_start_1_1_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'),
StringGen(date_start_1_1_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]\.[0-9]{0,6}Z?')
],
ids=idfn)
@tz_sensitive_test
Expand Down
6 changes: 6 additions & 0 deletions integration_tests/src/main/python/data_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -1219,3 +1219,9 @@ def get_25_partitions_df(spark):

# regexp to generate year from 0002, format is yyyy
yyyy_start_0002 = '([0-9]{3}[2-9]|([1-9][0-9]{2}|0[1-9][0-9]|00[1-9])[0-1])'

# regexp to generate year from 0001, format is yyyy
yyyy_start_0001 = '([0-9]{3}[1-9]|([1-9][0-9]{2}|0[1-9][0-9]|00[1-9])[0-1])'

# regexp to generate date from 0001-02-01, format is yyyy-MM-dd
date_start_1_1_1 = yyyy_start_0001 + '-[0-9]{1,2}-[0-9]{1,2}'
7 changes: 2 additions & 5 deletions integration_tests/src/main/python/date_time_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -579,8 +579,7 @@ def test_unsupported_fallback_to_date():
# (-62135510400, 253402214400) is the range of seconds that can be represented by timestamp_seconds
# considering the influence of time zone.
ts_float_gen = SetValuesGen(FloatType(), [0.0, -0.0, 1.0, -1.0, 1.234567, -1.234567, 16777215.0, float('inf'), float('-inf'), float('nan')])
# FIXME: min_val is changed to -62135410400 bypassing "ValueError: year 0 is out of range" from pySpark. It can be fixed after https://github.com/NVIDIA/spark-rapids/issues/9747
seconds_gens = [LongGen(min_val=-62135410400, max_val=253402214400), IntegerGen(), ShortGen(), ByteGen(),
seconds_gens = [LongGen(min_val=-62135510400, max_val=253402214400), IntegerGen(), ShortGen(), ByteGen(),
DoubleGen(min_exp=0, max_exp=32), ts_float_gen, DecimalGen(16, 6), DecimalGen(13, 3), DecimalGen(10, 0), DecimalGen(7, -3), DecimalGen(6, 6)]
@pytest.mark.parametrize('data_gen', seconds_gens, ids=idfn)
@allow_non_gpu(*non_utc_allow)
Expand Down Expand Up @@ -615,7 +614,6 @@ def test_timestamp_seconds_decimal_overflow(data_gen):
conf={},
error_message='Overflow')

# FIXME: min_val is changed to -62135410400000 bypassing "ValueError: year 0 is out of range" from pySpark. It can be fixed after https://github.com/NVIDIA/spark-rapids/issues/9747
millis_gens = [LongGen(min_val=-62135410400000, max_val=253402214400000), IntegerGen(), ShortGen(), ByteGen()]
@pytest.mark.parametrize('data_gen', millis_gens, ids=idfn)
@allow_non_gpu(*non_utc_allow)
Expand All @@ -630,8 +628,7 @@ def test_timestamp_millis_long_overflow():
conf={},
error_message='long overflow')

# FIXME: min_val is changed to -62135410400 bypassing "ValueError: year 0 is out of range" from pySpark. It can be fixed after https://github.com/NVIDIA/spark-rapids/issues/9747
micros_gens = [LongGen(min_val=-62135410400000000, max_val=253402214400000000), IntegerGen(), ShortGen(), ByteGen()]
micros_gens = [LongGen(min_val=-62135510400000000, max_val=253402214400000000), IntegerGen(), ShortGen(), ByteGen()]
@pytest.mark.parametrize('data_gen', micros_gens, ids=idfn)
@allow_non_gpu(*non_utc_allow)
def test_timestamp_micros(data_gen):
Expand Down
2 changes: 1 addition & 1 deletion integration_tests/src/main/python/delta_lake_write_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ def setup_tables(spark):
@pytest.mark.parametrize("ts_write", ["INT96", "TIMESTAMP_MICROS", "TIMESTAMP_MILLIS"], ids=idfn)
@pytest.mark.skipif(is_before_spark_320(), reason="Delta Lake writes are not supported before Spark 3.2.x")
def test_delta_write_legacy_timestamp(spark_tmp_path, ts_write):
gen = TimestampGen(start=datetime(1, 2, 1, tzinfo=timezone.utc),
gen = TimestampGen(start=datetime(1, 1, 1, tzinfo=timezone.utc),
end=datetime(2000, 1, 1, tzinfo=timezone.utc)).with_special_case(
datetime(1000, 1, 1, tzinfo=timezone.utc), weight=10.0)
data_path = spark_tmp_path + "/DELTA_DATA"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def test_reading_file_written_by_spark_cpu(data_gen, spark_tmp_path):
reason="fastparquet interprets timestamps in UTC timezone, regardless "
"of timezone settings")), # Vanilla case.
pytest.param(TimestampGen(nullable=False,
start=datetime(1, 2, 1, tzinfo=timezone.utc),
start=datetime(1, 1, 1, tzinfo=timezone.utc),
end=pandas_min_datetime),
marks=pytest.mark.xfail(reason="fastparquet reads timestamps preceding 1900 incorrectly.")),
], ids=idfn)
Expand Down
2 changes: 1 addition & 1 deletion integration_tests/src/main/python/json_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,7 @@ def test_from_json_struct_date_fallback_non_default_format(date_gen, date_format
# "yyyy-MM"
"\"[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?[1-8]{1}[0-9]{3}-[0-3]{1,2}[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?\"",
# "yyyy"
"\"[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?" + yyyy_start_0002 + "[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?\"",
"\"[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?" + yyyy_start_0001 + "[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?\"",
# "dd/MM/yyyy"
"\"[0-9]{2}/[0-9]{2}/[1-8]{1}[0-9]{3}\"",
# special constant values
Expand Down
6 changes: 3 additions & 3 deletions integration_tests/src/main/python/parquet_write_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@

parquet_datetime_gen_simple = [DateGen(start=date(1, 1, 1), end=date(2000, 1, 1))
.with_special_case(date(1000, 1, 1), weight=10.0),
TimestampGen(start=datetime(1, 2, 1, tzinfo=timezone.utc),
TimestampGen(start=datetime(1, 1, 1, tzinfo=timezone.utc),
end=datetime(2000, 1, 1, tzinfo=timezone.utc))
.with_special_case(datetime(1000, 1, 1, tzinfo=timezone.utc), weight=10.0)]
parquet_datetime_in_struct_gen = [
Expand Down Expand Up @@ -289,8 +289,8 @@ def writeParquetUpgradeCatchException(spark, df, data_path, spark_tmp_table_fact

@pytest.mark.parametrize('ts_write_data_gen',
[('INT96', TimestampGen()),
('TIMESTAMP_MICROS', TimestampGen(start=datetime(1, 2, 1, tzinfo=timezone.utc), end=datetime(1899, 12, 31, tzinfo=timezone.utc))),
('TIMESTAMP_MILLIS', TimestampGen(start=datetime(1, 2, 1, tzinfo=timezone.utc), end=datetime(1899, 12, 31, tzinfo=timezone.utc)))])
('TIMESTAMP_MICROS', TimestampGen(start=datetime(1, 1, 1, tzinfo=timezone.utc), end=datetime(1899, 12, 31, tzinfo=timezone.utc))),
('TIMESTAMP_MILLIS', TimestampGen(start=datetime(1, 1, 1, tzinfo=timezone.utc), end=datetime(1899, 12, 31, tzinfo=timezone.utc)))])
@pytest.mark.parametrize('rebase', ["CORRECTED","EXCEPTION"])
@allow_non_gpu(*non_utc_allow)
def test_ts_write_fails_datetime_exception(spark_tmp_path, ts_write_data_gen, spark_tmp_table_factory, rebase):
Expand Down

0 comments on commit c38ef2d

Please sign in to comment.