Skip to content

Commit

Permalink
Fix year 0 is out of range in test_from_json_struct_timestamp (#9972)
Browse files Browse the repository at this point in the history
* Fix year 0 is out of range for test_from_json_struct_timestamp

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* clean up

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* address comments

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

* fix nit and typo

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>

---------

Signed-off-by: Haoyang Li <haoyangl@nvidia.com>
  • Loading branch information
thirtiseven authored Dec 7, 2023
1 parent 2805b95 commit 6c5fbac
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 9 deletions.
10 changes: 4 additions & 6 deletions integration_tests/src/main/python/cast_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,13 +61,11 @@ def test_cast_nested(data_gen, to_type):
assert_gpu_and_cpu_are_equal_collect(
lambda spark : unary_op_df(spark, data_gen).select(f.col('a').cast(to_type)))

date_after_1_2_1 = '(0{0,3}1-(0?[2-9]|[1-3][0-9]))|(([0-9]{0,3}[2-9]|[1-9][0-9]{0,2}[01])-[0-3]?[0-9])-[0-5]?[0-9]'

def test_cast_string_date_valid_format():
# In Spark 3.2.0+ the valid format changed, and we cannot support all of the format.
# This provides values that are valid in all of those formats.
assert_gpu_and_cpu_are_equal_collect(
lambda spark : unary_op_df(spark, StringGen(date_after_1_2_1)).select(f.col('a').cast(DateType())),
lambda spark : unary_op_df(spark, StringGen(date_start_1_2_1)).select(f.col('a').cast(DateType())),
conf = {'spark.rapids.sql.hasExtendedYearValues': 'false'})

invalid_values_string_to_date = ['200', ' 1970A', '1970 A', '1970T', # not conform to "yyyy" after trim
Expand Down Expand Up @@ -148,9 +146,9 @@ def test_cast_string_date_non_ansi():
lambda spark: spark.createDataFrame(data_rows, "a string").select(f.col('a').cast(DateType())),
conf={'spark.rapids.sql.hasExtendedYearValues': 'false'})

@pytest.mark.parametrize('data_gen', [StringGen(date_after_1_2_1),
StringGen(date_after_1_2_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'),
StringGen(date_after_1_2_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]\.[0-9]{0,6}Z?')
@pytest.mark.parametrize('data_gen', [StringGen(date_start_1_2_1),
StringGen(date_start_1_2_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]'),
StringGen(date_start_1_2_1 + '[ |T][0-3][0-9]:[0-6][0-9]:[0-6][0-9]\.[0-9]{0,6}Z?')
],
ids=idfn)
@allow_non_gpu(*non_utc_allow)
Expand Down
10 changes: 9 additions & 1 deletion integration_tests/src/main/python/data_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -1203,4 +1203,12 @@ def get_25_partitions_df(spark):
# This will be deprecated and replaced case specified non GPU allow list
non_utc_allow = ['ProjectExec', 'FilterExec', 'FileSourceScanExec', 'BatchScanExec', 'CollectLimitExec',
'DeserializeToObjectExec', 'DataWritingCommandExec', 'WriteFilesExec', 'ShuffleExchangeExec',
'ExecutedCommandExec'] if is_not_utc() else []
'ExecutedCommandExec'] if is_not_utc() else []

# date related regexps for generating date strings within python's range limits

# regexp to generate date from 0001-02-01, format is yyyy-MM-dd
date_start_1_2_1 = '(0{0,3}1-(0?[2-9]|[1-3][0-9]))|(([0-9]{0,3}[2-9]|[1-9][0-9]{0,2}[01])-[0-3]?[0-9])-[0-5]?[0-9]'

# regexp to generate year from 0002, format is yyyy
yyyy_start_0002 = '([0-9]{3}[2-9]|([1-9][0-9]{2}|0[1-9][0-9]|00[1-9])[0-1])'
3 changes: 1 addition & 2 deletions integration_tests/src/main/python/json_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -637,7 +637,7 @@ def test_from_json_struct_date_fallback_non_default_format(date_gen, date_format
# "yyyy-MM"
"\"[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?[1-8]{1}[0-9]{3}-[0-3]{1,2}[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?\"",
# "yyyy"
"\"[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?[0-9]{4}[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?\"",
"\"[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?" + yyyy_start_0002 + "[ \t\xA0\u1680\u180e\u2000-\u200a\u202f\u205f\u3000]?\"",
# "dd/MM/yyyy"
"\"[0-9]{2}/[0-9]{2}/[1-8]{1}[0-9]{3}\"",
# special constant values
Expand All @@ -664,7 +664,6 @@ def test_from_json_struct_date_fallback_non_default_format(date_gen, date_format
pytest.param("LEGACY", marks=pytest.mark.allow_non_gpu('ProjectExec')),
"CORRECTED"
])
@datagen_overrides(seed=0, reason='https://github.com/NVIDIA/spark-rapids/issues/9747')
@pytest.mark.parametrize('ansi_enabled', [ True, False ])
def test_from_json_struct_timestamp(timestamp_gen, timestamp_format, time_parser_policy, ansi_enabled):
json_string_gen = StringGen(r'{ "a": ' + timestamp_gen + ' }') \
Expand Down

0 comments on commit 6c5fbac

Please sign in to comment.