Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support fine grained timezone checker instead of type based [databricks] #9719

Merged
merged 50 commits into from
Dec 1, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
801752c
Support current_date with timezone
winningsix Nov 15, 2023
36ddb24
Remove type based timezone checker
winningsix Nov 17, 2023
28ba19e
Add timezone checker for expressions
winningsix Nov 17, 2023
3d5d297
Add check for cast
winningsix Nov 17, 2023
b3f85bd
code style fix
winningsix Nov 17, 2023
c7c60d9
Fix premerge fails
winningsix Nov 20, 2023
90d975e
Fix
winningsix Nov 20, 2023
b88a13b
Refine comments
winningsix Nov 21, 2023
3cda255
Refine comments
winningsix Nov 21, 2023
42c7888
Refactor
winningsix Nov 21, 2023
1da1291
Refine comments
winningsix Nov 22, 2023
46dbe60
Typo
winningsix Nov 22, 2023
c954125
Re-enable failed test cases
winningsix Nov 27, 2023
eb32703
Fix inmatch cases
winningsix Nov 27, 2023
f0f6164
Revert before commit
winningsix Nov 27, 2023
e554f4e
Fix
winningsix Nov 27, 2023
d9fe752
Fix
winningsix Nov 28, 2023
78e5804
Comments
winningsix Nov 28, 2023
2ad72c8
Fix
winningsix Nov 28, 2023
7c734c0
Fix failed cases
winningsix Nov 28, 2023
e62ee80
Change xfail to allow_non_gpu
Nov 28, 2023
568f8e2
Fix CSV scan
winningsix Nov 29, 2023
7a70ad0
Merge branch 'branch-23.12' into new_now
winningsix Nov 29, 2023
a743a7a
Fix
Nov 28, 2023
4942496
Fix explain on CPU
winningsix Nov 29, 2023
83171ae
Fix json
winningsix Nov 29, 2023
e9b3b10
Fix json
winningsix Nov 29, 2023
a564891
Fix ORC scan
winningsix Nov 29, 2023
f497490
Fix ORC test
winningsix Nov 29, 2023
63d9e26
skip legacy mode rebase
winningsix Nov 29, 2023
1cbb694
Support check for AnsiCast
winningsix Nov 29, 2023
07b6863
Fix cases
Nov 29, 2023
36cc096
Fix more cases
Nov 29, 2023
3e70b52
Fix
winningsix Nov 29, 2023
51e5017
Refactor
winningsix Nov 29, 2023
07819fb
Fix more cases 3
Nov 29, 2023
d900607
Address comments
winningsix Nov 29, 2023
6e38bcb
Address comments
winningsix Nov 29, 2023
9fbe5b7
Merge branch 'branch-23.12' into new_now
winningsix Nov 29, 2023
dd07316
Fix for 341
winningsix Nov 30, 2023
2e6578e
Fix 341
winningsix Nov 30, 2023
33187b0
Minor fix
winningsix Nov 30, 2023
6dff012
Enable golden configuration
winningsix Nov 30, 2023
63d4394
Merge branch 'branch-24.02' into new_now
winningsix Nov 30, 2023
4d29350
Fix UTC cases
winningsix Nov 30, 2023
950ac3c
Address comments
winningsix Nov 30, 2023
eb0c85e
Address comments
winningsix Dec 1, 2023
78c026e
Merge branch 'branch-24.02' into new_now
winningsix Dec 1, 2023
0267c81
Fix a merge issue
winningsix Dec 1, 2023
216daf3
Minor fix
winningsix Dec 1, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion integration_tests/src/main/python/cache_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,9 @@ def n_fold(spark):

with_cpu_session(n_fold)

# allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653'
non_utc_orc_save_table_allow = ['DataWritingCommandExec', 'WriteFilesExec'] if is_not_utc() else []

# This test doesn't allow negative scale for Decimals as ` df.write.mode('overwrite').parquet(data_path)`
# writes parquet which doesn't allow negative decimals
@pytest.mark.parametrize('data_gen', [StringGen(), ByteGen(), ShortGen(), IntegerGen(), LongGen(),
Expand All @@ -165,7 +168,7 @@ def n_fold(spark):
@pytest.mark.parametrize('ts_write', ['TIMESTAMP_MICROS', 'TIMESTAMP_MILLIS'])
@pytest.mark.parametrize('enable_vectorized', ['true', 'false'], ids=idfn)
@ignore_order
@allow_non_gpu("SortExec", "ShuffleExchangeExec", "RangePartitioning")
@allow_non_gpu("SortExec", "ShuffleExchangeExec", "RangePartitioning", *non_utc_orc_save_table_allow)
def test_cache_columnar(spark_tmp_path, data_gen, enable_vectorized, ts_write):
data_path_gpu = spark_tmp_path + '/PARQUET_DATA'
def read_parquet_cached(data_path):
Expand Down
7 changes: 4 additions & 3 deletions integration_tests/src/main/python/csv_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,13 +565,14 @@ def test_csv_read_count(spark_tmp_path):
assert_gpu_and_cpu_row_counts_equal(lambda spark: spark.read.csv(data_path),
conf = {'spark.rapids.sql.explain': 'ALL'})

@allow_non_gpu('FileSourceScanExec', 'ProjectExec', 'CollectLimitExec', 'DeserializeToObjectExec', *non_utc_allow)
@allow_non_gpu('FileSourceScanExec', 'ProjectExec', 'CollectLimitExec', 'DeserializeToObjectExec')
@pytest.mark.skipif(is_before_spark_341(), reason='`TIMESTAMP_NTZ` is only supported in PySpark 341+')
@pytest.mark.parametrize('date_format', csv_supported_date_formats)
@pytest.mark.parametrize('ts_part', csv_supported_ts_parts)
@pytest.mark.parametrize("timestamp_type", [
pytest.param('TIMESTAMP_LTZ', marks=pytest.mark.xfail(is_spark_350_or_later(), reason="https://github.com/NVIDIA/spark-rapids/issues/9325")),
"TIMESTAMP_NTZ"])
@pytest.mark.xfail(is_not_utc(), reason='Timezone is not supported for csv format as https://github.com/NVIDIA/spark-rapids/issues/9653.')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this going back to an xfail instead of allow_non_utc?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is due to within csv_infer_schema_timestamp_ntz method, it has a class existence capture assert other than allow_non_utc case. As we had other test cases cover timezone staff for csv, xfail it instead of making original test case too complex.

def test_csv_infer_schema_timestamp_ntz_v1(spark_tmp_path, date_format, ts_part, timestamp_type):
csv_infer_schema_timestamp_ntz(spark_tmp_path, date_format, ts_part, timestamp_type, 'csv', 'FileSourceScanExec')

Expand Down Expand Up @@ -622,9 +623,9 @@ def do_read(spark):
non_exist_classes = cpu_scan_class,
conf = conf)

@allow_non_gpu('FileSourceScanExec', 'CollectLimitExec', 'DeserializeToObjectExec', *non_utc_allow)
@allow_non_gpu('FileSourceScanExec', 'CollectLimitExec', 'DeserializeToObjectExec')
@pytest.mark.skipif(is_before_spark_340(), reason='`preferDate` is only supported in Spark 340+')

@pytest.mark.xfail(is_not_utc(), reason='Timezone is not supported for csv format as https://github.com/NVIDIA/spark-rapids/issues/9653.')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here. Why xfail?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same reason as above.

This is due to within csv_infer_schema_timestamp_ntz method, it has a class existence capture assert other than allow_non_utc case. As we had other test cases cover timezone staff for csv, xfail it instead of making original test case too complex.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So then how does #9653 cover that? I don't see CSV or JSON mentioned in there at all. I don't wee any mention of timestamp_ntz being mentioned. I just want to be 100% sure that we don't end up dropping something and getting data corruption that we missed because of an xfail. I would rather see a complex test, or at a minimum an issue filed to do the test correctly.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Updated by refining the test cases.

def test_csv_prefer_date_with_infer_schema(spark_tmp_path):
# start date ""0001-01-02" required due to: https://github.com/NVIDIA/spark-rapids/issues/5606
data_gens = [byte_gen, short_gen, int_gen, long_gen, boolean_gen, timestamp_gen, DateGen(start=date(1, 1, 2))]
Expand Down
5 changes: 4 additions & 1 deletion integration_tests/src/main/python/data_gen.py
Original file line number Diff line number Diff line change
Expand Up @@ -1175,4 +1175,7 @@ def get_25_partitions_df(spark):


# allow non gpu when time zone is non-UTC because of https://github.com/NVIDIA/spark-rapids/issues/9653'
non_utc_allow = ['ProjectExec', 'FilterExec', 'FileSourceScanExec', 'BatchScanExec', 'CollectLimitExec', 'DeserializeToObjectExec', 'DataWritingCommandExec', 'WriteFilesExec', 'ShuffleExchangeExec'] if is_not_utc() else []
# This will be deprecated and replaced case specified non GPU allow list
non_utc_allow = ['ProjectExec', 'FilterExec', 'FileSourceScanExec', 'BatchScanExec', 'CollectLimitExec',
'DeserializeToObjectExec', 'DataWritingCommandExec', 'WriteFilesExec', 'ShuffleExchangeExec',
'ExecutedCommandExec'] if is_not_utc() else []
1 change: 1 addition & 0 deletions integration_tests/src/main/python/json_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -853,6 +853,7 @@ def struct_to_json(spark):
'Etc/UTC',
pytest.param('UTC+07:00', marks=pytest.mark.allow_non_gpu('ProjectExec')),
])
@pytest.mark.skipif(is_not_utc(), reason='Duplicated as original test case designed which it is parameterized by timezone. https://github.com/NVIDIA/spark-rapids/issues/9653.')
def test_structs_to_json_timestamp(spark_tmp_path, data_gen, timestamp_format, timezone):
struct_gen = StructGen([
("b", StructGen([('child', data_gen)], nullable=True)),
Expand Down
4 changes: 3 additions & 1 deletion integration_tests/src/main/python/parquet_write_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -771,10 +771,12 @@ def read_table(spark, path):

func(create_table, read_table, data_path, conf)

non_utc_hive_parquet_write_allow = ['DataWritingCommandExec', 'WriteFilesExec'] if is_not_utc() else []

# Test to avoid regression on a known bug in Spark. For details please visit https://github.com/NVIDIA/spark-rapids/issues/8693
@pytest.mark.parametrize('ts_rebase', [
pytest.param('LEGACY', marks=pytest.mark.skipif(is_not_utc(), reason="LEGACY datetime rebase mode is only supported for UTC timezone")),
'CORRECTED'])
pytest.param('CORRECTED', marks=pytest.mark.allow_non_gpu(*non_utc_hive_parquet_write_allow))])
def test_hive_timestamp_value(spark_tmp_table_factory, spark_tmp_path, ts_rebase):
def func_test(create_table, read_table, data_path, conf):
assert_gpu_and_cpu_writes_are_equal_collect(create_table, read_table, data_path, conf=conf)
Expand Down