Skip to content

Commit

Permalink
Add tests for column names with dots (#8704)
Browse files Browse the repository at this point in the history
Signed-off-by: Jason Lowe <[email protected]>
  • Loading branch information
jlowe authored Jul 13, 2023
1 parent 34d9e1d commit f407178
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 1 deletion.
2 changes: 1 addition & 1 deletion integration_tests/src/main/python/asserts.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def _prep_func_for_compare(func, mode):
if should_sort_on_spark():
def with_sorted(spark):
df = func(spark)
return df.sort(df.columns)
return df.sort([f"`{x}`" for x in df.columns])

sorted_func = with_sorted
else:
Expand Down
19 changes: 19 additions & 0 deletions integration_tests/src/main/python/orc_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -861,3 +861,22 @@ def test_read_case_col_name(spark_tmp_path, read_func, v1_enabled_list, orc_impl
assert_gpu_and_cpu_are_equal_collect(
lambda spark : reader(spark).selectExpr(col_name),
conf=all_confs)

@pytest.mark.parametrize("reader_confs", reader_opt_confs, ids=idfn)
@ignore_order
def test_orc_column_name_with_dots(spark_tmp_path, reader_confs):
data_path = spark_tmp_path + "/ORC_DATA"
reader = read_orc_df(data_path)
all_confs = reader_confs
gens = [
("a.b", StructGen([
("c.d.e", StructGen([
("f.g", int_gen),
("h", string_gen)])),
("i.j", long_gen)])),
("k", boolean_gen)]
with_cpu_session(lambda spark: gen_df(spark, gens).write.orc(data_path))
assert_gpu_and_cpu_are_equal_collect(lambda spark: reader(spark), conf=all_confs)
assert_gpu_and_cpu_are_equal_collect(lambda spark: reader(spark).selectExpr("`a.b`"), conf=all_confs)
assert_gpu_and_cpu_are_equal_collect(lambda spark: reader(spark).selectExpr("`a.b`.`c.d.e`.`f.g`"),
conf=all_confs)
15 changes: 15 additions & 0 deletions integration_tests/src/main/python/orc_write_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -266,3 +266,18 @@ def test_fallback_to_single_writer_from_concurrent_writer(spark_tmp_path):
{"spark.sql.maxConcurrentOutputFileWriters": 10},
{"spark.rapids.sql.concurrentWriterPartitionFlushSize": 64 * 1024 * 1024}
))

@ignore_order
def test_orc_write_column_name_with_dots(spark_tmp_path):
data_path = spark_tmp_path + "/ORC_DATA"
gens = [
("a.b", StructGen([
("c.d.e", StructGen([
("f.g", int_gen),
("h", string_gen)])),
("i.j", long_gen)])),
("k", boolean_gen)]
assert_gpu_and_cpu_writes_are_equal_collect(
lambda spark, path: gen_df(spark, gens).coalesce(1).write.orc(path),
lambda spark, path: spark.read.orc(path),
data_path)
19 changes: 19 additions & 0 deletions integration_tests/src/main/python/parquet_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -1508,3 +1508,22 @@ def test_read_case_col_name(spark_tmp_path, read_func, v1_enabled_list, reader_c
assert_gpu_and_cpu_are_equal_collect(
lambda spark : reader(spark).selectExpr(col_name),
conf=all_confs)

@pytest.mark.parametrize("reader_confs", reader_opt_confs, ids=idfn)
@ignore_order
def test_parquet_column_name_with_dots(spark_tmp_path, reader_confs):
data_path = spark_tmp_path + "/PARQUET_DATA"
reader = read_parquet_df(data_path)
all_confs = reader_confs
gens = [
("a.b", StructGen([
("c.d.e", StructGen([
("f.g", int_gen),
("h", string_gen)])),
("i.j", long_gen)])),
("k", boolean_gen)]
with_cpu_session(lambda spark: gen_df(spark, gens).write.parquet(data_path))
assert_gpu_and_cpu_are_equal_collect(lambda spark: reader(spark), conf=all_confs)
assert_gpu_and_cpu_are_equal_collect(lambda spark: reader(spark).selectExpr("`a.b`"), conf=all_confs)
assert_gpu_and_cpu_are_equal_collect(lambda spark: reader(spark).selectExpr("`a.b`.`c.d.e`.`f.g`"),
conf=all_confs)
15 changes: 15 additions & 0 deletions integration_tests/src/main/python/parquet_write_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -755,3 +755,18 @@ def test_write_with_planned_write_enabled(spark_tmp_path, planned_write_enabled,
lambda spark, path: spark.read.parquet(path),
data_path,
conf)

@ignore_order
def test_parquet_write_column_name_with_dots(spark_tmp_path):
data_path = spark_tmp_path + "/PARQUET_DATA"
gens = [
("a.b", StructGen([
("c.d.e", StructGen([
("f.g", int_gen),
("h", string_gen)])),
("i.j", long_gen)])),
("k", boolean_gen)]
assert_gpu_and_cpu_writes_are_equal_collect(
lambda spark, path: gen_df(spark, gens).coalesce(1).write.parquet(path),
lambda spark, path: spark.read.parquet(path),
data_path)

0 comments on commit f407178

Please sign in to comment.