From 9509cbb5f7a09262e957d3c329c1b23efe765376 Mon Sep 17 00:00:00 2001 From: Raza Jafri Date: Fri, 28 May 2021 16:42:27 -0700 Subject: [PATCH] Add Struct support for ParquetWriter (#2514) * updated Parquet writer support docs Signed-off-by: Raza Jafri * adding Struct to allow list for ParquetWriter Signed-off-by: Raza Jafri * add nested struct test Signed-off-by: Raza Jafri * removed duplicate data type from test Signed-off-by: Raza Jafri * removed CoalesceExec from allowedOnCpu list because it supports structs now Signed-off-by: Raza Jafri * Decimals are supported by CoalesceExec Signed-off-by: Raza Jafri Co-authored-by: Raza Jafri --- docs/supported_ops.md | 4 ++-- .../src/main/python/parquet_write_test.py | 11 +++++------ .../scala/com/nvidia/spark/rapids/GpuOverrides.scala | 5 +++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/supported_ops.md b/docs/supported_ops.md index a87c4e10575..7ca8d298242 100644 --- a/docs/supported_ops.md +++ b/docs/supported_ops.md @@ -537,7 +537,7 @@ Accelerator supports are described below. NS NS NS -NS +PS* (Only supported for Parquet; missing nested NULL, BINARY, CALENDAR, ARRAY, MAP, UDT) NS @@ -20629,7 +20629,7 @@ dates or timestamps, or for a lack of type coercion support. NS NS -NS +PS* (missing nested BINARY, ARRAY, MAP, UDT) NS diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py index 1840fde374d..329f05d5378 100644 --- a/integration_tests/src/main/python/parquet_write_test.py +++ b/integration_tests/src/main/python/parquet_write_test.py @@ -40,13 +40,12 @@ # we are limiting TimestampGen to avoid overflowing the INT96 value # see https://github.com/rapidsai/cudf/issues/8070 TimestampGen(start=datetime(1677, 9, 22, tzinfo=timezone.utc), end=datetime(2262, 4, 11, tzinfo=timezone.utc))] -parquet_basic_struct_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(parquet_basic_gen)]) +parquet_struct_gen = [StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(parquet_basic_gen)]), + StructGen([['child0', StructGen([[ 'child1', byte_gen]])]])] -parquet_write_gens_list = [ - parquet_basic_gen, - pytest.param(parquet_basic_gen + [decimal_gen_default, - decimal_gen_scale_precision, decimal_gen_same_scale_precision, decimal_gen_64bit], - marks=pytest.mark.allow_non_gpu("CoalesceExec"))] +parquet_write_gens_list = [parquet_basic_gen + parquet_struct_gen + + [decimal_gen_default, + decimal_gen_scale_precision, decimal_gen_same_scale_precision, decimal_gen_64bit]] parquet_ts_write_options = ['INT96', 'TIMESTAMP_MICROS', 'TIMESTAMP_MILLIS'] diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index 9e4a8cf8aca..8a9f7aabf30 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -761,7 +761,7 @@ object GpuOverrides { (ParquetFormatType, FileFormatChecks( cudfRead = (TypeSig.commonCudfTypes + TypeSig.DECIMAL + TypeSig.STRUCT + TypeSig.ARRAY + TypeSig.MAP).nested(), - cudfWrite = TypeSig.commonCudfTypes + TypeSig.DECIMAL, + cudfWrite = (TypeSig.commonCudfTypes + TypeSig.DECIMAL + TypeSig.STRUCT).nested(), sparkSig = (TypeSig.atomics + TypeSig.STRUCT + TypeSig.ARRAY + TypeSig.MAP + TypeSig.UDT).nested())), (OrcFormatType, FileFormatChecks( @@ -2836,7 +2836,8 @@ object GpuOverrides { exec[DataWritingCommandExec]( "Writing data", ExecChecks((TypeSig.commonCudfTypes + - TypeSig.DECIMAL.withPsNote(TypeEnum.DECIMAL, "Only supported for Parquet")).nested(), + TypeSig.DECIMAL.withPsNote(TypeEnum.DECIMAL, "Only supported for Parquet") + + TypeSig.STRUCT.withPsNote(TypeEnum.STRUCT, "Only supported for Parquet")).nested(), TypeSig.all), (p, conf, parent, r) => new SparkPlanMeta[DataWritingCommandExec](p, conf, parent, r) { override val childDataWriteCmds: scala.Seq[DataWritingCommandMeta[_]] =