Skip to content

Commit

Permalink
Add Struct support for ParquetWriter (NVIDIA#2514)
Browse files Browse the repository at this point in the history
* updated Parquet writer support docs

Signed-off-by: Raza Jafri <rjafri@nvidia.com>

* adding Struct to allow list for ParquetWriter

Signed-off-by: Raza Jafri <rjafri@nvidia.com>

* add nested struct test

Signed-off-by: Raza Jafri <rjafri@nvidia.com>

* removed duplicate data type from test

Signed-off-by: Raza Jafri <rjafri@nvidia.com>

* removed CoalesceExec from allowedOnCpu list because it supports structs now

Signed-off-by: Raza Jafri <rjafri@nvidia.com>

* Decimals are supported by CoalesceExec

Signed-off-by: Raza Jafri <rjafri@nvidia.com>

Co-authored-by: Raza Jafri <rjafri@nvidia.com>
  • Loading branch information
razajafri and razajafri authored May 28, 2021
1 parent 0ad2024 commit 9509cbb
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 10 deletions.
4 changes: 2 additions & 2 deletions docs/supported_ops.md
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,7 @@ Accelerator supports are described below.
<td><b>NS</b></td>
<td><b>NS</b></td>
<td><b>NS</b></td>
<td><b>NS</b></td>
<td><em>PS* (Only supported for Parquet; missing nested NULL, BINARY, CALENDAR, ARRAY, MAP, UDT)</em></td>
<td><b>NS</b></td>
</tr>
<tr>
Expand Down Expand Up @@ -20629,7 +20629,7 @@ dates or timestamps, or for a lack of type coercion support.
<td> </td>
<td><b>NS</b></td>
<td><b>NS</b></td>
<td><b>NS</b></td>
<td><em>PS* (missing nested BINARY, ARRAY, MAP, UDT)</em></td>
<td><b>NS</b></td>
</tr>
</table>
11 changes: 5 additions & 6 deletions integration_tests/src/main/python/parquet_write_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,13 +40,12 @@
# we are limiting TimestampGen to avoid overflowing the INT96 value
# see https://github.com/rapidsai/cudf/issues/8070
TimestampGen(start=datetime(1677, 9, 22, tzinfo=timezone.utc), end=datetime(2262, 4, 11, tzinfo=timezone.utc))]
parquet_basic_struct_gen = StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(parquet_basic_gen)])
parquet_struct_gen = [StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(parquet_basic_gen)]),
StructGen([['child0', StructGen([[ 'child1', byte_gen]])]])]

parquet_write_gens_list = [
parquet_basic_gen,
pytest.param(parquet_basic_gen + [decimal_gen_default,
decimal_gen_scale_precision, decimal_gen_same_scale_precision, decimal_gen_64bit],
marks=pytest.mark.allow_non_gpu("CoalesceExec"))]
parquet_write_gens_list = [parquet_basic_gen + parquet_struct_gen +
[decimal_gen_default,
decimal_gen_scale_precision, decimal_gen_same_scale_precision, decimal_gen_64bit]]

parquet_ts_write_options = ['INT96', 'TIMESTAMP_MICROS', 'TIMESTAMP_MILLIS']

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -761,7 +761,7 @@ object GpuOverrides {
(ParquetFormatType, FileFormatChecks(
cudfRead = (TypeSig.commonCudfTypes + TypeSig.DECIMAL + TypeSig.STRUCT + TypeSig.ARRAY +
TypeSig.MAP).nested(),
cudfWrite = TypeSig.commonCudfTypes + TypeSig.DECIMAL,
cudfWrite = (TypeSig.commonCudfTypes + TypeSig.DECIMAL + TypeSig.STRUCT).nested(),
sparkSig = (TypeSig.atomics + TypeSig.STRUCT + TypeSig.ARRAY + TypeSig.MAP +
TypeSig.UDT).nested())),
(OrcFormatType, FileFormatChecks(
Expand Down Expand Up @@ -2836,7 +2836,8 @@ object GpuOverrides {
exec[DataWritingCommandExec](
"Writing data",
ExecChecks((TypeSig.commonCudfTypes +
TypeSig.DECIMAL.withPsNote(TypeEnum.DECIMAL, "Only supported for Parquet")).nested(),
TypeSig.DECIMAL.withPsNote(TypeEnum.DECIMAL, "Only supported for Parquet") +
TypeSig.STRUCT.withPsNote(TypeEnum.STRUCT, "Only supported for Parquet")).nested(),
TypeSig.all),
(p, conf, parent, r) => new SparkPlanMeta[DataWritingCommandExec](p, conf, parent, r) {
override val childDataWriteCmds: scala.Seq[DataWritingCommandMeta[_]] =
Expand Down

0 comments on commit 9509cbb

Please sign in to comment.