Skip to content

Commit

Permalink
UnionExec array and nested array support (#3359)
Browse files Browse the repository at this point in the history
Signed-off-by: Ryan Lee <ryanlee@nvidia.com>
  • Loading branch information
rwlee authored Sep 3, 2021
1 parent f774e1a commit 13bb3a3
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 14 deletions.
6 changes: 3 additions & 3 deletions docs/supported_ops.md
Original file line number Diff line number Diff line change
Expand Up @@ -438,9 +438,9 @@ Accelerator supports are described below.
<td>S</td>
<td><b>NS</b></td>
<td><b>NS</b></td>
<td><b>NS</b></td>
<td><em>PS<br/>max child DECIMAL precision of 18;<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, ARRAY, UDT</em></td>
<td><em>PS<br/>unionByName will not optionally impute nulls for missing struct fields when the column is a struct and there are non-overlapping fields;<br/>max child DECIMAL precision of 18;<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, ARRAY, UDT</em></td>
<td><em>PS<br/>max child DECIMAL precision of 18;<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT</em></td>
<td><em>PS<br/>max child DECIMAL precision of 18;<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT</em></td>
<td><em>PS<br/>unionByName will not optionally impute nulls for missing struct fields when the column is a struct and there are non-overlapping fields;<br/>max child DECIMAL precision of 18;<br/>UTC is only supported TZ for child TIMESTAMP;<br/>unsupported child types BINARY, CALENDAR, UDT</em></td>
<td><b>NS</b></td>
</tr>
<tr>
Expand Down
33 changes: 24 additions & 9 deletions integration_tests/src/main/python/repart_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,18 @@
map_gens = [simple_string_to_string_map_gen,
MapGen(RepeatSeqGen(IntegerGen(nullable=False), 10), long_gen, max_length=10),
MapGen(BooleanGen(nullable=False), boolean_gen, max_length=2),
MapGen(StringGen(pattern='key_[0-9]', nullable=False), simple_string_to_string_map_gen)]
MapGen(StringGen(pattern='key_[0-9]', nullable=False), simple_string_to_string_map_gen),
MapGen(
LongGen(nullable=False),
MapGen(
DecimalGen(7, 2, nullable=False),
MapGen(
IntegerGen(nullable=False),
StringGen(pattern='value_[0-9]', nullable=False),
max_length=4),
max_length=7),
max_length=5)]

struct_of_maps = StructGen([['child0', BooleanGen()]] + [
['child%d' % (i + 1), gen] for i, gen in enumerate(map_gens)])

Expand Down Expand Up @@ -75,27 +86,29 @@ def test_union_struct_missing_children(data_gen):
lambda spark : binary_op_df(spark, left_gen).unionByName(binary_op_df(
spark, right_gen), True))

@pytest.mark.parametrize('data_gen', all_gen + map_gens +
@pytest.mark.parametrize('data_gen', all_gen + map_gens + array_gens_sample +
[all_basic_struct_gen,
StructGen([['child0', DecimalGen(7, 2)]]),
nested_struct,
struct_of_maps], ids=idfn)
# This tests union of two DFs of two cols each. The types of the left col and right col is the same
def test_union(data_gen):
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).union(binary_op_df(spark, data_gen)))
lambda spark : binary_op_df(spark, data_gen).union(binary_op_df(spark, data_gen)),
conf=allow_negative_scale_of_decimal_conf)

@pytest.mark.parametrize('data_gen', all_gen + map_gens +
@pytest.mark.parametrize('data_gen', all_gen + map_gens + array_gens_sample +
[all_basic_struct_gen,
StructGen([['child0', DecimalGen(7, 2)]]),
nested_struct,
struct_of_maps], ids=idfn)
# This tests union of two DFs of two cols each. The types of the left col and right col is the same
def test_unionAll(data_gen):
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).unionAll(binary_op_df(spark, data_gen)))
lambda spark : binary_op_df(spark, data_gen).unionAll(binary_op_df(spark, data_gen)),
conf=allow_negative_scale_of_decimal_conf)

@pytest.mark.parametrize('data_gen', all_gen + map_gens +
@pytest.mark.parametrize('data_gen', all_gen + map_gens + array_gens_sample +
[all_basic_struct_gen,
pytest.param(all_basic_struct_gen, marks=nested_scalar_mark),
pytest.param(StructGen([[ 'child0', DecimalGen(7, 2)]]), marks=nested_scalar_mark),
Expand All @@ -109,16 +122,18 @@ def test_unionAll(data_gen):
def test_union_by_missing_col_name(data_gen):
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).withColumnRenamed("a", "x")
.unionByName(binary_op_df(spark, data_gen).withColumnRenamed("a", "y"), True))
.unionByName(binary_op_df(spark, data_gen).withColumnRenamed("a", "y"), True),
conf=allow_negative_scale_of_decimal_conf)

@pytest.mark.parametrize('data_gen', all_gen + map_gens +
@pytest.mark.parametrize('data_gen', all_gen + map_gens + array_gens_sample +
[all_basic_struct_gen,
StructGen([['child0', DecimalGen(7, 2)]]),
nested_struct,
struct_of_maps], ids=idfn)
def test_union_by_name(data_gen):
assert_gpu_and_cpu_are_equal_collect(
lambda spark : binary_op_df(spark, data_gen).unionByName(binary_op_df(spark, data_gen)))
lambda spark : binary_op_df(spark, data_gen).unionByName(binary_op_df(spark, data_gen)),
conf=allow_negative_scale_of_decimal_conf)


@pytest.mark.parametrize('data_gen', [
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3252,8 +3252,8 @@ object GpuOverrides {
exec[UnionExec](
"The backend for the union operator",
ExecChecks(TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL_64 + TypeSig.MAP +
TypeSig.STRUCT.nested(TypeSig.commonCudfTypes + TypeSig.NULL +
TypeSig.DECIMAL_64 + TypeSig.STRUCT + TypeSig.MAP)
TypeSig.ARRAY + TypeSig.STRUCT.nested(TypeSig.commonCudfTypes + TypeSig.NULL +
TypeSig.DECIMAL_64 + TypeSig.STRUCT + TypeSig.MAP + TypeSig.ARRAY)
.withPsNote(TypeEnum.STRUCT,
"unionByName will not optionally impute nulls for missing struct fields " +
"when the column is a struct and there are non-overlapping fields"), TypeSig.all),
Expand Down

0 comments on commit 13bb3a3

Please sign in to comment.