From 122856928cc57456e27c02c9494f6f650c7bb502 Mon Sep 17 00:00:00 2001 From: Thomas Li Date: Thu, 13 Jun 2024 18:20:03 +0000 Subject: [PATCH] update following feedback --- python/cudf/cudf/_lib/json.pyx | 13 +- python/cudf/cudf/_lib/pylibcudf/io/types.pyx | 8 +- .../cudf/_lib/pylibcudf/libcudf/io/types.pxd | 1 + .../cudf/cudf/pylibcudf_tests/common/utils.py | 31 ++- python/cudf/cudf/pylibcudf_tests/conftest.py | 2 + .../cudf/cudf/pylibcudf_tests/test_copying.py | 245 ++++++++++++++++-- python/cudf/cudf/pylibcudf_tests/test_json.py | 16 -- 7 files changed, 262 insertions(+), 54 deletions(-) diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index d0e0875feb1..22e34feb547 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -227,13 +227,10 @@ cdef data_type _get_cudf_data_type_from_dtype(object dtype) except *: def _dtype_to_names_list(col): - cdef list child_names = [] if isinstance(col.dtype, cudf.StructDtype): - for child_col, name in zip(col.children, list(col.dtype.fields)): - child_names.append((name, _dtype_to_names_list(child_col))) + return [(name, _dtype_to_names_list(child)) + for name, child in zip(col.dtype.fields, col.children)] elif isinstance(col.dtype, cudf.ListDtype): - for child_col in col.children: - list_child_names = _dtype_to_names_list(child_col) - child_names.append(("", list_child_names)) - - return child_names + return [("", _dtype_to_names_list(child)) + for child in col.children] + return [] diff --git a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx index cd331c57bc7..953109176b9 100644 --- a/python/cudf/cudf/_lib/pylibcudf/io/types.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/io/types.pyx @@ -29,9 +29,9 @@ cdef class TableWithMetadata: Parameters ---------- - tbl: Table + tbl : Table The input table. - column_names: list + column_names : list A list of tuples each containing the name of each column and the names of its child columns (in the same format). e.g. @@ -193,6 +193,10 @@ cdef class SinkInfo: cdef unique_ptr[data_sink] sink cdef vector[string] paths + + if not sinks: + raise ValueError("Need to pass at least one sink") + if isinstance(sinks[0], io.StringIO): data_sinks.reserve(len(sinks)) for s in sinks: diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd index 8f9e8276fad..8d87deb1472 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/types.pxd @@ -75,6 +75,7 @@ cdef extern from "cudf/io/types.hpp" \ vector[column_name_info] children cdef cppclass table_metadata: + table_metadata() except + vector[string] column_names map[string, string] user_data diff --git a/python/cudf/cudf/pylibcudf_tests/common/utils.py b/python/cudf/cudf/pylibcudf_tests/common/utils.py index 08d21a9d5de..43a8ee6c2bc 100644 --- a/python/cudf/cudf/pylibcudf_tests/common/utils.py +++ b/python/cudf/cudf/pylibcudf_tests/common/utils.py @@ -133,12 +133,32 @@ def is_string(plc_dtype: plc.DataType): def is_fixed_width(plc_dtype: plc.DataType): return ( is_integer(plc_dtype) + or is_unsigned_integer(plc_dtype) or is_floating(plc_dtype) or is_boolean(plc_dtype) ) +def is_nested_struct(pa_type: pa.DataType): + if isinstance(pa_type, pa.StructType): + for i in range(pa_type.num_fields): + if isinstance(pa_type[i].type, pa.StructType): + return True + return False + + +def is_nested_list(pa_type: pa.DataType): + if isinstance(pa_type, pa.ListType): + return isinstance(pa_type.value_type, pa.ListType) + return False + + def sink_to_str(sink): + """ + Takes a sink (e.g. StringIO/BytesIO, filepath, etc.) + and reads in the contents into a string (str not bytes) + for comparison + """ if isinstance(sink, (str, os.PathLike)): with open(sink, "r") as f: str_result = f.read() @@ -151,8 +171,7 @@ def sink_to_str(sink): return str_result -# TODO: enable uint64, some failing tests -NUMERIC_PA_TYPES = [pa.int64(), pa.float64()] # pa.uint64()] +NUMERIC_PA_TYPES = [pa.int64(), pa.float64(), pa.uint64()] STRING_PA_TYPES = [pa.string()] BOOL_PA_TYPES = [pa.bool_()] LIST_PA_TYPES = [ @@ -187,10 +206,8 @@ def sink_to_str(sink): + BOOL_PA_TYPES # exclude nested list/struct cases # since not all tests work with them yet - + LIST_PA_TYPES[:1] - + DEFAULT_PA_STRUCT_TESTING_TYPES[:1] + + LIST_PA_TYPES # [:1] + + DEFAULT_PA_STRUCT_TESTING_TYPES # [:1] ) -ALL_PA_TYPES = ( - DEFAULT_PA_TYPES + LIST_PA_TYPES[1:] + DEFAULT_PA_STRUCT_TESTING_TYPES[1:] -) +ALL_PA_TYPES = DEFAULT_PA_TYPES # + LIST_PA_TYPES[1:] + DEFAULT_PA_STRUCT_TESTING_TYPES[1:] diff --git a/python/cudf/cudf/pylibcudf_tests/conftest.py b/python/cudf/cudf/pylibcudf_tests/conftest.py index e7d068ba4d2..bedcf39a314 100644 --- a/python/cudf/cudf/pylibcudf_tests/conftest.py +++ b/python/cudf/cudf/pylibcudf_tests/conftest.py @@ -57,6 +57,8 @@ def table_data(request): # plc.io.TableWithMetadata colnames = [] + np.random.seed(42) + for typ in ALL_PA_TYPES: rand_vals = np.random.randint(0, nrows, nrows) child_colnames = [] diff --git a/python/cudf/cudf/pylibcudf_tests/test_copying.py b/python/cudf/cudf/pylibcudf_tests/test_copying.py index da3ca3a6d1e..b4ae3f7c189 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_copying.py +++ b/python/cudf/cudf/pylibcudf_tests/test_copying.py @@ -11,6 +11,8 @@ is_fixed_width, is_floating, is_integer, + is_nested_list, + is_nested_struct, is_string, metadata_from_arrow_array, ) @@ -18,6 +20,8 @@ from cudf._lib import pylibcudf as plc +# TODO: consider moving this to conftest and "pairing" +# it with pa_type, so that they don't get out of sync # TODO: Test nullable data @pytest.fixture(scope="module") def input_column(pa_type): @@ -28,10 +32,27 @@ def input_column(pa_type): elif pa.types.is_boolean(pa_type): pa_array = pa.array([True, True, False], type=pa_type) elif pa.types.is_list(pa_type): - # TODO: Add heterogenous sizes - pa_array = pa.array([[1], [2], [3]], type=pa_type) + if pa_type.value_type == pa.int64(): + pa_array = pa.array([[1], [2, 3], [3]], type=pa_type) + elif ( + isinstance(pa_type.value_type, pa.ListType) + and pa_type.value_type.value_type == pa.int64() + ): + pa_array = pa.array([[[1]], [[2, 3]], [[3]]], type=pa_type) + else: + raise ValueError("Unsupported type " + pa_type.value_type) elif pa.types.is_struct(pa_type): - pa_array = pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) + if not is_nested_struct(pa_type): + pa_array = pa.array([{"v": 1}, {"v": 2}, {"v": 3}], type=pa_type) + else: + pa_array = pa.array( + [ + {"a": 1, "b_struct": {"b": 1.0}}, + {"a": 2, "b_struct": {"b": 2.0}}, + {"a": 3, "b_struct": {"b": 3.0}}, + ], + type=pa_type, + ) else: raise ValueError("Unsupported type") return pa_array, plc.interop.from_arrow(pa_array) @@ -55,13 +76,37 @@ def target_column(pa_type): [False, True, True, False, True, False], type=pa_type ) elif pa.types.is_list(pa_type): - # TODO: Add heterogenous sizes - pa_array = pa.array([[4], [5], [6], [7], [8], [9]], type=pa_type) + if pa_type.value_type == pa.int64(): + pa_array = pa.array( + [[4], [5, 6], [7], [8], [9], [10]], type=pa_type + ) + elif ( + isinstance(pa_type.value_type, pa.ListType) + and pa_type.value_type.value_type == pa.int64() + ): + pa_array = pa.array( + [[[4]], [[5, 6]], [[7]], [[8]], [[9]], [[10]]], type=pa_type + ) + else: + raise ValueError("Unsupported type") elif pa.types.is_struct(pa_type): - pa_array = pa.array( - [{"v": 4}, {"v": 5}, {"v": 6}, {"v": 7}, {"v": 8}, {"v": 9}], - type=pa_type, - ) + if not is_nested_struct(pa_type): + pa_array = pa.array( + [{"v": 4}, {"v": 5}, {"v": 6}, {"v": 7}, {"v": 8}, {"v": 9}], + type=pa_type, + ) + else: + pa_array = pa.array( + [ + {"a": 4, "b_struct": {"b": 4.0}}, + {"a": 5, "b_struct": {"b": 5.0}}, + {"a": 6, "b_struct": {"b": 6.0}}, + {"a": 7, "b_struct": {"b": 7.0}}, + {"a": 8, "b_struct": {"b": 8.0}}, + {"a": 9, "b_struct": {"b": 9.0}}, + ], + type=pa_type, + ) else: raise ValueError("Unsupported type") return pa_array, plc.interop.from_arrow(pa_array) @@ -96,10 +141,22 @@ def source_scalar(pa_type): elif pa.types.is_boolean(pa_type): pa_scalar = pa.scalar(False, type=pa_type) elif pa.types.is_list(pa_type): - # TODO: Longer list? - pa_scalar = pa.scalar([1], type=pa_type) + if pa_type.value_type == pa.int64(): + pa_scalar = pa.scalar([1, 2, 3, 4], type=pa_type) + elif ( + isinstance(pa_type.value_type, pa.ListType) + and pa_type.value_type.value_type == pa.int64() + ): + pa_scalar = pa.scalar([[1, 2, 3, 4]], type=pa_type) + else: + raise ValueError("Unsupported type") elif pa.types.is_struct(pa_type): - pa_scalar = pa.scalar({"v": 1}, type=pa_type) + if not is_nested_struct(pa_type): + pa_scalar = pa.scalar({"v": 1}, type=pa_type) + else: + pa_scalar = pa.scalar( + {"a": 1, "b_struct": {"b": 1.0}}, type=pa_type + ) else: raise ValueError("Unsupported type") return pa_scalar, plc.interop.from_arrow(pa_scalar) @@ -112,9 +169,21 @@ def mask(target_column): return pa_mask, plc.interop.from_arrow(pa_mask) -def test_gather(target_table, index_column): +def test_gather(request, target_table, index_column): pa_target_table, plc_target_table = target_table pa_index_column, plc_index_column = index_column + request.applymarker( + pytest.mark.xfail( + condition=any( + [is_nested_struct(col.type) for col in pa_target_table.columns] + ), + reason="pylibcudf interop fails with nested struct", + ) + ) + if any([is_nested_list(col.type) for col in pa_target_table.columns]): + pytest.skip( + reason="pylibcudf interop fails with memoryerror/segfault", + ) result = plc.copying.gather( plc_target_table, plc_index_column, @@ -168,6 +237,7 @@ def _pyarrow_boolean_mask_scatter_table(source, mask, target_table): def test_scatter_table( + request, source_table, index_column, target_table, @@ -175,6 +245,18 @@ def test_scatter_table( pa_source_table, plc_source_table = source_table pa_index_column, plc_index_column = index_column pa_target_table, plc_target_table = target_table + request.applymarker( + pytest.mark.xfail( + condition=any( + [is_nested_struct(col.type) for col in pa_target_table.columns] + ), + reason="pylibcudf interop fails with nested struct", + ) + ) + if any([is_nested_list(col.type) for col in pa_target_table.columns]): + pytest.skip( + reason="pylibcudf interop fails with memoryerror/segfault", + ) result = plc.copying.scatter( plc_source_table, plc_index_column, @@ -197,7 +279,7 @@ def test_scatter_table( if pa.types.is_list(dtype := pa_target_table[0].type): expected = pa.table( - [pa.array([[4], [1], [2], [3], [8], [9]])] * 3, [""] * 3 + [pa.array([[4], [1], [2, 3], [3], [9], [10]])] * 3, [""] * 3 ) elif pa.types.is_struct(dtype): expected = pa.table( @@ -290,6 +372,7 @@ def test_scatter_table_type_mismatch(source_table, index_column, target_table): def test_scatter_scalars( + request, source_scalar, index_column, target_table, @@ -297,6 +380,18 @@ def test_scatter_scalars( pa_source_scalar, plc_source_scalar = source_scalar pa_index_column, plc_index_column = index_column pa_target_table, plc_target_table = target_table + request.applymarker( + pytest.mark.xfail( + condition=any( + [is_nested_struct(col.type) for col in pa_target_table.columns] + ), + reason="pylibcudf interop fails with nested struct", + ) + ) + if any([is_nested_list(col.type) for col in pa_target_table.columns]): + pytest.skip( + reason="pylibcudf interop fails with memoryerror/segfault", + ) result = plc.copying.scatter( [plc_source_scalar] * plc_target_table.num_columns(), plc_index_column, @@ -566,11 +661,21 @@ def test_shift_type_mismatch(target_column): plc.copying.shift(plc_target_column, 2, fill_value) -def test_slice_column(target_column): +def test_slice_column(request, target_column): pa_target_column, plc_target_column = target_column bounds = list(range(6)) upper_bounds = bounds[1::2] lower_bounds = bounds[::2] + request.applymarker( + pytest.mark.xfail( + condition=is_nested_struct(pa_target_column.type), + reason="pylibcudf interop fails with nested struct", + ) + ) + if is_nested_list(pa_target_column.type): + pytest.skip( + reason="pylibcudf interop fails with memoryerror/segfault", + ) result = plc.copying.slice(plc_target_column, bounds) for lb, ub, slice_ in zip(lower_bounds, upper_bounds, result): assert_column_eq(pa_target_column[lb:ub], slice_) @@ -594,8 +699,20 @@ def test_slice_column_out_of_bounds(target_column): plc.copying.slice(plc_target_column, list(range(2, 8))) -def test_slice_table(target_table): +def test_slice_table(request, target_table): pa_target_table, plc_target_table = target_table + request.applymarker( + pytest.mark.xfail( + condition=any( + [is_nested_struct(col.type) for col in pa_target_table.columns] + ), + reason="pylibcudf interop fails with nested struct", + ) + ) + if any([is_nested_list(col.type) for col in pa_target_table.columns]): + pytest.skip( + reason="pylibcudf interop fails with memoryerror/segfault", + ) bounds = list(range(6)) upper_bounds = bounds[1::2] lower_bounds = bounds[::2] @@ -604,10 +721,20 @@ def test_slice_table(target_table): assert_table_eq(pa_target_table[lb:ub], slice_) -def test_split_column(target_column): +def test_split_column(request, target_column): upper_bounds = [1, 3, 5] lower_bounds = [0] + upper_bounds[:-1] pa_target_column, plc_target_column = target_column + request.applymarker( + pytest.mark.xfail( + condition=is_nested_struct(pa_target_column.type), + reason="pylibcudf interop fails with nested struct", + ) + ) + if is_nested_list(pa_target_column.type): + pytest.skip( + reason="pylibcudf interop fails with memoryerror/segfault", + ) result = plc.copying.split(plc_target_column, upper_bounds) for lb, ub, split in zip(lower_bounds, upper_bounds, result): assert_column_eq(pa_target_column[lb:ub], split) @@ -625,8 +752,21 @@ def test_split_column_out_of_bounds(target_column): plc.copying.split(plc_target_column, list(range(5, 8))) -def test_split_table(target_table): +def test_split_table(request, target_table): pa_target_table, plc_target_table = target_table + request.applymarker( + pytest.mark.xfail( + condition=any( + [is_nested_struct(col.type) for col in pa_target_table.columns] + ), + reason="pylibcudf interop fails with nested struct", + ) + ) + if any([is_nested_list(col.type) for col in pa_target_table.columns]): + pytest.skip( + reason="pylibcudf interop fails with memoryerror/segfault", + ) + upper_bounds = [1, 3, 5] lower_bounds = [0] + upper_bounds[:-1] result = plc.copying.split(plc_target_table, upper_bounds) @@ -634,7 +774,9 @@ def test_split_table(target_table): assert_table_eq(pa_target_table[lb:ub], split) -def test_copy_if_else_column_column(target_column, mask, source_scalar): +def test_copy_if_else_column_column( + request, target_column, mask, source_scalar +): pa_target_column, plc_target_column = target_column pa_source_scalar, _ = source_scalar pa_mask, plc_mask = mask @@ -644,6 +786,17 @@ def test_copy_if_else_column_column(target_column, mask, source_scalar): ) plc_other_column = plc.interop.from_arrow(pa_other_column) + request.applymarker( + pytest.mark.xfail( + condition=is_nested_struct(pa_target_column.type), + reason="pylibcudf interop fails with nested struct", + ) + ) + if is_nested_list(pa_target_column.type): + pytest.skip( + reason="pylibcudf interop fails with memoryerror/segfault", + ) + result = plc.copying.copy_if_else( plc_target_column, plc_other_column, @@ -710,6 +863,7 @@ def test_copy_if_else_wrong_size_mask(target_column): @pytest.mark.parametrize("array_left", [True, False]) def test_copy_if_else_column_scalar( + request, target_column, source_scalar, array_left, @@ -718,6 +872,19 @@ def test_copy_if_else_column_scalar( pa_target_column, plc_target_column = target_column pa_source_scalar, plc_source_scalar = source_scalar pa_mask, plc_mask = mask + + request.applymarker( + pytest.mark.xfail( + condition=is_nested_struct(pa_target_column.type), + reason="pylibcudf interop fails with nested struct", + ) + ) + + if is_nested_list(pa_target_column.type): + pytest.skip( + reason="pylibcudf interop fails with memoryerror/segfault", + ) + args = ( (plc_target_column, plc_source_scalar) if array_left @@ -741,12 +908,25 @@ def test_copy_if_else_column_scalar( def test_boolean_mask_scatter_from_table( + request, source_table, target_table, mask, ): pa_source_table, plc_source_table = source_table pa_target_table, plc_target_table = target_table + request.applymarker( + pytest.mark.xfail( + condition=any( + is_nested_struct(col.type) for col in pa_source_table.columns + ), + reason="pylibcudf interop fails with nested struct", + ) + ) + if any([is_nested_list(col.type) for col in pa_target_table.columns]): + pytest.skip( + reason="pylibcudf interop fails with memoryerror/segfault", + ) pa_mask, plc_mask = mask result = plc.copying.boolean_mask_scatter( @@ -767,7 +947,7 @@ def test_boolean_mask_scatter_from_table( if pa.types.is_list(dtype := pa_target_table[0].type): expected = pa.table( - [pa.array([[1], [5], [2], [7], [3], [9]])] * 3, [""] * 3 + [pa.array([[1], [5, 6], [2, 3], [8], [3], [10]])] * 3, [""] * 3 ) elif pa.types.is_struct(dtype): expected = pa.table( @@ -858,6 +1038,7 @@ def test_boolean_mask_scatter_from_wrong_mask_type(source_table, target_table): def test_boolean_mask_scatter_from_scalars( + request, source_scalar, target_table, mask, @@ -865,6 +1046,18 @@ def test_boolean_mask_scatter_from_scalars( pa_source_scalar, plc_source_scalar = source_scalar pa_target_table, plc_target_table = target_table pa_mask, plc_mask = mask + request.applymarker( + pytest.mark.xfail( + condition=any( + is_nested_struct(col.type) for col in pa_target_table.columns + ), + reason="pylibcudf interop fails with nested struct", + ) + ) + if any([is_nested_list(col.type) for col in pa_target_table.columns]): + pytest.skip( + reason="pylibcudf interop fails with memoryerror/segfault", + ) result = plc.copying.boolean_mask_scatter( [plc_source_scalar] * 3, plc_target_table, @@ -880,9 +1073,19 @@ def test_boolean_mask_scatter_from_scalars( assert_table_eq(expected, result) -def test_get_element(input_column): +def test_get_element(request, input_column): index = 1 pa_input_column, plc_input_column = input_column + request.applymarker( + pytest.mark.xfail( + condition=is_nested_struct(pa_input_column.type), + reason="pylibcudf interop fails with nested struct", + ) + ) + if is_nested_list(pa_input_column.type): + pytest.skip( + reason="pylibcudf interop fails with memoryerror/segfault", + ) result = plc.copying.get_element(plc_input_column, index) assert ( diff --git a/python/cudf/cudf/pylibcudf_tests/test_json.py b/python/cudf/cudf/pylibcudf_tests/test_json.py index 0a7a49991b4..a24424cd2a3 100644 --- a/python/cudf/cudf/pylibcudf_tests/test_json.py +++ b/python/cudf/cudf/pylibcudf_tests/test_json.py @@ -1,7 +1,6 @@ # Copyright (c) 2024, NVIDIA CORPORATION. import io -import pandas as pd import pyarrow as pa import pytest from utils import sink_to_str @@ -23,13 +22,8 @@ def test_write_json_basic(table_data, source_or_sink, lines, rows_per_chunk): plc.io.SinkInfo([sink]), plc_table_w_meta, lines=lines, **kwargs ) - # orient=records (basically what the cudf json writer does, - # doesn't preserve colnames when there are zero rows in table) exp = pa_table.to_pandas() - if len(exp) == 0: - exp = pd.DataFrame() - # Convert everything to string to make # comparisons easier str_result = sink_to_str(sink) @@ -61,13 +55,8 @@ def test_write_json_nulls(na_rep, include_nulls): include_nulls=include_nulls, ) - # orient=records (basically what the cudf json writer does, - # doesn't preserve colnames when there are zero rows in table) exp = pa_tbl.to_pandas() - if len(exp) == 0: - exp = pd.DataFrame() - # Convert everything to string to make # comparisons easier str_result = sink_to_str(sink) @@ -111,13 +100,8 @@ def test_write_json_bool_opts(true_value, false_value): false_value=false_value, ) - # orient=records (basically what the cudf json writer does, - # doesn't preserve colnames when there are zero rows in table) exp = pa_tbl.to_pandas() - if len(exp) == 0: - exp = pd.DataFrame() - # Convert everything to string to make # comparisons easier str_result = sink_to_str(sink)