diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 73724b99589..aa7dca0dad3 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -83,6 +83,9 @@ class json_reader_options { // Whether to use the experimental reader bool _experimental = false; + // Whether to keep the quote characters of string values + bool _keep_quotes = false; + /** * @brief Constructor from source info. * @@ -203,6 +206,13 @@ class json_reader_options { */ bool is_enabled_experimental() const { return _experimental; } + /** + * @brief Whether the experimental reader should keep quotes of string values. + * + * @returns true if the experimental reader should keep quotes, false otherwise + */ + bool is_enabled_keep_quotes() const { return _keep_quotes; } + /** * @brief Set data types for columns to be read. * @@ -258,6 +268,14 @@ class json_reader_options { * @param val Boolean value to enable/disable the experimental reader */ void enable_experimental(bool val) { _experimental = val; } + + /** + * @brief Set whether the experimental reader should keep quotes of string values. + * + * @param val Boolean value to indicate whether the experimental reader should keep quotes + * of string values + */ + void enable_keep_quotes(bool val) { _keep_quotes = val; } }; /** @@ -377,6 +395,19 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether the experimental reader should keep quotes of string values. + * + * @param val Boolean value to indicate whether the experimental reader should keep quotes + * of string values + * @return this for chaining + */ + json_reader_options_builder& keep_quotes(bool val) + { + options._keep_quotes = val; + return *this; + } + /** * @brief move json_reader_options member once it's built. */ diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index fca9a3ecc42..dccd6a81e28 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -127,6 +128,7 @@ struct json_column { // Following "items" as the default child column's name of a list column // Using the struct's field names std::map child_columns; + std::vector column_order; // Counting the current number of items in this column row_offset_t current_offset = 0; @@ -142,19 +144,7 @@ struct json_column { * * @param up_to_row_offset The row offset up to which to fill with nulls. */ - void null_fill(row_offset_t up_to_row_offset) - { - // Fill all the rows up to up_to_row_offset with "empty"/null rows - validity.resize(word_index(up_to_row_offset) + 1); - std::fill_n(std::back_inserter(string_offsets), - up_to_row_offset - string_offsets.size(), - (string_offsets.size() > 0) ? string_offsets.back() : 0); - std::fill_n(std::back_inserter(string_lengths), up_to_row_offset - string_lengths.size(), 0); - std::fill_n(std::back_inserter(child_offsets), - up_to_row_offset + 1 - child_offsets.size(), - (child_offsets.size() > 0) ? child_offsets.back() : 0); - current_offset = up_to_row_offset; - } + void null_fill(row_offset_t up_to_row_offset); /** * @brief Recursively iterates through the tree of columns making sure that all child columns of a @@ -162,26 +152,7 @@ struct json_column { * * @param min_row_count The minimum number of rows to be filled. */ - void level_child_cols_recursively(row_offset_t min_row_count) - { - // Fill this columns with nulls up to the given row count - null_fill(min_row_count); - - // If this is a struct column, we need to level all its child columns - if (type == json_col_t::StructColumn) { - for (auto it = std::begin(child_columns); it != std::end(child_columns); it++) { - it->second.level_child_cols_recursively(min_row_count); - } - } - // If this is a list column, we need to make sure that its child column levels its children - else if (type == json_col_t::ListColumn) { - auto it = std::begin(child_columns); - // Make that child column fill its child columns up to its own row count - if (it != std::end(child_columns)) { - it->second.level_child_cols_recursively(it->second.current_offset); - } - } - } + void level_child_cols_recursively(row_offset_t min_row_count); /** * @brief Appends the row at the given index to the column, filling all rows between the column's @@ -195,42 +166,10 @@ struct json_column { * the offsets */ void append_row(uint32_t row_index, - json_col_t const& row_type, + json_col_t row_type, uint32_t string_offset, uint32_t string_end, - uint32_t child_count) - { - // If, thus far, the column's type couldn't be inferred, we infer it to the given type - if (type == json_col_t::Unknown) { type = row_type; } - - // We shouldn't run into this, as we shouldn't be asked to append an "unknown" row type - // CUDF_EXPECTS(type != json_col_t::Unknown, "Encountered invalid JSON token sequence"); - - // Fill all the omitted rows with "empty"/null rows (if needed) - null_fill(row_index); - - // Table listing what we intend to use for a given column type and row type combination - // col type | row type => {valid, FAIL, null} - // ----------------------------------------------- - // List | List => valid - // List | Struct => FAIL - // List | String => null - // Struct | List => FAIL - // Struct | Struct => valid - // Struct | String => null - // String | List => null - // String | Struct => null - // String | String => valid - bool const is_valid = (type == row_type); - if (static_cast(validity.size()) < word_index(current_offset)) - validity.push_back({}); - set_bit_unsafe(&validity.back(), intra_word_index(current_offset)); - valid_count += (is_valid) ? 1U : 0U; - string_offsets.push_back(string_offset); - string_lengths.push_back(string_end - string_offset); - child_offsets.push_back((child_offsets.size() > 0) ? child_offsets.back() + child_count : 0); - current_offset++; - }; + uint32_t child_count); }; /** diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 4c525caa3c8..de814cb5358 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -18,12 +18,14 @@ #include #include -#include +#include +#include #include #include #include #include +#include #include #include #include @@ -31,10 +33,12 @@ #include #include +#include #include #include #include +#include #include #include @@ -130,9 +134,9 @@ std::array, TT_NUM_STATES> const trans // Translation table (i.e., for each transition, what are the symbols that we output) std::array, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const translation_table{ {/* IN_STATE { [ } ] " \ OTHER */ - /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}}}, - /* TT_STR */ {{{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}}, - /* TT_ESC */ {{{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}}}}; + /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}}}, + /* TT_STR */ {{{}, {}, {}, {}, {}, {}, {}}}, + /* TT_ESC */ {{{}, {}, {}, {}, {}, {}, {}}}}}; // The DFA's starting state constexpr auto start_state = static_cast(TT_OOS); @@ -915,14 +919,107 @@ struct JSONToStackOp { } }; +void json_column::null_fill(row_offset_t up_to_row_offset) +{ + // Fill all the rows up to up_to_row_offset with "empty"/null rows + validity.resize(word_index(up_to_row_offset) + 1); + std::fill_n(std::back_inserter(string_offsets), + up_to_row_offset - string_offsets.size(), + (string_offsets.size() > 0) ? string_offsets.back() : 0); + std::fill_n(std::back_inserter(string_lengths), up_to_row_offset - string_lengths.size(), 0); + std::fill_n(std::back_inserter(child_offsets), + up_to_row_offset + 1 - child_offsets.size(), + (child_offsets.size() > 0) ? child_offsets.back() : 0); + current_offset = up_to_row_offset; +} + +void json_column::level_child_cols_recursively(row_offset_t min_row_count) +{ + // Fill this columns with nulls up to the given row count + null_fill(min_row_count); + + // If this is a struct column, we need to level all its child columns + if (type == json_col_t::StructColumn) { + for (auto it = std::begin(child_columns); it != std::end(child_columns); it++) { + it->second.level_child_cols_recursively(min_row_count); + } + } + // If this is a list column, we need to make sure that its child column levels its children + else if (type == json_col_t::ListColumn) { + auto it = std::begin(child_columns); + // Make that child column fill its child columns up to its own row count + if (it != std::end(child_columns)) { + it->second.level_child_cols_recursively(it->second.current_offset); + } + } +}; + +void json_column::append_row(uint32_t row_index, + json_col_t row_type, + uint32_t string_offset, + uint32_t string_end, + uint32_t child_count) +{ + // If, thus far, the column's type couldn't be inferred, we infer it to the given type + if (type == json_col_t::Unknown) { + type = row_type; + } + // If, at some point within a column, we encounter a nested type (list or struct), + // we change that column's type to that respective nested type and invalidate all previous rows + else if (type == json_col_t::StringColumn && + (row_type == json_col_t::ListColumn || row_type == json_col_t::StructColumn)) { + // Change the column type + type = row_type; + + // Invalidate all previous entries, as they were _not_ of the nested type to which we just + // converted + std::fill_n(validity.begin(), validity.size(), 0); + valid_count = 0U; + } + // If this is a nested column but we're trying to insert either (a) a list node into a struct + // column or (b) a struct node into a list column, we fail + CUDF_EXPECTS(not((type == json_col_t::ListColumn and row_type == json_col_t::StructColumn) or + (type == json_col_t::StructColumn and row_type == json_col_t::ListColumn)), + "A mix of lists and structs within the same column is not supported"); + + // We shouldn't run into this, as we shouldn't be asked to append an "unknown" row type + CUDF_EXPECTS(type != json_col_t::Unknown, "Encountered invalid JSON token sequence"); + + // Fill all the omitted rows with "empty"/null rows (if needed) + null_fill(row_index); + + // Table listing what we intend to use for a given column type and row type combination + // col type | row type => {valid, FAIL, null} + // ----------------------------------------------- + // List | List => valid + // List | Struct => FAIL + // List | String => null + // Struct | List => FAIL + // Struct | Struct => valid + // Struct | String => null + // String | List => valid (we switch col type to list, null'ing all previous rows) + // String | Struct => valid (we switch col type to list, null'ing all previous rows) + // String | String => valid + bool const is_valid = (type == row_type); + if (static_cast(validity.size()) < word_index(current_offset)) validity.push_back({}); + if (is_valid) { set_bit_unsafe(&validity.back(), intra_word_index(current_offset)); } + valid_count += (is_valid) ? 1U : 0U; + string_offsets.push_back(string_offset); + string_lengths.push_back(string_end - string_offset); + child_offsets.push_back((child_offsets.size() > 0) ? child_offsets.back() + child_count : 0); + current_offset++; +}; + namespace detail { void get_stack_context(device_span json_in, SymbolT* d_top_of_stack, rmm::cuda_stream_view stream) { + // Range of encapsulating function that comprises: + // -> DFA simulation for filtering out brackets and braces inside of quotes + // -> Logical stack to infer the stack context CUDF_FUNC_RANGE(); - constexpr std::size_t single_item = 1; // Symbol representing the JSON-root (i.e., we're at nesting level '0') constexpr StackSymbolT root_symbol = '_'; @@ -930,7 +1027,7 @@ void get_stack_context(device_span json_in, constexpr StackSymbolT read_symbol = 'x'; // Number of stack operations in the input (i.e., number of '{', '}', '[', ']' outside of quotes) - hostdevice_vector num_stack_ops(single_item, stream); + rmm::device_scalar d_num_stack_ops(stream); // Sequence of stack symbols and their position in the original input (sparse representation) rmm::device_uvector stack_ops{json_in.size(), stream}; @@ -953,14 +1050,17 @@ void get_stack_context(device_span json_in, static_cast(json_in.size()), stack_ops.data(), stack_op_indices.data(), - num_stack_ops.device_ptr(), + d_num_stack_ops.data(), to_stack_op::start_state, stream); + // Copy back to actual number of stack operations + auto const num_stack_ops = d_num_stack_ops.value(stream); + // stack operations with indices are converted to top of the stack for each character in the input fst::sparse_stack_op_to_top_of_stack( stack_ops.data(), - device_span{stack_op_indices.data(), stack_op_indices.size()}, + device_span{stack_op_indices.data(), num_stack_ops}, JSONToStackOp{}, d_top_of_stack, root_symbol, @@ -975,7 +1075,9 @@ std::pair, rmm::device_uvector> ge rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + // Range of encapsulating function that parses to internal columnar data representation CUDF_FUNC_RANGE(); + rmm::device_uvector tokens{json_in.size(), stream, mr}; rmm::device_uvector tokens_indices{json_in.size(), stream, mr}; rmm::device_scalar num_written_tokens{stream, mr}; @@ -1039,6 +1141,8 @@ std::pair, rmm::device_uvector> ge * @param[in] input The JSON input in host memory * @param[in] d_input The JSON input in device memory * @param[in] options Parsing options specifying the parsing behaviour + * @param[in] include_quote_char Whether to include the original quote chars around string values, + * allowing to distinguish string values from numeric and literal values * @param[in] stream The CUDA stream to which kernels are dispatched * @param[in] mr Optional, resource with which to allocate * @return The columnar representation of the data from the given JSON input @@ -1048,13 +1152,15 @@ void make_json_column(json_column& root_column, host_span input, device_span d_input, cudf::io::json_reader_options const& options, + bool include_quote_char, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { + // Range of encapsulating function that parses to internal columnar data representation CUDF_FUNC_RANGE(); + // Default name for a list's child column std::string const list_child_name = "element"; - constexpr bool include_quote_char = false; // TODO if merge conflict with PR #11574, make it true // Parse the JSON and get the token stream const auto [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr); @@ -1090,14 +1196,17 @@ void make_json_column(json_column& root_column, }; }; - // Includes quote char for end-of-string token or Skips the quote char for beginning-of-field-name + // Depending on whether we want to include the quotes of strings or not, respectively, we: + // (a) strip off the beginning quote included in StringBegin and FieldNameBegin or + // (b) include of the end quote excluded from in StringEnd and strip off the beginning quote + // included FieldNameBegin auto get_token_index = [include_quote_char](PdaTokenT const token, SymbolOffsetT const token_index) { constexpr SymbolOffsetT quote_char_size = 1; switch (token) { - // Strip off or include quote char for StringBegin + // Optionally strip off quote char included for StringBegin case token_t::StringBegin: return token_index + (include_quote_char ? 0 : quote_char_size); - // Strip off or Include trailing quote char for string values for StringEnd + // Optionally include trailing quote char for string values excluded for StringEnd case token_t::StringEnd: return token_index + (include_quote_char ? quote_char_size : 0); // Strip off quote char included for FieldNameBegin case token_t::FieldNameBegin: return token_index + quote_char_size; @@ -1187,6 +1296,7 @@ void make_json_column(json_column& root_column, if (current_data_path.top().column->child_columns.size() == 0) { current_data_path.top().column->child_columns.emplace(std::string{list_child_name}, json_column{json_col_t::Unknown}); + current_data_path.top().column->column_order.push_back(list_child_name); } current_data_path.top().current_selected_col = ¤t_data_path.top().column->child_columns.begin()->second; @@ -1226,6 +1336,7 @@ void make_json_column(json_column& root_column, // The field name's column does not exist yet, so we have to append the child column to the // struct column + struct_col->column_order.push_back(field_name); return &struct_col->child_columns.emplace(field_name, json_column{}).first->second; }; @@ -1419,17 +1530,36 @@ void make_json_column(json_column& root_column, root_column.level_child_cols_recursively(root_column.current_offset); } +/** + * @brief Retrieves the parse_options to be used for type inference and type casting + * + * @param options The reader options to influence the relevant type inference and type casting + * options + */ +auto parsing_options(cudf::io::json_reader_options const& options) +{ + auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'}; + + auto const stream = cudf::default_stream_value; + parse_opts.keepquotes = options.is_enabled_keep_quotes(); + parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + return parse_opts; +} + std::pair, std::vector> json_column_to_cudf_column( json_column const& json_col, device_span d_input, + cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + // Range of orchestrating/encapsulating function CUDF_FUNC_RANGE(); + auto make_validity = [stream, mr](json_column const& json_col) -> std::pair { - if (json_col.current_offset == json_col.valid_count) { return {rmm::device_buffer{}, 0}; } - return {rmm::device_buffer{json_col.validity.data(), bitmask_allocation_size_bytes(json_col.current_offset), stream, @@ -1439,29 +1569,58 @@ std::pair, std::vector> json_column_to switch (json_col.type) { case json_col_t::StringColumn: { - // move string_offsets to GPU and transform to string column - auto const col_size = json_col.string_offsets.size(); - using char_length_pair_t = thrust::pair; + auto const col_size = json_col.string_offsets.size(); CUDF_EXPECTS(json_col.string_offsets.size() == json_col.string_lengths.size(), "string offset, string length mismatch"); - rmm::device_uvector d_string_data(col_size, stream); + + // Move string_offsets and string_lengths to GPU rmm::device_uvector d_string_offsets = cudf::detail::make_device_uvector_async(json_col.string_offsets, stream); rmm::device_uvector d_string_lengths = cudf::detail::make_device_uvector_async(json_col.string_lengths, stream); + + // Prepare iterator that returns (string_offset, string_length)-tuples auto offset_length_it = thrust::make_zip_iterator(d_string_offsets.begin(), d_string_lengths.begin()); - thrust::transform(rmm::exec_policy(stream), - offset_length_it, - offset_length_it + col_size, - d_string_data.data(), - [data = d_input.data()] __device__(auto ip) { - return char_length_pair_t{data + thrust::get<0>(ip), thrust::get<1>(ip)}; - }); - auto str_col_ptr = make_strings_column(d_string_data, stream, mr); - auto [result_bitmask, null_count] = make_validity(json_col); - str_col_ptr->set_null_mask(result_bitmask, null_count); - return {std::move(str_col_ptr), {{"offsets"}, {"chars"}}}; + + // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference + auto string_ranges_it = + thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) { + return thrust::pair{ + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; + }); + + // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion + auto string_spans_it = thrust::make_transform_iterator( + offset_length_it, [data = d_input.data()] __device__(auto ip) { + return thrust::pair{ + data + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; + }); + + // Infer column type + auto target_type = cudf::io::detail::infer_data_type( + parsing_options(options).json_view(), d_input, string_ranges_it, col_size, stream); + + // Convert strings to the inferred data type + auto col = experimental::detail::parse_data(string_spans_it, + col_size, + target_type, + make_validity(json_col).first, + parsing_options(options).view(), + stream, + mr); + + // Reset nullable if we do not have nulls + if (col->null_count() == 0) { col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); } + + // For string columns return ["offsets", "char"] schema + if (target_type.id() == type_id::STRING) { + return {std::move(col), {{"offsets"}, {"chars"}}}; + } + // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema + else { + return {std::move(col), {}}; + } break; } case json_col_t::StructColumn: { @@ -1469,10 +1628,12 @@ std::pair, std::vector> json_column_to std::vector column_names{}; size_type num_rows{json_col.current_offset}; // Create children columns - for (auto const& col : json_col.child_columns) { - column_names.emplace_back(col.first); - auto const& child_col = col.second; - auto [child_column, names] = json_column_to_cudf_column(child_col, d_input, stream, mr); + for (auto const& col_name : json_col.column_order) { + auto const& col = json_col.child_columns.find(col_name); + column_names.emplace_back(col->first); + auto const& child_col = col->second; + auto [child_column, names] = + json_column_to_cudf_column(child_col, d_input, options, stream, mr); CUDF_EXPECTS(num_rows == child_column->size(), "All children columns must have the same size"); child_columns.push_back(std::move(child_column)); @@ -1496,8 +1657,8 @@ std::pair, std::vector> json_column_to auto offsets_column = std::make_unique(data_type{type_id::INT32}, num_rows, d_offsets.release()); // Create children column - auto [child_column, names] = - json_column_to_cudf_column(json_col.child_columns.begin()->second, d_input, stream, mr); + auto [child_column, names] = json_column_to_cudf_column( + json_col.child_columns.begin()->second, d_input, options, stream, mr); column_names.back().children = names; auto [result_bitmask, null_count] = make_validity(json_col); return {make_lists_column(num_rows - 1, @@ -1521,7 +1682,9 @@ table_with_metadata parse_nested_json(host_span input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + // Range of orchestrating/encapsulating function CUDF_FUNC_RANGE(); + auto const new_line_delimited_json = options.is_enabled_lines(); // Allocate device memory for the JSON input & copy over to device @@ -1536,6 +1699,10 @@ table_with_metadata parse_nested_json(host_span input, constexpr uint32_t token_end_offset_zero = 0; constexpr uint32_t node_init_child_count_zero = 0; + // Whether the tokenizer stage should keep quote characters for string values + // If the tokenizer keeps the quote characters, they may be stripped during type casting + constexpr bool include_quote_chars = true; + // We initialize the very root node and root column, which represent the JSON document being // parsed. That root node is a list node and that root column is a list column. The column has the // root node as its only row. The values parsed from the JSON input will be treated as follows: @@ -1549,7 +1716,8 @@ table_with_metadata parse_nested_json(host_span input, // Push the root node onto the stack for the data path data_path.push({&root_column, row_offset_zero, nullptr, node_init_child_count_zero}); - make_json_column(root_column, data_path, input, d_input, options, stream, mr); + make_json_column( + root_column, data_path, input, d_input, options, include_quote_chars, stream, mr); // data_root refers to the root column of the data represented by the given JSON string auto const& data_root = @@ -1570,12 +1738,14 @@ table_with_metadata parse_nested_json(host_span input, std::vector out_column_names; // Iterate over the struct's child columns and convert to cudf column - for (auto const& [col_name, json_col] : root_struct_col.child_columns) { + for (auto const& col_name : root_struct_col.column_order) { + auto const& json_col = root_struct_col.child_columns.find(col_name)->second; // Insert this columns name into the schema out_column_names.emplace_back(col_name); // Get this JSON column's cudf column and schema info - auto [cudf_col, col_name_info] = json_column_to_cudf_column(json_col, d_input, stream, mr); + auto [cudf_col, col_name_info] = + json_column_to_cudf_column(json_col, d_input, options, stream, mr); out_column_names.back().children = std::move(col_name_info); out_columns.emplace_back(std::move(cudf_col)); } diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 232aaa51ef3..7f698774084 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1006,4 +1006,67 @@ TEST_F(JsonReaderTest, ExperimentalLinesNoOmissions) } } +TEST_F(JsonReaderTest, TestColumnOrder) +{ + std::string const json_string = + // Expected order: + // root: b, c, a, d + // a: 2, 0, 1 + {R"({"b":"b0"} + {"c":"c1","a":{"2":null}} + {"d":"d2","a":{"0":"a2.0", "2":"a2.2"}} + {"b":"b3","a":{"1":null, "2":"a3.2"}})"}; + + std::vector const root_col_names{"b", "c", "a", "d"}; + std::vector const a_child_col_names{"2", "0", "1"}; + + // Initialize parsing options (reading json lines) + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.c_str(), json_string.size()}) + .lines(true) + .experimental(true); + + // Read in data using nested JSON reader + cudf::io::table_with_metadata new_reader_table = cudf::io::read_json(json_lines_options); + + // Verify root column order (assert to avoid OOB access) + ASSERT_EQ(new_reader_table.metadata.schema_info.size(), root_col_names.size()); + + for (std::size_t i = 0; i < a_child_col_names.size(); i++) { + auto const& root_col_name = root_col_names[i]; + EXPECT_EQ(new_reader_table.metadata.schema_info[i].name, root_col_name); + } + + // Verify nested child column order (assert to avoid OOB access) + ASSERT_EQ(new_reader_table.metadata.schema_info[2].children.size(), a_child_col_names.size()); + for (std::size_t i = 0; i < a_child_col_names.size(); i++) { + auto const& a_child_col_name = a_child_col_names[i]; + EXPECT_EQ(new_reader_table.metadata.schema_info[2].children[i].name, a_child_col_name); + } + + // Verify data of root columns + ASSERT_EQ(root_col_names.size(), new_reader_table.tbl->num_columns()); + column_wrapper root_col_data_b{{"b0", "", "", "b3"}, + {true, false, false, true}}; + column_wrapper root_col_data_c{{"", "c1", "", ""}, + {false, true, false, false}}; + column_wrapper root_col_data_d{{"", "", "d2", ""}, + {false, false, true, false}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(root_col_data_b, new_reader_table.tbl->get_column(0)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(root_col_data_c, new_reader_table.tbl->get_column(1)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(root_col_data_d, new_reader_table.tbl->get_column(3)); + + // Verify data of child columns of column 'a' + auto const col_a = new_reader_table.tbl->get_column(2); + ASSERT_EQ(a_child_col_names.size(), col_a.num_children()); + column_wrapper col_a2{{"", "", "a2.2", "a3.2"}, {false, false, true, true}}; + column_wrapper col_a0{{"", "", "a2.0", ""}, {false, false, true, false}}; + // col a.1 is inferred as all-null + int8_wrapper col_a1{{0, 0, 0, 0}, {false, false, false, false}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_a2, col_a.child(0)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_a0, col_a.child(1)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_a1, col_a.child(2)); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index 55364ca7e9d..7f64f1191cf 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -470,8 +470,8 @@ TEST_F(JsonTest, ExtractColumn) auto const second_column_index = 1; EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count); - auto expected_col1 = cudf::test::strings_column_wrapper({"0.0", "0.1", "0.2"}); - auto expected_col2 = cudf::test::strings_column_wrapper({"1.0", "1.1", "1.2"}); + auto expected_col1 = cudf::test::fixed_width_column_wrapper({0.0, 0.1, 0.2}); + auto expected_col2 = cudf::test::fixed_width_column_wrapper({1.0, 1.1, 1.2}); cudf::column_view parsed_col1 = cudf_table.tbl->get_column(first_column_index); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col1, parsed_col1); cudf::column_view parsed_col2 = cudf_table.tbl->get_column(second_column_index); @@ -519,110 +519,64 @@ TEST_F(JsonTest, UTF_JSON) CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(utf_pass, default_options, stream)); } -TEST_F(JsonTest, DISABLED_FromParquet) +TEST_F(JsonTest, ExtractColumnWithQuotes) { using cuio_json::SymbolT; - std::string const input = - R"([{"0":{},"1":[],"2":{}},{"1":[[""],[]],"2":{"2":""}},{"0":{"a":"1"},"2":{"0":"W&RR=+I","1":""}}])"; - // Prepare cuda stream for data transfers & kernels constexpr auto stream = cudf::default_stream_value; // Default parsing options - cudf::io::json_reader_options default_options{}; + cudf::io::json_reader_options options{}; + options.enable_keep_quotes(true); - // Binary parquet data containing the same data as the data represented by the JSON string. - // We could add a dataset to include this file, but we don't want tests in cudf to have data. - const unsigned char parquet_data[] = { - 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x18, 0x15, 0x18, 0x2C, 0x15, 0x06, 0x15, 0x00, 0x15, - 0x06, 0x15, 0x06, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x21, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x31, 0x15, 0x00, 0x15, 0x24, 0x15, 0x20, 0x2C, 0x15, 0x08, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, - 0x00, 0x00, 0x12, 0x18, 0x03, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x05, 0x07, 0x04, 0x2D, 0x00, - 0x01, 0x01, 0x15, 0x00, 0x15, 0x22, 0x15, 0x22, 0x2C, 0x15, 0x06, 0x15, 0x00, 0x15, 0x06, 0x15, - 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x04, 0x07, 0x00, 0x00, 0x00, 0x57, 0x26, 0x52, - 0x52, 0x3D, 0x2B, 0x49, 0x15, 0x00, 0x15, 0x14, 0x15, 0x14, 0x2C, 0x15, 0x06, 0x15, 0x00, 0x15, - 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x04, 0x00, 0x00, 0x00, 0x00, 0x15, - 0x00, 0x15, 0x14, 0x15, 0x14, 0x2C, 0x15, 0x06, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x03, 0x02, 0x00, 0x00, 0x00, 0x00, 0x15, 0x02, 0x19, 0xCC, 0x48, 0x06, - 0x73, 0x63, 0x68, 0x65, 0x6D, 0x61, 0x15, 0x06, 0x00, 0x35, 0x02, 0x18, 0x01, 0x30, 0x15, 0x02, - 0x00, 0x15, 0x0C, 0x25, 0x02, 0x18, 0x01, 0x61, 0x25, 0x00, 0x00, 0x35, 0x02, 0x18, 0x01, 0x31, - 0x15, 0x02, 0x15, 0x06, 0x00, 0x35, 0x04, 0x18, 0x04, 0x6C, 0x69, 0x73, 0x74, 0x15, 0x02, 0x00, - 0x35, 0x00, 0x18, 0x07, 0x65, 0x6C, 0x65, 0x6D, 0x65, 0x6E, 0x74, 0x15, 0x02, 0x15, 0x06, 0x00, - 0x35, 0x04, 0x18, 0x04, 0x6C, 0x69, 0x73, 0x74, 0x15, 0x02, 0x00, 0x15, 0x0C, 0x25, 0x00, 0x18, - 0x07, 0x65, 0x6C, 0x65, 0x6D, 0x65, 0x6E, 0x74, 0x25, 0x00, 0x00, 0x35, 0x00, 0x18, 0x01, 0x32, - 0x15, 0x06, 0x00, 0x15, 0x0C, 0x25, 0x02, 0x18, 0x01, 0x30, 0x25, 0x00, 0x00, 0x15, 0x0C, 0x25, - 0x02, 0x18, 0x01, 0x31, 0x25, 0x00, 0x00, 0x15, 0x0C, 0x25, 0x02, 0x18, 0x01, 0x32, 0x25, 0x00, - 0x00, 0x16, 0x06, 0x19, 0x1C, 0x19, 0x5C, 0x26, 0x00, 0x1C, 0x15, 0x0C, 0x19, 0x25, 0x00, 0x06, - 0x19, 0x28, 0x01, 0x30, 0x01, 0x61, 0x15, 0x00, 0x16, 0x06, 0x16, 0x3A, 0x16, 0x3A, 0x26, 0x08, - 0x3C, 0x36, 0x04, 0x28, 0x01, 0x31, 0x18, 0x01, 0x31, 0x00, 0x00, 0x00, 0x26, 0x00, 0x1C, 0x15, - 0x0C, 0x19, 0x25, 0x00, 0x06, 0x19, 0x58, 0x01, 0x31, 0x04, 0x6C, 0x69, 0x73, 0x74, 0x07, 0x65, - 0x6C, 0x65, 0x6D, 0x65, 0x6E, 0x74, 0x04, 0x6C, 0x69, 0x73, 0x74, 0x07, 0x65, 0x6C, 0x65, 0x6D, - 0x65, 0x6E, 0x74, 0x15, 0x02, 0x16, 0x08, 0x16, 0x46, 0x16, 0x42, 0x26, 0x42, 0x3C, 0x36, 0x00, - 0x28, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x26, 0x00, 0x1C, 0x15, 0x0C, 0x19, 0x25, 0x00, 0x06, - 0x19, 0x28, 0x01, 0x32, 0x01, 0x30, 0x15, 0x00, 0x16, 0x06, 0x16, 0x44, 0x16, 0x44, 0x26, 0x84, - 0x01, 0x3C, 0x36, 0x04, 0x28, 0x07, 0x57, 0x26, 0x52, 0x52, 0x3D, 0x2B, 0x49, 0x18, 0x07, 0x57, - 0x26, 0x52, 0x52, 0x3D, 0x2B, 0x49, 0x00, 0x00, 0x00, 0x26, 0x00, 0x1C, 0x15, 0x0C, 0x19, 0x25, - 0x00, 0x06, 0x19, 0x28, 0x01, 0x32, 0x01, 0x31, 0x15, 0x00, 0x16, 0x06, 0x16, 0x36, 0x16, 0x36, - 0x26, 0xC8, 0x01, 0x3C, 0x36, 0x04, 0x28, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x26, 0x00, 0x1C, - 0x15, 0x0C, 0x19, 0x25, 0x00, 0x06, 0x19, 0x28, 0x01, 0x32, 0x01, 0x32, 0x15, 0x00, 0x16, 0x06, - 0x16, 0x36, 0x16, 0x36, 0x26, 0xFE, 0x01, 0x3C, 0x36, 0x04, 0x28, 0x00, 0x18, 0x00, 0x00, 0x00, - 0x00, 0x16, 0xAC, 0x02, 0x16, 0x06, 0x00, 0x19, 0x1C, 0x18, 0x06, 0x70, 0x61, 0x6E, 0x64, 0x61, - 0x73, 0x18, 0xFE, 0x04, 0x7B, 0x22, 0x69, 0x6E, 0x64, 0x65, 0x78, 0x5F, 0x63, 0x6F, 0x6C, 0x75, - 0x6D, 0x6E, 0x73, 0x22, 0x3A, 0x20, 0x5B, 0x7B, 0x22, 0x6B, 0x69, 0x6E, 0x64, 0x22, 0x3A, 0x20, - 0x22, 0x72, 0x61, 0x6E, 0x67, 0x65, 0x22, 0x2C, 0x20, 0x22, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, - 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x2C, 0x20, 0x22, 0x73, 0x74, 0x61, 0x72, 0x74, 0x22, 0x3A, 0x20, - 0x30, 0x2C, 0x20, 0x22, 0x73, 0x74, 0x6F, 0x70, 0x22, 0x3A, 0x20, 0x33, 0x2C, 0x20, 0x22, 0x73, - 0x74, 0x65, 0x70, 0x22, 0x3A, 0x20, 0x31, 0x7D, 0x5D, 0x2C, 0x20, 0x22, 0x63, 0x6F, 0x6C, 0x75, - 0x6D, 0x6E, 0x5F, 0x69, 0x6E, 0x64, 0x65, 0x78, 0x65, 0x73, 0x22, 0x3A, 0x20, 0x5B, 0x7B, 0x22, - 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x2C, 0x20, 0x22, 0x66, 0x69, - 0x65, 0x6C, 0x64, 0x5F, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x2C, - 0x20, 0x22, 0x70, 0x61, 0x6E, 0x64, 0x61, 0x73, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, - 0x22, 0x75, 0x6E, 0x69, 0x63, 0x6F, 0x64, 0x65, 0x22, 0x2C, 0x20, 0x22, 0x6E, 0x75, 0x6D, 0x70, - 0x79, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, 0x65, 0x63, 0x74, - 0x22, 0x2C, 0x20, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3A, 0x20, 0x7B, - 0x22, 0x65, 0x6E, 0x63, 0x6F, 0x64, 0x69, 0x6E, 0x67, 0x22, 0x3A, 0x20, 0x22, 0x55, 0x54, 0x46, - 0x2D, 0x38, 0x22, 0x7D, 0x7D, 0x5D, 0x2C, 0x20, 0x22, 0x63, 0x6F, 0x6C, 0x75, 0x6D, 0x6E, 0x73, - 0x22, 0x3A, 0x20, 0x5B, 0x7B, 0x22, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x30, 0x22, - 0x2C, 0x20, 0x22, 0x66, 0x69, 0x65, 0x6C, 0x64, 0x5F, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, - 0x22, 0x30, 0x22, 0x2C, 0x20, 0x22, 0x70, 0x61, 0x6E, 0x64, 0x61, 0x73, 0x5F, 0x74, 0x79, 0x70, - 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, 0x65, 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6E, - 0x75, 0x6D, 0x70, 0x79, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, - 0x65, 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, - 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x7D, 0x2C, 0x20, 0x7B, 0x22, 0x6E, 0x61, 0x6D, 0x65, 0x22, - 0x3A, 0x20, 0x22, 0x31, 0x22, 0x2C, 0x20, 0x22, 0x66, 0x69, 0x65, 0x6C, 0x64, 0x5F, 0x6E, 0x61, - 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x31, 0x22, 0x2C, 0x20, 0x22, 0x70, 0x61, 0x6E, 0x64, 0x61, - 0x73, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6C, 0x69, 0x73, 0x74, 0x5B, 0x6C, - 0x69, 0x73, 0x74, 0x5B, 0x75, 0x6E, 0x69, 0x63, 0x6F, 0x64, 0x65, 0x5D, 0x5D, 0x22, 0x2C, 0x20, - 0x22, 0x6E, 0x75, 0x6D, 0x70, 0x79, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, - 0x62, 0x6A, 0x65, 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, - 0x61, 0x22, 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x7D, 0x2C, 0x20, 0x7B, 0x22, 0x6E, 0x61, 0x6D, - 0x65, 0x22, 0x3A, 0x20, 0x22, 0x32, 0x22, 0x2C, 0x20, 0x22, 0x66, 0x69, 0x65, 0x6C, 0x64, 0x5F, - 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x32, 0x22, 0x2C, 0x20, 0x22, 0x70, 0x61, 0x6E, - 0x64, 0x61, 0x73, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, 0x65, - 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6E, 0x75, 0x6D, 0x70, 0x79, 0x5F, 0x74, 0x79, 0x70, 0x65, - 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, 0x65, 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6D, 0x65, - 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x7D, 0x5D, 0x2C, - 0x20, 0x22, 0x63, 0x72, 0x65, 0x61, 0x74, 0x6F, 0x72, 0x22, 0x3A, 0x20, 0x7B, 0x22, 0x6C, 0x69, - 0x62, 0x72, 0x61, 0x72, 0x79, 0x22, 0x3A, 0x20, 0x22, 0x70, 0x79, 0x61, 0x72, 0x72, 0x6F, 0x77, - 0x22, 0x2C, 0x20, 0x22, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x22, 0x3A, 0x20, 0x22, 0x38, - 0x2E, 0x30, 0x2E, 0x31, 0x22, 0x7D, 0x2C, 0x20, 0x22, 0x70, 0x61, 0x6E, 0x64, 0x61, 0x73, 0x5F, - 0x76, 0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x22, 0x3A, 0x20, 0x22, 0x31, 0x2E, 0x34, 0x2E, 0x33, - 0x22, 0x7D, 0x00, 0x29, 0x5C, 0x1C, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x1C, 0x00, - 0x00, 0x1C, 0x00, 0x00, 0x00, 0x0B, 0x04, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; - - // Read in the data via parquet reader - cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(parquet_data), sizeof(parquet_data)}); - auto result = cudf::io::read_parquet(read_opts); - - // Read in the data via the JSON parser + std::string const input = R"( [{"a":"0.0", "b":1.0}, {"b":1.1}, {"b":2.1, "a":"2.0"}] )"; + // Get the JSON's tree representation auto const cudf_table = cuio_json::detail::parse_nested_json( - cudf::host_span{input.data(), input.size()}, default_options, stream); + cudf::host_span{input.data(), input.size()}, options, stream); + + auto constexpr expected_col_count = 2; + EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count); + + auto expected_col1 = + cudf::test::strings_column_wrapper({R"("0.0")", R"()", R"("2.0")"}, {true, false, true}); + auto expected_col2 = cudf::test::fixed_width_column_wrapper({1.0, 1.1, 2.1}); + cudf::column_view parsed_col1 = cudf_table.tbl->get_column(0); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col1, parsed_col1); + cudf::column_view parsed_col2 = cudf_table.tbl->get_column(1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2); +} + +TEST_F(JsonTest, ExpectFailMixStructAndList) +{ + using cuio_json::SymbolT; + + // Prepare cuda stream for data transfers & kernels + constexpr auto stream = cudf::default_stream_value; - // Verify that the data read via parquet matches the data read via JSON - CUDF_TEST_EXPECT_TABLES_EQUAL(cudf_table.tbl->view(), result.tbl->view()); + // Default parsing options + cudf::io::json_reader_options options{}; + options.enable_keep_quotes(true); + + std::vector const inputs_fail{ + R"( [{"a":[123], "b":1.0}, {"b":1.1}, {"b":2.1, "a":{"0":123}}] )", + R"( [{"a":{"0":"foo"}, "b":1.0}, {"b":1.1}, {"b":2.1, "a":[123]}] )", + R"( [{"a":{"0":null}, "b":1.0}, {"b":1.1}, {"b":2.1, "a":[123]}] )"}; + + std::vector const inputs_succeed{ + R"( [{"a":[123, {"0": 123}], "b":1.0}, {"b":1.1}, {"b":2.1}] )", + R"( [{"a":[123, "123"], "b":1.0}, {"b":1.1}, {"b":2.1}] )"}; + + for (auto const& input : inputs_fail) { + CUDF_EXPECT_THROW_MESSAGE( + auto const cudf_table = cuio_json::detail::parse_nested_json( + cudf::host_span{input.data(), input.size()}, options, stream), + "A mix of lists and structs within the same column is not supported"); + } - // Verify that the schema read via parquet matches the schema read via JSON - cudf::test::expect_metadata_equal(cudf_table.metadata, result.metadata); + for (auto const& input : inputs_succeed) { + CUDF_EXPECT_NO_THROW( + auto const cudf_table = cuio_json::detail::parse_nested_json( + cudf::host_span{input.data(), input.size()}, options, stream)); + } } diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index f3d9180d44d..f6ca4691669 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -615,6 +615,48 @@ def test_json_nested_lines(data): ) bytes.seek(0) pdf = pd.read_json(bytes, orient="records", lines=True) - # In the second test-case: - # Pandas omits "f1" in first row, so we have to enforce a common schema - assert df.to_arrow().equals(pa.Table.from_pandas(pdf)) + # In the second test-case we need to take a detour via pyarrow + # Pandas omits "f1" in first row, so we have to enforce a common schema, + # such that pandas would have the f1 member with null + # Also, pyarrow chooses to select different ordering of a nested column + # children though key-value pairs are correct. + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) + assert df.to_arrow().equals(pa_table_pdf) + + +def test_json_nested_data(): + json_str = ( + '[{"0":{},"2":{}},{"1":[[""],[]],"2":{"2":""}},' + '{"0":{"a":"1"},"2":{"0":"W&RR=+I","1":""}}]' + ) + df = cudf.read_json( + StringIO(json_str), engine="cudf_experimental", orient="records" + ) + pdf = pd.read_json(StringIO(json_str), orient="records") + pdf.columns = pdf.columns.astype("str") + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) + assert df.to_arrow().equals(pa_table_pdf) + + +def test_json_types_data(): + # 0:<0:string,1:float> + # 1:list + # 2:<0:bool> + json_str = ( + '[{"0":null,"2":{}},' + '{"1":[123],"0":{"0":"foo","1":123.4},"2":{"0":false}},' + '{"0":{},"1":[],"2":{"0":null}}]' + ) + df = cudf.read_json( + StringIO(json_str), engine="cudf_experimental", orient="records" + ) + pdf = pd.read_json(StringIO(json_str), orient="records") + pdf.columns = pdf.columns.astype("str") + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) + assert df.to_arrow().equals(pa_table_pdf)