Skip to content

Commit

Permalink
Support casting of Map type to string in JSON reader (#14936)
Browse files Browse the repository at this point in the history
Addresses part of #14288
Depends on  #14939 (mixed type ignore nulls fix)

In the input schema, if a struct column is given as STRING type, it's forced to be a STRING column.
This could be used to support map type in spark JSON reader. (Force a map type to be a STRING, and use different parser to extract this string column as key, value columns)
To enable this forcing, mixed type as string should be enabled in json_reader_options.

Authors:
  - Karthikeyan (https://github.com/karthikeyann)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Andy Grove (https://github.com/andygrove)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Shruti Shivakumar (https://github.com/shrshi)
  - Bradley Dice (https://github.com/bdice)

URL: #14936
  • Loading branch information
karthikeyann authored Mar 11, 2024
1 parent b08dd9b commit 6a03827
Show file tree
Hide file tree
Showing 6 changed files with 235 additions and 9 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,7 @@ add_library(
src/io/json/read_json.cu
src/io/json/legacy/json_gpu.cu
src/io/json/legacy/reader_impl.cu
src/io/json/parser_features.cpp
src/io/json/write_json.cu
src/io/orc/aggregate_orc_metadata.cpp
src/io/orc/dict_enc.cu
Expand Down
2 changes: 2 additions & 0 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,7 @@ class json_reader_options {

/**
* @brief Set whether to parse mixed types as a string column.
* Also enables forcing to read a struct as string column using schema.
*
* @param val Boolean value to enable/disable parsing mixed types as a string column
*/
Expand Down Expand Up @@ -491,6 +492,7 @@ class json_reader_options_builder {

/**
* @brief Set whether to parse mixed types as a string column.
* Also enables forcing to read a struct as string column using schema.
*
* @param val Boolean value to enable/disable parsing mixed types as a string column
* @return this for chaining
Expand Down
37 changes: 28 additions & 9 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -496,15 +496,16 @@ void make_device_json_column(device_span<SymbolT const> input,
rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin());

NodeIndexT const row_array_parent_col_id = [&]() {
if (!is_array_of_arrays) return parent_node_sentinel;
auto const list_node_index = is_enabled_lines ? 0 : 1;
NodeIndexT value;
CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
col_ids.data() + list_node_index,
sizeof(NodeIndexT),
cudaMemcpyDefault,
stream.value()));
stream.synchronize();
NodeIndexT value = parent_node_sentinel;
if (!col_ids.empty()) {
auto const list_node_index = is_enabled_lines ? 0 : 1;
CUDF_CUDA_TRY(cudaMemcpyAsync(&value,
col_ids.data() + list_node_index,
sizeof(NodeIndexT),
cudaMemcpyDefault,
stream.value()));
stream.synchronize();
}
return value;
}();

Expand Down Expand Up @@ -592,6 +593,12 @@ void make_device_json_column(device_span<SymbolT const> input,
col.column_order.clear();
};

path_from_tree tree_path{column_categories,
column_parent_ids,
column_names,
is_array_of_arrays,
row_array_parent_col_id};

// 2. generate nested columns tree and its device_memory
// reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
auto h_range_col_id_it =
Expand Down Expand Up @@ -642,6 +649,7 @@ void make_device_json_column(device_span<SymbolT const> input,
ignore_vals[this_col_id] = 1;
continue;
}

// If the child is already found,
// replace if this column is a nested column and the existing was a value column
// ignore this column if this column is a value column and the existing was a nested column
Expand Down Expand Up @@ -701,6 +709,17 @@ void make_device_json_column(device_span<SymbolT const> input,
"A mix of lists and structs within the same column is not supported");
}
}
if (is_enabled_mixed_types_as_string) {
// get path of this column, check if it is a struct forced as string, and enforce it
auto nt = tree_path.get_path(this_col_id);
std::optional<data_type> user_dt = get_path_data_type(nt, options);
if (column_categories[this_col_id] == NC_STRUCT and user_dt.has_value() and
user_dt.value().id() == type_id::STRING) {
is_mixed_type_column[this_col_id] = 1;
column_categories[this_col_id] = NC_STR;
}
}

CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name);
// move into parent
device_json_column col(stream, mr);
Expand Down
26 changes: 26 additions & 0 deletions cpp/src/io/json/nested_json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -309,6 +309,32 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Get the path data type of a column by path if present in input schema
*
* @param path path of the column
* @param options json reader options which holds schema
* @return data type of the column if present
*/
std::optional<data_type> get_path_data_type(
host_span<std::pair<std::string, cudf::io::json::NodeT>> path,
cudf::io::json_reader_options const& options);

/**
* @brief Helper class to get path of a column by column id from reduced column tree
*
*/
struct path_from_tree {
host_span<NodeT const> column_categories;
host_span<NodeIndexT const> column_parent_ids;
host_span<std::string const> column_names;
bool is_array_of_arrays;
NodeIndexT const row_array_parent_col_id;

using path_rep = std::pair<std::string, cudf::io::json::NodeT>;
std::vector<path_rep> get_path(NodeIndexT this_col_id);
};

/**
* @brief Parses the given JSON string and generates table from the given input.
*
Expand Down
126 changes: 126 additions & 0 deletions cpp/src/io/json/parser_features.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "nested_json.hpp"

#include <cudf/detail/utilities/visitor_overload.hpp>

#include <optional>
#include <string>
#include <vector>

namespace cudf::io::json::detail {

std::optional<schema_element> child_schema_element(std::string const& col_name,
cudf::io::json_reader_options const& options)
{
return std::visit(
cudf::detail::visitor_overload{
[col_name](std::vector<data_type> const& user_dtypes) -> std::optional<schema_element> {
auto column_index = atol(col_name.data());
return (static_cast<std::size_t>(column_index) < user_dtypes.size())
? std::optional<schema_element>{{user_dtypes[column_index]}}
: std::optional<schema_element>{};
},
[col_name](
std::map<std::string, data_type> const& user_dtypes) -> std::optional<schema_element> {
return (user_dtypes.find(col_name) != std::end(user_dtypes))
? std::optional<schema_element>{{user_dtypes.find(col_name)->second}}
: std::optional<schema_element>{};
},
[col_name](
std::map<std::string, schema_element> const& user_dtypes) -> std::optional<schema_element> {
return (user_dtypes.find(col_name) != std::end(user_dtypes))
? user_dtypes.find(col_name)->second
: std::optional<schema_element>{};
}},
options.get_dtypes());
}

// example schema and its path.
// "a": int {"a", int}
// "a": [ int ] {"a", list}, {"element", int}
// "a": { "b": int} {"a", struct}, {"b", int}
// "a": [ {"b": int }] {"a", list}, {"element", struct}, {"b", int}
// "a": [ null] {"a", list}, {"element", str}
// back() is root.
// front() is leaf.
std::optional<data_type> get_path_data_type(
host_span<std::pair<std::string, cudf::io::json::NodeT>> path, schema_element const& root)
{
if (path.empty() || path.size() == 1) {
return root.type;
} else {
if (path.back().second == NC_STRUCT && root.type.id() == type_id::STRUCT) {
auto const child_name = path.first(path.size() - 1).back().first;
auto const child_schema_it = root.child_types.find(child_name);
return (child_schema_it != std::end(root.child_types))
? get_path_data_type(path.first(path.size() - 1), child_schema_it->second)
: std::optional<data_type>{};
} else if (path.back().second == NC_LIST && root.type.id() == type_id::LIST) {
auto const child_schema_it = root.child_types.find(list_child_name);
return (child_schema_it != std::end(root.child_types))
? get_path_data_type(path.first(path.size() - 1), child_schema_it->second)
: std::optional<data_type>{};
}
return std::optional<data_type>{};
}
}

std::optional<data_type> get_path_data_type(
host_span<std::pair<std::string, cudf::io::json::NodeT>> path,
cudf::io::json_reader_options const& options)
{
if (path.empty()) return {};
std::optional<schema_element> col_schema = child_schema_element(path.back().first, options);
// check if it has value, then do recursive call and return.
if (col_schema.has_value()) {
return get_path_data_type(path, col_schema.value());
} else {
return {};
}
}

// idea: write a memoizer using template and lambda?, then call recursively.
std::vector<path_from_tree::path_rep> path_from_tree::get_path(NodeIndexT this_col_id)
{
std::vector<path_rep> path;
// TODO Need to stop at row root. so, how to find row root?
while (this_col_id != parent_node_sentinel) {
auto type = column_categories[this_col_id];
std::string name = "";
// TODO make this ifelse into a separate lambda function, along with parent_col_id.
auto parent_col_id = column_parent_ids[this_col_id];
if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) {
name = column_names[this_col_id];
} else {
name = list_child_name;
}
} else if (column_categories[parent_col_id] == NC_FN) {
auto field_name_col_id = parent_col_id;
parent_col_id = column_parent_ids[parent_col_id];
name = column_names[field_name_col_id];
}
// "name": type/schema
path.emplace_back(name, type);
this_col_id = parent_col_id;
if (this_col_id == row_array_parent_col_id) return path;
}
return {};
}

} // namespace cudf::io::json::detail
52 changes: 52 additions & 0 deletions cpp/tests/io/json_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2239,4 +2239,56 @@ TEST_F(JsonReaderTest, MixedTypes)
expected_list);
}

TEST_F(JsonReaderTest, MapTypes)
{
using cudf::type_id;
// Testing function for mixed types in JSON (for spark json reader)
auto test_fn = [](std::string_view json_string, bool lines, std::vector<type_id> types) {
std::map<std::string, cudf::io::schema_element> dtype_schema{
{"foo1", {data_type{type_id::STRING}}}, // list won't be a string
{"foo2", {data_type{type_id::STRING}}}, // struct forced as a string
{"1", {data_type{type_id::STRING}}},
{"2", {data_type{type_id::STRING}}},
{"bar", {dtype<int32_t>()}},
};

cudf::io::json_reader_options in_options =
cudf::io::json_reader_options::builder(
cudf::io::source_info{json_string.data(), json_string.size()})
.dtypes(dtype_schema)
.mixed_types_as_string(true)
.lines(lines);

cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
EXPECT_EQ(result.tbl->num_columns(), types.size());
int i = 0;
for (auto& col : result.tbl->view()) {
EXPECT_EQ(col.type().id(), types[i]) << "column[" << i << "].type";
i++;
}
std::cout << "\n";
};

// json
test_fn(R"([{ "foo1": [1,2,3], "bar": 123 },
{ "foo2": { "a": 1 }, "bar": 456 }])",
false,
{type_id::LIST, type_id::INT32, type_id::STRING});
// jsonl
test_fn(R"( { "foo1": [1,2,3], "bar": 123 }
{ "foo2": { "a": 1 }, "bar": 456 })",
true,
{type_id::LIST, type_id::INT32, type_id::STRING});
// jsonl-array
test_fn(R"([123, [1,2,3]]
[456, null, { "a": 1 }])",
true,
{type_id::INT64, type_id::LIST, type_id::STRING});
// json-array
test_fn(R"([[[1,2,3], null, 123],
[null, { "a": 1 }, 456 ]])",
false,
{type_id::LIST, type_id::STRING, type_id::STRING});
}

CUDF_TEST_PROGRAM_MAIN()

0 comments on commit 6a03827

Please sign in to comment.