Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add JSON option to prune columns #14996

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ class json_reader_options {
bool _lines = false;
// Parse mixed types as a string column
bool _mixed_types_as_string = false;
// Use dtypes as filter instead of type inference suggestion
bool _use_dtypes_as_filter = false;
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved

// Bytes to skip from the start
size_t _byte_range_offset = 0;
Expand Down Expand Up @@ -240,6 +242,13 @@ class json_reader_options {
*/
bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; }

/**
* @brief Whether to use dtypes as filter instead of type inference suggestion.
*
* @return `true` if dtypes is used as filter
*/
bool is_enabled_use_dtypes_as_filter() const { return _use_dtypes_as_filter; }

/**
* @brief Whether to parse dates as DD/MM versus MM/DD.
*
Expand Down Expand Up @@ -339,6 +348,13 @@ class json_reader_options {
*/
void enable_mixed_types_as_string(bool val) { _mixed_types_as_string = val; }

/**
* @brief Set whether to use dtypes as filter instead of type inference suggestion.
*
* @param val Boolean value to enable/disable dtypes use as filter
*/
void enable_use_dtypes_as_filter(bool val) { _use_dtypes_as_filter = val; }

/**
* @brief Set whether to parse dates as DD/MM versus MM/DD.
*
Expand Down Expand Up @@ -503,6 +519,18 @@ class json_reader_options_builder {
return *this;
}

/**
* @brief Set whether to use dtypes as filter instead of type inference suggestion.
*
* @param val Boolean value to enable/disable dtypes use as filter
* @return this for chaining
*/
json_reader_options_builder& use_dtypes_as_filter(bool val)
{
options._use_dtypes_as_filter = val;
return *this;
}

/**
* @brief Set whether to parse dates as DD/MM versus MM/DD.
*
Expand Down
74 changes: 45 additions & 29 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -887,6 +887,7 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
device_json_column& json_col,
device_span<SymbolT const> d_input,
cudf::io::parse_options const& options,
bool use_dtypes_as_filter,
std::optional<schema_element> schema,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
Expand Down Expand Up @@ -977,13 +978,16 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
for (auto const& col_name : json_col.column_order) {
auto const& col = json_col.child_columns.find(col_name);
column_names.emplace_back(col->first);
auto& child_col = col->second;
auto [child_column, names] = device_json_column_to_cudf_column(
child_col, d_input, options, get_child_schema(col_name), stream, mr);
CUDF_EXPECTS(num_rows == child_column->size(),
"All children columns must have the same size");
child_columns.push_back(std::move(child_column));
column_names.back().children = names;
auto& child_col = col->second;
auto child_schema_element = get_child_schema(col_name);
if (!use_dtypes_as_filter or child_schema_element.has_value()) {
auto [child_column, names] = device_json_column_to_cudf_column(
child_col, d_input, options, use_dtypes_as_filter, child_schema_element, stream, mr);
CUDF_EXPECTS(num_rows == child_column->size(),
"All children columns must have the same size");
child_columns.push_back(std::move(child_column));
column_names.back().children = names;
}
}
auto [result_bitmask, null_count] = make_validity(json_col);
// The null_mask is set after creation of struct column is to skip the superimpose_nulls and
Expand All @@ -1006,8 +1010,12 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
rmm::device_buffer{},
0);
// Create children column
auto child_schema_element = json_col.child_columns.empty()
? std::optional<schema_element>{}
: get_child_schema(json_col.child_columns.begin()->first);
auto [child_column, names] =
json_col.child_columns.empty()
json_col.child_columns.empty() or
(use_dtypes_as_filter and !child_schema_element.has_value())
? std::pair<std::unique_ptr<column>,
// EMPTY type could not used because gather throws exception on EMPTY type.
std::vector<column_name_info>>{std::make_unique<column>(
Expand All @@ -1017,13 +1025,13 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
rmm::device_buffer{},
0),
std::vector<column_name_info>{}}
: device_json_column_to_cudf_column(
json_col.child_columns.begin()->second,
d_input,
options,
get_child_schema(json_col.child_columns.begin()->first),
stream,
mr);
: device_json_column_to_cudf_column(json_col.child_columns.begin()->second,
d_input,
options,
use_dtypes_as_filter,
child_schema_element,
stream,
mr);
column_names.back().children = names;
auto [result_bitmask, null_count] = make_validity(json_col);
auto ret_col = make_lists_column(num_rows,
Expand Down Expand Up @@ -1135,8 +1143,6 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
size_type column_index = 0;
for (auto const& col_name : root_struct_col.column_order) {
auto& json_col = root_struct_col.child_columns.find(col_name)->second;
// Insert this columns name into the schema
out_column_names.emplace_back(col_name);

std::optional<schema_element> child_schema_element = std::visit(
cudf::detail::visitor_overload{
Expand Down Expand Up @@ -1179,18 +1185,28 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
debug_schema_print(child_schema_element);
#endif

// Get this JSON column's cudf column and schema info, (modifies json_col)
auto [cudf_col, col_name_info] = device_json_column_to_cudf_column(
json_col, d_input, parse_opt, child_schema_element, stream, mr);
// TODO: RangeIndex as DataFrame.columns names for array of arrays
// if (is_array_of_arrays) {
// col_name_info.back().name = "";
// }

out_column_names.back().children = std::move(col_name_info);
out_columns.emplace_back(std::move(cudf_col));

column_index++;
if (!options.is_enabled_use_dtypes_as_filter() or child_schema_element.has_value()) {
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
// Get this JSON column's cudf column and schema info, (modifies json_col)
auto [cudf_col, col_name_info] =
device_json_column_to_cudf_column(json_col,
d_input,
parse_opt,
options.is_enabled_use_dtypes_as_filter(),
child_schema_element,
stream,
mr);
// Insert this columns name into the schema
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
out_column_names.emplace_back(col_name);
// TODO: RangeIndex as DataFrame.columns names for array of arrays
// if (is_array_of_arrays) {
// col_name_info.back().name = "";
// }

out_column_names.back().children = std::move(col_name_info);
out_columns.emplace_back(std::move(cudf_col));

column_index++;
}
}

return table_with_metadata{std::make_unique<table>(std::move(out_columns)), {out_column_names}};
Expand Down
5 changes: 5 additions & 0 deletions python/cudf/cudf/_lib/cpp/io/json.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ cdef extern from "cudf/io/json.hpp" \
size_type get_byte_range_size() except +
bool is_enabled_lines() except +
bool is_enabled_mixed_types_as_string() except +
bool is_enabled_use_dtypes_as_filter() except +
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
bool is_enabled_dayfirst() except +
bool is_enabled_experimental() except +

Expand All @@ -41,6 +42,7 @@ cdef extern from "cudf/io/json.hpp" \
void set_byte_range_size(size_type size) except +
void enable_lines(bool val) except +
void enable_mixed_types_as_string(bool val) except +
void enable_use_dtypes_as_filter(bool val) except +
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
void enable_dayfirst(bool val) except +
void enable_experimental(bool val) except +
void enable_keep_quotes(bool val) except +
Expand Down Expand Up @@ -79,6 +81,9 @@ cdef extern from "cudf/io/json.hpp" \
json_reader_options_builder& mixed_types_as_string(
bool val
) except +
json_reader_options_builder& use_dtypes_as_filter(
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
bool val
) except +
json_reader_options_builder& dayfirst(
bool val
) except +
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/_lib/json.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ cpdef read_json(object filepaths_or_buffers,
object byte_range,
bool legacy,
bool keep_quotes,
bool mixed_types_as_string):
bool mixed_types_as_string,
bool use_dtypes_as_filter):
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
"""
Cython function to call into libcudf API, see `read_json`.

Expand Down Expand Up @@ -128,6 +129,7 @@ cpdef read_json(object filepaths_or_buffers,

opts.enable_keep_quotes(keep_quotes)
opts.enable_mixed_types_as_string(mixed_types_as_string)
opts.enable_use_dtypes_as_filter(use_dtypes_as_filter)
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
# Read JSON
cdef cudf_io_types.table_with_metadata c_result

Expand Down
2 changes: 2 additions & 0 deletions python/cudf/cudf/io/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ def read_json(
keep_quotes=False,
storage_options=None,
mixed_types_as_string=False,
use_dtypes_as_filter=False,
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
*args,
**kwargs,
):
Expand Down Expand Up @@ -120,6 +121,7 @@ def read_json(
engine == "cudf_legacy",
keep_quotes,
mixed_types_as_string,
use_dtypes_as_filter,
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
)
else:
warnings.warn(
Expand Down
Loading