Skip to content

Commit

Permalink
Handle missing fields as nulls in get_json_object() (#10970)
Browse files Browse the repository at this point in the history
Addresses: #10196 

Previously, `get_json_object()` ignored fields in a JsonPath expression that are missing in the json string. This PR adds the option to  return these missing fields as null instead.

Authors:
  - Srikar Vanavasam (https://github.com/SrikarVanavasam)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - MithunR (https://github.com/mythrocks)
  - David Wendt (https://github.com/davidwendt)

URL: #10970
  • Loading branch information
SrikarVanavasam authored Jun 21, 2022
1 parent 13d02a7 commit 40ec190
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 13 deletions.
39 changes: 39 additions & 0 deletions cpp/include/cudf/strings/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ class get_json_object_options {
// individual string values are returned with quotes stripped.
bool strip_quotes_from_single_strings = true;

// Whether to return nulls when an object does not contain the requested field.
bool missing_fields_as_nulls = false;

public:
/**
* @brief Default constructor.
Expand Down Expand Up @@ -84,6 +87,32 @@ class get_json_object_options {
return strip_quotes_from_single_strings;
}

/**
* @brief Whether a field not contained by an object is to be interpreted as null.
*
* When set to true, if an object is queried for a field it does not contain, a null is returned.
*
* @code{.pseudo}
*
* With missing_fields_as_nulls OFF:
* Input = {"a" : [{"x": "1", "y": "2"}, {"x": "3"}]}
* Query = $.a[*].y
* Output = ["2"]
*
* With missing_fields_as_nulls ON:
* Input = {"a" : [{"x": "1", "y": "2"}, {"x": "3"}]}
* Query = $.a[*].y
* Output = ["2", null]
*
* @endcode
*
* @return true if missing fields are interpreted as null.
*/
[[nodiscard]] CUDF_HOST_DEVICE inline bool get_missing_fields_as_nulls() const
{
return missing_fields_as_nulls;
}

/**
* @brief Set whether single-quotes for strings are allowed.
*
Expand All @@ -103,6 +132,16 @@ class get_json_object_options {
{
strip_quotes_from_single_strings = _strip_quotes_from_single_strings;
}

/**
* @brief Set whether missing fields are interpreted as null.
*
* @param _missing_fields_as_nulls bool indicating desired behavior.
*/
void set_missing_fields_as_nulls(bool _missing_fields_as_nulls)
{
missing_fields_as_nulls = _missing_fields_as_nulls;
}
};

/**
Expand Down
28 changes: 16 additions & 12 deletions cpp/src/strings/json/json_path.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@ namespace detail {

namespace {

// debug accessibility

// change to "\n" and 1 to make output more readable
#define DEBUG_NEWLINE
constexpr int DEBUG_NEWLINE_LEN = 0;
Expand All @@ -61,9 +59,10 @@ constexpr int DEBUG_NEWLINE_LEN = 0;
* or you get nothing back (parse_result::EMPTY)
*/
enum class parse_result {
ERROR, // failure
SUCCESS, // success
EMPTY, // success, but no data
ERROR, // failure
SUCCESS, // success
MISSING_FIELD, // success, but the field is missing
EMPTY, // success, but no data
};

/**
Expand Down Expand Up @@ -325,16 +324,18 @@ class json_state : private parser {
}
// loop until we find a match or there's nothing left
do {
// wildcard matches anything
if (name.size_bytes() == 1 && name.data()[0] == '*') {
return parse_result::SUCCESS;
} else if (cur_el_name == name) {
return parse_result::SUCCESS;
}

// next
parse_result result = next_element_internal(false);
if (result != parse_result::SUCCESS) { return result; }
if (result != parse_result::SUCCESS) {
return options.get_missing_fields_as_nulls() && result == parse_result::EMPTY
? parse_result::MISSING_FIELD
: result;
}
} while (true);

return parse_result::ERROR;
Expand Down Expand Up @@ -727,7 +728,6 @@ __device__ parse_result parse_json_path(json_state& j_state,
int element_count = 0;
while (pop_context(ctx)) {
path_operator op = *ctx.commands;

switch (op.type) {
// whatever the first object is
case path_operator_type::ROOT:
Expand All @@ -745,6 +745,12 @@ __device__ parse_result parse_json_path(json_state& j_state,
PARSE_TRY(ctx.j_state.next_matching_element(op.name, true));
if (last_result == parse_result::SUCCESS) {
push_context(ctx.j_state, ctx.commands + 1, ctx.list_element);
} else if (last_result == parse_result::MISSING_FIELD) {
if (ctx.list_element && element_count > 0) {
output.add_output({"," DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN});
}
output.add_output({"null", 4});
element_count++;
}
}
} break;
Expand Down Expand Up @@ -980,9 +986,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c

constexpr int block_size = 512;
cudf::detail::grid_1d const grid{col.size(), block_size};

auto cdv = column_device_view::create(col.parent(), stream);

// preprocess sizes (returned in the offsets buffer)
get_json_object_kernel<block_size>
<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
Expand Down Expand Up @@ -1014,6 +1018,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
// compute results
cudf::mutable_column_view chars_view(*chars);
rmm::device_scalar<size_type> d_valid_count{0, stream};

get_json_object_kernel<block_size>
<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
*cdv,
Expand All @@ -1023,7 +1028,6 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
static_cast<bitmask_type*>(validity.data()),
d_valid_count.data(),
options);

return make_strings_column(col.size(),
std::move(offsets),
std::move(chars),
Expand Down
47 changes: 46 additions & 1 deletion cpp/tests/strings/json_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
* Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -967,4 +967,49 @@ TEST_F(JsonPathTests, EscapeSequences)
// clang-format on
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
}
}

TEST_F(JsonPathTests, MissingFieldsAsNulls)
{
std::string input_string{
// clang-format off
"{"
"\"tup\":"
"["
"{\"id\":\"1\",\"array\":[1,2]},"
"{\"id\":\"2\"},"
"{\"id\":\"3\",\"array\":[3,4]},"
"{\"id\":\"4\", \"a\": {\"x\": \"5\", \"y\": \"6\"}}"
"]"
"}"
// clang-format on
};
auto do_test = [&input_string](auto const& json_path_string,
auto const& default_output,
auto const& missing_fields_output,
bool default_valid = true) {
cudf::test::strings_column_wrapper input{input_string};
cudf::strings::get_json_object_options options;

// Test default behavior
options.set_missing_fields_as_nulls(false);
auto const default_result =
cudf::strings::get_json_object(cudf::strings_column_view(input), {json_path_string}, options);
cudf::test::strings_column_wrapper default_expected({default_output}, {default_valid});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(default_expected, *default_result);

// Test with missing fields as null
options.set_missing_fields_as_nulls(true);
auto const missing_fields_result =
cudf::strings::get_json_object(cudf::strings_column_view(input), {json_path_string}, options);
cudf::test::strings_column_wrapper missing_fields_expected({missing_fields_output}, {1});

CUDF_TEST_EXPECT_COLUMNS_EQUAL(missing_fields_expected, *missing_fields_result);
};

do_test("$.tup[1].array", "", "null", false);
do_test("$.tup[*].array", "[[1,2],[3,4]]", "[[1,2],null,[3,4],null]");
do_test("$.x[*].array", "", "null", false);
do_test("$.tup[*].a.x", "[\"5\"]", "[null,null,null,\"5\"]");
}

0 comments on commit 40ec190

Please sign in to comment.