From 40ec1903e8cfa894950b3e2a91ca05bfcb7fdb63 Mon Sep 17 00:00:00 2001 From: Srikar Vanavasam Date: Tue, 21 Jun 2022 10:31:16 -0500 Subject: [PATCH] Handle missing fields as nulls in get_json_object() (#10970) Addresses: #10196 Previously, `get_json_object()` ignored fields in a JsonPath expression that are missing in the json string. This PR adds the option to return these missing fields as null instead. Authors: - Srikar Vanavasam (https://github.com/SrikarVanavasam) Approvers: - Nghia Truong (https://github.com/ttnghia) - MithunR (https://github.com/mythrocks) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/10970 --- cpp/include/cudf/strings/json.hpp | 39 +++++++++++++++++++++++++ cpp/src/strings/json/json_path.cu | 28 ++++++++++-------- cpp/tests/strings/json_tests.cpp | 47 ++++++++++++++++++++++++++++++- 3 files changed, 101 insertions(+), 13 deletions(-) diff --git a/cpp/include/cudf/strings/json.hpp b/cpp/include/cudf/strings/json.hpp index 2b66bcb807e..4efbe938fc7 100644 --- a/cpp/include/cudf/strings/json.hpp +++ b/cpp/include/cudf/strings/json.hpp @@ -38,6 +38,9 @@ class get_json_object_options { // individual string values are returned with quotes stripped. bool strip_quotes_from_single_strings = true; + // Whether to return nulls when an object does not contain the requested field. + bool missing_fields_as_nulls = false; + public: /** * @brief Default constructor. @@ -84,6 +87,32 @@ class get_json_object_options { return strip_quotes_from_single_strings; } + /** + * @brief Whether a field not contained by an object is to be interpreted as null. + * + * When set to true, if an object is queried for a field it does not contain, a null is returned. + * + * @code{.pseudo} + * + * With missing_fields_as_nulls OFF: + * Input = {"a" : [{"x": "1", "y": "2"}, {"x": "3"}]} + * Query = $.a[*].y + * Output = ["2"] + * + * With missing_fields_as_nulls ON: + * Input = {"a" : [{"x": "1", "y": "2"}, {"x": "3"}]} + * Query = $.a[*].y + * Output = ["2", null] + * + * @endcode + * + * @return true if missing fields are interpreted as null. + */ + [[nodiscard]] CUDF_HOST_DEVICE inline bool get_missing_fields_as_nulls() const + { + return missing_fields_as_nulls; + } + /** * @brief Set whether single-quotes for strings are allowed. * @@ -103,6 +132,16 @@ class get_json_object_options { { strip_quotes_from_single_strings = _strip_quotes_from_single_strings; } + + /** + * @brief Set whether missing fields are interpreted as null. + * + * @param _missing_fields_as_nulls bool indicating desired behavior. + */ + void set_missing_fields_as_nulls(bool _missing_fields_as_nulls) + { + missing_fields_as_nulls = _missing_fields_as_nulls; + } }; /** diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu index f4a2f908e30..5590109f9e9 100644 --- a/cpp/src/strings/json/json_path.cu +++ b/cpp/src/strings/json/json_path.cu @@ -45,8 +45,6 @@ namespace detail { namespace { -// debug accessibility - // change to "\n" and 1 to make output more readable #define DEBUG_NEWLINE constexpr int DEBUG_NEWLINE_LEN = 0; @@ -61,9 +59,10 @@ constexpr int DEBUG_NEWLINE_LEN = 0; * or you get nothing back (parse_result::EMPTY) */ enum class parse_result { - ERROR, // failure - SUCCESS, // success - EMPTY, // success, but no data + ERROR, // failure + SUCCESS, // success + MISSING_FIELD, // success, but the field is missing + EMPTY, // success, but no data }; /** @@ -325,16 +324,18 @@ class json_state : private parser { } // loop until we find a match or there's nothing left do { - // wildcard matches anything if (name.size_bytes() == 1 && name.data()[0] == '*') { return parse_result::SUCCESS; } else if (cur_el_name == name) { return parse_result::SUCCESS; } - // next parse_result result = next_element_internal(false); - if (result != parse_result::SUCCESS) { return result; } + if (result != parse_result::SUCCESS) { + return options.get_missing_fields_as_nulls() && result == parse_result::EMPTY + ? parse_result::MISSING_FIELD + : result; + } } while (true); return parse_result::ERROR; @@ -727,7 +728,6 @@ __device__ parse_result parse_json_path(json_state& j_state, int element_count = 0; while (pop_context(ctx)) { path_operator op = *ctx.commands; - switch (op.type) { // whatever the first object is case path_operator_type::ROOT: @@ -745,6 +745,12 @@ __device__ parse_result parse_json_path(json_state& j_state, PARSE_TRY(ctx.j_state.next_matching_element(op.name, true)); if (last_result == parse_result::SUCCESS) { push_context(ctx.j_state, ctx.commands + 1, ctx.list_element); + } else if (last_result == parse_result::MISSING_FIELD) { + if (ctx.list_element && element_count > 0) { + output.add_output({"," DEBUG_NEWLINE, 1 + DEBUG_NEWLINE_LEN}); + } + output.add_output({"null", 4}); + element_count++; } } } break; @@ -980,9 +986,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c constexpr int block_size = 512; cudf::detail::grid_1d const grid{col.size(), block_size}; - auto cdv = column_device_view::create(col.parent(), stream); - // preprocess sizes (returned in the offsets buffer) get_json_object_kernel <<>>( @@ -1014,6 +1018,7 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c // compute results cudf::mutable_column_view chars_view(*chars); rmm::device_scalar d_valid_count{0, stream}; + get_json_object_kernel <<>>( *cdv, @@ -1023,7 +1028,6 @@ std::unique_ptr get_json_object(cudf::strings_column_view const& c static_cast(validity.data()), d_valid_count.data(), options); - return make_strings_column(col.size(), std::move(offsets), std::move(chars), diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp index 2dfe50d2ef5..c533eed48df 100644 --- a/cpp/tests/strings/json_tests.cpp +++ b/cpp/tests/strings/json_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2021-2022, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -967,4 +967,49 @@ TEST_F(JsonPathTests, EscapeSequences) // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected); } +} + +TEST_F(JsonPathTests, MissingFieldsAsNulls) +{ + std::string input_string{ + // clang-format off + "{" + "\"tup\":" + "[" + "{\"id\":\"1\",\"array\":[1,2]}," + "{\"id\":\"2\"}," + "{\"id\":\"3\",\"array\":[3,4]}," + "{\"id\":\"4\", \"a\": {\"x\": \"5\", \"y\": \"6\"}}" + "]" + "}" + // clang-format on + }; + auto do_test = [&input_string](auto const& json_path_string, + auto const& default_output, + auto const& missing_fields_output, + bool default_valid = true) { + cudf::test::strings_column_wrapper input{input_string}; + cudf::strings::get_json_object_options options; + + // Test default behavior + options.set_missing_fields_as_nulls(false); + auto const default_result = + cudf::strings::get_json_object(cudf::strings_column_view(input), {json_path_string}, options); + cudf::test::strings_column_wrapper default_expected({default_output}, {default_valid}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(default_expected, *default_result); + + // Test with missing fields as null + options.set_missing_fields_as_nulls(true); + auto const missing_fields_result = + cudf::strings::get_json_object(cudf::strings_column_view(input), {json_path_string}, options); + cudf::test::strings_column_wrapper missing_fields_expected({missing_fields_output}, {1}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(missing_fields_expected, *missing_fields_result); + }; + + do_test("$.tup[1].array", "", "null", false); + do_test("$.tup[*].array", "[[1,2],[3,4]]", "[[1,2],null,[3,4],null]"); + do_test("$.x[*].array", "", "null", false); + do_test("$.tup[*].a.x", "[\"5\"]", "[null,null,null,\"5\"]"); } \ No newline at end of file