Skip to content

Commit

Permalink
JSON single quote normalization API (#14729)
Browse files Browse the repository at this point in the history
The goal of this PR is to address [10004](#10004) by supporting parsing of JSON files containing single quotes for field/value strings. This is a follow-up work to the POC [PR 14545](#14545)

Authors:
  - Shruti Shivakumar (https://github.com/shrshi)

Approvers:
  - Andy Grove (https://github.com/andygrove)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Elias Stehle (https://github.com/elstehle)
  - Robert (Bobby) Evans (https://github.com/revans2)

URL: #14729
  • Loading branch information
shrshi authored Jan 24, 2024
1 parent 67a36a9 commit f800f5a
Show file tree
Hide file tree
Showing 14 changed files with 401 additions and 194 deletions.
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -375,6 +375,7 @@ add_library(
src/io/functions.cpp
src/io/json/byte_range_info.cu
src/io/json/json_column.cu
src/io/json/json_quote_normalization.cu
src/io/json/json_tree.cu
src/io/json/nested_json_gpu.cu
src/io/json/read_json.cu
Expand Down
14 changes: 13 additions & 1 deletion cpp/include/cudf/io/detail/json.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -51,4 +51,16 @@ void write_json(data_sink* sink,
json_writer_options const& options,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Normalize single quotes to double quotes using FST
*
* @param inbuf Input device buffer
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource to use for device memory allocation
*/
rmm::device_uvector<char> normalize_single_quotes(rmm::device_uvector<char>&& inbuf,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

} // namespace cudf::io::json::detail
31 changes: 31 additions & 0 deletions cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,9 @@ class json_reader_options {
// Whether to keep the quote characters of string values
bool _keep_quotes = false;

// Normalize single quotes
bool _normalize_single_quotes = false;

// Whether to recover after an invalid JSON line
json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;

Expand Down Expand Up @@ -255,6 +258,13 @@ class json_reader_options {
*/
bool is_enabled_keep_quotes() const { return _keep_quotes; }

/**
* @brief Whether the reader should normalize single quotes around strings
*
* @returns true if the reader should normalize single quotes, false otherwise
*/
bool is_enabled_normalize_single_quotes() const { return _normalize_single_quotes; }

/**
* @brief Queries the JSON reader's behavior on invalid JSON lines.
*
Expand Down Expand Up @@ -340,6 +350,14 @@ class json_reader_options {
*/
void enable_keep_quotes(bool val) { _keep_quotes = val; }

/**
* @brief Set whether the reader should enable normalization of single quotes around strings.
*
* @param val Boolean value to indicate whether the reader should normalize single quotes around
* strings
*/
void enable_normalize_single_quotes(bool val) { _normalize_single_quotes = val; }

/**
* @brief Specifies the JSON reader's behavior on invalid JSON lines.
*
Expand Down Expand Up @@ -502,6 +520,19 @@ class json_reader_options_builder {
return *this;
}

/**
* @brief Set whether the reader should normalize single quotes around strings
*
* @param val Boolean value to indicate whether the reader should normalize single quotes
* of strings
* @return this for chaining
*/
json_reader_options_builder& normalize_single_quotes(bool val)
{
options._normalize_single_quotes = val;
return *this;
}

/**
* @brief Specifies the JSON reader's behavior on invalid JSON lines.
*
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -15,19 +15,13 @@
*/

#include <io/fst/lookup_tables.cuh>
#include <io/utilities/hostdevice_vector.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/cudf_gtest.hpp>
#include <cudf_test/testing_main.hpp>

#include <cudf/scalar/scalar_factories.hpp>
#include <cudf/strings/repeat_strings.hpp>
#include <cudf/io/detail/json.hpp>
#include <cudf/types.hpp>

#include <rmm/cuda_stream.hpp>
#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_buffer.hpp>
#include <rmm/device_scalar.hpp>
#include <rmm/device_uvector.hpp>

#include <thrust/iterator/discard_iterator.h>
Expand All @@ -36,17 +30,16 @@
#include <string>
#include <vector>

namespace {
namespace cudf::io::json {

using SymbolT = char;
using StateT = char;
using SymbolOffsetT = uint32_t;

// Type used to represent the atomic symbol type used within the finite-state machine
// TODO: type aliasing to be declared in a common header for better maintainability and
// pre-empt future bugs
using SymbolT = char;
using StateT = char;
namespace normalize_quotes {

// Type sufficiently large to index symbols within the input and output (may be unsigned)
using SymbolOffsetT = uint32_t;
enum class dfa_states : char { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES };
enum class dfa_states : StateT { TT_OOS = 0U, TT_DQS, TT_SQS, TT_DEC, TT_SEC, TT_NUM_STATES };
enum class dfa_symbol_group_id : uint32_t {
DOUBLE_QUOTE_CHAR, ///< Quote character SG: "
SINGLE_QUOTE_CHAR, ///< Quote character SG: '
Expand All @@ -62,7 +55,7 @@ constexpr auto TT_DQS = dfa_states::TT_DQS;
constexpr auto TT_SQS = dfa_states::TT_SQS;
constexpr auto TT_DEC = dfa_states::TT_DEC;
constexpr auto TT_SEC = dfa_states::TT_SEC;
constexpr auto TT_NUM_STATES = static_cast<char>(dfa_states::TT_NUM_STATES);
constexpr auto TT_NUM_STATES = static_cast<StateT>(dfa_states::TT_NUM_STATES);
constexpr auto NUM_SYMBOL_GROUPS = static_cast<uint32_t>(dfa_symbol_group_id::NUM_SYMBOL_GROUPS);

// The i-th string representing all the characters of a symbol group
Expand All @@ -80,7 +73,7 @@ std::array<std::array<dfa_states, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const qna_s
}};

// The DFA's starting state
constexpr char start_state = static_cast<char>(TT_OOS);
constexpr auto start_state = static_cast<StateT>(TT_OOS);

struct TransduceToNormalizedQuotes {
/**
Expand Down Expand Up @@ -112,7 +105,7 @@ struct TransduceToNormalizedQuotes {
// SEC | Sigma\{'} -> {\*}

// Whether this transition translates to the escape sequence: \"
const bool outputs_escape_sequence =
bool const outputs_escape_sequence =
(state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
(match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR));
// Case when a double quote needs to be replaced by the escape sequence: \"
Expand Down Expand Up @@ -156,19 +149,19 @@ struct TransduceToNormalizedQuotes {
SymbolT const read_symbol) const
{
// Whether this transition translates to the escape sequence: \"
const bool sqs_outputs_escape_sequence =
bool const sqs_outputs_escape_sequence =
(state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
(match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::DOUBLE_QUOTE_CHAR));
// Number of characters to output on this transition
if (sqs_outputs_escape_sequence) { return 2; }
// Whether this transition translates to the escape sequence \<s> or unescaped '
const bool sec_outputs_escape_sequence =
bool const sec_outputs_escape_sequence =
(state_id == static_cast<StateT>(dfa_states::TT_SEC)) &&
(match_id != static_cast<SymbolGroupT>(dfa_symbol_group_id::SINGLE_QUOTE_CHAR));
// Number of characters to output on this transition
if (sec_outputs_escape_sequence) { return 2; }
// Whether this transition translates to no output <nop>
const bool sqs_outputs_nop =
bool const sqs_outputs_nop =
(state_id == static_cast<StateT>(dfa_states::TT_SQS)) &&
(match_id == static_cast<SymbolGroupT>(dfa_symbol_group_id::ESCAPE_CHAR));
// Number of characters to output on this transition
Expand All @@ -177,156 +170,33 @@ struct TransduceToNormalizedQuotes {
}
};

} // namespace
} // namespace normalize_quotes

// Base test fixture for tests
struct FstTest : public cudf::test::BaseFixture {};
namespace detail {

void run_test(std::string& input, std::string& output)
rmm::device_uvector<SymbolT> normalize_single_quotes(rmm::device_uvector<SymbolT>&& inbuf,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
// Prepare cuda stream for data transfers & kernels
rmm::cuda_stream stream{};
rmm::cuda_stream_view stream_view(stream);

auto parser = cudf::io::fst::detail::make_fst(
cudf::io::fst::detail::make_symbol_group_lut(qna_sgs),
cudf::io::fst::detail::make_transition_table(qna_state_tt),
cudf::io::fst::detail::make_translation_functor(TransduceToNormalizedQuotes{}),
auto parser = fst::detail::make_fst(
fst::detail::make_symbol_group_lut(normalize_quotes::qna_sgs),
fst::detail::make_transition_table(normalize_quotes::qna_state_tt),
fst::detail::make_translation_functor(normalize_quotes::TransduceToNormalizedQuotes{}),
stream);

auto d_input_scalar = cudf::make_string_scalar(input, stream_view);
auto& d_input = static_cast<cudf::scalar_type_t<std::string>&>(*d_input_scalar);

// Prepare input & output buffers
constexpr std::size_t single_item = 1;
cudf::detail::hostdevice_vector<SymbolT> output_gpu(input.size() * 2, stream_view);
cudf::detail::hostdevice_vector<SymbolOffsetT> output_gpu_size(single_item, stream_view);

// Allocate device-side temporary storage & run algorithm
parser.Transduce(d_input.data(),
static_cast<SymbolOffsetT>(d_input.size()),
output_gpu.device_ptr(),
rmm::device_uvector<SymbolT> outbuf(inbuf.size() * 2, stream, mr);
rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
parser.Transduce(inbuf.data(),
static_cast<SymbolOffsetT>(inbuf.size()),
outbuf.data(),
thrust::make_discard_iterator(),
output_gpu_size.device_ptr(),
start_state,
stream_view);

// Async copy results from device to host
output_gpu.device_to_host_async(stream_view);
output_gpu_size.device_to_host_async(stream_view);

// Make sure results have been copied back to host
stream.synchronize();

// Verify results
ASSERT_EQ(output_gpu_size[0], output.size());
CUDF_TEST_EXPECT_VECTOR_EQUAL(output_gpu, output, output.size());
}

TEST_F(FstTest, GroundTruth_QuoteNormalization1)
{
std::string input = R"({"A":'TEST"'})";
std::string output = R"({"A":"TEST\""})";
run_test(input, output);
}

TEST_F(FstTest, GroundTruth_QuoteNormalization2)
{
std::string input = R"({'A':"TEST'"} ['OTHER STUFF'])";
std::string output = R"({"A":"TEST'"} ["OTHER STUFF"])";
run_test(input, output);
}

TEST_F(FstTest, GroundTruth_QuoteNormalization3)
{
std::string input = R"(['{"A": "B"}',"{'A': 'B'}"])";
std::string output = R"(["{\"A\": \"B\"}","{'A': 'B'}"])";
run_test(input, output);
}

TEST_F(FstTest, GroundTruth_QuoteNormalization4)
{
std::string input = R"({"ain't ain't a word and you ain't supposed to say it":'"""""""""""'})";
std::string output =
R"({"ain't ain't a word and you ain't supposed to say it":"\"\"\"\"\"\"\"\"\"\"\""})";
run_test(input, output);
}

TEST_F(FstTest, GroundTruth_QuoteNormalization5)
{
std::string input = R"({"\"'\"'\"'\"'":'"\'"\'"\'"\'"'})";
std::string output = R"({"\"'\"'\"'\"'":"\"'\"'\"'\"'\""})";
run_test(input, output);
}

TEST_F(FstTest, GroundTruth_QuoteNormalization6)
{
std::string input = R"([{"ABC':'CBA":'XYZ":"ZXY'}])";
std::string output = R"([{"ABC':'CBA":"XYZ\":\"ZXY"}])";
run_test(input, output);
}

TEST_F(FstTest, GroundTruth_QuoteNormalization7)
{
std::string input = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])";
std::string output = R"(["\t","\\t","\\","\\\'\"\\\\","\n","\b"])";
run_test(input, output);
}

TEST_F(FstTest, GroundTruth_QuoteNormalization8)
{
std::string input = R"(['\t','\\t','\\','\\\"\'\\\\','\n','\b','\u0012'])";
std::string output = R"(["\t","\\t","\\","\\\"'\\\\","\n","\b","\u0012"])";
run_test(input, output);
}

TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid1)
{
std::string input = R"(["THIS IS A TEST'])";
std::string output = R"(["THIS IS A TEST'])";
run_test(input, output);
}

TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid2)
{
std::string input = R"(['THIS IS A TEST"])";
std::string output = R"(["THIS IS A TEST\"])";
run_test(input, output);
}
outbuf_size.data(),
normalize_quotes::start_state,
stream);

TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid3)
{
std::string input = R"({"MORE TEST'N":'RESUL})";
std::string output = R"({"MORE TEST'N":"RESUL})";
run_test(input, output);
}

TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid4)
{
std::string input = R"({"NUMBER":100'0,'STRING':'SOMETHING'})";
std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})";
run_test(input, output);
}

TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid5)
{
std::string input = R"({'NUMBER':100"0,"STRING":"SOMETHING"})";
std::string output = R"({"NUMBER":100"0,"STRING":"SOMETHING"})";
run_test(input, output);
}

TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid6)
{
std::string input = R"({'a':'\\''})";
std::string output = R"({"a":"\\""})";
run_test(input, output);
}

TEST_F(FstTest, GroundTruth_QuoteNormalization_Invalid7)
{
std::string input = R"(}'a': 'b'{)";
std::string output = R"(}"a": "b"{)";
run_test(input, output);
outbuf.resize(outbuf_size.value(stream), stream);
return outbuf;
}

CUDF_TEST_PROGRAM_MAIN()
} // namespace detail
} // namespace cudf::io::json
Loading

0 comments on commit f800f5a

Please sign in to comment.