Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce benchmark suite for JSON reader options #15124

Merged
merged 22 commits into from
Apr 9, 2024
Merged
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
b2a7c45
simple json lines benchmark - no fancy data generation
shrshi Feb 23, 2024
bc42243
style fixes
shrshi Feb 23, 2024
f2dd994
added lines; table size as param
shrshi Feb 28, 2024
2ae3633
Merge branch 'branch-24.04' into json-benchmark
shrshi Feb 28, 2024
ea4d9b7
Merge branch 'branch-24.04' into json-benchmark
shrshi Mar 4, 2024
56317bd
Merge branch 'branch-24.04' into json-benchmark
shrshi Mar 6, 2024
a1bc431
Merge branch 'branch-24.04' into json-benchmark
shrshi Mar 7, 2024
5c0de81
partial work commit
shrshi Mar 8, 2024
72b070d
adding whitespace normalization and lines axes
shrshi Mar 8, 2024
e04db8b
separated out the lines benchamrk
shrshi Mar 8, 2024
b470334
Merge branch 'json-benchmark' of github.com:shrshi/cudf into json-ben…
shrshi Mar 8, 2024
dc799c2
Merge branch 'branch-24.04' into json-benchmark
shrshi Mar 8, 2024
f926172
skipping some param configs
shrshi Mar 11, 2024
6c75aee
Merge branch 'json-benchmark' of github.com:shrshi/cudf into json-ben…
shrshi Mar 11, 2024
14435e0
Merge branch 'branch-24.04' into json-benchmark
shrshi Mar 11, 2024
5f331e2
partially addressing PR reviews
shrshi Mar 11, 2024
6c3604b
Merge branch 'branch-24.04' into json-benchmark
shrshi Mar 19, 2024
5661a4a
Update cpp/benchmarks/io/json/json_reader_option.cpp
shrshi Mar 19, 2024
12cfaba
Merge branch 'branch-24.04' into json-benchmark
shrshi Mar 19, 2024
300753c
Merge branch 'json-benchmark' of github.com:shrshi/cudf into json-ben…
shrshi Mar 19, 2024
3a79368
formatting fix
shrshi Mar 19, 2024
66552a5
Merge branch 'branch-24.06' into json-benchmark
ttnghia Apr 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -330,6 +330,7 @@ ConfigureNVBench(
ConfigureBench(JSON_BENCH json/json.cu)
ConfigureNVBench(FST_NVBENCH io/fst.cu)
ConfigureNVBench(JSON_READER_NVBENCH io/json/nested_json.cpp io/json/json_reader_input.cpp)
ConfigureNVBench(JSON_READER_OPTION io/json/json_reader_option.cpp)
ConfigureNVBench(JSON_WRITER_NVBENCH io/json/json_writer.cpp)

# ##################################################################################################
Expand Down
199 changes: 199 additions & 0 deletions cpp/benchmarks/io/json/json_reader_option.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,199 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>
#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/io/cuio_common.hpp>
#include <benchmarks/io/nvbench_helpers.hpp>

#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/io/json.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <nvbench/nvbench.cuh>

// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
// run on most GPUs, but large enough to allow highest throughput
constexpr size_t data_size = 512 << 20;
constexpr cudf::size_type num_cols = 64;

template <json_lines JsonLines>
void BM_json_read_options(nvbench::state& state, nvbench::type_list<nvbench::enum_type<JsonLines>>)
{
constexpr auto json_lines_bool = JsonLines == json_lines::YES;

cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
auto const data_types = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
static_cast<int32_t>(data_type::FLOAT),
static_cast<int32_t>(data_type::DECIMAL),
static_cast<int32_t>(data_type::STRING),
static_cast<int32_t>(data_type::LIST),
static_cast<int32_t>(data_type::STRUCT)});

auto const tbl = create_random_table(
cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}, data_profile_builder());
auto const view = tbl->view();
cudf::io::json_writer_options const write_opts =
cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view)
.lines(json_lines_bool)
.na_rep("null")
.rows_per_chunk(100'000);
cudf::io::write_json(write_opts);

cudf::io::json_reader_options read_options =
cudf::io::json_reader_options::builder(source_sink.make_source_info()).lines(json_lines_bool);

auto mem_stats_logger = cudf::memory_stats_logger();
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(
nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
try_drop_l3_cache();
cudf::size_type num_rows_read = 0;
cudf::size_type num_cols_read = 0;
timer.start();
auto const result = cudf::io::read_json(read_options);
num_rows_read = result.tbl->num_rows();
num_cols_read = result.tbl->num_columns();
shrshi marked this conversation as resolved.
Show resolved Hide resolved
timer.stop();
CUDF_EXPECTS(num_rows_read == view.num_rows(), "Benchmark did not read the entire table");
CUDF_EXPECTS(num_cols_read == num_cols, "Unexpected number of columns");
});

auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
auto const data_processed = data_size * num_cols / view.num_columns();
state.add_element_count(static_cast<double>(data_processed) / elapsed_time, "bytes_per_second");
state.add_buffer_size(
mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
}

template <row_selection RowSelection,
normalize_single_quotes NormalizeSingleQuotes,
normalize_whitespace NormalizeWhitespace,
mixed_types_as_string MixedTypesAsString,
recovery_mode RecoveryMode>
Comment on lines +81 to +85
Copy link
Contributor

@ttnghia ttnghia Mar 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh no this is too many template params. Why not using run time parameter instead? That would reduce compile time a lot.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I followed the design for benchmarking reader options in orc and parquet. Would we have to modify those benchmarks as well to maintain similar design?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh no this is too many template params. Why not using run time parameter instead?

This makes results more readable and the build time for benchmarks doesn't really matter IMO.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah that's fine to me 👍

void BM_jsonlines_read_options(nvbench::state& state,
nvbench::type_list<nvbench::enum_type<RowSelection>,
nvbench::enum_type<NormalizeSingleQuotes>,
nvbench::enum_type<NormalizeWhitespace>,
nvbench::enum_type<MixedTypesAsString>,
nvbench::enum_type<RecoveryMode>>)
{
constexpr auto normalize_single_quotes_bool =
NormalizeSingleQuotes == normalize_single_quotes::YES;
constexpr auto normalize_whitespace_bool = NormalizeWhitespace == normalize_whitespace::YES;
constexpr auto mixed_types_as_string_bool = MixedTypesAsString == mixed_types_as_string::YES;
constexpr auto recovery_mode_enum = RecoveryMode == recovery_mode::RECOVER_WITH_NULL
? cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL
: cudf::io::json_recovery_mode_t::FAIL;
size_t const num_chunks = state.get_int64("num_chunks");
if (num_chunks > 1 && RowSelection == row_selection::ALL) {
state.skip(
"No point running the same benchmark multiple times for different num_chunks when all rows "
"are being selected anyway");
return;
}

cuio_source_sink_pair source_sink(io_type::HOST_BUFFER);
auto const data_types = get_type_or_group({static_cast<int32_t>(data_type::INTEGRAL),
static_cast<int32_t>(data_type::FLOAT),
static_cast<int32_t>(data_type::DECIMAL),
static_cast<int32_t>(data_type::STRING),
static_cast<int32_t>(data_type::LIST),
static_cast<int32_t>(data_type::STRUCT)});

auto const tbl = create_random_table(
cycle_dtypes(data_types, num_cols), table_size_bytes{data_size}, data_profile_builder());
auto const view = tbl->view();
cudf::io::json_writer_options const write_opts =
cudf::io::json_writer_options::builder(source_sink.make_sink_info(), view)
.lines(true)
Copy link
Contributor

@GregoryKimball GregoryKimball Feb 28, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While we are testing the options, I would recommend making lines into an nvbench enum axis as well. Although since some of the options require lines to be true, maybe benchmarking lines true/false should be a separate benchmark.

nvbench::enum_type_list<row_selection::ALL, row_selection::BYTE_RANGE>, lines=True
nvbench::enum_type_list<normalize_single_quotes::NO, normalize_single_quotes::YES> line=True/False
nvbench::enum_type_list<mixed_types_as_string::NO, mixed_types_as_string::YES> line=True/False
nvbench::enum_type_list<recovery_mode::RECOVER_WITH_NULL, recovery_mode::FAIL>)) lines=True

After thinking through the options, I don't think we need to test normalize_single_quotes and mixed_types_as_string with lines=false. It still might be useful to add a lines true/false benchmark without any additional options. If others agree then that could be a follow-on PR.

.na_rep("null")
.rows_per_chunk(100'000);
cudf::io::write_json(write_opts);

cudf::io::json_reader_options read_options =
cudf::io::json_reader_options::builder(source_sink.make_source_info())
.lines(true)
.normalize_single_quotes(normalize_single_quotes_bool)
.normalize_whitespace(normalize_whitespace_bool)
.mixed_types_as_string(mixed_types_as_string_bool)
.recovery_mode(recovery_mode_enum);

size_t const chunk_size = cudf::util::div_rounding_up_safe(source_sink.size(), num_chunks);
auto mem_stats_logger = cudf::memory_stats_logger();
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
state.exec(
nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
try_drop_l3_cache();
cudf::size_type num_rows_read = 0;
cudf::size_type num_cols_read = 0;
timer.start();
switch (RowSelection) {
vuule marked this conversation as resolved.
Show resolved Hide resolved
case row_selection::ALL: {
auto const result = cudf::io::read_json(read_options);
num_rows_read = result.tbl->num_rows();
num_cols_read = result.tbl->num_columns();
break;
}
case row_selection::BYTE_RANGE: {
for (uint64_t chunk = 0; chunk < num_chunks; chunk++) {
read_options.set_byte_range_offset(chunk * chunk_size);
read_options.set_byte_range_size(chunk_size);
auto const result = cudf::io::read_json(read_options);
num_rows_read += result.tbl->num_rows();
num_cols_read = result.tbl->num_columns();
if (num_cols_read)
vuule marked this conversation as resolved.
Show resolved Hide resolved
CUDF_EXPECTS(num_cols_read == num_cols, "Unexpected number of columns");
}
break;
}
default: CUDF_FAIL("Unsupported row selection method");
}
timer.stop();
CUDF_EXPECTS(num_rows_read == view.num_rows(), "Benchmark did not read the entire table");
});

auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
auto const data_processed = data_size * num_cols / view.num_columns();
state.add_element_count(static_cast<double>(data_processed) / elapsed_time, "bytes_per_second");
state.add_buffer_size(
mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage");
state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
}

NVBENCH_BENCH_TYPES(
BM_jsonlines_read_options,
NVBENCH_TYPE_AXES(
nvbench::enum_type_list<row_selection::ALL, row_selection::BYTE_RANGE>,
nvbench::enum_type_list<normalize_single_quotes::NO, normalize_single_quotes::YES>,
nvbench::enum_type_list<normalize_whitespace::NO, normalize_whitespace::YES>,
nvbench::enum_type_list<mixed_types_as_string::NO, mixed_types_as_string::YES>,
nvbench::enum_type_list<recovery_mode::RECOVER_WITH_NULL, recovery_mode::FAIL>))
.set_name("jsonlines_reader")
.set_type_axes_names({"row_selection",
"normalize_single_quotes",
"normalize_whitespace",
"mixed_types_as_string",
"recovery_mode"})
.set_min_samples(6)
.add_int64_axis("num_chunks", nvbench::range(1, 5, 1));

NVBENCH_BENCH_TYPES(BM_json_read_options,
NVBENCH_TYPE_AXES(nvbench::enum_type_list<json_lines::YES, json_lines::NO>))
.set_name("json_reader")
.set_type_axes_names({"json_lines"})
.set_min_samples(6);
67 changes: 66 additions & 1 deletion cpp/benchmarks/io/nvbench_helpers.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -169,3 +169,68 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
}
},
[](auto) { return std::string{}; })

enum class json_lines : bool { YES, NO };

enum class normalize_single_quotes : bool { YES, NO };

enum class normalize_whitespace : bool { YES, NO };

enum class mixed_types_as_string : bool { YES, NO };

enum class recovery_mode : bool { FAIL, RECOVER_WITH_NULL };

NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
json_lines,
[](auto value) {
switch (value) {
case json_lines::YES: return "YES";
case json_lines::NO: return "NO";
default: return "Unknown";
}
},
[](auto) { return std::string{}; })

NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
normalize_single_quotes,
[](auto value) {
switch (value) {
case normalize_single_quotes::YES: return "YES";
case normalize_single_quotes::NO: return "NO";
default: return "Unknown";
}
},
[](auto) { return std::string{}; })

NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
normalize_whitespace,
[](auto value) {
switch (value) {
case normalize_whitespace::YES: return "YES";
case normalize_whitespace::NO: return "NO";
default: return "Unknown";
}
},
[](auto) { return std::string{}; })

NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
mixed_types_as_string,
[](auto value) {
switch (value) {
case mixed_types_as_string::YES: return "YES";
case mixed_types_as_string::NO: return "NO";
default: return "Unknown";
}
},
[](auto) { return std::string{}; })

NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
recovery_mode,
[](auto value) {
switch (value) {
case recovery_mode::FAIL: return "FAIL";
case recovery_mode::RECOVER_WITH_NULL: return "RECOVER_WITH_NULL";
default: return "Unknown";
}
},
[](auto) { return std::string{}; })
Loading