Skip to content

Commit

Permalink
WGS84 to UTM benchmark (C++) (#1216)
Browse files Browse the repository at this point in the history
Closes #1215

Depends on #1191 

New benchmark for C++ API for WGS84->UTM transform.

Below results were run on a single CPU core for Proj (ignore gbench's claims to the contrary), and on a single H100 80GB GPU. Machine was a DGX H100.

```
(rapids) coder ➜ ~/cuspatial $ CUDA_VISIBLE_DEVICES=1 cpp/build/latest/cuproj/benchmarks/WGS_TO_UTM_BENCH 
2023-08-02T00:08:52+00:00
Running cpp/build/latest/cuproj/benchmarks/WGS_TO_UTM_BENCH
Run on (224 X 3800 MHz CPU s)
CPU Caches:
  L1 Data 48 KiB (x112)
  L1 Instruction 32 KiB (x112)
  L2 Unified 2048 KiB (x112)
  L3 Unified 107520 KiB (x2)
Load Average: 1.70, 6.21, 14.43
---------------------------------------------------------------------------------------------------------------------
Benchmark                                                           Time             CPU   Iterations UserCounters...
---------------------------------------------------------------------------------------------------------------------
proj_utm_benchmark/forward_double/100                           0.013 ms        0.013 ms        57756 items_per_second=7.95314M/s
proj_utm_benchmark/forward_double/1000                          0.114 ms        0.114 ms         6118 items_per_second=8.73369M/s
proj_utm_benchmark/forward_double/10000                          1.24 ms         1.24 ms          588 items_per_second=8.07697M/s
proj_utm_benchmark/forward_double/100000                         12.0 ms         12.0 ms           58 items_per_second=8.35593M/s
proj_utm_benchmark/forward_double/1000000                         120 ms          120 ms            6 items_per_second=8.36301M/s
proj_utm_benchmark/forward_double/10000000                       1213 ms         1213 ms            1 items_per_second=8.24563M/s
proj_utm_benchmark/forward_double/100000000                     11977 ms        11976 ms            1 items_per_second=8.35038M/s
proj_utm_benchmark/forward_double/1000000000                   119680 ms       119677 ms            1 items_per_second=8.35579M/s
cuproj_utm_benchmark/forward_float/100/manual_time              0.011 ms        0.040 ms        66283 items_per_second=9.47605M/s
cuproj_utm_benchmark/forward_float/1000/manual_time             0.012 ms        0.041 ms        56799 items_per_second=81.2019M/s
cuproj_utm_benchmark/forward_float/10000/manual_time            0.013 ms        0.042 ms        55571 items_per_second=793.482M/s
cuproj_utm_benchmark/forward_float/100000/manual_time           0.013 ms        0.042 ms        53048 items_per_second=7.5779G/s
cuproj_utm_benchmark/forward_float/1000000/manual_time          0.027 ms        0.056 ms        25842 items_per_second=36.9063G/s
cuproj_utm_benchmark/forward_float/10000000/manual_time         0.170 ms        0.198 ms         4130 items_per_second=58.8554G/s
cuproj_utm_benchmark/forward_float/100000000/manual_time         1.60 ms         1.62 ms          439 items_per_second=62.6581G/s
cuproj_utm_benchmark/forward_float/1000000000/manual_time        15.9 ms         15.9 ms           44 items_per_second=63.0518G/s
cuproj_utm_benchmark/forward_double/100/manual_time             0.012 ms        0.041 ms        57960 items_per_second=8.30297M/s
cuproj_utm_benchmark/forward_double/1000/manual_time            0.015 ms        0.044 ms        47605 items_per_second=68.0791M/s
cuproj_utm_benchmark/forward_double/10000/manual_time           0.015 ms        0.044 ms        47353 items_per_second=676.864M/s
cuproj_utm_benchmark/forward_double/100000/manual_time          0.016 ms        0.045 ms        43394 items_per_second=6.19684G/s
cuproj_utm_benchmark/forward_double/1000000/manual_time         0.042 ms        0.070 ms        16621 items_per_second=23.7599G/s
cuproj_utm_benchmark/forward_double/10000000/manual_time        0.304 ms        0.332 ms         2302 items_per_second=32.863G/s
cuproj_utm_benchmark/forward_double/100000000/manual_time        2.93 ms         2.96 ms          240 items_per_second=34.087G/s
cuproj_utm_benchmark/forward_double/1000000000/manual_time       29.3 ms         29.3 ms           24 items_per_second=34.13G/s

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Michael Wang (https://github.com/isVoid)

URL: #1216
  • Loading branch information
harrism authored Aug 2, 2023
1 parent fb240fe commit cb7ce8e
Show file tree
Hide file tree
Showing 7 changed files with 312 additions and 47 deletions.
54 changes: 32 additions & 22 deletions cpp/cuproj/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,32 +24,31 @@ add_library(cuproj_benchmark_common OBJECT
target_compile_features(cuproj_benchmark_common PUBLIC cxx_std_17 cuda_std_17)

target_link_libraries(cuproj_benchmark_common
PUBLIC benchmark::benchmark
cudf::cudftestutil
cuproj)
PUBLIC benchmark::benchmark cudf::cudftestutil cuproj)

target_compile_options(cuproj_benchmark_common
PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUPROJ_CXX_FLAGS}>"
"$<$<COMPILE_LANGUAGE:CUDA>:${CUPROJ_CUDA_FLAGS}>")
PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUPROJ_CXX_FLAGS}>"
"$<$<COMPILE_LANGUAGE:CUDA>:${CUPROJ_CUDA_FLAGS}>")

target_include_directories(cuproj_benchmark_common
PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
"$<BUILD_INTERFACE:${CUPROJ_SOURCE_DIR}>"
"$<BUILD_INTERFACE:${CUPROJ_SOURCE_DIR}/src>")
PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
"$<BUILD_INTERFACE:${CUPROJ_SOURCE_DIR}>"
"$<BUILD_INTERFACE:${CUPROJ_SOURCE_DIR}/src>"
"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../include>")

function(ConfigureBench CMAKE_BENCH_NAME)
add_executable(${CMAKE_BENCH_NAME} ${ARGN})
set_target_properties(${CMAKE_BENCH_NAME}
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUPROJ_BINARY_DIR}/benchmarks>"
INSTALL_RPATH "\$ORIGIN/../../../lib"
)
target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE benchmark::benchmark_main cuproj_benchmark_common)
install(
TARGETS ${CMAKE_BENCH_NAME}
COMPONENT benchmark
DESTINATION bin/benchmarks/libcuproj
EXCLUDE_FROM_ALL
add_executable(${CMAKE_BENCH_NAME} ${ARGN})
set_target_properties(${CMAKE_BENCH_NAME}
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUPROJ_BINARY_DIR}/benchmarks>"
INSTALL_RPATH "\$ORIGIN/../../../lib"
)
target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE benchmark::benchmark_main cuproj_benchmark_common PROJ::proj)
install(
TARGETS ${CMAKE_BENCH_NAME}
COMPONENT benchmark
DESTINATION bin/benchmarks/libcuproj
EXCLUDE_FROM_ALL
)
endfunction()

# This function takes in a benchmark name and benchmark source for nvbench benchmarks and handles
Expand All @@ -61,8 +60,20 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUPROJ_BINARY_DIR}/benchmarks>"
INSTALL_RPATH "\$ORIGIN/../../../lib"
)
target_compile_features(${CMAKE_BENCH_NAME} PUBLIC cxx_std_17 cuda_std_17)

target_compile_options(${CMAKE_BENCH_NAME}
PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUPROJ_CXX_FLAGS}>"
"$<$<COMPILE_LANGUAGE:CUDA>:${CUPROJ_CUDA_FLAGS}>")

target_include_directories(${CMAKE_BENCH_NAME}
PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
"$<BUILD_INTERFACE:${CUPROJ_SOURCE_DIR}>"
"$<BUILD_INTERFACE:${CUPROJ_SOURCE_DIR}/src>"
"$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../include>")

target_link_libraries(
${CMAKE_BENCH_NAME} PRIVATE cuproj_benchmark_common nvbench::main
${CMAKE_BENCH_NAME} PRIVATE cuproj nvbench::main
)
install(
TARGETS ${CMAKE_BENCH_NAME}
Expand All @@ -76,5 +87,4 @@ endfunction()
### benchmark sources #############################################################################
###################################################################################################

ConfigureBench(TEST_BENCH
test.cu)
ConfigureBench(WGS_TO_UTM_BENCH wgs_to_utm_bench.cu)
91 changes: 91 additions & 0 deletions cpp/cuproj/benchmarks/fixture/benchmark_fixture.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmark/benchmark.h>
#include <rmm/mr/device/cuda_memory_resource.hpp>
#include <rmm/mr/device/owning_wrapper.hpp>
#include <rmm/mr/device/per_device_resource.hpp>
#include <rmm/mr/device/pool_memory_resource.hpp>

namespace cuspatial {

namespace {
// memory resource factory helpers
inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }

inline auto make_pool()
{
return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
}
} // namespace

/**
* @brief Google Benchmark fixture for libcuspatial benchmarks
*
* libcuspatial benchmarks should use a fixture derived from this fixture class to
* ensure that the RAPIDS Memory Manager pool mode is used in benchmarks, which
* eliminates memory allocation / deallocation performance overhead from the
* benchmark.
*
* The SetUp and TearDown methods of this fixture initialize RMM into pool mode
* and finalize it, respectively. These methods are called automatically by
* Google Benchmark
*
* Example:
*
* template <class T>
* class my_benchmark : public cuspatial::benchmark {
* public:
* using TypeParam = T;
* };
*
* Then:
*
* BENCHMARK_TEMPLATE_DEFINE_F(my_benchmark, my_test_name, int)
* (::benchmark::State& state) {
* for (auto _ : state) {
* // benchmark stuff
* }
* }
*
* BENCHMARK_REGISTER_F(my_benchmark, my_test_name)->Range(128, 512);
*/
class benchmark : public ::benchmark::Fixture {
public:
virtual void SetUp(const ::benchmark::State& state) override
{
mr = make_pool();
rmm::mr::set_current_device_resource(mr.get()); // set default resource to pool
}

virtual void TearDown(const ::benchmark::State& state) override
{
// reset default resource to the initial resource
rmm::mr::set_current_device_resource(nullptr);
mr.reset();
}

// eliminate partial override warnings (see benchmark/benchmark.h)
void SetUp(::benchmark::State& st) override { SetUp(const_cast<const ::benchmark::State&>(st)); }
void TearDown(::benchmark::State& st) override
{
TearDown(const_cast<const ::benchmark::State&>(st));
}

std::shared_ptr<rmm::mr::device_memory_resource> mr;
};

}; // namespace cuspatial
Empty file removed cpp/cuproj/benchmarks/test.cu
Empty file.
135 changes: 135 additions & 0 deletions cpp/cuproj/benchmarks/wgs_to_utm_bench.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cuproj/projection_factories.hpp>
#include <cuspatial/geometry/vec_2d.hpp>

#include <cuproj_test/convert_coordinates.hpp>
#include <cuproj_test/coordinate_generator.cuh>

#include <benchmarks/fixture/benchmark_fixture.hpp>
#include <benchmarks/synchronization/synchronization.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_vector.hpp>

#include <thrust/host_vector.h>

#include <type_traits>

template <typename T>
using coordinate = typename cuspatial::vec_2d<T>;

static char const* epsg_src = "EPSG:4326";
static char const* epsg_dst = "EPSG:32756";

template <typename T>
auto make_input(std::size_t grid_side)
{
// Sydney Harbour
coordinate<T> min_corner{-33.9, 151.2};
coordinate<T> max_corner{-33.7, 151.3};

auto input = cuproj_test::make_grid_array<coordinate<T>, rmm::device_vector<coordinate<T>>>(
min_corner, max_corner, grid_side, grid_side);

return input;
}

template <typename T>
static void cuproj_wgs_to_utm_benchmark(benchmark::State& state)
{
auto const num_points = state.range(0);

auto const grid_side{static_cast<std::size_t>(sqrt(num_points))};

auto input = make_input<T>(grid_side);

rmm::device_vector<coordinate<T>> output(input.size());

auto proj = cuproj::make_projection<coordinate<T>>(epsg_src, epsg_dst);

for (auto _ : state) {
cuda_event_timer raii(state, true);
proj.transform(input.begin(),
input.end(),
output.begin(),
cuproj::direction::FORWARD,
rmm::cuda_stream_default);
}

state.SetItemsProcessed(num_points * state.iterations());
}

void proj_wgs_to_utm_benchmark(benchmark::State& state)
{
using T = double;
auto const num_points = state.range(0);

auto const grid_side{static_cast<std::size_t>(sqrt(num_points))};

auto d_input = make_input<T>(grid_side);
auto input = thrust::host_vector<coordinate<T>>(d_input);

std::vector<PJ_COORD> pj_input(input.size());

PJ_CONTEXT* C = proj_context_create();
PJ* P = proj_create_crs_to_crs(C, epsg_src, epsg_dst, nullptr);

for (auto _ : state) {
state.PauseTiming();
cuproj_test::convert_coordinates(input, pj_input);
state.ResumeTiming();
proj_trans_array(P, PJ_FWD, pj_input.size(), pj_input.data());
}

state.SetItemsProcessed(num_points * state.iterations());
}

class proj_utm_benchmark : public ::benchmark::Fixture {};

// Edit these for GPUs/CPUs with larger or smaller memory.
// For double precision, its' 16 bytes per (x,y) point, x2 for input and output
// 10^8 points -> 3.2GB+, 10^9 points -> 32GB+
// H100 80GB is plenty for 10^9 points

constexpr int range_min = 100;
constexpr int range_max = 100'000'000;

BENCHMARK_DEFINE_F(proj_utm_benchmark, forward_double)(::benchmark::State& state)
{
proj_wgs_to_utm_benchmark(state);
}
BENCHMARK_REGISTER_F(proj_utm_benchmark, forward_double)
->RangeMultiplier(10)
->Range(range_min, range_max)
->Unit(benchmark::kMillisecond);

class cuproj_utm_benchmark : public cuspatial::benchmark {};

#define UTM_CUPROJ_BENCHMARK_DEFINE(name, type) \
BENCHMARK_DEFINE_F(cuproj_utm_benchmark, name)(::benchmark::State & state) \
{ \
cuproj_wgs_to_utm_benchmark<type>(state); \
} \
BENCHMARK_REGISTER_F(cuproj_utm_benchmark, name) \
->RangeMultiplier(10) \
->Range(range_min, range_max) \
->UseManualTime() \
->Unit(benchmark::kMillisecond);

UTM_CUPROJ_BENCHMARK_DEFINE(forward_float, float);
UTM_CUPROJ_BENCHMARK_DEFINE(forward_double, double);
49 changes: 49 additions & 0 deletions cpp/cuproj/include/cuproj_test/convert_coordinates.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <proj.h>

#include <algorithm>
#include <type_traits>

namespace cuproj_test {

// Convert coordinates from a x-y struct to a PJ_COORD struct or vice versa
template <typename InVector, typename OutVector>
void convert_coordinates(InVector const& in, OutVector& out)
{
using in_coord_type = typename InVector::value_type;
using out_coord_type = typename OutVector::value_type;

static_assert(
(std::is_same_v<out_coord_type, PJ_COORD> != std::is_same_v<in_coord_type, PJ_COORD>),
"Invalid coordinate vector conversion");

if constexpr (std::is_same_v<in_coord_type, PJ_COORD>) {
using T = typename out_coord_type::value_type;
auto proj_coord_to_coordinate = [](auto const& c) {
return out_coord_type{static_cast<T>(c.xy.x), static_cast<T>(c.xy.y)};
};
std::transform(in.begin(), in.end(), out.begin(), proj_coord_to_coordinate);
} else if constexpr (std::is_same_v<out_coord_type, PJ_COORD>) {
auto coordinate_to_proj_coord = [](auto const& c) { return PJ_COORD{c.x, c.y, 0, 0}; };
std::transform(in.begin(), in.end(), out.begin(), coordinate_to_proj_coord);
}
}

} // namespace cuproj_test
1 change: 1 addition & 0 deletions cpp/cuproj/include/cuproj_test/coordinate_generator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ struct grid_generator {
}
};

// Create a Vector containing a grid of coordinates between the min and max corners
template <typename Coord, typename Vector>
auto make_grid_array(Coord const& min_corner,
Coord const& max_corner,
Expand Down
Loading

0 comments on commit cb7ce8e

Please sign in to comment.