WGS84 to UTM benchmark (C++) (#1216)

Closes #1215 Depends on #1191 New benchmark for C++ API for WGS84->UTM transform. Below results were run on a single CPU core for Proj (ignore gbench's claims to the contrary), and on a single H100 80GB GPU. Machine was a DGX H100. ``` (rapids) coder ➜ ~/cuspatial $ CUDA_VISIBLE_DEVICES=1 cpp/build/latest/cuproj/benchmarks/WGS_TO_UTM_BENCH 2023-08-02T00:08:52+00:00 Running cpp/build/latest/cuproj/benchmarks/WGS_TO_UTM_BENCH Run on (224 X 3800 MHz CPU s) CPU Caches: L1 Data 48 KiB (x112) L1 Instruction 32 KiB (x112) L2 Unified 2048 KiB (x112) L3 Unified 107520 KiB (x2) Load Average: 1.70, 6.21, 14.43 --------------------------------------------------------------------------------------------------------------------- Benchmark Time CPU Iterations UserCounters... --------------------------------------------------------------------------------------------------------------------- proj_utm_benchmark/forward_double/100 0.013 ms 0.013 ms 57756 items_per_second=7.95314M/s proj_utm_benchmark/forward_double/1000 0.114 ms 0.114 ms 6118 items_per_second=8.73369M/s proj_utm_benchmark/forward_double/10000 1.24 ms 1.24 ms 588 items_per_second=8.07697M/s proj_utm_benchmark/forward_double/100000 12.0 ms 12.0 ms 58 items_per_second=8.35593M/s proj_utm_benchmark/forward_double/1000000 120 ms 120 ms 6 items_per_second=8.36301M/s proj_utm_benchmark/forward_double/10000000 1213 ms 1213 ms 1 items_per_second=8.24563M/s proj_utm_benchmark/forward_double/100000000 11977 ms 11976 ms 1 items_per_second=8.35038M/s proj_utm_benchmark/forward_double/1000000000 119680 ms 119677 ms 1 items_per_second=8.35579M/s cuproj_utm_benchmark/forward_float/100/manual_time 0.011 ms 0.040 ms 66283 items_per_second=9.47605M/s cuproj_utm_benchmark/forward_float/1000/manual_time 0.012 ms 0.041 ms 56799 items_per_second=81.2019M/s cuproj_utm_benchmark/forward_float/10000/manual_time 0.013 ms 0.042 ms 55571 items_per_second=793.482M/s cuproj_utm_benchmark/forward_float/100000/manual_time 0.013 ms 0.042 ms 53048 items_per_second=7.5779G/s cuproj_utm_benchmark/forward_float/1000000/manual_time 0.027 ms 0.056 ms 25842 items_per_second=36.9063G/s cuproj_utm_benchmark/forward_float/10000000/manual_time 0.170 ms 0.198 ms 4130 items_per_second=58.8554G/s cuproj_utm_benchmark/forward_float/100000000/manual_time 1.60 ms 1.62 ms 439 items_per_second=62.6581G/s cuproj_utm_benchmark/forward_float/1000000000/manual_time 15.9 ms 15.9 ms 44 items_per_second=63.0518G/s cuproj_utm_benchmark/forward_double/100/manual_time 0.012 ms 0.041 ms 57960 items_per_second=8.30297M/s cuproj_utm_benchmark/forward_double/1000/manual_time 0.015 ms 0.044 ms 47605 items_per_second=68.0791M/s cuproj_utm_benchmark/forward_double/10000/manual_time 0.015 ms 0.044 ms 47353 items_per_second=676.864M/s cuproj_utm_benchmark/forward_double/100000/manual_time 0.016 ms 0.045 ms 43394 items_per_second=6.19684G/s cuproj_utm_benchmark/forward_double/1000000/manual_time 0.042 ms 0.070 ms 16621 items_per_second=23.7599G/s cuproj_utm_benchmark/forward_double/10000000/manual_time 0.304 ms 0.332 ms 2302 items_per_second=32.863G/s cuproj_utm_benchmark/forward_double/100000000/manual_time 2.93 ms 2.96 ms 240 items_per_second=34.087G/s cuproj_utm_benchmark/forward_double/1000000000/manual_time 29.3 ms 29.3 ms 24 items_per_second=34.13G/s Authors: - Mark Harris (https://github.com/harrism) Approvers: - Michael Wang (https://github.com/isVoid) URL: #1216
rapidsai · Aug 2, 2023 · cb7ce8e · cb7ce8e
1 parent fb240fe
commit cb7ce8e
Show file tree

Hide file tree

Showing 7 changed files with 312 additions and 47 deletions.
diff --git a/cpp/cuproj/benchmarks/CMakeLists.txt b/cpp/cuproj/benchmarks/CMakeLists.txt
@@ -24,32 +24,31 @@ add_library(cuproj_benchmark_common OBJECT
 target_compile_features(cuproj_benchmark_common PUBLIC cxx_std_17 cuda_std_17)
 
 target_link_libraries(cuproj_benchmark_common
-    PUBLIC benchmark::benchmark
-           cudf::cudftestutil
-           cuproj)
+  PUBLIC benchmark::benchmark cudf::cudftestutil cuproj)
 
 target_compile_options(cuproj_benchmark_common
-             PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUPROJ_CXX_FLAGS}>"
-                    "$<$<COMPILE_LANGUAGE:CUDA>:${CUPROJ_CUDA_FLAGS}>")
+  PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUPROJ_CXX_FLAGS}>"
+         "$<$<COMPILE_LANGUAGE:CUDA>:${CUPROJ_CUDA_FLAGS}>")
 
 target_include_directories(cuproj_benchmark_common
-             PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
-                    "$<BUILD_INTERFACE:${CUPROJ_SOURCE_DIR}>"
-                    "$<BUILD_INTERFACE:${CUPROJ_SOURCE_DIR}/src>")
+  PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
+         "$<BUILD_INTERFACE:${CUPROJ_SOURCE_DIR}>"
+         "$<BUILD_INTERFACE:${CUPROJ_SOURCE_DIR}/src>"
+         "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../include>")
 
 function(ConfigureBench CMAKE_BENCH_NAME)
-    add_executable(${CMAKE_BENCH_NAME} ${ARGN})
-    set_target_properties(${CMAKE_BENCH_NAME}
-        PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUPROJ_BINARY_DIR}/benchmarks>"
-                   INSTALL_RPATH "\$ORIGIN/../../../lib"
-        )
-    target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE benchmark::benchmark_main cuproj_benchmark_common)
-    install(
-        TARGETS ${CMAKE_BENCH_NAME}
-        COMPONENT benchmark
-        DESTINATION bin/benchmarks/libcuproj
-        EXCLUDE_FROM_ALL
+  add_executable(${CMAKE_BENCH_NAME} ${ARGN})
+  set_target_properties(${CMAKE_BENCH_NAME}
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUPROJ_BINARY_DIR}/benchmarks>"
+               INSTALL_RPATH "\$ORIGIN/../../../lib"
     )
+  target_link_libraries(${CMAKE_BENCH_NAME} PRIVATE benchmark::benchmark_main cuproj_benchmark_common PROJ::proj)
+  install(
+    TARGETS ${CMAKE_BENCH_NAME}
+    COMPONENT benchmark
+    DESTINATION bin/benchmarks/libcuproj
+    EXCLUDE_FROM_ALL
+  )
 endfunction()
 
 # This function takes in a benchmark name and benchmark source for nvbench benchmarks and handles
@@ -61,8 +60,20 @@ function(ConfigureNVBench CMAKE_BENCH_NAME)
     PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUPROJ_BINARY_DIR}/benchmarks>"
                INSTALL_RPATH "\$ORIGIN/../../../lib"
   )
+  target_compile_features(${CMAKE_BENCH_NAME} PUBLIC cxx_std_17 cuda_std_17)
+
+  target_compile_options(${CMAKE_BENCH_NAME}
+    PUBLIC "$<$<COMPILE_LANGUAGE:CXX>:${CUPROJ_CXX_FLAGS}>"
+           "$<$<COMPILE_LANGUAGE:CUDA>:${CUPROJ_CUDA_FLAGS}>")
+
+  target_include_directories(${CMAKE_BENCH_NAME}
+    PUBLIC "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}>"
+           "$<BUILD_INTERFACE:${CUPROJ_SOURCE_DIR}>"
+           "$<BUILD_INTERFACE:${CUPROJ_SOURCE_DIR}/src>"
+           "$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../../include>")
+
   target_link_libraries(
-    ${CMAKE_BENCH_NAME} PRIVATE cuproj_benchmark_common nvbench::main
+    ${CMAKE_BENCH_NAME} PRIVATE cuproj nvbench::main
   )
   install(
     TARGETS ${CMAKE_BENCH_NAME}
@@ -76,5 +87,4 @@ endfunction()
 ### benchmark sources #############################################################################
 ###################################################################################################
 
-ConfigureBench(TEST_BENCH
-    test.cu)
+ConfigureBench(WGS_TO_UTM_BENCH wgs_to_utm_bench.cu)
diff --git a/cpp/cuproj/benchmarks/fixture/benchmark_fixture.hpp b/cpp/cuproj/benchmarks/fixture/benchmark_fixture.hpp
@@ -0,0 +1,91 @@
+/*
+ * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmark/benchmark.h>
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+namespace cuspatial {
+
+namespace {
+// memory resource factory helpers
+inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
+
+inline auto make_pool()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda());
+}
+}  // namespace
+
+/**
+ * @brief Google Benchmark fixture for libcuspatial benchmarks
+ *
+ * libcuspatial benchmarks should use a fixture derived from this fixture class to
+ * ensure that the RAPIDS Memory Manager pool mode is used in benchmarks, which
+ * eliminates memory allocation / deallocation performance overhead from the
+ * benchmark.
+ *
+ * The SetUp and TearDown methods of this fixture initialize RMM into pool mode
+ * and finalize it, respectively. These methods are called automatically by
+ * Google Benchmark
+ *
+ * Example:
+ *
+ * template <class T>
+ * class my_benchmark : public cuspatial::benchmark {
+ * public:
+ *   using TypeParam = T;
+ * };
+ *
+ * Then:
+ *
+ * BENCHMARK_TEMPLATE_DEFINE_F(my_benchmark, my_test_name, int)
+ *   (::benchmark::State& state) {
+ *     for (auto _ : state) {
+ *       // benchmark stuff
+ *     }
+ * }
+ *
+ * BENCHMARK_REGISTER_F(my_benchmark, my_test_name)->Range(128, 512);
+ */
+class benchmark : public ::benchmark::Fixture {
+ public:
+  virtual void SetUp(const ::benchmark::State& state) override
+  {
+    mr = make_pool();
+    rmm::mr::set_current_device_resource(mr.get());  // set default resource to pool
+  }
+
+  virtual void TearDown(const ::benchmark::State& state) override
+  {
+    // reset default resource to the initial resource
+    rmm::mr::set_current_device_resource(nullptr);
+    mr.reset();
+  }
+
+  // eliminate partial override warnings (see benchmark/benchmark.h)
+  void SetUp(::benchmark::State& st) override { SetUp(const_cast<const ::benchmark::State&>(st)); }
+  void TearDown(::benchmark::State& st) override
+  {
+    TearDown(const_cast<const ::benchmark::State&>(st));
+  }
+
+  std::shared_ptr<rmm::mr::device_memory_resource> mr;
+};
+
+};  // namespace cuspatial
diff --git a/cpp/cuproj/benchmarks/test.cu b/cpp/cuproj/benchmarks/test.cu
diff --git a/cpp/cuproj/benchmarks/wgs_to_utm_bench.cu b/cpp/cuproj/benchmarks/wgs_to_utm_bench.cu
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cuproj/projection_factories.hpp>
+#include <cuspatial/geometry/vec_2d.hpp>
+
+#include <cuproj_test/convert_coordinates.hpp>
+#include <cuproj_test/coordinate_generator.cuh>
+
+#include <benchmarks/fixture/benchmark_fixture.hpp>
+#include <benchmarks/synchronization/synchronization.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_vector.hpp>
+
+#include <thrust/host_vector.h>
+
+#include <type_traits>
+
+template <typename T>
+using coordinate = typename cuspatial::vec_2d<T>;
+
+static char const* epsg_src = "EPSG:4326";
+static char const* epsg_dst = "EPSG:32756";
+
+template <typename T>
+auto make_input(std::size_t grid_side)
+{
+  // Sydney Harbour
+  coordinate<T> min_corner{-33.9, 151.2};
+  coordinate<T> max_corner{-33.7, 151.3};
+
+  auto input = cuproj_test::make_grid_array<coordinate<T>, rmm::device_vector<coordinate<T>>>(
+    min_corner, max_corner, grid_side, grid_side);
+
+  return input;
+}
+
+template <typename T>
+static void cuproj_wgs_to_utm_benchmark(benchmark::State& state)
+{
+  auto const num_points = state.range(0);
+
+  auto const grid_side{static_cast<std::size_t>(sqrt(num_points))};
+
+  auto input = make_input<T>(grid_side);
+
+  rmm::device_vector<coordinate<T>> output(input.size());
+
+  auto proj = cuproj::make_projection<coordinate<T>>(epsg_src, epsg_dst);
+
+  for (auto _ : state) {
+    cuda_event_timer raii(state, true);
+    proj.transform(input.begin(),
+                   input.end(),
+                   output.begin(),
+                   cuproj::direction::FORWARD,
+                   rmm::cuda_stream_default);
+  }
+
+  state.SetItemsProcessed(num_points * state.iterations());
+}
+
+void proj_wgs_to_utm_benchmark(benchmark::State& state)
+{
+  using T               = double;
+  auto const num_points = state.range(0);
+
+  auto const grid_side{static_cast<std::size_t>(sqrt(num_points))};
+
+  auto d_input = make_input<T>(grid_side);
+  auto input   = thrust::host_vector<coordinate<T>>(d_input);
+
+  std::vector<PJ_COORD> pj_input(input.size());
+
+  PJ_CONTEXT* C = proj_context_create();
+  PJ* P         = proj_create_crs_to_crs(C, epsg_src, epsg_dst, nullptr);
+
+  for (auto _ : state) {
+    state.PauseTiming();
+    cuproj_test::convert_coordinates(input, pj_input);
+    state.ResumeTiming();
+    proj_trans_array(P, PJ_FWD, pj_input.size(), pj_input.data());
+  }
+
+  state.SetItemsProcessed(num_points * state.iterations());
+}
+
+class proj_utm_benchmark : public ::benchmark::Fixture {};
+
+// Edit these for GPUs/CPUs with larger or smaller memory.
+// For double precision, its' 16 bytes per (x,y) point, x2 for input and output
+// 10^8 points -> 3.2GB+, 10^9 points -> 32GB+
+// H100 80GB is plenty for 10^9 points
+
+constexpr int range_min = 100;
+constexpr int range_max = 100'000'000;
+
+BENCHMARK_DEFINE_F(proj_utm_benchmark, forward_double)(::benchmark::State& state)
+{
+  proj_wgs_to_utm_benchmark(state);
+}
+BENCHMARK_REGISTER_F(proj_utm_benchmark, forward_double)
+  ->RangeMultiplier(10)
+  ->Range(range_min, range_max)
+  ->Unit(benchmark::kMillisecond);
+
+class cuproj_utm_benchmark : public cuspatial::benchmark {};
+
+#define UTM_CUPROJ_BENCHMARK_DEFINE(name, type)                              \
+  BENCHMARK_DEFINE_F(cuproj_utm_benchmark, name)(::benchmark::State & state) \
+  {                                                                          \
+    cuproj_wgs_to_utm_benchmark<type>(state);                                \
+  }                                                                          \
+  BENCHMARK_REGISTER_F(cuproj_utm_benchmark, name)                           \
+    ->RangeMultiplier(10)                                                    \
+    ->Range(range_min, range_max)                                            \
+    ->UseManualTime()                                                        \
+    ->Unit(benchmark::kMillisecond);
+
+UTM_CUPROJ_BENCHMARK_DEFINE(forward_float, float);
+UTM_CUPROJ_BENCHMARK_DEFINE(forward_double, double);
diff --git a/cpp/cuproj/include/cuproj_test/convert_coordinates.hpp b/cpp/cuproj/include/cuproj_test/convert_coordinates.hpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <proj.h>
+
+#include <algorithm>
+#include <type_traits>
+
+namespace cuproj_test {
+
+// Convert coordinates from a x-y struct to a PJ_COORD struct or vice versa
+template <typename InVector, typename OutVector>
+void convert_coordinates(InVector const& in, OutVector& out)
+{
+  using in_coord_type  = typename InVector::value_type;
+  using out_coord_type = typename OutVector::value_type;
+
+  static_assert(
+    (std::is_same_v<out_coord_type, PJ_COORD> != std::is_same_v<in_coord_type, PJ_COORD>),
+    "Invalid coordinate vector conversion");
+
+  if constexpr (std::is_same_v<in_coord_type, PJ_COORD>) {
+    using T                       = typename out_coord_type::value_type;
+    auto proj_coord_to_coordinate = [](auto const& c) {
+      return out_coord_type{static_cast<T>(c.xy.x), static_cast<T>(c.xy.y)};
+    };
+    std::transform(in.begin(), in.end(), out.begin(), proj_coord_to_coordinate);
+  } else if constexpr (std::is_same_v<out_coord_type, PJ_COORD>) {
+    auto coordinate_to_proj_coord = [](auto const& c) { return PJ_COORD{c.x, c.y, 0, 0}; };
+    std::transform(in.begin(), in.end(), out.begin(), coordinate_to_proj_coord);
+  }
+}
+
+}  // namespace cuproj_test
diff --git a/cpp/cuproj/include/cuproj_test/coordinate_generator.cuh b/cpp/cuproj/include/cuproj_test/coordinate_generator.cuh
@@ -48,6 +48,7 @@ struct grid_generator {
   }
 };
 
+// Create a Vector containing a grid of coordinates between the min and max corners
 template <typename Coord, typename Vector>
 auto make_grid_array(Coord const& min_corner,
                      Coord const& max_corner,