From 6d4a017fbf728cec329c60150afc752da58d3313 Mon Sep 17 00:00:00 2001 From: Tobias Ribizel Date: Wed, 3 Nov 2021 21:26:16 +0100 Subject: [PATCH] review updates Co-authored-by: Yuhsiang Tsai Co-authored-by: Terry Cojean --- BENCHMARKING.md | 4 +- benchmark/CMakeLists.txt | 24 +-- benchmark/utils/cuda_linops.cu | 311 +++++++++++++-------------- benchmark/utils/formats.hpp | 127 ++++++----- benchmark/utils/hip_linops.hip.cpp | 128 +++++------ benchmark/utils/sparselib_linops.hpp | 57 +++-- benchmark/utils/timer.hpp | 8 +- 7 files changed, 334 insertions(+), 325 deletions(-) diff --git a/BENCHMARKING.md b/BENCHMARKING.md index bc419f57386..f62540264cd 100644 --- a/BENCHMARKING.md +++ b/BENCHMARKING.md @@ -287,12 +287,12 @@ The supported environment variables are described in the following list: * `PRECONDS={jacobi,ic,ilu,paric,parict,parilu,parilut,ic-isai,ilu-isai,paric-isai,parict-isai,parilu-isai,parilut-isai,none}` the preconditioners to use for either `solver` or `preconditioner` benchmarks. Multiple options can be passed to this variable. Default is `none`. -* `FORMATS={csr,coo,ell,hybrid,sellp,hybridxx,cusp_xx,hipsp_xx}` the matrix +* `FORMATS={csr,coo,ell,hybrid,sellp,hybridxx,cusparse_xx,hipsparse_xx}` the matrix formats to benchmark for the `spmv` phase of the benchmark. Run `${ginkgo_build_dir}/benchmark/spmv/spmv --help` for a full list. If needed, multiple options for hybrid with different optimization parameters are available. Depending on the libraries available at build time, vendor - library formats (cuSPARSE with `cusp_` prefix or hipSPARSE with `hipsp_` + library formats (cuSPARSE with `cusparse_` prefix or hipSPARSE with `hipsparse_` prefix) can be used as well. Multiple options can be passed. The default is `csr,coo,ell,hybrid,sellp`. * `SOLVERS={bicgstab,bicg,cg,cgs,fcg,gmres,cb_gmres_{keep,reduce1,reduce2,integer,ireduce1,ireduce2},lower_trs,upper_trs}` diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 80d9838a29a..af8d38e3eb3 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -10,18 +10,16 @@ function(ginkgo_benchmark_add_tuning_maybe name) endif() endfunction() -function(ginkgo_benchmark_cusp_linops type def) +function(ginkgo_benchmark_cusparse_linops type def) add_library(cusparse_linops_${type} utils/cuda_linops.cu) # make the dependency public to catch issues target_compile_definitions(cusparse_linops_${type} PUBLIC ${def}) target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo ${CUDA_RUNTIME_LIBS} ${CUBLAS} ${CUSPARSE}) target_include_directories(cusparse_linops_${type} SYSTEM PRIVATE ${CUDA_INCLUDE_DIRS}) - if(CMAKE_CUDA_COMPILER_VERSION GREATER_EQUAL "9.2") - target_compile_definitions(cusparse_linops_${type} PRIVATE ALLOWMP=1) - endif() + target_compile_definitions(cusparse_linops_${type} PRIVATE ALLOWMP=1) endfunction() -function(ginkgo_benchmark_hipsp_linops type def) +function(ginkgo_benchmark_hipsparse_linops type def) add_library(hipsparse_linops_${type} utils/hip_linops.hip.cpp) target_compile_definitions(hipsparse_linops_${type} PUBLIC ${def}) EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS) @@ -99,19 +97,19 @@ endfunction(ginkgo_add_typed_benchmark_executables) if (GINKGO_BUILD_CUDA) enable_language(CUDA) - ginkgo_benchmark_cusp_linops(d GKO_BENCHMARK_USE_DOUBLE_PRECISION) - ginkgo_benchmark_cusp_linops(s GKO_BENCHMARK_USE_SINGLE_PRECISION) - ginkgo_benchmark_cusp_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) - ginkgo_benchmark_cusp_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) + ginkgo_benchmark_cusparse_linops(d GKO_BENCHMARK_USE_DOUBLE_PRECISION) + ginkgo_benchmark_cusparse_linops(s GKO_BENCHMARK_USE_SINGLE_PRECISION) + ginkgo_benchmark_cusparse_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) + ginkgo_benchmark_cusparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) add_library(cuda_timer utils/cuda_timer.cu) target_link_libraries(cuda_timer ginkgo ${CUDA_RUNTIME_LIBS}) target_include_directories(cuda_timer SYSTEM PRIVATE ${CUDA_INCLUDE_DIRS}) endif() if (GINKGO_BUILD_HIP) - ginkgo_benchmark_hipsp_linops(d GKO_BENCHMARK_USE_DOUBLE_PRECISION) - ginkgo_benchmark_hipsp_linops(s GKO_BENCHMARK_USE_SINGLE_PRECISION) - ginkgo_benchmark_hipsp_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) - ginkgo_benchmark_hipsp_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) + ginkgo_benchmark_hipsparse_linops(d GKO_BENCHMARK_USE_DOUBLE_PRECISION) + ginkgo_benchmark_hipsparse_linops(s GKO_BENCHMARK_USE_SINGLE_PRECISION) + ginkgo_benchmark_hipsparse_linops(z GKO_BENCHMARK_USE_DOUBLE_COMPLEX_PRECISION) + ginkgo_benchmark_hipsparse_linops(c GKO_BENCHMARK_USE_SINGLE_COMPLEX_PRECISION) add_library(hip_timer utils/hip_timer.hip.cpp) EXECUTE_PROCESS(COMMAND ${HIP_PATH}/bin/hipconfig --cpp_config OUTPUT_VARIABLE HIP_CXX_FLAGS) set_target_properties(hip_timer PROPERTIES COMPILE_FLAGS ${HIP_CXX_FLAGS}) diff --git a/benchmark/utils/cuda_linops.cu b/benchmark/utils/cuda_linops.cu index 1977f0e382e..5e2cf680183 100644 --- a/benchmark/utils/cuda_linops.cu +++ b/benchmark/utils/cuda_linops.cu @@ -49,27 +49,27 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/types.hpp" -class cusp_csr {}; -class cusp_csrmp {}; -class cusp_csrmm {}; -class cusp_hybrid {}; -class cusp_coo {}; -class cusp_ell {}; -class cusp_gcsr {}; -class cusp_gcoo {}; -class cusp_csrex {}; -class cusp_gcsr2 {}; +class cusparse_csr {}; +class cusparse_csrmp {}; +class cusparse_csrmm {}; +class cusparse_hybrid {}; +class cusparse_coo {}; +class cusparse_ell {}; +class cusparse_gcsr {}; +class cusparse_gcoo {}; +class cusparse_csrex {}; +class cusparse_gcsr2 {}; namespace detail { -class CuspBase : public gko::LinOp { +class CusparseBase : public gko::LinOp { public: cusparseMatDescr_t get_descr() const { return this->descr_.get(); } - // Return shared pointer not plain pointer such that CuspGenericSpMV uses - // gko::Array to allocate buffer. + // Return shared pointer not plain pointer such that CusparseGenericSpMV + // uses gko::Array to allocate buffer. std::shared_ptr get_gpu_exec() const { return gpu_exec_; @@ -82,8 +82,8 @@ protected: GKO_NOT_IMPLEMENTED; } - CuspBase(std::shared_ptr exec, - const gko::dim<2>& size = gko::dim<2>{}) + CusparseBase(std::shared_ptr exec, + const gko::dim<2>& size = gko::dim<2>{}) : gko::LinOp(exec, size) { gpu_exec_ = std::dynamic_pointer_cast(exec); @@ -93,11 +93,11 @@ protected: this->initialize_descr(); } - ~CuspBase() = default; + ~CusparseBase() = default; - CuspBase(const CuspBase& other) = delete; + CusparseBase(const CusparseBase& other) = delete; - CuspBase& operator=(const CuspBase& other) + CusparseBase& operator=(const CusparseBase& other) { if (this != &other) { gko::LinOp::operator=(other); @@ -127,17 +127,18 @@ private: }; -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +#if CUDA_VERSION < 11000 template -class CuspCsrmp - : public gko::EnableLinOp, CuspBase>, +class CusparseCsrmp + : public gko::EnableLinOp, + CusparseBase>, public gko::ReadableFromMatrixData, - public gko::EnableCreateMethod> { - friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + public gko::EnableCreateMethod> { + friend class gko::EnableCreateMethod; + friend class gko::EnablePolymorphicObject; public: using csr = gko::matrix::Csr; @@ -173,9 +174,9 @@ protected: &scalars.get_const_data()[1], dx); } - CuspCsrmp(std::shared_ptr exec, - const gko::dim<2>& size = gko::dim<2>{}) - : gko::EnableLinOp(exec, size), + CusparseCsrmp(std::shared_ptr exec, + const gko::dim<2>& size = gko::dim<2>{}) + : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) @@ -192,12 +193,12 @@ private: template -class CuspCsr - : public gko::EnableLinOp, CuspBase>, - public gko::EnableCreateMethod>, +class CusparseCsr + : public gko::EnableLinOp, CusparseBase>, + public gko::EnableCreateMethod>, public gko::ReadableFromMatrixData { - friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::EnableCreateMethod; + friend class gko::EnablePolymorphicObject; public: using csr = gko::matrix::Csr; @@ -233,9 +234,9 @@ protected: &scalars.get_const_data()[1], dx); } - CuspCsr(std::shared_ptr exec, - const gko::dim<2>& size = gko::dim<2>{}) - : gko::EnableLinOp(exec, size), + CusparseCsr(std::shared_ptr exec, + const gko::dim<2>& size = gko::dim<2>{}) + : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) @@ -252,12 +253,13 @@ private: template -class CuspCsrmm - : public gko::EnableLinOp, CuspBase>, - public gko::EnableCreateMethod>, +class CusparseCsrmm + : public gko::EnableLinOp, + CusparseBase>, + public gko::EnableCreateMethod>, public gko::ReadableFromMatrixData { - friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::EnableCreateMethod; + friend class gko::EnablePolymorphicObject; public: using csr = gko::matrix::Csr; @@ -294,9 +296,9 @@ protected: dense_x->get_size()[0]); } - CuspCsrmm(std::shared_ptr exec, - const gko::dim<2>& size = gko::dim<2>{}) - : gko::EnableLinOp(exec, size), + CusparseCsrmm(std::shared_ptr exec, + const gko::dim<2>& size = gko::dim<2>{}) + : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) @@ -311,17 +313,18 @@ private: }; -#endif // defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +#endif // CUDA_VERSION < 11000 template -class CuspCsrEx - : public gko::EnableLinOp, CuspBase>, - public gko::EnableCreateMethod>, +class CusparseCsrEx + : public gko::EnableLinOp, + CusparseBase>, + public gko::EnableCreateMethod>, public gko::ReadableFromMatrixData { - friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::EnableCreateMethod; + friend class gko::EnablePolymorphicObject; public: using csr = gko::matrix::Csr; @@ -338,9 +341,9 @@ public: return csr_->get_num_stored_elements(); } - CuspCsrEx(const CuspCsrEx& other) = delete; + CusparseCsrEx(const CusparseCsrEx& other) = delete; - CuspCsrEx& operator=(const CuspCsrEx& other) = default; + CusparseCsrEx& operator=(const CusparseCsrEx& other) = default; protected: void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override @@ -378,9 +381,9 @@ protected: } - CuspCsrEx(std::shared_ptr exec, - const gko::dim<2>& size = gko::dim<2>{}) - : gko::EnableLinOp(exec, size), + CusparseCsrEx(std::shared_ptr exec, + const gko::dim<2>& size = gko::dim<2>{}) + : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), trans_(CUSPARSE_OPERATION_NON_TRANSPOSE), @@ -399,21 +402,22 @@ private: }; -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +#if CUDA_VERSION < 11000 template -class CuspHybrid +class CusparseHybrid : public gko::EnableLinOp< - CuspHybrid, CuspBase>, + CusparseHybrid, + CusparseBase>, public gko::EnableCreateMethod< - CuspHybrid>, + CusparseHybrid>, public gko::ReadableFromMatrixData { - friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::EnableCreateMethod; + friend class gko::EnablePolymorphicObject; public: using csr = gko::matrix::Csr; @@ -435,21 +439,21 @@ public: Threshold, Partition); } - ~CuspHybrid() override + ~CusparseHybrid() override { const auto id = this->get_gpu_exec()->get_device_id(); try { gko::cuda::device_guard g{id}; GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyHybMat(hyb_)); } catch (const std::exception& e) { - std::cerr << "Error when unallocating CuspHybrid hyb_ matrix: " + std::cerr << "Error when unallocating CusparseHybrid hyb_ matrix: " << e.what() << std::endl; } } - CuspHybrid(const CuspHybrid& other) = delete; + CusparseHybrid(const CusparseHybrid& other) = delete; - CuspHybrid& operator=(const CuspHybrid& other) = default; + CusparseHybrid& operator=(const CusparseHybrid& other) = default; protected: void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override @@ -467,9 +471,9 @@ protected: &scalars.get_const_data()[1], dx); } - CuspHybrid(std::shared_ptr exec, - const gko::dim<2>& size = gko::dim<2>{}) - : gko::EnableLinOp(exec, size), + CusparseHybrid(std::shared_ptr exec, + const gko::dim<2>& size = gko::dim<2>{}) + : gko::EnableLinOp(exec, size), trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) { const auto id = this->get_gpu_exec()->get_device_id(); @@ -486,20 +490,19 @@ private: }; -#endif // defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +#endif // CUDA_VERSION < 11000 -#if defined(CUDA_VERSION) && \ - (CUDA_VERSION >= 11000 || \ - ((CUDA_VERSION >= 10020) && !(defined(_WIN32) || defined(__CYGWIN__)))) +#if CUDA_VERSION >= 11000 || \ + ((CUDA_VERSION >= 10020) && !(defined(_WIN32) || defined(__CYGWIN__))) template -void cusp_generic_spmv(std::shared_ptr gpu_exec, - const cusparseSpMatDescr_t mat, - const gko::Array& scalars, - const gko::LinOp* b, gko::LinOp* x, - cusparseOperation_t trans, cusparseSpMVAlg_t alg) +void cusparse_generic_spmv(std::shared_ptr gpu_exec, + const cusparseSpMatDescr_t mat, + const gko::Array& scalars, + const gko::LinOp* b, gko::LinOp* x, + cusparseOperation_t trans, cusparseSpMVAlg_t alg) { cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type(); using gko::kernels::cuda::as_culibs_type; @@ -536,13 +539,14 @@ void cusp_generic_spmv(std::shared_ptr gpu_exec, template -class CuspGenericCsr - : public gko::EnableLinOp, - CuspBase>, - public gko::EnableCreateMethod>, +class CusparseGenericCsr + : public gko::EnableLinOp, + CusparseBase>, + public gko::EnableCreateMethod< + CusparseGenericCsr>, public gko::ReadableFromMatrixData { - friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::EnableCreateMethod; + friend class gko::EnablePolymorphicObject; public: using csr = gko::matrix::Csr; @@ -570,32 +574,33 @@ public: return csr_->get_num_stored_elements(); } - ~CuspGenericCsr() override + ~CusparseGenericCsr() override { const auto id = this->get_gpu_exec()->get_device_id(); try { gko::cuda::device_guard g{id}; GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(mat_)); } catch (const std::exception& e) { - std::cerr << "Error when unallocating CuspGenericCsr mat_ matrix: " - << e.what() << std::endl; + std::cerr + << "Error when unallocating CusparseGenericCsr mat_ matrix: " + << e.what() << std::endl; } } - CuspGenericCsr(const CuspGenericCsr& other) = delete; + CusparseGenericCsr(const CusparseGenericCsr& other) = delete; - CuspGenericCsr& operator=(const CuspGenericCsr& other) = default; + CusparseGenericCsr& operator=(const CusparseGenericCsr& other) = default; protected: void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override { - cusp_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_, - Alg); + cusparse_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_, + Alg); } - CuspGenericCsr(std::shared_ptr exec, - const gko::dim<2>& size = gko::dim<2>{}) - : gko::EnableLinOp(exec, size), + CusparseGenericCsr(std::shared_ptr exec, + const gko::dim<2>& size = gko::dim<2>{}) + : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) @@ -613,12 +618,13 @@ private: template -class CuspGenericCoo - : public gko::EnableLinOp, CuspBase>, - public gko::EnableCreateMethod>, +class CusparseGenericCoo + : public gko::EnableLinOp, + CusparseBase>, + public gko::EnableCreateMethod>, public gko::ReadableFromMatrixData { - friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::EnableCreateMethod; + friend class gko::EnablePolymorphicObject; public: using coo = gko::matrix::Coo; @@ -646,32 +652,33 @@ public: return coo_->get_num_stored_elements(); } - ~CuspGenericCoo() override + ~CusparseGenericCoo() override { const auto id = this->get_gpu_exec()->get_device_id(); try { gko::cuda::device_guard g{id}; GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(mat_)); } catch (const std::exception& e) { - std::cerr << "Error when unallocating CuspGenericCoo mat_ matrix: " - << e.what() << std::endl; + std::cerr + << "Error when unallocating CusparseGenericCoo mat_ matrix: " + << e.what() << std::endl; } } - CuspGenericCoo(const CuspGenericCoo& other) = delete; + CusparseGenericCoo(const CusparseGenericCoo& other) = delete; - CuspGenericCoo& operator=(const CuspGenericCoo& other) = default; + CusparseGenericCoo& operator=(const CusparseGenericCoo& other) = default; protected: void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override { - cusp_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_, - CUSPARSE_MV_ALG_DEFAULT); + cusparse_generic_spmv(this->get_gpu_exec(), mat_, scalars, b, x, trans_, + CUSPARSE_MV_ALG_DEFAULT); } - CuspGenericCoo(std::shared_ptr exec, - const gko::dim<2>& size = gko::dim<2>{}) - : gko::EnableLinOp(exec, size), + CusparseGenericCoo(std::shared_ptr exec, + const gko::dim<2>& size = gko::dim<2>{}) + : gko::EnableLinOp(exec, size), coo_(std::move(coo::create(exec))), trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) {} @@ -686,66 +693,58 @@ private: }; -#endif // defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || ((CUDA_VERSION >= - // 10020) && !(defined(_WIN32) || defined(__CYGWIN__)))) +#endif // CUDA_VERSION >= 11000 || ((CUDA_VERSION >= 10020) && + // !(defined(_WIN32) || defined(__CYGWIN__))) } // namespace detail -// Some shortcuts +IMPL_CREATE_SPARSELIB_LINOP(cusparse_csrex, detail::CusparseCsrEx) +#if CUDA_VERSION < 11000 +IMPL_CREATE_SPARSELIB_LINOP(cusparse_csr, detail::CusparseCsr); +IMPL_CREATE_SPARSELIB_LINOP(cusparse_csrmp, + detail::CusparseCsrmp); +IMPL_CREATE_SPARSELIB_LINOP(cusparse_csrmm, + detail::CusparseCsrmm); +#else // CUDA_VERSION >= 11000 +IMPL_CREATE_SPARSELIB_LINOP(cusparse_csr, + detail::CusparseGenericCsr); +STUB_CREATE_SPARSELIB_LINOP(cusparse_csrmp); +STUB_CREATE_SPARSELIB_LINOP(cusparse_csrmm); +#endif // CUDA_VERSION >= 11000 -#define IMPL_CREATE_SPARSELIB_LINOP(_type, ...) \ - template <> \ - std::unique_ptr create_sparselib_linop<_type>( \ - std::shared_ptr exec) \ - { \ - return __VA_ARGS__::create(exec); \ - } -#define STUB_CREATE_SPARSELIB_LINOP(_type) \ - template <> \ - std::unique_ptr create_sparselib_linop<_type>( \ - std::shared_ptr exec) GKO_NOT_IMPLEMENTED; - -IMPL_CREATE_SPARSELIB_LINOP(cusp_csrex, detail::CuspCsrEx) - -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) -IMPL_CREATE_SPARSELIB_LINOP(cusp_csr, detail::CuspCsr) -IMPL_CREATE_SPARSELIB_LINOP(cusp_csrmp, detail::CuspCsrmp) -IMPL_CREATE_SPARSELIB_LINOP(cusp_csrmm, detail::CuspCsrmm) -#else -STUB_CREATE_SPARSELIB_LINOP(cusp_csr) -STUB_CREATE_SPARSELIB_LINOP(cusp_csrmp) -STUB_CREATE_SPARSELIB_LINOP(cusp_csrmm) -#endif // not (defined(CUDA_VERSION) && (CUDA_VERSION < 11000)) - - -#if defined(CUDA_VERSION) && \ - (CUDA_VERSION >= 11000 || \ - ((CUDA_VERSION >= 10020) && !(defined(_WIN32) || defined(__CYGWIN__)))) -IMPL_CREATE_SPARSELIB_LINOP(cusp_gcsr, detail::CuspGenericCsr) +#if CUDA_VERSION >= 11000 || \ + ((CUDA_VERSION >= 10020) && !(defined(_WIN32) || defined(__CYGWIN__))) +IMPL_CREATE_SPARSELIB_LINOP(cusparse_gcsr, + detail::CusparseGenericCsr); IMPL_CREATE_SPARSELIB_LINOP( - cusp_gcsr2, detail::CuspGenericCsr) -IMPL_CREATE_SPARSELIB_LINOP(cusp_gcoo, detail::CuspGenericCoo) + cusparse_gcsr2, + detail::CusparseGenericCsr); +IMPL_CREATE_SPARSELIB_LINOP(cusparse_gcoo, + detail::CusparseGenericCoo); #else -STUB_CREATE_SPARSELIB_LINOP(cusp_gcsr) -STUB_CREATE_SPARSELIB_LINOP(cusp_gcsr2) -STUB_CREATE_SPARSELIB_LINOP(cusp_gcoo) -#endif // not (defined(CUDA_VERSION) && (CUDA_VERSION >= 11000 || - // ((CUDA_VERSION >= 10020) && !(defined(_WIN32) || - // defined(__CYGWIN__))))) +STUB_CREATE_SPARSELIB_LINOP(cusparse_gcsr); +STUB_CREATE_SPARSELIB_LINOP(cusparse_gcsr2); +STUB_CREATE_SPARSELIB_LINOP(cusparse_gcoo); +#endif // CUDA_VERSION < 11000 && ((CUDA_VERSION < 10020) || (defined(_WIN32) + // && defined(__CYGWIN__)))) -#if defined(CUDA_VERSION) && (CUDA_VERSION < 11000) +#if CUDA_VERSION < 11000 IMPL_CREATE_SPARSELIB_LINOP( - cusp_coo, detail::CuspHybrid) + cusparse_coo, + detail::CusparseHybrid); IMPL_CREATE_SPARSELIB_LINOP( - cusp_ell, detail::CuspHybrid) -IMPL_CREATE_SPARSELIB_LINOP(cusp_hybrid, detail::CuspHybrid) -#else -STUB_CREATE_SPARSELIB_LINOP(cusp_coo) -STUB_CREATE_SPARSELIB_LINOP(cusp_ell) -STUB_CREATE_SPARSELIB_LINOP(cusp_hybrid) -#endif // not (defined(CUDA_VERSION) && (CUDA_VERSION < 11000)) + cusparse_ell, + detail::CusparseHybrid); +IMPL_CREATE_SPARSELIB_LINOP(cusparse_hybrid, + detail::CusparseHybrid); +#else // CUDA_VERSION >= 11000 +IMPL_CREATE_SPARSELIB_LINOP(cusparse_coo, + detail::CusparseGenericCoo); +STUB_CREATE_SPARSELIB_LINOP(cusparse_ell); +STUB_CREATE_SPARSELIB_LINOP(cusparse_hybrid); +#endif // CUDA_VERSION >= 11000 diff --git a/benchmark/utils/formats.hpp b/benchmark/utils/formats.hpp index 55aa003b361..b1a4a3583f8 100644 --- a/benchmark/utils/formats.hpp +++ b/benchmark/utils/formats.hpp @@ -58,64 +58,73 @@ std::string available_format = "hybrid60, hybrid80, hybridlimit0, hybridlimit25, hybridlimit33, " "hybridminstorage" #ifdef HAS_CUDA - ", cusp_csr, cusp_csrex, cusp_coo" - ", cusp_csrmp, cusp_csrmm, cusp_ell, cusp_hybrid" - ", cusp_gcsr, cusp_gcsr2, cusp_gcoo" + ", cusparse_csr, cusparse_csrex, cusparse_coo" + ", cusparse_csrmp, cusparse_csrmm, cusparse_ell, cusparse_hybrid" + ", cusparse_gcsr, cusparse_gcsr2, cusparse_gcoo" #endif // HAS_CUDA #ifdef HAS_HIP - ", hipsp_csr, hipsp_csrmm, hipsp_coo, hipsp_ell, hipsp_hybrid" + ", hipsparse_csr, hipsparse_csrmm, hipsparse_coo, hipsparse_ell, " + "hipsparse_hybrid" #endif // HAS_HIP ".\n"; std::string format_description = - "coo: Coordinate storage. The CUDA kernel uses the load-balancing approach " - "suggested in Flegar et al.: Overcoming Load Imbalance for Irregular " - "Sparse Matrices.\n" - "csr: Compressed Sparse Row storage. Ginkgo implementation with automatic " - "strategy.\n" + "coo: Coordinate storage. The GPU kernels use the load-balancing " + "approach\n" + " suggested in Flegar et al.: Overcoming Load Imbalance for\n" + " Irregular Sparse Matrices.\n" + "csr: Compressed Sparse Row storage. Ginkgo implementation with\n" + " automatic strategy.\n" "csrc: Ginkgo's CSR implementation with automatic stategy.\n" "csri: Ginkgo's CSR implementation with inbalance strategy.\n" "csrm: Ginkgo's CSR implementation with merge_path strategy.\n" "csrs: Ginkgo's CSR implementation with sparselib strategy.\n" - "ell: Ellpack format according to Bell and Garland: Efficient Sparse " - "Matrix-Vector Multiplication on CUDA.\n" - "ell-mixed: Mixed Precision Ellpack format according to Bell and Garland: " - "Efficient Sparse Matrix-Vector Multiplication on CUDA.\n" + "ell: Ellpack format according to Bell and Garland: Efficient Sparse\n" + " Matrix-Vector Multiplication on CUDA.\n" + "ell-mixed: Mixed Precision Ellpack format according to Bell and Garland:\n" + " Efficient Sparse Matrix-Vector Multiplication on CUDA.\n" "sellp: Sliced Ellpack uses a default block size of 32.\n" - "hybrid: Hybrid uses ell and coo to represent the matrix.\n" - "hybrid0, hybrid25, hybrid33, hybrid40, hybrid60, hybrid80: Hybrid uses " - "the row distribution to decide the partition.\n" - "hybridlimit0, hybridlimit25, hybrid33: Add the upper bound on the ell " - "part of hybrid0, hybrid25, hybrid33.\n" - "hybridminstorage: Hybrid uses the minimal storage to store the matrix." + "hybrid: Hybrid uses ELL and COO to represent the matrix.\n" + "hybrid0, hybrid25, hybrid33, hybrid40, hybrid60, hybrid80:\n" + " Use 0%, 25%, ... quantiles of the row length distribution\n" + " to choose number of entries stored in the ELL part.\n" + "hybridlimit0, hybridlimit25, hybrid33: Similar to hybrid0\n" + " but with an additional absolute limit on the number of entries\n" + " per row stored in ELL.\n" + "hybridminstorage: Use the minimal storage to store the matrix." #ifdef HAS_CUDA "\n" - "cusp_coo: use cusparseXhybmv with a CUSPARSE_HYB_PARTITION_USER " - "partition.\n" - "cusp_csr: benchmark CuSPARSE with the cusparseXcsrmv function.\n" - "cusp_ell: use cusparseXhybmv with CUSPARSE_HYB_PARTITION_MAX partition.\n" - "cusp_csrmp: benchmark CuSPARSE with the cusparseXcsrmv_mp function.\n" - "cusp_csrmm: benchmark CuSPARSE with the cusparseXcsrmv_mm function.\n" - "cusp_hybrid: benchmark CuSPARSE spmv with cusparseXhybmv and an automatic " - "partition.\n" - "cusp_csrex: benchmark CuSPARSE with the cusparseXcsrmvEx function." - "\n" - "cusp_gcsr: benchmark CuSPARSE with the generic csr with default " - "algorithm.\n" - "cusp_gcsr2: benchmark CuSPARSE with the generic csr with " - "CUSPARSE_CSRMV_ALG2.\n" - "cusp_gcoo: benchmark CuSPARSE with the generic coo with default " - "algorithm.\n" + "cusparse_coo: cuSPARSE COO SpMV, using cusparseXhybmv with \n" + " CUSPARSE_HYB_PARTITION_USER for CUDA < 10.2, or\n" + " the Generic API otherwise\n" + "cusparse_csr: cuSPARSE CSR SpMV, using cusparseXcsrmv for CUDA < 10.2,\n" + " or the Generic API with default algorithm otherwise\n" + "cusparse_csrex: cuSPARSE CSR SpMV using cusparseXcsrmvEx\n" + "cusparse_ell: cuSPARSE ELL SpMV using cusparseXhybmv with\n" + " CUSPARSE_HYB_PARTITION_MAX, available for CUDA < 11.0\n" + "cusparse_csrmp: cuSPARSE CSR SpMV using cusparseXcsrmv_mp,\n" + " available for CUDA < 11.0\n" + "cusparse_csrmm: cuSPARSE CSR SpMV using cusparseXcsrmv_mm,\n" + " available for CUDA < 11.0\n" + "cusparse_hybrid: cuSPARSE Hybrid SpMV using cusparseXhybmv\n" + " with an automatic partition, available for CUDA < 11.0\n" + "cusparse_gcsr: cuSPARSE CSR SpMV using Generic API with default\n" + " algorithm, available for CUDA >= 10.2\n" + "cusparse_gcsr2: cuSPARSE CSR SpMV using Generic API with\n" + " CUSPARSE_CSRMV_ALG2, available for CUDA >= 10.2\n" + "cusparse_gcoo: cuSPARSE Generic API with default COO SpMV,\n" + " available for CUDA >= 10.2\n" #endif // HAS_CUDA #ifdef HAS_HIP "\n" - "hipsp_csr: benchmark HipSPARSE with the hipsparseXcsrmv function.\n" - "hipsp_csrmm: benchmark HipSPARSE with the hipsparseXcsrmv_mm function.\n" - "hipsp_hybrid: benchmark HipSPARSE spmv with hipsparseXhybmv and an " - "automatic partition.\n" - "hipsp_coo: use hipsparseXhybmv with a HIPSPARSE_HYB_PARTITION_USER " - "partition.\n" - "hipsp_ell: use hipsparseXhybmv with HIPSPARSE_HYB_PARTITION_MAX partition." + "hipsparse_csr: hipSPARSE CSR SpMV using hipsparseXcsrmv\n" + "hipsparse_csrmm: hipSPARSE CSR SpMV using hipsparseXcsrmv_mm\n" + "hipsparse_hybrid: hipSPARSE CSR SpMV using hipsparseXhybmv\n" + " with an automatic partition\n" + "hipsparse_coo: hipSPARSE CSR SpMV using hipsparseXhybmv\n" + " with HIPSPARSE_HYB_PARTITION_USER\n" + "hipsparse_ell: hipSPARSE CSR SpMV using hipsparseXhybmv\n" + " with HIPSPARSE_HYB_PARTITION_MAX" #endif // HAS_HIP ; @@ -299,25 +308,25 @@ const std::map( return mat; }}, #ifdef HAS_CUDA - {"cusp_csr", read_splib_matrix_from_data}, - {"cusp_csrmp", read_splib_matrix_from_data}, - {"cusp_csrmm", read_splib_matrix_from_data}, - {"cusp_hybrid", read_splib_matrix_from_data}, - {"cusp_coo", read_splib_matrix_from_data}, - {"cusp_ell", read_splib_matrix_from_data}, - {"cusp_csr", read_splib_matrix_from_data}, - {"cusp_coo", read_splib_matrix_from_data}, - {"cusp_csrex", read_splib_matrix_from_data}, - {"cusp_gcsr", read_splib_matrix_from_data}, - {"cusp_gcsr2", read_splib_matrix_from_data}, - {"cusp_gcoo", read_splib_matrix_from_data}, + {"cusparse_csr", read_splib_matrix_from_data}, + {"cusparse_csrmp", read_splib_matrix_from_data}, + {"cusparse_csrmm", read_splib_matrix_from_data}, + {"cusparse_hybrid", read_splib_matrix_from_data}, + {"cusparse_coo", read_splib_matrix_from_data}, + {"cusparse_ell", read_splib_matrix_from_data}, + {"cusparse_csr", read_splib_matrix_from_data}, + {"cusparse_coo", read_splib_matrix_from_data}, + {"cusparse_csrex", read_splib_matrix_from_data}, + {"cusparse_gcsr", read_splib_matrix_from_data}, + {"cusparse_gcsr2", read_splib_matrix_from_data}, + {"cusparse_gcoo", read_splib_matrix_from_data}, #endif // HAS_CUDA #ifdef HAS_HIP - {"hipsp_csr", read_splib_matrix_from_data}, - {"hipsp_csrmm", read_splib_matrix_from_data}, - {"hipsp_hybrid", read_splib_matrix_from_data}, - {"hipsp_coo", read_splib_matrix_from_data}, - {"hipsp_ell", read_splib_matrix_from_data}, + {"hipsparse_csr", read_splib_matrix_from_data}, + {"hipsparse_csrmm", read_splib_matrix_from_data}, + {"hipsparse_hybrid", read_splib_matrix_from_data}, + {"hipsparse_coo", read_splib_matrix_from_data}, + {"hipsparse_ell", read_splib_matrix_from_data}, #endif // HAS_HIP {"hybrid", read_matrix_from_data}, {"hybrid0", diff --git a/benchmark/utils/hip_linops.hip.cpp b/benchmark/utils/hip_linops.hip.cpp index 707b383e9ec..2b4418a2dcf 100644 --- a/benchmark/utils/hip_linops.hip.cpp +++ b/benchmark/utils/hip_linops.hip.cpp @@ -45,11 +45,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/base/hipsparse_bindings.hip.hpp" -class hipsp_csr {}; -class hipsp_csrmm {}; -class hipsp_hybrid {}; -class hipsp_coo {}; -class hipsp_ell {}; +class hipsparse_csr {}; +class hipsparse_csrmm {}; +class hipsparse_hybrid {}; +class hipsparse_coo {}; +class hipsparse_ell {}; namespace detail { @@ -58,7 +58,7 @@ namespace detail { struct hipsparseMatDescr; -class HipspBase : public gko::LinOp { +class HipsparseBase : public gko::LinOp { public: hipsparseMatDescr_t get_descr() const { return this->descr_.get(); } @@ -71,8 +71,8 @@ class HipspBase : public gko::LinOp { GKO_NOT_IMPLEMENTED; } - HipspBase(std::shared_ptr exec, - const gko::dim<2>& size = gko::dim<2>{}) + HipsparseBase(std::shared_ptr exec, + const gko::dim<2>& size = gko::dim<2>{}) : gko::LinOp(exec, size) { gpu_exec_ = std::dynamic_pointer_cast(exec); @@ -82,11 +82,11 @@ class HipspBase : public gko::LinOp { this->initialize_descr(); } - ~HipspBase() = default; + ~HipsparseBase() = default; - HipspBase(const HipspBase& other) = delete; + HipsparseBase(const HipsparseBase& other) = delete; - HipspBase& operator=(const HipspBase& other) + HipsparseBase& operator=(const HipsparseBase& other) { if (this != &other) { gko::LinOp::operator=(other); @@ -119,12 +119,13 @@ class HipspBase : public gko::LinOp { template -class HipspCsr - : public gko::EnableLinOp, HipspBase>, - public gko::EnableCreateMethod>, +class HipsparseCsr + : public gko::EnableLinOp, + HipsparseBase>, + public gko::EnableCreateMethod>, public gko::ReadableFromMatrixData { - friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::EnableCreateMethod; + friend class gko::EnablePolymorphicObject; public: using csr = gko::matrix::Csr; @@ -160,9 +161,9 @@ class HipspCsr &scalars.get_const_data()[1], dx); } - HipspCsr(std::shared_ptr exec, - const gko::dim<2>& size = gko::dim<2>{}) - : gko::EnableLinOp(exec, size), + HipsparseCsr(std::shared_ptr exec, + const gko::dim<2>& size = gko::dim<2>{}) + : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE) @@ -179,12 +180,13 @@ class HipspCsr template -class HipspCsrmm - : public gko::EnableLinOp, HipspBase>, - public gko::EnableCreateMethod>, +class HipsparseCsrmm + : public gko::EnableLinOp, + HipsparseBase>, + public gko::EnableCreateMethod>, public gko::ReadableFromMatrixData { - friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::EnableCreateMethod; + friend class gko::EnablePolymorphicObject; public: using csr = gko::matrix::Csr; @@ -221,9 +223,9 @@ class HipspCsrmm dense_x->get_size()[0]); } - HipspCsrmm(std::shared_ptr exec, - const gko::dim<2>& size = gko::dim<2>{}) - : gko::EnableLinOp(exec, size), + HipsparseCsrmm(std::shared_ptr exec, + const gko::dim<2>& size = gko::dim<2>{}) + : gko::EnableLinOp(exec, size), csr_(std::move( csr::create(exec, std::make_shared()))), trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE) @@ -242,14 +244,15 @@ template -class HipspHybrid +class HipsparseHybrid : public gko::EnableLinOp< - HipspHybrid, HipspBase>, + HipsparseHybrid, + HipsparseBase>, public gko::EnableCreateMethod< - HipspHybrid>, + HipsparseHybrid>, public gko::ReadableFromMatrixData { - friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::EnableCreateMethod; + friend class gko::EnablePolymorphicObject; public: using csr = gko::matrix::Csr; @@ -271,21 +274,21 @@ class HipspHybrid Threshold, Partition); } - ~HipspHybrid() override + ~HipsparseHybrid() override { const auto id = this->get_gpu_exec()->get_device_id(); try { gko::hip::device_guard g{id}; GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseDestroyHybMat(hyb_)); } catch (const std::exception& e) { - std::cerr << "Error when unallocating HipspHybrid hyb_ matrix: " + std::cerr << "Error when unallocating HipsparseHybrid hyb_ matrix: " << e.what() << std::endl; } } - HipspHybrid(const HipspHybrid& other) = delete; + HipsparseHybrid(const HipsparseHybrid& other) = delete; - HipspHybrid& operator=(const HipspHybrid& other) = default; + HipsparseHybrid& operator=(const HipsparseHybrid& other) = default; protected: void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override @@ -303,9 +306,9 @@ class HipspHybrid &scalars.get_const_data()[1], dx); } - HipspHybrid(std::shared_ptr exec, - const gko::dim<2>& size = gko::dim<2>{}) - : gko::EnableLinOp(exec, size), + HipsparseHybrid(std::shared_ptr exec, + const gko::dim<2>& size = gko::dim<2>{}) + : gko::EnableLinOp(exec, size), trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE) { const auto id = this->get_gpu_exec()->get_device_id(); @@ -325,39 +328,14 @@ class HipspHybrid } // namespace detail -template <> -std::unique_ptr create_sparselib_linop( - std::shared_ptr exec) -{ - return detail::HipspCsr::create(exec); -} - -template <> -std::unique_ptr create_sparselib_linop( - std::shared_ptr exec) -{ - return detail::HipspCsrmm::create(exec); -} - -template <> -std::unique_ptr create_sparselib_linop( - std::shared_ptr exec) -{ - return detail::HipspHybrid::create(exec); -} - -template <> -std::unique_ptr create_sparselib_linop( - std::shared_ptr exec) -{ - return detail::HipspHybrid::create(exec); -} - -template <> -std::unique_ptr create_sparselib_linop( - std::shared_ptr exec) -{ - return detail::HipspHybrid::create(exec); -} +IMPL_CREATE_SPARSELIB_LINOP(hipsparse_csr, detail::HipsparseCsr); +IMPL_CREATE_SPARSELIB_LINOP(hipsparse_csrmm, + detail::HipsparseCsrmm); +IMPL_CREATE_SPARSELIB_LINOP( + hipsparse_coo, + detail::HipsparseHybrid); +IMPL_CREATE_SPARSELIB_LINOP( + hipsparse_ell, + detail::HipsparseHybrid); +IMPL_CREATE_SPARSELIB_LINOP(hipsparse_hybrid, + detail::HipsparseHybrid); diff --git a/benchmark/utils/sparselib_linops.hpp b/benchmark/utils/sparselib_linops.hpp index a953fb5b4f2..7949a92d2b5 100644 --- a/benchmark/utils/sparselib_linops.hpp +++ b/benchmark/utils/sparselib_linops.hpp @@ -33,32 +33,51 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef GKO_BENCHMARK_UTILS_SPARSELIB_LINOPS_HPP_ #define GKO_BENCHMARK_UTILS_SPARSELIB_LINOPS_HPP_ + #include +#include #include -#include "ginkgo/core/base/exception_helpers.hpp" -class cusp_csr; -class cusp_csrmp; -class cusp_csrmm; -class cusp_hybrid; -class cusp_coo; -class cusp_ell; -class cusp_gcsr; -class cusp_gcoo; -class cusp_csrex; -class cusp_gcsr; -class cusp_gcsr2; -class cusp_gcoo; +#define IMPL_CREATE_SPARSELIB_LINOP(_type, ...) \ + template <> \ + std::unique_ptr create_sparselib_linop<_type>( \ + std::shared_ptr exec) \ + { \ + return __VA_ARGS__::create(exec); \ + } \ + static_assert(true, \ + "This assert is used to counter the false positive extra " \ + "semi-colon warnings") + + +#define STUB_CREATE_SPARSELIB_LINOP(_type) \ + template <> \ + std::unique_ptr create_sparselib_linop<_type>( \ + std::shared_ptr exec) GKO_NOT_IMPLEMENTED + + +class cusparse_csr; +class cusparse_csrmp; +class cusparse_csrmm; +class cusparse_hybrid; +class cusparse_coo; +class cusparse_ell; +class cusparse_gcsr; +class cusparse_gcoo; +class cusparse_csrex; +class cusparse_gcsr; +class cusparse_gcsr2; +class cusparse_gcoo; -class hipsp_csr; -class hipsp_csrmm; -class hipsp_hybrid; -class hipsp_coo; -class hipsp_ell; +class hipsparse_csr; +class hipsparse_csrmm; +class hipsparse_hybrid; +class hipsparse_coo; +class hipsparse_ell; template @@ -66,4 +85,4 @@ std::unique_ptr create_sparselib_linop( std::shared_ptr exec); -#endif // GKO_BENCHMARK_UTILS_SPARSELIB_LINOPS_HPP_ \ No newline at end of file +#endif // GKO_BENCHMARK_UTILS_SPARSELIB_LINOPS_HPP_ diff --git a/benchmark/utils/timer.hpp b/benchmark/utils/timer.hpp index 548bc91a510..31406b4d3de 100644 --- a/benchmark/utils/timer.hpp +++ b/benchmark/utils/timer.hpp @@ -34,12 +34,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_BENCHMARK_UTILS_TIMER_HPP_ -#include "benchmark/utils/timer_impl.hpp" +#include + + +#include #include +#include "benchmark/utils/timer_impl.hpp" + + // Command-line arguments DEFINE_bool(gpu_timer, false, "use gpu timer based on event. It is valid only when "