diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ab76637eab0..da7afaf6b55 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -341,38 +341,6 @@ build/cuda102/nompi/intel/cuda/debug/static: CUDA_ARCH: 35 # cuda 11.0 and friends on HoreKa with tests -build/cuda110/mvapich2/gcc/cuda/debug/shared: - extends: - - .build_template - - .default_variables - - .full_test_condition - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 - variables: - BUILD_OMP: "ON" - BUILD_CUDA: "ON" - BUILD_MPI: "ON" - BUILD_TYPE: "Debug" - FAST_TESTS: "ON" - CUDA_ARCH: 80 - USE_NAME: "cuda110-mvapich2-gcc-${CI_PIPELINE_ID}" - KEEP_CONTAINER: "ON" - USE_SLURM: 0 - -test/cuda110/mvapich2/gcc/cuda/debug/shared: - extends: - - .horeka_test_template - - .default_variables - - .full_test_condition - - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020 - variables: - USE_NAME: "cuda110-mvapich2-gcc-${CI_PIPELINE_ID}" - SLURM_PARTITION: "accelerated" - SLURM_GRES: "gpu:1" - SLURM_TIME: "00:45:00" - dependencies: null - needs: [ "build/cuda110/mvapich2/gcc/cuda/debug/shared" ] - - build/cuda110/nompi/clang/cuda/release/static: extends: - .build_template @@ -533,13 +501,15 @@ build/amd/openmpi/clang/rocm502/release/shared: extends: - .build_and_test_template - .default_variables - - .quick_test_condition - - .use_gko-rocm502-openmpi-gnu11-llvm11 + - .full_test_condition + - .use_gko-rocm502-openmpi-gnu11-llvm11-multi-gpu variables: C_COMPILER: "clang" CXX_COMPILER: "clang++" BUILD_OMP: "ON" BUILD_HIP: "ON" + BUILD_MPI: "ON" + MPI_AS_ROOT: "ON" RUN_EXAMPLES: "ON" BUILD_TYPE: "Release" @@ -834,7 +804,6 @@ iwyu: variables: BUILD_OMP: "ON" BUILD_CUDA: "ON" - BUILD_CUDA: "HIP" EXTRA_CMAKE_FLAGS: '-DGINKGO_WITH_IWYU=ON' allow_failure: yes diff --git a/.gitlab/image.yml b/.gitlab/image.yml index 04aabfebcdb..0f8128ea2f1 100644 --- a/.gitlab/image.yml +++ b/.gitlab/image.yml @@ -84,6 +84,12 @@ - amdci - gpu +.use_gko-rocm502-openmpi-gnu11-llvm11-multi-gpu: + image: ginkgohub/rocm:502-openmpi-gnu11-llvm11 + tags: + - private_ci + - nla-gpu + .use_gko-oneapi-cpu: image: ginkgohub/oneapi:latest tags: diff --git a/CMakeLists.txt b/CMakeLists.txt index 9376cef03aa..23cac48d3c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -88,6 +88,8 @@ option(GINKGO_DPCPP_SINGLE_MODE "Do not compile double kernels for the DPC++ bac option(GINKGO_INSTALL_RPATH "Set the RPATH when installing its libraries." ON) option(GINKGO_INSTALL_RPATH_ORIGIN "Add $ORIGIN (Linux) or @loader_path (MacOS) to the installation RPATH." ON) option(GINKGO_INSTALL_RPATH_DEPENDENCIES "Add dependencies to the installation RPATH." OFF) +option(GINKGO_FORCE_GPU_AWARE_MPI "Assert that the MPI library is GPU aware. This forces Ginkgo to assume that GPU aware functionality is available (OFF (default) or ON), but may fail + catastrophically in case the MPI implementation is not GPU Aware, and GPU aware functionality has been forced" OFF) # load executor-specific configuration if(GINKGO_BUILD_CUDA) @@ -107,10 +109,10 @@ include(cmake/build_type_helpers.cmake) include(cmake/build_helpers.cmake) include(cmake/install_helpers.cmake) -if (MSVC) +if(MSVC) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj") endif() -if (MINGW OR CYGWIN) +if(MINGW OR CYGWIN) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wa,-mbig-obj") endif() @@ -204,8 +206,31 @@ else() message(STATUS "HWLOC is being forcibly switched off") endif() +set(GINKGO_HAVE_GPU_AWARE_MPI OFF) +set(GINKGO_FORCE_SPMV_BLOCKING_COMM OFF) if(GINKGO_BUILD_MPI) find_package(MPI REQUIRED) + if(GINKGO_FORCE_GPU_AWARE_MPI) + set(GINKGO_HAVE_GPU_AWARE_MPI ON) + else() + set(GINKGO_HAVE_GPU_AWARE_MPI OFF) + endif() + + try_run(uses_openmpi gko_result_unused + ${PROJECT_BINARY_DIR} + ${CMAKE_SOURCE_DIR}/cmake/openmpi_test.cpp + LINK_LIBRARIES MPI::MPI_CXX + RUN_OUTPUT_VARIABLE openmpi_version + ) + if(uses_openmpi) + if(openmpi_version VERSION_LESS "4.1") + message(WARNING + "OpenMPI v4.0.x has a bug that forces us to use blocking communication in our distributed " + "matrix class. To enable faster, non-blocking communication, consider updating your OpenMPI version or " + "switch to a different vendor.") + set(GINKGO_FORCE_SPMV_BLOCKING_COMM ON) + endif() + endif() endif() # Try to find the third party packages before using our subdirectories @@ -241,21 +266,21 @@ add_subdirectory(common) # Import list of unified kernel source files if(GINKGO_BUILD_CUDA) add_subdirectory(cuda) # High-performance kernels for NVIDIA GPUs endif() -if (GINKGO_BUILD_REFERENCE) +if(GINKGO_BUILD_REFERENCE) add_subdirectory(reference) # Reference kernel implementations endif() if(GINKGO_BUILD_HIP) add_subdirectory(hip) # High-performance kernels for AMD or NVIDIA GPUs endif() -if (GINKGO_BUILD_DPCPP) +if(GINKGO_BUILD_DPCPP) add_subdirectory(dpcpp) # High-performance DPC++ kernels endif() -if (GINKGO_BUILD_OMP) +if(GINKGO_BUILD_OMP) add_subdirectory(omp) # High-performance omp kernels endif() add_subdirectory(core) # Core Ginkgo types and top-level functions add_subdirectory(include) # Public API self-contained check -if (GINKGO_BUILD_TESTS) +if(GINKGO_BUILD_TESTS) add_subdirectory(test) # Tests running on all executors endif() @@ -323,7 +348,7 @@ endif() configure_file(${Ginkgo_SOURCE_DIR}/cmake/ginkgo.pc.in ${Ginkgo_BINARY_DIR}/ginkgo.pc.in @ONLY) file(GENERATE OUTPUT ${Ginkgo_BINARY_DIR}/ginkgo_$.pc - INPUT ${Ginkgo_BINARY_DIR}/ginkgo.pc.in) + INPUT ${Ginkgo_BINARY_DIR}/ginkgo.pc.in) # WINDOWS NVCC has " inside the string, add escape character # to avoid config problem. @@ -356,7 +381,7 @@ endif() file(MAKE_DIRECTORY "${GINKGO_TEST_INSTALL_BIN_DIR}") file(MAKE_DIRECTORY "${GINKGO_TEST_EXPORTBUILD_BIN_DIR}") set(TOOLSET "") -if (NOT "${CMAKE_GENERATOR_TOOLSET}" STREQUAL "") +if(NOT "${CMAKE_GENERATOR_TOOLSET}" STREQUAL "") set(TOOLSET "-T${CMAKE_GENERATOR_TOOLSET}") endif() add_custom_target(test_install diff --git a/benchmark/utils/cuda_linops.cu b/benchmark/utils/cuda_linops.cu index 9eeb309017e..502ccb89c7c 100644 --- a/benchmark/utils/cuda_linops.cu +++ b/benchmark/utils/cuda_linops.cu @@ -44,7 +44,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/sparselib_linops.hpp" #include "benchmark/utils/types.hpp" #include "cuda/base/cusparse_bindings.hpp" -#include "cuda/base/device_guard.hpp" #include "cuda/base/pointer_mode_guard.hpp" #include "cuda/base/types.hpp" @@ -102,12 +101,12 @@ protected: void initialize_descr() { - const auto id = this->get_gpu_exec()->get_device_id(); - gko::cuda::device_guard g{id}; + auto exec = this->get_gpu_exec(); + auto guard = exec->get_scoped_device_id_guard(); this->descr_ = handle_manager( gko::kernels::cuda::cusparse::create_mat_descr(), - [id](cusparseMatDescr_t descr) { - gko::cuda::device_guard g{id}; + [exec](cusparseMatDescr_t descr) { + auto guard = exec->get_scoped_device_id_guard(); gko::kernels::cuda::cusparse::destroy(descr); }); } @@ -130,7 +129,7 @@ class CusparseCsrmp public gko::ReadableFromMatrixData, public gko::EnableCreateMethod> { friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::polymorphic_object_traits; public: using csr = gko::matrix::Csr; @@ -166,8 +165,7 @@ protected: auto db = dense_b->get_const_values(); auto dx = dense_x->get_values(); - const auto id = this->get_gpu_exec()->get_device_id(); - gko::cuda::device_guard g{id}; + auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::cuda::cusparse::spmv_mp( this->get_gpu_exec()->get_cusparse_handle(), trans_, this->get_size()[0], this->get_size()[1], @@ -205,7 +203,7 @@ class CusparseCsr public gko::EnableCreateMethod>, public gko::ReadableFromMatrixData { friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::polymorphic_object_traits; public: using csr = gko::matrix::Csr; @@ -241,8 +239,7 @@ protected: auto db = dense_b->get_const_values(); auto dx = dense_x->get_values(); - const auto id = this->get_gpu_exec()->get_device_id(); - gko::cuda::device_guard g{id}; + auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::cuda::cusparse::spmv( this->get_gpu_exec()->get_cusparse_handle(), trans_, this->get_size()[0], this->get_size()[1], @@ -281,7 +278,7 @@ class CusparseCsrmm public gko::EnableCreateMethod>, public gko::ReadableFromMatrixData { friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::polymorphic_object_traits; public: using csr = gko::matrix::Csr; @@ -317,8 +314,7 @@ protected: auto db = dense_b->get_const_values(); auto dx = dense_x->get_values(); - const auto id = this->get_gpu_exec()->get_device_id(); - gko::cuda::device_guard g{id}; + auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::cuda::cusparse::spmm( this->get_gpu_exec()->get_cusparse_handle(), trans_, this->get_size()[0], dense_b->get_size()[1], this->get_size()[1], @@ -361,7 +357,7 @@ class CusparseCsrEx public gko::EnableCreateMethod>, public gko::ReadableFromMatrixData { friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::polymorphic_object_traits; public: using csr = gko::matrix::Csr; @@ -404,8 +400,7 @@ protected: ValueType beta = gko::zero(); gko::size_type buffer_size = 0; - const auto id = this->get_gpu_exec()->get_device_id(); - gko::cuda::device_guard g{id}; + auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); auto handle = this->get_gpu_exec()->get_cusparse_handle(); // This function seems to require the pointer mode to be set to HOST. // Ginkgo use pointer mode DEVICE by default, so we change this @@ -468,7 +463,7 @@ class CusparseHybrid CusparseHybrid>, public gko::ReadableFromMatrixData { friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::polymorphic_object_traits; public: using csr = gko::matrix::Csr; @@ -492,8 +487,7 @@ public: t_csr->read(data); this->set_size(t_csr->get_size()); - const auto id = this->get_gpu_exec()->get_device_id(); - gko::cuda::device_guard g{id}; + auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::cuda::cusparse::csr2hyb( this->get_gpu_exec()->get_cusparse_handle(), this->get_size()[0], this->get_size()[1], this->get_descr(), t_csr->get_const_values(), @@ -503,9 +497,8 @@ public: ~CusparseHybrid() override { - const auto id = this->get_gpu_exec()->get_device_id(); try { - gko::cuda::device_guard g{id}; + auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyHybMat(hyb_)); } catch (const std::exception& e) { std::cerr << "Error when unallocating CusparseHybrid hyb_ matrix: " @@ -525,8 +518,7 @@ protected: auto db = dense_b->get_const_values(); auto dx = dense_x->get_values(); - const auto id = this->get_gpu_exec()->get_device_id(); - gko::cuda::device_guard g{id}; + auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::cuda::cusparse::spmv( this->get_gpu_exec()->get_cusparse_handle(), trans_, &scalars.get_const_data()[0], this->get_descr(), hyb_, db, @@ -542,8 +534,7 @@ protected: : gko::EnableLinOp(exec, size), trans_(CUSPARSE_OPERATION_NON_TRANSPOSE) { - const auto id = this->get_gpu_exec()->get_device_id(); - gko::cuda::device_guard g{id}; + auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateHybMat(&hyb_)); } @@ -576,8 +567,7 @@ void cusparse_generic_spmv(std::shared_ptr gpu_exec, auto dense_x = gko::as>(x); auto db = dense_b->get_const_values(); auto dx = dense_x->get_values(); - const auto id = gpu_exec->get_device_id(); - gko::cuda::device_guard g{id}; + auto guard = gpu_exec->get_scoped_device_id_guard(); cusparseDnVecDescr_t vecb, vecx; GKO_ASSERT_NO_CUSPARSE_ERRORS( cusparseCreateDnVec(&vecx, dense_x->get_num_stored_elements(), @@ -612,7 +602,7 @@ class CusparseGenericCsr CusparseGenericCsr>, public gko::ReadableFromMatrixData { friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::polymorphic_object_traits; public: using csr = gko::matrix::Csr; @@ -653,9 +643,8 @@ public: ~CusparseGenericCsr() override { - const auto id = this->get_gpu_exec()->get_device_id(); try { - gko::cuda::device_guard g{id}; + auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(mat_)); } catch (const std::exception& e) { std::cerr @@ -705,7 +694,7 @@ class CusparseGenericCoo public gko::EnableCreateMethod>, public gko::ReadableFromMatrixData { friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::polymorphic_object_traits; public: using coo = gko::matrix::Coo; @@ -746,9 +735,8 @@ public: ~CusparseGenericCoo() override { - const auto id = this->get_gpu_exec()->get_device_id(); try { - gko::cuda::device_guard g{id}; + auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(mat_)); } catch (const std::exception& e) { std::cerr diff --git a/benchmark/utils/cuda_timer.cu b/benchmark/utils/cuda_timer.cu index c4222dcaa73..3ccdd2d8b3c 100644 --- a/benchmark/utils/cuda_timer.cu +++ b/benchmark/utils/cuda_timer.cu @@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/timer_impl.hpp" -#include "cuda/base/device_guard.hpp" /** @@ -61,8 +60,7 @@ public: { assert(exec != nullptr); exec_ = exec; - id_ = exec_->get_device_id(); - gko::cuda::device_guard g{id_}; + auto guard = exec_->get_scoped_device_id_guard(); GKO_ASSERT_NO_CUDA_ERRORS(cudaEventCreate(&start_)); GKO_ASSERT_NO_CUDA_ERRORS(cudaEventCreate(&stop_)); } @@ -71,14 +69,14 @@ protected: void tic_impl() override { exec_->synchronize(); - gko::cuda::device_guard g{id_}; + auto guard = exec_->get_scoped_device_id_guard(); // Currently, gko::CudaExecutor always use default stream. GKO_ASSERT_NO_CUDA_ERRORS(cudaEventRecord(start_)); } double toc_impl() override { - gko::cuda::device_guard g{id_}; + auto guard = exec_->get_scoped_device_id_guard(); // Currently, gko::CudaExecutor always use default stream. GKO_ASSERT_NO_CUDA_ERRORS(cudaEventRecord(stop_)); GKO_ASSERT_NO_CUDA_ERRORS(cudaEventSynchronize(stop_)); @@ -95,7 +93,6 @@ private: std::shared_ptr exec_; cudaEvent_t start_; cudaEvent_t stop_; - int id_; }; diff --git a/benchmark/utils/dpcpp_linops.dp.cpp b/benchmark/utils/dpcpp_linops.dp.cpp index 522a0f205fa..e03d8520ea3 100644 --- a/benchmark/utils/dpcpp_linops.dp.cpp +++ b/benchmark/utils/dpcpp_linops.dp.cpp @@ -126,7 +126,7 @@ class OnemklCsr OnemklCsr>, public gko::ReadableFromMatrixData { friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::polymorphic_object_traits; public: using Csr = gko::matrix::Csr; diff --git a/benchmark/utils/hip_linops.hip.cpp b/benchmark/utils/hip_linops.hip.cpp index bd7d4e7650a..ae0a1a2d82d 100644 --- a/benchmark/utils/hip_linops.hip.cpp +++ b/benchmark/utils/hip_linops.hip.cpp @@ -41,7 +41,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/sparselib_linops.hpp" #include "benchmark/utils/types.hpp" -#include "hip/base/device_guard.hip.hpp" #include "hip/base/hipsparse_bindings.hip.hpp" @@ -94,13 +93,13 @@ class HipsparseBase : public gko::LinOp { void initialize_descr() { - const auto id = this->get_gpu_exec()->get_device_id(); - gko::hip::device_guard g{id}; + auto exec = this->get_gpu_exec(); + auto guard = exec->get_scoped_device_id_guard(); this->descr_ = handle_manager( reinterpret_cast( gko::kernels::hip::hipsparse::create_mat_descr()), - [id](hipsparseMatDescr* descr) { - gko::hip::device_guard g{id}; + [exec](hipsparseMatDescr* descr) { + auto guard = exec->get_scoped_device_id_guard(); gko::kernels::hip::hipsparse::destroy(descr); }); } @@ -120,7 +119,7 @@ class HipsparseCsr public gko::EnableCreateMethod>, public gko::ReadableFromMatrixData { friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::polymorphic_object_traits; public: using csr = gko::matrix::Csr; @@ -156,8 +155,7 @@ class HipsparseCsr auto db = dense_b->get_const_values(); auto dx = dense_x->get_values(); - const auto id = this->get_gpu_exec()->get_device_id(); - gko::hip::device_guard g{id}; + auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::hip::hipsparse::spmv( this->get_gpu_exec()->get_hipsparse_handle(), trans_, this->get_size()[0], this->get_size()[1], @@ -196,7 +194,7 @@ class HipsparseCsrmm public gko::EnableCreateMethod>, public gko::ReadableFromMatrixData { friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::polymorphic_object_traits; public: using csr = gko::matrix::Csr; @@ -232,8 +230,7 @@ class HipsparseCsrmm auto db = dense_b->get_const_values(); auto dx = dense_x->get_values(); - const auto id = this->get_gpu_exec()->get_device_id(); - gko::hip::device_guard g{id}; + auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::hip::hipsparse::spmm( this->get_gpu_exec()->get_hipsparse_handle(), trans_, this->get_size()[0], dense_b->get_size()[1], this->get_size()[1], @@ -277,7 +274,7 @@ class HipsparseHybrid HipsparseHybrid>, public gko::ReadableFromMatrixData { friend class gko::EnableCreateMethod; - friend class gko::EnablePolymorphicObject; + friend class gko::polymorphic_object_traits; public: using csr = gko::matrix::Csr; @@ -301,8 +298,7 @@ class HipsparseHybrid t_csr->read(data); this->set_size(t_csr->get_size()); - const auto id = this->get_gpu_exec()->get_device_id(); - gko::hip::device_guard g{id}; + auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::hip::hipsparse::csr2hyb( this->get_gpu_exec()->get_hipsparse_handle(), this->get_size()[0], this->get_size()[1], this->get_descr(), t_csr->get_const_values(), @@ -312,9 +308,8 @@ class HipsparseHybrid ~HipsparseHybrid() override { - const auto id = this->get_gpu_exec()->get_device_id(); try { - gko::hip::device_guard g{id}; + auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseDestroyHybMat(hyb_)); } catch (const std::exception& e) { std::cerr << "Error when unallocating HipsparseHybrid hyb_ matrix: " @@ -334,8 +329,7 @@ class HipsparseHybrid auto db = dense_b->get_const_values(); auto dx = dense_x->get_values(); - const auto id = this->get_gpu_exec()->get_device_id(); - gko::hip::device_guard g{id}; + auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); gko::kernels::hip::hipsparse::spmv( this->get_gpu_exec()->get_hipsparse_handle(), trans_, &scalars.get_const_data()[0], this->get_descr(), hyb_, db, @@ -351,8 +345,7 @@ class HipsparseHybrid : gko::EnableLinOp(exec, size), trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE) { - const auto id = this->get_gpu_exec()->get_device_id(); - gko::hip::device_guard g{id}; + auto guard = this->get_gpu_exec()->get_scoped_device_id_guard(); GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateHybMat(&hyb_)); } diff --git a/benchmark/utils/hip_timer.hip.cpp b/benchmark/utils/hip_timer.hip.cpp index 2a6e6fe9c29..168c46ed3f8 100644 --- a/benchmark/utils/hip_timer.hip.cpp +++ b/benchmark/utils/hip_timer.hip.cpp @@ -34,7 +34,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "benchmark/utils/timer_impl.hpp" -#include "hip/base/device_guard.hip.hpp" /** @@ -60,8 +59,7 @@ class HipTimer : public Timer { { assert(exec != nullptr); exec_ = exec; - id_ = exec_->get_device_id(); - gko::hip::device_guard g{id_}; + auto guard = exec_->get_scoped_device_id_guard(); GKO_ASSERT_NO_HIP_ERRORS(hipEventCreate(&start_)); GKO_ASSERT_NO_HIP_ERRORS(hipEventCreate(&stop_)); } @@ -70,14 +68,14 @@ class HipTimer : public Timer { void tic_impl() override { exec_->synchronize(); - gko::hip::device_guard g{id_}; + auto guard = exec_->get_scoped_device_id_guard(); // Currently, gko::HipExecutor always use default stream. GKO_ASSERT_NO_HIP_ERRORS(hipEventRecord(start_)); } double toc_impl() override { - gko::hip::device_guard g{id_}; + auto guard = exec_->get_scoped_device_id_guard(); // Currently, gko::HipExecutor always use default stream. GKO_ASSERT_NO_HIP_ERRORS(hipEventRecord(stop_)); GKO_ASSERT_NO_HIP_ERRORS(hipEventSynchronize(stop_)); @@ -94,7 +92,6 @@ class HipTimer : public Timer { std::shared_ptr exec_; hipEvent_t start_; hipEvent_t stop_; - int id_; }; diff --git a/benchmark/utils/overhead_linop.hpp b/benchmark/utils/overhead_linop.hpp index dce50d49f15..af6b533d30d 100644 --- a/benchmark/utils/overhead_linop.hpp +++ b/benchmark/utils/overhead_linop.hpp @@ -101,7 +101,7 @@ template class Overhead : public EnableLinOp>, public Preconditionable { friend class EnableLinOp; - friend class EnablePolymorphicObject; + friend struct polymorphic_object_traits; public: GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory) diff --git a/cmake/autodetect_executors.cmake b/cmake/autodetect_executors.cmake index 9e396ee96b9..3a5ba36b559 100644 --- a/cmake/autodetect_executors.cmake +++ b/cmake/autodetect_executors.cmake @@ -15,7 +15,7 @@ if (NOT DEFINED GINKGO_BUILD_OMP) endif() if (NOT DEFINED GINKGO_BUILD_MPI) - find_package(MPI) + find_package(MPI 3.1) if(MPI_FOUND) message(STATUS "Enabling MPI support") set(GINKGO_HAS_MPI ON) diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake index aa0a657b215..0bd181cffd5 100644 --- a/cmake/create_test.cmake +++ b/cmake/create_test.cmake @@ -1,5 +1,5 @@ set(gko_test_single_args "MPI_SIZE") -set(gko_test_multi_arg "DISABLE_EXECUTORS;ADDITIONAL_LIBRARIES;ADDITIONAL_INCLUDES") +set(gko_test_multi_args "DISABLE_EXECUTORS;ADDITIONAL_LIBRARIES;ADDITIONAL_INCLUDES") ## Replaces / by _ to create valid target names from relative paths function(ginkgo_build_test_name test_name target_name) @@ -7,12 +7,21 @@ function(ginkgo_build_test_name test_name target_name) ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}") set(${target_name} ${TEST_TARGET_NAME} PARENT_SCOPE) -endfunction() +endfunction(ginkgo_build_test_name) + +function(ginkgo_create_gtest_mpi_main) + add_library(gtest_mpi_main "") + target_sources(gtest_mpi_main + PRIVATE + ${PROJECT_SOURCE_DIR}/core/test/mpi/gtest/mpi_listener.cpp) + find_package(MPI REQUIRED) + target_link_libraries(gtest_mpi_main PRIVATE GTest::GTest MPI::MPI_CXX) +endfunction(ginkgo_create_gtest_mpi_main) ## Set up shared target properties and handle ADDITIONAL_LIBRARIES/ADDITIONAL_INCLUDES ## `MPI_SIZE size` causes the tests to be run with `size` MPI processes. function(ginkgo_set_test_target_properties test_target_name) - cmake_parse_arguments(PARSE_ARGV 1 set_properties "" "${gko_test_single_args}" "${gko_test_multi_arg}") + cmake_parse_arguments(PARSE_ARGV 1 set_properties "" "${gko_test_single_args}" "${gko_test_multi_args}") if (GINKGO_FAST_TESTS) target_compile_definitions(${test_target_name} PRIVATE GINKGO_FAST_TESTS) endif() @@ -23,6 +32,9 @@ function(ginkgo_set_test_target_properties test_target_name) target_link_libraries(${test_target_name} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}") endif() if (set_properties_MPI_SIZE) + if(NOT TARGET gtest_mpi_main) + ginkgo_create_gtest_mpi_main() + endif() set(gtest_main gtest_mpi_main MPI::MPI_CXX) else() set(gtest_main GTest::Main) @@ -40,7 +52,7 @@ endfunction() ## - `ADDITIONAL_LIBRARIES lib1 lib2` adds additional target link dependencies ## - `ADDITIONAL_INCLUDES path1 path2` adds additional target include paths function(ginkgo_add_test test_name test_target_name) - cmake_parse_arguments(PARSE_ARGV 2 add_test "" "${gko_test_single_arg}" "${gko_test_multi_arg}") + cmake_parse_arguments(PARSE_ARGV 2 add_test "" "${gko_test_single_args}" "${gko_test_multi_args}") file(RELATIVE_PATH REL_BINARY_DIR ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}) set_target_properties(${test_target_name} PROPERTIES OUTPUT_NAME ${test_name}) if (add_test_MPI_SIZE) @@ -189,7 +201,7 @@ function(ginkgo_create_common_test test_name) endfunction(ginkgo_create_common_test) function(ginkgo_create_common_test_internal test_name exec_type exec) - cmake_parse_arguments(PARSE_ARGV 3 common_test "" "${gko_test_single_arg}" "${gko_test_multi_arg}") + cmake_parse_arguments(PARSE_ARGV 3 common_test "" "${gko_test_single_args}" "${gko_test_multi_args}") if(exec IN_LIST common_test_DISABLE_EXECUTORS) return() endif() @@ -211,7 +223,7 @@ endfunction(ginkgo_create_common_test_internal) ## Common test compiled with the device compiler, one target for each enabled backend function(ginkgo_create_common_device_test test_name) - cmake_parse_arguments(PARSE_ARGV 1 common_device_test "" "${gko_test_single_arg}" "${gko_test_multi_arg}") + cmake_parse_arguments(PARSE_ARGV 1 common_device_test "" "${gko_test_single_args}" "${gko_test_multi_args}") ginkgo_build_test_name(${test_name} test_target_name) if(GINKGO_BUILD_DPCPP) ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN}) diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake index 479b889aeaf..2cf8dd06c3f 100644 --- a/cmake/get_info.cmake +++ b/cmake/get_info.cmake @@ -130,7 +130,7 @@ foreach(log_type ${log_types}) "GINKGO_BUILD_OMP;GINKGO_BUILD_MPI;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_DPCPP") ginkgo_print_module_footer(${${log_type}} " Enabled features:") ginkgo_print_foreach_variable(${${log_type}} - "GINKGO_MIXED_PRECISION") + "GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI") ginkgo_print_module_footer(${${log_type}} " Tests, benchmarks and examples:") ginkgo_print_foreach_variable(${${log_type}} "GINKGO_BUILD_TESTS;GINKGO_FAST_TESTS;GINKGO_BUILD_EXAMPLES;GINKGO_EXTLIB_EXAMPLE;GINKGO_BUILD_BENCHMARKS;GINKGO_BENCHMARK_ENABLE_TUNING") diff --git a/cmake/openmpi_test.cpp b/cmake/openmpi_test.cpp new file mode 100644 index 00000000000..18bf7669368 --- /dev/null +++ b/cmake/openmpi_test.cpp @@ -0,0 +1,44 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include +#include +int main() +{ +#if defined(OPEN_MPI) && OPEN_MPI + std::printf("%d.%d.%d", OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION, + OMPI_RELEASE_VERSION); + return 1; +#else + return 0; +#endif +} diff --git a/common/cuda_hip/distributed/matrix_kernels.hpp.inc b/common/cuda_hip/distributed/matrix_kernels.hpp.inc new file mode 100644 index 00000000000..4b327a41872 --- /dev/null +++ b/common/cuda_hip/distributed/matrix_kernels.hpp.inc @@ -0,0 +1,291 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +template +struct input_type { + GlobalIndexType row; + GlobalIndexType col; + ValueType val; + size_type row_range; + size_type col_range; + + __forceinline__ __device__ __host__ + input_type(thrust::tuple + t) + : row(thrust::get<0>(t)), + col(thrust::get<1>(t)), + val(thrust::get<2>(t)), + row_range(thrust::get<3>(t)), + col_range(thrust::get<4>(t)) + {} +}; + + +template +void build_local_nonlocal( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + const experimental::distributed::Partition* + col_partition, + comm_index_type local_part, array& local_row_idxs, + array& local_col_idxs, array& local_values, + array& non_local_row_idxs, + array& non_local_col_idxs, + array& non_local_values, + array& local_gather_idxs, + array& recv_sizes, + array& non_local_to_global) +{ + auto input_vals = input.get_const_values(); + auto row_part_ids = row_partition->get_part_ids(); + auto col_part_ids = col_partition->get_part_ids(); + auto num_parts = static_cast(row_partition->get_num_parts()); + const auto* row_range_bounds = row_partition->get_range_bounds(); + const auto* col_range_bounds = col_partition->get_range_bounds(); + const auto* row_range_starting_indices = + row_partition->get_range_starting_indices(); + const auto* col_range_starting_indices = + col_partition->get_range_starting_indices(); + const auto num_row_ranges = row_partition->get_num_ranges(); + const auto num_col_ranges = col_partition->get_num_ranges(); + const auto num_input_elements = input.get_num_elems(); + + // precompute the row and column range id of each input element + auto input_row_idxs = input.get_const_row_idxs(); + auto input_col_idxs = input.get_const_col_idxs(); + array row_range_ids{exec, num_input_elements}; + thrust::upper_bound(thrust::device, row_range_bounds + 1, + row_range_bounds + num_row_ranges + 1, input_row_idxs, + input_row_idxs + num_input_elements, + row_range_ids.get_data()); + array col_range_ids{exec, input.get_num_elems()}; + thrust::upper_bound(thrust::device, col_range_bounds + 1, + col_range_bounds + num_col_ranges + 1, input_col_idxs, + input_col_idxs + num_input_elements, + col_range_ids.get_data()); + + // count number of local<0> and non-local<1> elements + auto range_ids_it = thrust::make_zip_iterator(thrust::make_tuple( + row_range_ids.get_const_data(), col_range_ids.get_const_data())); + auto num_elements_pair = thrust::transform_reduce( + thrust::device, range_ids_it, range_ids_it + num_input_elements, + [local_part, row_part_ids, col_part_ids] __host__ __device__( + const thrust::tuple& tuple) { + auto row_part = row_part_ids[thrust::get<0>(tuple)]; + auto col_part = col_part_ids[thrust::get<1>(tuple)]; + bool is_inner_entry = + row_part == local_part && col_part == local_part; + bool is_ghost_entry = + row_part == local_part && col_part != local_part; + return thrust::make_tuple( + is_inner_entry ? size_type{1} : size_type{0}, + is_ghost_entry ? size_type{1} : size_type{0}); + }, + thrust::make_tuple(size_type{}, size_type{}), + [] __host__ __device__(const thrust::tuple& a, + const thrust::tuple& b) { + return thrust::make_tuple(thrust::get<0>(a) + thrust::get<0>(b), + thrust::get<1>(a) + thrust::get<1>(b)); + }); + auto num_local_elements = thrust::get<0>(num_elements_pair); + auto num_non_local_elements = thrust::get<1>(num_elements_pair); + + // define global-to-local maps for row and column indices + auto map_to_local_row = + [row_range_bounds, row_range_starting_indices] __host__ __device__( + const GlobalIndexType row, const size_type range_id) { + return static_cast(row - + row_range_bounds[range_id]) + + row_range_starting_indices[range_id]; + }; + auto map_to_local_col = + [col_range_bounds, col_range_starting_indices] __host__ __device__( + const GlobalIndexType col, const size_type range_id) { + return static_cast(col - + col_range_bounds[range_id]) + + col_range_starting_indices[range_id]; + }; + + using input_type = input_type; + auto input_it = thrust::make_zip_iterator(thrust::make_tuple( + input.get_const_row_idxs(), input.get_const_col_idxs(), + input.get_const_values(), row_range_ids.get_const_data(), + col_range_ids.get_const_data())); + + // copy and transform local entries into arrays + local_row_idxs.resize_and_reset(num_local_elements); + local_col_idxs.resize_and_reset(num_local_elements); + local_values.resize_and_reset(num_local_elements); + auto local_it = thrust::make_transform_iterator( + input_it, [map_to_local_row, map_to_local_col] __host__ __device__( + const input_type input) { + auto local_row = map_to_local_row(input.row, input.row_range); + auto local_col = map_to_local_col(input.col, input.col_range); + return thrust::make_tuple(local_row, local_col, input.val); + }); + thrust::copy_if( + thrust::device, local_it, local_it + input.get_num_elems(), + range_ids_it, + thrust::make_zip_iterator(thrust::make_tuple(local_row_idxs.get_data(), + local_col_idxs.get_data(), + local_values.get_data())), + [local_part, row_part_ids, col_part_ids] __host__ __device__( + const thrust::tuple& tuple) { + auto row_part = row_part_ids[thrust::get<0>(tuple)]; + auto col_part = col_part_ids[thrust::get<1>(tuple)]; + return row_part == local_part && col_part == local_part; + }); + // copy and transform non-local entries into arrays. this keeps global + // column indices, and also stores the column part id for each non-local + // entry in an array + non_local_row_idxs.resize_and_reset(num_non_local_elements); + non_local_values.resize_and_reset(num_non_local_elements); + array non_local_global_col_idxs{exec, + num_non_local_elements}; + array non_local_col_part_ids{exec, num_non_local_elements}; + array non_local_col_range_ids{exec, num_non_local_elements}; + auto non_local_it = thrust::make_transform_iterator( + input_it, [map_to_local_row, map_to_local_col, + col_part_ids] __host__ __device__(const input_type input) { + auto local_row = map_to_local_row(input.row, input.row_range); + return thrust::make_tuple(local_row, input.col, input.val, + col_part_ids[input.col_range], + input.col_range); + }); + thrust::copy_if( + thrust::device, non_local_it, non_local_it + input.get_num_elems(), + range_ids_it, + thrust::make_zip_iterator(thrust::make_tuple( + non_local_row_idxs.get_data(), non_local_global_col_idxs.get_data(), + non_local_values.get_data(), non_local_col_part_ids.get_data(), + non_local_col_range_ids.get_data())), + [local_part, row_part_ids, col_part_ids] __host__ __device__( + const thrust::tuple& tuple) { + auto row_part = row_part_ids[thrust::get<0>(tuple)]; + auto col_part = col_part_ids[thrust::get<1>(tuple)]; + return row_part == local_part && col_part != local_part; + }); + + // 1. sort global columns, part-id and range-id according to + // their part-id and global columns + // the previous `non_local_global_col_idxs` is not modify to + // keep it consistent with the non-local row and values array + array sorted_non_local_global_col_idxs{ + exec, non_local_global_col_idxs}; + auto key_it = thrust::make_zip_iterator( + thrust::make_tuple(non_local_col_part_ids.get_data(), + sorted_non_local_global_col_idxs.get_data())); + thrust::sort_by_key(thrust::device, key_it, key_it + num_non_local_elements, + non_local_col_range_ids.get_data()); + + // 2. remove duplicate columns, now the new column i has global index + // non_local_global_col_idxs[i] + auto non_local_global_col_idxs_begin = + sorted_non_local_global_col_idxs.get_data(); + auto non_local_global_col_idxs_end = thrust::get<0>(thrust::unique_by_key( + thrust::device, non_local_global_col_idxs_begin, + non_local_global_col_idxs_begin + num_non_local_elements, + thrust::make_zip_iterator( + thrust::make_tuple(non_local_col_part_ids.get_data(), + non_local_col_range_ids.get_data())))); + auto num_non_local_cols = static_cast(thrust::distance( + non_local_global_col_idxs_begin, non_local_global_col_idxs_end)); + + // 2.5 copy unique_columns to non_local_to_global map + non_local_to_global.resize_and_reset(num_non_local_cols); + exec->copy(num_non_local_cols, non_local_global_col_idxs_begin, + non_local_to_global.get_data()); + + // 3. create mapping from unique_columns + // since we don't have hash tables on GPUs I'm first sorting the non-local + // global column indices and their new local index again by the global + // column index. Then I'm using binary searches to find the new local column + // index. + array permutation{exec, num_non_local_cols}; + thrust::sequence(thrust::device, permutation.get_data(), + permutation.get_data() + num_non_local_cols); + thrust::sort_by_key( + thrust::device, non_local_global_col_idxs_begin, + non_local_global_col_idxs_begin + num_non_local_cols, + thrust::make_zip_iterator(thrust::make_tuple( + non_local_col_part_ids.get_data(), permutation.get_data()))); + + // 4. map column index of non-local entries to new columns + non_local_col_idxs.resize_and_reset(num_non_local_elements); + array lower_bounds{exec, num_non_local_elements}; + // I have to precompute the lower bounds because the calling binary + // searches from the device does not work: + // https://github.com/NVIDIA/thrust/issues/1415 + // TODO: compute lower bounds on-the-fly if available + thrust::lower_bound( + thrust::device, non_local_global_col_idxs_begin, + non_local_global_col_idxs_begin + num_non_local_cols, + non_local_global_col_idxs.get_data(), + non_local_global_col_idxs.get_data() + num_non_local_elements, + lower_bounds.get_data()); + auto permutation_data = permutation.get_data(); + thrust::transform( + thrust::device, lower_bounds.get_data(), + lower_bounds.get_data() + num_non_local_elements, + non_local_col_idxs.get_data(), + [permutation_data] __host__ __device__(const size_type lower_bound) { + return permutation_data[lower_bound]; + }); + + // 5. compute gather idxs and recv_sizes + local_gather_idxs.resize_and_reset(num_non_local_cols); + auto transform_it = thrust::make_zip_iterator(thrust::make_tuple( + non_local_to_global.get_data(), non_local_col_range_ids.get_data())); + thrust::transform( + thrust::device, transform_it, transform_it + num_non_local_cols, + local_gather_idxs.get_data(), + [map_to_local_col] __host__ __device__( + const thrust::tuple& tuple) { + return map_to_local_col(thrust::get<0>(tuple), + thrust::get<1>(tuple)); + }); + + auto recv_sizes_ptr = recv_sizes.get_data(); + thrust::fill_n(thrust::device, recv_sizes_ptr, num_parts, 0); + thrust::for_each_n(thrust::device, non_local_col_part_ids.get_data(), + num_non_local_cols, + [recv_sizes_ptr] __device__(const size_type part) { + atomic_add(recv_sizes_ptr + part, 1); + }); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_BUILD_LOCAL_NONLOCAL); diff --git a/common/cuda_hip/distributed/vector_kernels.hpp.inc b/common/cuda_hip/distributed/vector_kernels.hpp.inc new file mode 100644 index 00000000000..b3030ce5252 --- /dev/null +++ b/common/cuda_hip/distributed/vector_kernels.hpp.inc @@ -0,0 +1,96 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + + +template +void build_local( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + partition, + comm_index_type local_part, matrix::Dense* local_mtx) +{ + const auto* range_bounds = partition->get_range_bounds(); + const auto* range_starting_indices = + partition->get_range_starting_indices(); + const auto* part_ids = partition->get_part_ids(); + const auto num_ranges = partition->get_num_ranges(); + + array range_id{exec, input.get_num_elems()}; + thrust::upper_bound(thrust::device, range_bounds + 1, + range_bounds + num_ranges + 1, + input.get_const_row_idxs(), + input.get_const_row_idxs() + input.get_num_elems(), + range_id.get_data(), thrust::less()); + + // write values with local rows into the local matrix at the correct index + // this needs the following iterators: + // - local_row_it: (global_row, range_id) -> local row index + // - flat_idx_it: (local_row, col) -> flat index in local matrix values + // array + // the flat_idx_it is used by the scatter_if as an index map for the values + auto map_to_local_row = + [range_bounds, range_starting_indices] __host__ __device__( + const thrust::tuple& idx_range_id) { + const auto idx = thrust::get<0>(idx_range_id); + const auto rid = thrust::get<1>(idx_range_id); + return static_cast(idx - range_bounds[rid]) + + range_starting_indices[rid]; + }; + auto local_row_it = thrust::make_transform_iterator( + thrust::make_zip_iterator(thrust::make_tuple(input.get_const_row_idxs(), + range_id.get_data())), + map_to_local_row); + + auto stride = local_mtx->get_stride(); + auto map_to_flat_idx = + [stride] __host__ __device__( + const thrust::tuple& row_col) { + return thrust::get<0>(row_col) * stride + thrust::get<1>(row_col); + }; + auto flat_idx_it = thrust::make_transform_iterator( + thrust::make_zip_iterator( + thrust::make_tuple(local_row_it, input.get_const_col_idxs())), + map_to_flat_idx); + + auto is_local_row = + [part_ids, local_part] __host__ __device__(const size_type rid) { + return part_ids[rid] == local_part; + }; + thrust::scatter_if(thrust::device, input.get_const_values(), + input.get_const_values() + input.get_num_elems(), + flat_idx_it, range_id.get_data(), + local_mtx->get_values(), is_local_row); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL); diff --git a/common/unified/distributed/partition_kernels.cpp b/common/unified/distributed/partition_kernels.cpp index ecb8e9b0733..8129dec3fc3 100644 --- a/common/unified/distributed/partition_kernels.cpp +++ b/common/unified/distributed/partition_kernels.cpp @@ -44,7 +44,7 @@ namespace GKO_DEVICE_NAMESPACE { namespace partition { -using distributed::comm_index_type; +using experimental::distributed::comm_index_type; void count_ranges(std::shared_ptr exec, const array& mapping, size_type& num_ranges) @@ -149,7 +149,8 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_PARTITION_BUILD_FROM_GLOBAL_SIZE); template void has_ordered_parts( std::shared_ptr exec, - const distributed::Partition* partition, + const experimental::distributed::Partition* + partition, bool* result) { const auto part_ids = partition->get_part_ids(); diff --git a/common/unified/matrix/dense_kernels.cpp b/common/unified/matrix/dense_kernels.cpp index c1d1422c483..6d9f78d2cdb 100644 --- a/common/unified/matrix/dense_kernels.cpp +++ b/common/unified/matrix/dense_kernels.cpp @@ -380,6 +380,38 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T); +template +void compute_squared_norm2(std::shared_ptr exec, + const matrix::Dense* x, + matrix::Dense>* result, + array& tmp) +{ + run_kernel_col_reduction_cached( + exec, + [] GKO_KERNEL(auto i, auto j, auto x) { return squared_norm(x(i, j)); }, + GKO_KERNEL_REDUCE_SUM(remove_complex), result->get_values(), + x->get_size(), tmp, x); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE( + GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL); + + +template +void compute_sqrt(std::shared_ptr exec, + matrix::Dense* x) +{ + run_kernel( + exec, + [] GKO_KERNEL(auto row, auto col, auto x) { + x(row, col) = sqrt(x(row, col)); + }, + x->get_size(), x); +} + +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL); + + template void symm_permute(std::shared_ptr exec, const array* permutation_indices, diff --git a/common/unified/solver/bicg_kernels.cpp b/common/unified/solver/bicg_kernels.cpp index 5b6b27e5737..c63e9b1467b 100644 --- a/common/unified/solver/bicg_kernels.cpp +++ b/common/unified/solver/bicg_kernels.cpp @@ -60,26 +60,38 @@ void initialize(std::shared_ptr exec, matrix::Dense* q2, array* stop_status) { - run_kernel_solver( - exec, - [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto z, auto p, - auto q, auto prev_rho, auto rho, auto r2, auto z2, - auto p2, auto q2, auto stop) { - if (row == 0) { + if (b->get_size()) { + run_kernel_solver( + exec, + [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto z, auto p, + auto q, auto prev_rho, auto rho, auto r2, auto z2, + auto p2, auto q2, auto stop) { + if (row == 0) { + rho[col] = zero(rho[col]); + prev_rho[col] = one(prev_rho[col]); + stop[col].reset(); + } + r(row, col) = b(row, col); + r2(row, col) = b(row, col); + z(row, col) = p(row, col) = q(row, col) = z2(row, col) = + p2(row, col) = q2(row, col) = zero(z(row, col)); + }, + b->get_size(), b->get_stride(), default_stride(b), + default_stride(r), default_stride(z), default_stride(p), + default_stride(q), row_vector(prev_rho), row_vector(rho), + default_stride(r2), default_stride(z2), default_stride(p2), + default_stride(q2), *stop_status); + } else { + run_kernel( + exec, + [] GKO_KERNEL(auto col, auto prev_rho, auto rho, auto stop) { rho[col] = zero(rho[col]); prev_rho[col] = one(prev_rho[col]); stop[col].reset(); - } - r(row, col) = b(row, col); - r2(row, col) = b(row, col); - z(row, col) = p(row, col) = q(row, col) = z2(row, col) = - p2(row, col) = q2(row, col) = zero(z(row, col)); - }, - b->get_size(), b->get_stride(), default_stride(b), default_stride(r), - default_stride(z), default_stride(p), default_stride(q), - row_vector(prev_rho), row_vector(rho), default_stride(r2), - default_stride(z2), default_stride(p2), default_stride(q2), - *stop_status); + }, + b->get_size()[1], row_vector(prev_rho), row_vector(rho), + *stop_status); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL); diff --git a/common/unified/solver/bicgstab_kernels.cpp b/common/unified/solver/bicgstab_kernels.cpp index 1c120065198..2e675514c5b 100644 --- a/common/unified/solver/bicgstab_kernels.cpp +++ b/common/unified/solver/bicgstab_kernels.cpp @@ -62,27 +62,41 @@ void initialize(std::shared_ptr exec, matrix::Dense* omega, array* stop_status) { - run_kernel_solver( - exec, - [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto rr, auto y, - auto s, auto t, auto z, auto v, auto p, auto prev_rho, - auto rho, auto alpha, auto beta, auto gamma, auto omega, - auto stop) { - if (row == 0) { + if (b->get_size()) { + run_kernel_solver( + exec, + [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto rr, auto y, + auto s, auto t, auto z, auto v, auto p, auto prev_rho, + auto rho, auto alpha, auto beta, auto gamma, + auto omega, auto stop) { + if (row == 0) { + rho[col] = prev_rho[col] = alpha[col] = beta[col] = + gamma[col] = omega[col] = one(rho[col]); + stop[col].reset(); + } + r(row, col) = b(row, col); + rr(row, col) = z(row, col) = v(row, col) = s(row, col) = t( + row, col) = y(row, col) = p(row, col) = zero(rr(row, col)); + }, + b->get_size(), b->get_stride(), default_stride(b), + default_stride(r), default_stride(rr), default_stride(y), + default_stride(s), default_stride(t), default_stride(z), + default_stride(v), default_stride(p), row_vector(prev_rho), + row_vector(rho), row_vector(alpha), row_vector(beta), + row_vector(gamma), row_vector(omega), *stop_status); + } else { + run_kernel( + exec, + [] GKO_KERNEL(auto col, auto prev_rho, auto rho, auto alpha, + auto beta, auto gamma, auto omega, auto stop) { rho[col] = prev_rho[col] = alpha[col] = beta[col] = gamma[col] = omega[col] = one(rho[col]); stop[col].reset(); - } - r(row, col) = b(row, col); - rr(row, col) = z(row, col) = v(row, col) = s(row, col) = - t(row, col) = y(row, col) = p(row, col) = zero(rr(row, col)); - }, - b->get_size(), b->get_stride(), default_stride(b), default_stride(r), - default_stride(rr), default_stride(y), default_stride(s), - default_stride(t), default_stride(z), default_stride(v), - default_stride(p), row_vector(prev_rho), row_vector(rho), - row_vector(alpha), row_vector(beta), row_vector(gamma), - row_vector(omega), *stop_status); + }, + b->get_size()[1], row_vector(prev_rho), row_vector(rho), + row_vector(alpha), row_vector(beta), row_vector(gamma), + row_vector(omega), *stop_status); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL); diff --git a/common/unified/solver/cg_kernels.cpp b/common/unified/solver/cg_kernels.cpp index 36a7fc3ce9b..966317adbcf 100644 --- a/common/unified/solver/cg_kernels.cpp +++ b/common/unified/solver/cg_kernels.cpp @@ -58,21 +58,33 @@ void initialize(std::shared_ptr exec, matrix::Dense* rho, array* stop_status) { - run_kernel_solver( - exec, - [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto z, auto p, - auto q, auto prev_rho, auto rho, auto stop) { - if (row == 0) { + if (b->get_size()) { + run_kernel_solver( + exec, + [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto z, auto p, + auto q, auto prev_rho, auto rho, auto stop) { + if (row == 0) { + rho[col] = zero(rho[col]); + prev_rho[col] = one(prev_rho[col]); + stop[col].reset(); + } + r(row, col) = b(row, col); + z(row, col) = p(row, col) = q(row, col) = zero(z(row, col)); + }, + b->get_size(), b->get_stride(), b, default_stride(r), + default_stride(z), default_stride(p), default_stride(q), + row_vector(prev_rho), row_vector(rho), *stop_status); + } else { + run_kernel( + exec, + [] GKO_KERNEL(auto col, auto prev_rho, auto rho, auto stop) { rho[col] = zero(rho[col]); prev_rho[col] = one(prev_rho[col]); stop[col].reset(); - } - r(row, col) = b(row, col); - z(row, col) = p(row, col) = q(row, col) = zero(z(row, col)); - }, - b->get_size(), b->get_stride(), b, default_stride(r), default_stride(z), - default_stride(p), default_stride(q), row_vector(prev_rho), - row_vector(rho), *stop_status); + }, + b->get_size()[1], row_vector(prev_rho), row_vector(rho), + *stop_status); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_INITIALIZE_KERNEL); diff --git a/common/unified/solver/cgs_kernels.cpp b/common/unified/solver/cgs_kernels.cpp index 9878c0d9751..8cefb60e976 100644 --- a/common/unified/solver/cgs_kernels.cpp +++ b/common/unified/solver/cgs_kernels.cpp @@ -63,27 +63,43 @@ void initialize(std::shared_ptr exec, matrix::Dense* rho, array* stop_status) { - run_kernel_solver( - exec, - [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto r_tld, auto p, - auto q, auto u, auto u_hat, auto v_hat, auto t, - auto alpha, auto beta, auto gamma, auto prev_rho, - auto rho, auto stop) { - if (row == 0) { + if (b->get_size()) { + run_kernel_solver( + exec, + [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto r_tld, + auto p, auto q, auto u, auto u_hat, auto v_hat, + auto t, auto alpha, auto beta, auto gamma, + auto prev_rho, auto rho, auto stop) { + if (row == 0) { + rho[col] = zero(rho[col]); + prev_rho[col] = alpha[col] = beta[col] = gamma[col] = + one(prev_rho[col]); + stop[col].reset(); + } + r(row, col) = r_tld(row, col) = b(row, col); + u(row, col) = u_hat(row, col) = p(row, col) = q(row, col) = + v_hat(row, col) = t(row, col) = zero(u(row, col)); + }, + b->get_size(), b->get_stride(), default_stride(b), + default_stride(r), default_stride(r_tld), default_stride(p), + default_stride(q), default_stride(u), default_stride(u_hat), + default_stride(v_hat), default_stride(t), row_vector(alpha), + row_vector(beta), row_vector(gamma), row_vector(prev_rho), + row_vector(rho), *stop_status); + } else { + run_kernel( + exec, + [] GKO_KERNEL(auto col, auto alpha, auto beta, auto gamma, + auto prev_rho, auto rho, auto stop) { rho[col] = zero(rho[col]); prev_rho[col] = alpha[col] = beta[col] = gamma[col] = one(prev_rho[col]); stop[col].reset(); - } - r(row, col) = r_tld(row, col) = b(row, col); - u(row, col) = u_hat(row, col) = p(row, col) = q(row, col) = - v_hat(row, col) = t(row, col) = zero(u(row, col)); - }, - b->get_size(), b->get_stride(), default_stride(b), default_stride(r), - default_stride(r_tld), default_stride(p), default_stride(q), - default_stride(u), default_stride(u_hat), default_stride(v_hat), - default_stride(t), row_vector(alpha), row_vector(beta), - row_vector(gamma), row_vector(prev_rho), row_vector(rho), *stop_status); + }, + b->get_size()[1], row_vector(alpha), row_vector(beta), + row_vector(gamma), row_vector(prev_rho), row_vector(rho), + *stop_status); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_INITIALIZE_KERNEL); diff --git a/common/unified/solver/fcg_kernels.cpp b/common/unified/solver/fcg_kernels.cpp index 4277f45f5fa..fb03f686d56 100644 --- a/common/unified/solver/fcg_kernels.cpp +++ b/common/unified/solver/fcg_kernels.cpp @@ -59,23 +59,36 @@ void initialize(std::shared_ptr exec, matrix::Dense* rho, matrix::Dense* rho_t, array* stop_status) { - run_kernel_solver( - exec, - [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto z, auto p, - auto q, auto t, auto prev_rho, auto rho, auto rho_t, - auto stop) { - if (row == 0) { + if (b->get_size()) { + run_kernel_solver( + exec, + [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto z, auto p, + auto q, auto t, auto prev_rho, auto rho, auto rho_t, + auto stop) { + if (row == 0) { + rho[col] = zero(rho[col]); + prev_rho[col] = rho_t[col] = one(prev_rho[col]); + stop[col].reset(); + } + t(row, col) = r(row, col) = b(row, col); + z(row, col) = p(row, col) = q(row, col) = zero(z(row, col)); + }, + b->get_size(), b->get_stride(), default_stride(b), + default_stride(r), default_stride(z), default_stride(p), + default_stride(q), default_stride(t), row_vector(prev_rho), + row_vector(rho), row_vector(rho_t), *stop_status); + } else { + run_kernel( + exec, + [] GKO_KERNEL(auto col, auto prev_rho, auto rho, auto rho_t, + auto stop) { rho[col] = zero(rho[col]); prev_rho[col] = rho_t[col] = one(prev_rho[col]); stop[col].reset(); - } - t(row, col) = r(row, col) = b(row, col); - z(row, col) = p(row, col) = q(row, col) = zero(z(row, col)); - }, - b->get_size(), b->get_stride(), default_stride(b), default_stride(r), - default_stride(z), default_stride(p), default_stride(q), - default_stride(t), row_vector(prev_rho), row_vector(rho), - row_vector(rho_t), *stop_status); + }, + b->get_size()[1], row_vector(prev_rho), row_vector(rho), + row_vector(rho_t), *stop_status); + } } GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_INITIALIZE_KERNEL); diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 427ac8091a3..26a7bf1c3e3 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -6,9 +6,11 @@ target_sources(ginkgo base/array.cpp base/combination.cpp base/composition.cpp + base/dense_cache.cpp base/device_matrix_data.cpp base/executor.cpp base/index_set.cpp + base/mpi.cpp base/mtx_io.cpp base/perturbation.cpp base/version.cpp @@ -67,7 +69,10 @@ endif() if (GINKGO_BUILD_MPI) target_sources(ginkgo - PRIVATE mpi/exception.cpp) + PRIVATE + mpi/exception.cpp + distributed/matrix.cpp + distributed/vector.cpp) endif() ginkgo_compile_features(ginkgo) diff --git a/core/base/dense_cache.cpp b/core/base/dense_cache.cpp new file mode 100644 index 00000000000..91e4a4247cd --- /dev/null +++ b/core/base/dense_cache.cpp @@ -0,0 +1,69 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +namespace gko { +namespace detail { + + +template +void DenseCache::init(std::shared_ptr exec, + dim<2> size) const +{ + if (!vec || vec->get_size() != size || vec->get_executor() != exec) { + vec = matrix::Dense::create(exec, size); + } +} + + +template +void DenseCache::init_from( + const matrix::Dense* template_vec) const +{ + if (!vec || vec->get_size() != template_vec->get_size() || + vec->get_executor() != template_vec->get_executor()) { + vec = matrix::Dense::create_with_config_of(template_vec); + } +} + + +#define GKO_DECLARE_DENSE_CACHE(_type) class DenseCache<_type> +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CACHE); + + +} // namespace detail +} // namespace gko diff --git a/core/base/mpi.cpp b/core/base/mpi.cpp new file mode 100644 index 00000000000..ed33cd38e42 --- /dev/null +++ b/core/base/mpi.cpp @@ -0,0 +1,99 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#if GINKGO_BUILD_MPI + + +#include + + +#include + + +namespace gko { +namespace experimental { +namespace mpi { + + +int map_rank_to_device_id(MPI_Comm comm, const int num_devices) +{ + GKO_ASSERT(num_devices > 0); + if (num_devices == 1) { + return 0; + } else { + auto mpi_node_local_rank = [](MPI_Comm comm_) { + int local_rank; + MPI_Comm local_comm; + GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_split_type( + comm_, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &local_comm)); + GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_rank(local_comm, &local_rank)); + MPI_Comm_free(&local_comm); + return local_rank; + }; + + // When we are using MPI_COMM_WORLD, there might be already an + // environment variable describing the node local rank, so we + // prioritize it. If no suitable environment variable is found + // we determine the node-local rank with MPI calls. + int local_rank; + int compare_result; + GKO_ASSERT_NO_MPI_ERRORS( + MPI_Comm_compare(comm, MPI_COMM_WORLD, &compare_result)); + if (compare_result != MPI_IDENT && compare_result != MPI_CONGRUENT) { + local_rank = mpi_node_local_rank(comm); + } else { + if (auto str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK")) { + local_rank = std::stoi(str); + } else if (auto str = std::getenv("OMPI_COMM_WORLD_LOCAL_RANK")) { + local_rank = std::stoi(str); + } else if (auto str = std::getenv("MPI_LOCALRANKID")) { + local_rank = std::stoi(str); + } else if (auto str = std::getenv("SLURM_LOCALID")) { + local_rank = std::stoi(str); + } else { + local_rank = mpi_node_local_rank(comm); + } + } + return local_rank % num_devices; + } +} + + +} // namespace mpi +} // namespace experimental +} // namespace gko + + +#endif // GKO_HAVE_MPI diff --git a/core/base/noop_scoped_device_id_guard.hpp b/core/base/noop_scoped_device_id_guard.hpp new file mode 100644 index 00000000000..ab6f514e9dc --- /dev/null +++ b/core/base/noop_scoped_device_id_guard.hpp @@ -0,0 +1,57 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GINKGO_CORE_BASE_NOOP_SCOPED_DEVICE_ID_GUARD_HPP +#define GINKGO_CORE_BASE_NOOP_SCOPED_DEVICE_ID_GUARD_HPP + + +#include + + +namespace gko { +namespace detail { + + +/** + * An implementation of generic_scoped_device_id_guard that does nothing. + * + * This is used for OmpExecutor and DpcppExecutor, since they don't require + * setting a device id. + */ +class noop_scoped_device_id_guard : public generic_scoped_device_id_guard {}; + + +} // namespace detail +} // namespace gko + + +#endif // GINKGO_CORE_BASE_NOOP_SCOPED_DEVICE_ID_GUARD_HPP diff --git a/core/device_hooks/CMakeLists.txt b/core/device_hooks/CMakeLists.txt index fcb370a81a0..901acef7797 100644 --- a/core/device_hooks/CMakeLists.txt +++ b/core/device_hooks/CMakeLists.txt @@ -45,6 +45,7 @@ if (NOT GINKGO_BUILD_REFERENCE) add_library(ginkgo_reference $ reference_hooks.cpp) + target_link_libraries(ginkgo_reference PRIVATE ginkgo_omp) target_link_libraries(ginkgo_reference PUBLIC ginkgo_device) ginkgo_compile_features(ginkgo_reference) ginkgo_default_includes(ginkgo_reference) diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp index 83f6628404e..fdaf02b050d 100644 --- a/core/device_hooks/common_kernels.inc.cpp +++ b/core/device_hooks/common_kernels.inc.cpp @@ -43,7 +43,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "core/components/precision_conversion_kernels.hpp" #include "core/components/prefix_sum_kernels.hpp" #include "core/components/reduce_array_kernels.hpp" +#include "core/distributed/matrix_kernels.hpp" #include "core/distributed/partition_kernels.hpp" +#include "core/distributed/vector_kernels.hpp" #include "core/factorization/cholesky_kernels.hpp" #include "core/factorization/factorization_kernels.hpp" #include "core/factorization/ic_kernels.hpp" @@ -144,6 +146,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(_macro) +#define GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ + template \ + _macro(ValueType, LocalIndexType, GlobalIndexType) \ + GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ + GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) + #define GKO_STUB_TEMPLATE_TYPE(_macro) \ template \ _macro(IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \ @@ -243,6 +252,24 @@ GKO_STUB_LOCAL_GLOBAL_TYPE(GKO_DECLARE_PARTITION_IS_ORDERED); } // namespace partition +namespace distributed_vector { + + +GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL); + + +} + +namespace distributed_matrix { + + +GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_BUILD_LOCAL_NONLOCAL); + + +} // namespace distributed_matrix + + namespace dense { @@ -264,6 +291,8 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL); GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL); +GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL); GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL); diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp index fa6dbe6c773..1d456e8173c 100644 --- a/core/device_hooks/cuda_hooks.cpp +++ b/core/device_hooks/cuda_hooks.cpp @@ -31,7 +31,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ #include -#include #include @@ -40,9 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include "core/matrix/csr_kernels.hpp" - - namespace gko { @@ -108,6 +104,10 @@ void CudaExecutor::raw_copy_to(const DpcppExecutor*, size_type num_bytes, void CudaExecutor::synchronize() const GKO_NOT_COMPILED(cuda); +scoped_device_id_guard CudaExecutor::get_scoped_device_id_guard() const + GKO_NOT_COMPILED(cuda); + + void CudaExecutor::run(const Operation& op) const { op.run( @@ -154,6 +154,11 @@ void CudaExecutor::set_gpu_property() {} void CudaExecutor::init_handles() {} +scoped_device_id_guard::scoped_device_id_guard(const CudaExecutor* exec, + int device_id) + GKO_NOT_COMPILED(cuda); + + } // namespace gko diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp index 9cd057fbeab..b1dbc3f666b 100644 --- a/core/device_hooks/dpcpp_hooks.cpp +++ b/core/device_hooks/dpcpp_hooks.cpp @@ -121,6 +121,10 @@ void DpcppExecutor::run(const Operation& op) const } +scoped_device_id_guard DpcppExecutor::get_scoped_device_id_guard() const + GKO_NOT_COMPILED(dpcpp); + + int DpcppExecutor::get_num_devices(std::string) { return 0; } @@ -142,6 +146,11 @@ bool DpcppExecutor::verify_memory_to(const DpcppExecutor* dest_exec) const } +scoped_device_id_guard::scoped_device_id_guard(const DpcppExecutor* exec, + int device_id) + GKO_NOT_COMPILED(dpcpp); + + } // namespace gko diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp index 54cc6439956..35bdd30b068 100644 --- a/core/device_hooks/hip_hooks.cpp +++ b/core/device_hooks/hip_hooks.cpp @@ -112,6 +112,10 @@ void HipExecutor::run(const Operation& op) const } +scoped_device_id_guard HipExecutor::get_scoped_device_id_guard() const + GKO_NOT_COMPILED(hip); + + std::string HipError::get_error(int64) { return "ginkgo HIP module is not compiled"; @@ -151,6 +155,11 @@ void HipExecutor::set_gpu_property() {} void HipExecutor::init_handles() {} +scoped_device_id_guard::scoped_device_id_guard(const HipExecutor* exec, + int device_id) + GKO_NOT_COMPILED(hip); + + } // namespace gko diff --git a/core/device_hooks/omp_hooks.cpp b/core/device_hooks/omp_hooks.cpp index 29f568c6932..981585c909d 100644 --- a/core/device_hooks/omp_hooks.cpp +++ b/core/device_hooks/omp_hooks.cpp @@ -30,6 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ +#include +#include #include @@ -44,6 +46,11 @@ version version_info::get_omp_version() noexcept } +scoped_device_id_guard::scoped_device_id_guard(const OmpExecutor* exec, + int device_id) + GKO_NOT_COMPILED(omp); + + } // namespace gko diff --git a/core/device_hooks/reference_hooks.cpp b/core/device_hooks/reference_hooks.cpp index ac39ed1070d..a3e2fe3a34d 100644 --- a/core/device_hooks/reference_hooks.cpp +++ b/core/device_hooks/reference_hooks.cpp @@ -30,6 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ +#include +#include #include @@ -44,6 +46,11 @@ version version_info::get_reference_version() noexcept } +scoped_device_id_guard::scoped_device_id_guard(const ReferenceExecutor* exec, + int device_id) + GKO_NOT_COMPILED(reference); + + } // namespace gko diff --git a/core/distributed/helpers.hpp b/core/distributed/helpers.hpp new file mode 100644 index 00000000000..0e4f7b34e55 --- /dev/null +++ b/core/distributed/helpers.hpp @@ -0,0 +1,128 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include + + +namespace gko { +namespace detail { + + +template +std::unique_ptr> create_with_config_of( + const matrix::Dense* mtx) +{ + return matrix::Dense::create(mtx->get_executor(), + mtx->get_size(), mtx->get_stride()); +} + + +template +const matrix::Dense* get_local(const matrix::Dense* mtx) +{ + return mtx; +} + + +template +matrix::Dense* get_local(matrix::Dense* mtx) +{ + return mtx; +} + + +#if GINKGO_BUILD_MPI + + +template +std::unique_ptr> +create_with_config_of(const experimental::distributed::Vector* mtx) +{ + return experimental::distributed::Vector::create( + mtx->get_executor(), mtx->get_communicator(), mtx->get_size(), + mtx->get_local_vector()->get_size(), + mtx->get_local_vector()->get_stride()); +} + + +template +matrix::Dense* get_local( + experimental::distributed::Vector* mtx) +{ + return const_cast*>(mtx->get_local_vector()); +} + + +template +const matrix::Dense* get_local( + const experimental::distributed::Vector* mtx) +{ + return mtx->get_local_vector(); +} + + +#endif + + +template +bool is_distributed(Arg* linop) +{ +#if GINKGO_BUILD_MPI + return dynamic_cast( + linop); +#else + return false; +#endif +} + + +template +bool is_distributed(Arg* linop, Rest*... rest) +{ +#if GINKGO_BUILD_MPI + bool is_distributed_value = + dynamic_cast(linop); + GKO_ASSERT(is_distributed_value == is_distributed(rest...)); + return is_distributed_value; +#else + return false; +#endif +} + + +} // namespace detail +} // namespace gko diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp new file mode 100644 index 00000000000..924dc216086 --- /dev/null +++ b/core/distributed/matrix.cpp @@ -0,0 +1,450 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include + + +#include "core/distributed/matrix_kernels.hpp" + + +namespace gko { +namespace experimental { +namespace distributed { +namespace matrix { +namespace { + + +GKO_REGISTER_OPERATION(build_local_nonlocal, + distributed_matrix::build_local_nonlocal); + + +} // namespace +} // namespace matrix + + +template +Matrix::Matrix( + std::shared_ptr exec, mpi::communicator comm) + : Matrix(exec, comm, with_matrix_type()) +{} + + +template +Matrix::Matrix( + std::shared_ptr exec, mpi::communicator comm, + const LinOp* local_matrix_type) + : Matrix(exec, comm, local_matrix_type, local_matrix_type) +{} + + +template +Matrix::Matrix( + std::shared_ptr exec, mpi::communicator comm, + const LinOp* local_matrix_template, const LinOp* non_local_matrix_template) + : EnableLinOp< + Matrix>{exec}, + DistributedBase{comm}, + send_offsets_(comm.size() + 1), + send_sizes_(comm.size()), + recv_offsets_(comm.size() + 1), + recv_sizes_(comm.size()), + gather_idxs_{exec}, + non_local_to_global_{exec}, + one_scalar_{}, + local_mtx_{local_matrix_template->clone(exec)}, + non_local_mtx_{non_local_matrix_template->clone(exec)} +{ + GKO_ASSERT( + (dynamic_cast*>( + local_mtx_.get()))); + GKO_ASSERT( + (dynamic_cast*>( + non_local_mtx_.get()))); + one_scalar_.init(exec, dim<2>{1, 1}); + one_scalar_->fill(one()); +} + + +template +void Matrix::convert_to( + Matrix, local_index_type, global_index_type>* + result) const +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->local_mtx_->copy_from(this->local_mtx_.get()); + result->non_local_mtx_->copy_from(this->non_local_mtx_.get()); + result->gather_idxs_ = this->gather_idxs_; + result->send_offsets_ = this->send_offsets_; + result->recv_offsets_ = this->recv_offsets_; + result->recv_sizes_ = this->recv_sizes_; + result->send_sizes_ = this->send_sizes_; + result->non_local_to_global_ = this->non_local_to_global_; + result->set_size(this->get_size()); +} + + +template +void Matrix::move_to( + Matrix, local_index_type, global_index_type>* + result) +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->local_mtx_->move_from(this->local_mtx_.get()); + result->non_local_mtx_->move_from(this->non_local_mtx_.get()); + result->gather_idxs_ = std::move(this->gather_idxs_); + result->send_offsets_ = std::move(this->send_offsets_); + result->recv_offsets_ = std::move(this->recv_offsets_); + result->recv_sizes_ = std::move(this->recv_sizes_); + result->send_sizes_ = std::move(this->send_sizes_); + result->non_local_to_global_ = std::move(this->non_local_to_global_); + result->set_size(this->get_size()); + this->set_size({}); +} + + +template +void Matrix::read_distributed( + const device_matrix_data& data, + const Partition* row_partition, + const Partition* col_partition) +{ + const auto comm = this->get_communicator(); + GKO_ASSERT_EQ(data.get_size()[0], row_partition->get_size()); + GKO_ASSERT_EQ(data.get_size()[1], col_partition->get_size()); + GKO_ASSERT_EQ(comm.size(), row_partition->get_num_parts()); + GKO_ASSERT_EQ(comm.size(), col_partition->get_num_parts()); + auto exec = this->get_executor(); + auto local_part = comm.rank(); + + // set up LinOp sizes + auto num_parts = static_cast(row_partition->get_num_parts()); + auto global_num_rows = row_partition->get_size(); + auto global_num_cols = col_partition->get_size(); + dim<2> global_dim{global_num_rows, global_num_cols}; + this->set_size(global_dim); + + // temporary storage for the output + array local_row_idxs{exec}; + array local_col_idxs{exec}; + array local_values{exec}; + array non_local_row_idxs{exec}; + array non_local_col_idxs{exec}; + array non_local_values{exec}; + array recv_gather_idxs{exec}; + array recv_sizes_array{exec, num_parts}; + + // build local, non-local matrix data and communication structures + exec->run(matrix::make_build_local_nonlocal( + data, make_temporary_clone(exec, row_partition).get(), + make_temporary_clone(exec, col_partition).get(), local_part, + local_row_idxs, local_col_idxs, local_values, non_local_row_idxs, + non_local_col_idxs, non_local_values, recv_gather_idxs, + recv_sizes_array, non_local_to_global_)); + + // read the local matrix data + const auto num_local_rows = + static_cast(row_partition->get_part_size(local_part)); + const auto num_local_cols = + static_cast(col_partition->get_part_size(local_part)); + const auto num_non_local_cols = non_local_to_global_.get_num_elems(); + device_matrix_data local_data{ + exec, dim<2>{num_local_rows, num_local_cols}, std::move(local_row_idxs), + std::move(local_col_idxs), std::move(local_values)}; + device_matrix_data non_local_data{ + exec, dim<2>{num_local_rows, num_non_local_cols}, + std::move(non_local_row_idxs), std::move(non_local_col_idxs), + std::move(non_local_values)}; + as>(this->local_mtx_) + ->read(std::move(local_data)); + as>(this->non_local_mtx_) + ->read(std::move(non_local_data)); + + // exchange step 1: determine recv_sizes, send_sizes, send_offsets + exec->get_master()->copy_from(exec.get(), num_parts, + recv_sizes_array.get_const_data(), + recv_sizes_.data()); + std::partial_sum(recv_sizes_.begin(), recv_sizes_.end(), + recv_offsets_.begin() + 1); + comm.all_to_all(exec, recv_sizes_.data(), 1, send_sizes_.data(), 1); + std::partial_sum(send_sizes_.begin(), send_sizes_.end(), + send_offsets_.begin() + 1); + send_offsets_[0] = 0; + recv_offsets_[0] = 0; + + // exchange step 2: exchange gather_idxs from receivers to senders + auto use_host_buffer = exec->get_master() != exec && !mpi::is_gpu_aware(); + if (use_host_buffer) { + recv_gather_idxs.set_executor(exec->get_master()); + gather_idxs_.clear(); + gather_idxs_.set_executor(exec->get_master()); + } + gather_idxs_.resize_and_reset(send_offsets_.back()); + comm.all_to_all_v(use_host_buffer ? exec->get_master() : exec, + recv_gather_idxs.get_const_data(), recv_sizes_.data(), + recv_offsets_.data(), gather_idxs_.get_data(), + send_sizes_.data(), send_offsets_.data()); + if (use_host_buffer) { + gather_idxs_.set_executor(exec); + } +} + + +template +void Matrix::read_distributed( + const matrix_data& data, + const Partition* row_partition, + const Partition* col_partition) +{ + this->read_distributed( + device_matrix_data::create_from_host( + this->get_executor(), data), + row_partition, col_partition); +} + + +template +void Matrix::read_distributed( + const matrix_data& data, + const Partition* partition) +{ + this->read_distributed( + device_matrix_data::create_from_host( + this->get_executor(), data), + partition, partition); +} + + +template +void Matrix::read_distributed( + const device_matrix_data& data, + const Partition* partition) +{ + this->read_distributed(data, partition, partition); +} + + +template +mpi::request Matrix::communicate( + const local_vector_type* local_b) const +{ + auto exec = this->get_executor(); + const auto comm = this->get_communicator(); + auto num_cols = local_b->get_size()[1]; + auto send_size = send_offsets_.back(); + auto recv_size = recv_offsets_.back(); + auto send_dim = dim<2>{static_cast(send_size), num_cols}; + auto recv_dim = dim<2>{static_cast(recv_size), num_cols}; + recv_buffer_.init(exec, recv_dim); + send_buffer_.init(exec, send_dim); + + local_b->row_gather(&gather_idxs_, send_buffer_.get()); + + auto use_host_buffer = exec->get_master() != exec && !mpi::is_gpu_aware(); + if (use_host_buffer) { + host_recv_buffer_.init(exec->get_master(), recv_dim); + host_send_buffer_.init(exec->get_master(), send_dim); + host_send_buffer_->copy_from(send_buffer_.get()); + } + + mpi::contiguous_type type(num_cols, mpi::type_impl::get_type()); + auto send_ptr = use_host_buffer ? host_send_buffer_->get_const_values() + : send_buffer_->get_const_values(); + auto recv_ptr = use_host_buffer ? host_recv_buffer_->get_values() + : recv_buffer_->get_values(); + exec->synchronize(); +#ifdef GINKGO_FORCE_SPMV_BLOCKING_COMM + comm.all_to_all_v(use_host_buffer ? exec->get_master() : exec, send_ptr, + send_sizes_.data(), send_offsets_.data(), type.get(), + recv_ptr, recv_sizes_.data(), recv_offsets_.data(), + type.get()); + return {}; +#else + return comm.i_all_to_all_v( + use_host_buffer ? exec->get_master() : exec, send_ptr, + send_sizes_.data(), send_offsets_.data(), type.get(), recv_ptr, + recv_sizes_.data(), recv_offsets_.data(), type.get()); +#endif +} + + +template +void Matrix::apply_impl( + const LinOp* b, LinOp* x) const +{ + distributed::precision_dispatch_real_complex( + [this](const auto dense_b, auto dense_x) { + auto x_exec = dense_x->get_executor(); + auto local_x = gko::matrix::Dense::create( + x_exec, dense_x->get_local_vector()->get_size(), + gko::make_array_view( + x_exec, + dense_x->get_local_vector()->get_num_stored_elements(), + dense_x->get_local_values()), + dense_x->get_local_vector()->get_stride()); + + auto req = this->communicate(dense_b->get_local_vector()); + local_mtx_->apply(dense_b->get_local_vector(), local_x.get()); + req.wait(); + + auto exec = this->get_executor(); + auto use_host_buffer = + exec->get_master() != exec && !mpi::is_gpu_aware(); + if (use_host_buffer) { + recv_buffer_->copy_from(host_recv_buffer_.get()); + } + non_local_mtx_->apply(one_scalar_.get(), recv_buffer_.get(), + one_scalar_.get(), local_x.get()); + }, + b, x); +} + + +template +void Matrix::apply_impl( + const LinOp* alpha, const LinOp* b, const LinOp* beta, LinOp* x) const +{ + distributed::precision_dispatch_real_complex( + [this](const auto local_alpha, const auto dense_b, + const auto local_beta, auto dense_x) { + const auto x_exec = dense_x->get_executor(); + auto local_x = gko::matrix::Dense::create( + x_exec, dense_x->get_local_vector()->get_size(), + gko::make_array_view( + x_exec, + dense_x->get_local_vector()->get_num_stored_elements(), + dense_x->get_local_values()), + dense_x->get_local_vector()->get_stride()); + + auto req = this->communicate(dense_b->get_local_vector()); + local_mtx_->apply(local_alpha, dense_b->get_local_vector(), + local_beta, local_x.get()); + req.wait(); + + auto exec = this->get_executor(); + auto use_host_buffer = + exec->get_master() != exec && !mpi::is_gpu_aware(); + if (use_host_buffer) { + recv_buffer_->copy_from(host_recv_buffer_.get()); + } + non_local_mtx_->apply(local_alpha, recv_buffer_.get(), + one_scalar_.get(), local_x.get()); + }, + alpha, b, beta, x); +} + + +template +Matrix::Matrix(const Matrix& other) + : EnableLinOp>{other.get_executor()}, + DistributedBase{other.get_communicator()} +{ + *this = other; +} + + +template +Matrix::Matrix( + Matrix&& other) noexcept + : EnableLinOp>{other.get_executor()}, + DistributedBase{other.get_communicator()} +{ + *this = std::move(other); +} + + +template +Matrix& +Matrix::operator=( + const Matrix& other) +{ + if (this != &other) { + GKO_ASSERT_EQ(other.get_communicator().size(), + this->get_communicator().size()); + this->set_size(other.get_size()); + local_mtx_->copy_from(other.local_mtx_.get()); + non_local_mtx_->copy_from(other.non_local_mtx_.get()); + gather_idxs_ = other.gather_idxs_; + send_offsets_ = other.send_offsets_; + recv_offsets_ = other.recv_offsets_; + send_sizes_ = other.send_sizes_; + recv_sizes_ = other.recv_sizes_; + non_local_to_global_ = other.non_local_to_global_; + one_scalar_.init(this->get_executor(), dim<2>{1, 1}); + one_scalar_->fill(one()); + } + return *this; +} + + +template +Matrix& +Matrix::operator=(Matrix&& other) +{ + if (this != &other) { + GKO_ASSERT_EQ(other.get_communicator().size(), + this->get_communicator().size()); + this->set_size(other.get_size()); + other.set_size({}); + local_mtx_->move_from(other.local_mtx_.get()); + non_local_mtx_->move_from(other.non_local_mtx_.get()); + gather_idxs_ = std::move(other.gather_idxs_); + send_offsets_ = std::move(other.send_offsets_); + recv_offsets_ = std::move(other.recv_offsets_); + send_sizes_ = std::move(other.send_sizes_); + recv_sizes_ = std::move(other.recv_sizes_); + non_local_to_global_ = std::move(other.non_local_to_global_); + one_scalar_.init(this->get_executor(), dim<2>{1, 1}); + one_scalar_->fill(one()); + } + return *this; +} + + +#define GKO_DECLARE_DISTRIBUTED_MATRIX(ValueType, LocalIndexType, \ + GlobalIndexType) \ + class Matrix +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_DISTRIBUTED_MATRIX); + + +} // namespace distributed +} // namespace experimental +} // namespace gko diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp new file mode 100644 index 00000000000..878e7fe3239 --- /dev/null +++ b/core/distributed/matrix_kernels.hpp @@ -0,0 +1,89 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_DISTRIBUTED_MATRIX_KERNELS_HPP_ +#define GKO_CORE_DISTRIBUTED_MATRIX_KERNELS_HPP_ + + +#include +#include +#include +#include +#include + + +#include "core/base/kernel_declaration.hpp" + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_BUILD_LOCAL_NONLOCAL(ValueType, LocalIndexType, \ + GlobalIndexType) \ + void build_local_nonlocal( \ + std::shared_ptr exec, \ + const device_matrix_data& input, \ + const experimental::distributed::Partition< \ + LocalIndexType, GlobalIndexType>* row_partition, \ + const experimental::distributed::Partition< \ + LocalIndexType, GlobalIndexType>* col_partition, \ + comm_index_type local_part, array& local_row_idxs, \ + array& local_col_idxs, array& local_values, \ + array& non_local_row_idxs, \ + array& non_local_col_idxs, \ + array& non_local_values, \ + array& local_gather_idxs, \ + array& recv_offsets, \ + array& non_local_to_global) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + using comm_index_type = experimental::distributed::comm_index_type; \ + template \ + GKO_DECLARE_BUILD_LOCAL_NONLOCAL(ValueType, LocalIndexType, \ + GlobalIndexType) + + +GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(distributed_matrix, + GKO_DECLARE_ALL_AS_TEMPLATES); + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_DISTRIBUTED_MATRIX_KERNELS_HPP_ diff --git a/core/distributed/partition.cpp b/core/distributed/partition.cpp index 4a58ccdaf7a..7dc634f17a9 100644 --- a/core/distributed/partition.cpp +++ b/core/distributed/partition.cpp @@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { +namespace experimental { namespace distributed { namespace partition { @@ -142,4 +143,5 @@ GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_PARTITION); } // namespace distributed +} // namespace experimental } // namespace gko diff --git a/core/distributed/partition_kernels.hpp b/core/distributed/partition_kernels.hpp index 4a1f76d94c5..9f9d162044a 100644 --- a/core/distributed/partition_kernels.hpp +++ b/core/distributed/partition_kernels.hpp @@ -76,27 +76,26 @@ namespace kernels { comm_index_type& num_empty_parts, \ LocalIndexType* ranks, LocalIndexType* sizes) -#define GKO_DECLARE_PARTITION_IS_ORDERED(LocalIndexType, GlobalIndexType) \ - void has_ordered_parts( \ - std::shared_ptr exec, \ - const distributed::Partition* \ - partition, \ - bool* result) - - -#define GKO_DECLARE_ALL_AS_TEMPLATES \ - using comm_index_type = distributed::comm_index_type; \ - GKO_PARTITION_COUNT_RANGES; \ - template \ - GKO_PARTITION_BUILD_FROM_CONTIGUOUS(GlobalIndexType); \ - template \ - GKO_PARTITION_BUILD_FROM_MAPPING(GlobalIndexType); \ - template \ - GKO_PARTITION_BUILD_FROM_GLOBAL_SIZE(GlobalIndexType); \ - template \ - GKO_DECLARE_PARTITION_BUILD_STARTING_INDICES(LocalIndexType, \ - GlobalIndexType); \ - template \ +#define GKO_DECLARE_PARTITION_IS_ORDERED(LocalIndexType, GlobalIndexType) \ + void has_ordered_parts(std::shared_ptr exec, \ + const experimental::distributed::Partition< \ + LocalIndexType, GlobalIndexType>* partition, \ + bool* result) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + using comm_index_type = experimental::distributed::comm_index_type; \ + GKO_PARTITION_COUNT_RANGES; \ + template \ + GKO_PARTITION_BUILD_FROM_CONTIGUOUS(GlobalIndexType); \ + template \ + GKO_PARTITION_BUILD_FROM_MAPPING(GlobalIndexType); \ + template \ + GKO_PARTITION_BUILD_FROM_GLOBAL_SIZE(GlobalIndexType); \ + template \ + GKO_DECLARE_PARTITION_BUILD_STARTING_INDICES(LocalIndexType, \ + GlobalIndexType); \ + template \ GKO_DECLARE_PARTITION_IS_ORDERED(LocalIndexType, GlobalIndexType) GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(partition, diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp new file mode 100644 index 00000000000..4d338d77f05 --- /dev/null +++ b/core/distributed/vector.cpp @@ -0,0 +1,543 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include "core/distributed/vector_kernels.hpp" +#include "core/matrix/dense_kernels.hpp" + + +namespace gko { +namespace experimental { +namespace distributed { +namespace vector { +namespace { + + +GKO_REGISTER_OPERATION(compute_squared_norm2, dense::compute_squared_norm2); +GKO_REGISTER_OPERATION(compute_sqrt, dense::compute_sqrt); +GKO_REGISTER_OPERATION(outplace_absolute_dense, dense::outplace_absolute_dense); +GKO_REGISTER_OPERATION(build_local, distributed_vector::build_local); + + +} // namespace +} // namespace vector + + +dim<2> compute_global_size(std::shared_ptr exec, + mpi::communicator comm, dim<2> local_size) +{ + size_type num_global_rows = local_size[0]; + comm.all_reduce(std::move(exec), &num_global_rows, 1, MPI_SUM); + return {num_global_rows, local_size[1]}; +} + + +template +void Vector::apply_impl(const LinOp* b, LinOp* x) const +{ + GKO_NOT_SUPPORTED(this); +} + + +template +void Vector::apply_impl(const LinOp* alpha, const LinOp* b, + const LinOp* beta, LinOp* x) const +{ + GKO_NOT_SUPPORTED(this); +} + +template +Vector::Vector(std::shared_ptr exec, + mpi::communicator comm, dim<2> global_size, + dim<2> local_size) + : Vector(exec, comm, global_size, local_size, local_size[1]) +{} + + +template +Vector::Vector(std::shared_ptr exec, + mpi::communicator comm, dim<2> global_size, + dim<2> local_size, size_type stride) + : EnableLinOp>{exec, global_size}, + DistributedBase{comm}, + local_{exec, local_size, stride} +{ + GKO_ASSERT_EQUAL_COLS(global_size, local_size); +} + +template +Vector::Vector(std::shared_ptr exec, + mpi::communicator comm, dim<2> global_size, + local_vector_type* local_vector) + : EnableLinOp>{exec, global_size}, + DistributedBase{comm}, + local_{exec} +{ + local_vector->move_to(&local_); +} + + +template +Vector::Vector(std::shared_ptr exec, + mpi::communicator comm, + local_vector_type* local_vector) + : EnableLinOp>{exec, {}}, + DistributedBase{comm}, + local_{exec} +{ + this->set_size(compute_global_size(exec, comm, local_vector->get_size())); + local_vector->move_to(&local_); +} + + +template +template +void Vector::read_distributed( + const device_matrix_data& data, + const Partition* partition) +{ + auto exec = this->get_executor(); + auto global_cols = data.get_size()[1]; + this->resize( + dim<2>(partition->get_size(), global_cols), + dim<2>(partition->get_part_size(this->get_communicator().rank()), + global_cols)); + + auto rank = this->get_communicator().rank(); + local_.fill(zero()); + exec->run(vector::make_build_local( + data, make_temporary_clone(exec, partition).get(), rank, &local_)); +} + + +template +template +void Vector::read_distributed( + const matrix_data& data, + const Partition* partition) + +{ + this->read_distributed( + device_matrix_data::create_from_host( + this->get_executor(), data), + std::move(partition)); +} + + +template +void Vector::fill(const ValueType value) +{ + local_.fill(value); +} + + +template +void Vector::convert_to( + Vector>* result) const +{ + GKO_ASSERT(this->get_communicator().size() == + result->get_communicator().size()); + result->set_size(this->get_size()); + this->get_local_vector()->convert_to(&result->local_); +} + + +template +void Vector::move_to(Vector>* result) +{ + this->convert_to(result); +} + + +template +std::unique_ptr::absolute_type> +Vector::compute_absolute() const +{ + auto exec = this->get_executor(); + + auto result = + absolute_type::create(exec, this->get_communicator(), this->get_size(), + this->get_local_vector()->get_size()); + + exec->run(vector::make_outplace_absolute_dense(this->get_local_vector(), + &result->local_)); + + return result; +} + + +template +void Vector::compute_absolute_inplace() +{ + local_.compute_absolute_inplace(); +} + + +template +const typename Vector::local_vector_type* +Vector::get_local_vector() const +{ + return &local_; +} + + +template +std::unique_ptr::complex_type> +Vector::make_complex() const +{ + auto result = complex_type::create( + this->get_executor(), this->get_communicator(), this->get_size(), + this->get_local_vector()->get_size(), + this->get_local_vector()->get_stride()); + this->make_complex(result.get()); + return result; +} + + +template +void Vector::make_complex(Vector::complex_type* result) const +{ + this->get_local_vector()->make_complex(&result->local_); +} + + +template +std::unique_ptr::real_type> +Vector::get_real() const +{ + auto result = real_type::create(this->get_executor(), + this->get_communicator(), this->get_size(), + this->get_local_vector()->get_size(), + this->get_local_vector()->get_stride()); + this->get_real(result.get()); + return result; +} + + +template +void Vector::get_real(Vector::real_type* result) const +{ + this->get_local_vector()->get_real(&result->local_); +} + + +template +std::unique_ptr::real_type> +Vector::get_imag() const +{ + auto result = real_type::create(this->get_executor(), + this->get_communicator(), this->get_size(), + this->get_local_vector()->get_size(), + this->get_local_vector()->get_stride()); + this->get_imag(result.get()); + return result; +} + + +template +void Vector::get_imag(Vector::real_type* result) const +{ + this->get_local_vector()->get_imag(&result->local_); +} + + +template +void Vector::scale(const LinOp* alpha) +{ + local_.scale(alpha); +} + + +template +void Vector::inv_scale(const LinOp* alpha) +{ + local_.inv_scale(alpha); +} + + +template +void Vector::add_scaled(const LinOp* alpha, const LinOp* b) +{ + auto dense_b = as>(b); + local_.add_scaled(alpha, dense_b->get_local_vector()); +} + + +template +void Vector::sub_scaled(const LinOp* alpha, const LinOp* b) +{ + auto dense_b = as>(b); + local_.sub_scaled(alpha, dense_b->get_local_vector()); +} + + +template +void Vector::compute_dot(const LinOp* b, LinOp* result) const +{ + array tmp{this->get_executor()}; + this->compute_dot(b, result, tmp); +} + + +template +void Vector::compute_dot(const LinOp* b, LinOp* result, + array& tmp) const +{ + GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1])); + auto exec = this->get_executor(); + const auto comm = this->get_communicator(); + auto dense_res = + make_temporary_clone(exec, as>(result)); + this->get_local_vector()->compute_dot(as(b)->get_local_vector(), + dense_res.get(), tmp); + exec->synchronize(); + auto use_host_buffer = exec->get_master() != exec && !mpi::is_gpu_aware(); + if (use_host_buffer) { + host_reduction_buffer_.init(exec->get_master(), dense_res->get_size()); + host_reduction_buffer_->copy_from(dense_res.get()); + comm.all_reduce(exec->get_master(), + host_reduction_buffer_->get_values(), + static_cast(this->get_size()[1]), MPI_SUM); + dense_res->copy_from(host_reduction_buffer_.get()); + } else { + comm.all_reduce(exec, dense_res->get_values(), + static_cast(this->get_size()[1]), MPI_SUM); + } +} + + +template +void Vector::compute_conj_dot(const LinOp* b, LinOp* result) const +{ + array tmp{this->get_executor()}; + this->compute_conj_dot(b, result, tmp); +} + + +template +void Vector::compute_conj_dot(const LinOp* b, LinOp* result, + array& tmp) const +{ + GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1])); + auto exec = this->get_executor(); + const auto comm = this->get_communicator(); + auto dense_res = + make_temporary_clone(exec, as>(result)); + this->get_local_vector()->compute_conj_dot( + as(b)->get_local_vector(), dense_res.get(), tmp); + exec->synchronize(); + auto use_host_buffer = exec->get_master() != exec && !mpi::is_gpu_aware(); + if (use_host_buffer) { + host_reduction_buffer_.init(exec->get_master(), dense_res->get_size()); + host_reduction_buffer_->copy_from(dense_res.get()); + comm.all_reduce(exec->get_master(), + host_reduction_buffer_->get_values(), + static_cast(this->get_size()[1]), MPI_SUM); + dense_res->copy_from(host_reduction_buffer_.get()); + } else { + comm.all_reduce(exec, dense_res->get_values(), + static_cast(this->get_size()[1]), MPI_SUM); + } +} + + +template +void Vector::compute_norm2(LinOp* result) const +{ + array tmp{this->get_executor()}; + this->compute_norm2(result, tmp); +} + + +template +void Vector::compute_norm2(LinOp* result, array& tmp) const +{ + using NormVector = typename local_vector_type::absolute_type; + GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1])); + auto exec = this->get_executor(); + const auto comm = this->get_communicator(); + auto dense_res = make_temporary_clone(exec, as(result)); + exec->run(vector::make_compute_squared_norm2(this->get_local_vector(), + dense_res.get(), tmp)); + exec->synchronize(); + auto use_host_buffer = exec->get_master() != exec && !mpi::is_gpu_aware(); + if (use_host_buffer) { + host_norm_buffer_.init(exec->get_master(), dense_res->get_size()); + host_norm_buffer_->copy_from(dense_res.get()); + comm.all_reduce(exec->get_master(), host_norm_buffer_->get_values(), + static_cast(this->get_size()[1]), MPI_SUM); + dense_res->copy_from(host_norm_buffer_.get()); + } else { + comm.all_reduce(exec, dense_res->get_values(), + static_cast(this->get_size()[1]), MPI_SUM); + } + exec->run(vector::make_compute_sqrt(dense_res.get())); +} + + +template +void Vector::compute_norm1(LinOp* result) const +{ + array tmp{this->get_executor()}; + this->compute_norm1(result, tmp); +} + + +template +void Vector::compute_norm1(LinOp* result, array& tmp) const +{ + using NormVector = typename local_vector_type::absolute_type; + GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1])); + auto exec = this->get_executor(); + const auto comm = this->get_communicator(); + auto dense_res = make_temporary_clone(exec, as(result)); + this->get_local_vector()->compute_norm1(dense_res.get()); + exec->synchronize(); + auto use_host_buffer = exec->get_master() != exec && !mpi::is_gpu_aware(); + if (use_host_buffer) { + host_norm_buffer_.init(exec->get_master(), dense_res->get_size()); + host_norm_buffer_->copy_from(dense_res.get()); + comm.all_reduce(exec->get_master(), host_norm_buffer_->get_values(), + static_cast(this->get_size()[1]), MPI_SUM); + dense_res->copy_from(host_norm_buffer_.get()); + } else { + comm.all_reduce(exec, dense_res->get_values(), + static_cast(this->get_size()[1]), MPI_SUM); + } +} + + +template +ValueType& Vector::at_local(size_type row, size_type col) noexcept +{ + return local_.at(row, col); +} + +template +ValueType Vector::at_local(size_type row, size_type col) const + noexcept +{ + return local_.at(row, col); +} + +template +ValueType& Vector::at_local(size_type idx) noexcept +{ + return local_.at(idx); +} + +template +ValueType Vector::at_local(size_type idx) const noexcept +{ + return local_.at(idx); +} + + +template +ValueType* Vector::get_local_values() +{ + return local_.get_values(); +} + + +template +const ValueType* Vector::get_const_local_values() const +{ + return local_.get_const_values(); +} + + +template +void Vector::resize(dim<2> global_size, dim<2> local_size) +{ + if (this->get_size() != global_size) { + this->set_size(global_size); + } + local_.resize(local_size); +} + + +template +std::unique_ptr::real_type> +Vector::create_real_view() const +{ + const auto num_global_rows = this->get_size()[0]; + const auto num_cols = + is_complex() ? 2 * this->get_size()[1] : this->get_size()[1]; + + return real_type::create(this->get_executor(), this->get_communicator(), + dim<2>{num_global_rows, num_cols}, + const_cast( + local_.create_real_view().get())); +} + + +template +std::unique_ptr::real_type> +Vector::create_real_view() +{ + const auto num_global_rows = this->get_size()[0]; + const auto num_cols = + is_complex() ? 2 * this->get_size()[1] : this->get_size()[1]; + + return real_type::create(this->get_executor(), this->get_communicator(), + dim<2>{num_global_rows, num_cols}, + local_.create_real_view().get()); +} + + +#define GKO_DECLARE_DISTRIBUTED_VECTOR(ValueType) class Vector +GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DISTRIBUTED_VECTOR); + + +#define GKO_DECLARE_DISTRIBUTED_VECTOR_READ_DISTRIBUTED( \ + ValueType, LocalIndexType, GlobalIndexType) \ + void Vector::read_distributed( \ + const device_matrix_data& data, \ + const Partition* partition); \ + template void \ + Vector::read_distributed( \ + const matrix_data& data, \ + const Partition* partition) + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_DISTRIBUTED_VECTOR_READ_DISTRIBUTED); + + +} // namespace distributed +} // namespace experimental +} // namespace gko diff --git a/core/distributed/vector_kernels.hpp b/core/distributed/vector_kernels.hpp new file mode 100644 index 00000000000..91a008f50dd --- /dev/null +++ b/core/distributed/vector_kernels.hpp @@ -0,0 +1,81 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CORE_DISTRIBUTED_VECTOR_KERNELS_HPP_ +#define GKO_CORE_DISTRIBUTED_VECTOR_KERNELS_HPP_ + + +// can't include ginkgo/core/distributed/vector.hpp since that requires linking +// against MPI +#include +#include +#include +#include + + +#include "core/base/kernel_declaration.hpp" + + +namespace gko { +namespace kernels { + + +#define GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL(ValueType, LocalIndexType, \ + GlobalIndexType) \ + void build_local( \ + std::shared_ptr exec, \ + const device_matrix_data& input, \ + const experimental::distributed::Partition< \ + LocalIndexType, GlobalIndexType>* partition, \ + comm_index_type local_part, matrix::Dense* local_mtx) + + +#define GKO_DECLARE_ALL_AS_TEMPLATES \ + using comm_index_type = experimental::distributed::comm_index_type; \ + template \ + GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL(ValueType, LocalIndexType, \ + GlobalIndexType) + + +GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(distributed_vector, + GKO_DECLARE_ALL_AS_TEMPLATES); + + +#undef GKO_DECLARE_ALL_AS_TEMPLATES + + +} // namespace kernels +} // namespace gko + + +#endif // GKO_CORE_DISTRIBUTED_VECTOR_KERNELS_HPP_ diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp index 4c1027421b7..2969fa4fec1 100644 --- a/core/matrix/dense.cpp +++ b/core/matrix/dense.cpp @@ -80,6 +80,8 @@ GKO_REGISTER_OPERATION(compute_dot, dense::compute_dot_dispatch); GKO_REGISTER_OPERATION(compute_conj_dot, dense::compute_conj_dot_dispatch); GKO_REGISTER_OPERATION(compute_norm2, dense::compute_norm2_dispatch); GKO_REGISTER_OPERATION(compute_norm1, dense::compute_norm1); +GKO_REGISTER_OPERATION(compute_squared_norm2, dense::compute_squared_norm2); +GKO_REGISTER_OPERATION(compute_sqrt, dense::compute_sqrt); GKO_REGISTER_OPERATION(compute_max_nnz_per_row, dense::compute_max_nnz_per_row); GKO_REGISTER_OPERATION(compute_hybrid_coo_row_ptrs, hybrid::compute_coo_row_ptrs); diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp index f6041ece443..c31d00e5337 100644 --- a/core/matrix/dense_kernels.hpp +++ b/core/matrix/dense_kernels.hpp @@ -151,6 +151,16 @@ namespace kernels { const device_matrix_data<_type, _prec>& data, \ matrix::Dense<_type>* output) +#define GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL(_type) \ + void compute_squared_norm2(std::shared_ptr exec, \ + const matrix::Dense<_type>* x, \ + matrix::Dense>* result, \ + array& tmp) + +#define GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL(_type) \ + void compute_sqrt(std::shared_ptr exec, \ + matrix::Dense<_type>* data) + #define GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL(_type, _prec) \ void convert_to_coo(std::shared_ptr exec, \ const matrix::Dense<_type>* source, \ @@ -341,6 +351,10 @@ namespace kernels { GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType); \ template \ GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType); \ + template \ + GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL(ValueType); \ + template \ + GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL(ValueType); \ template \ GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL(ValueType, IndexType); \ template \ diff --git a/core/matrix/identity.cpp b/core/matrix/identity.cpp index 21c5d91dc3a..66898f2c74d 100644 --- a/core/matrix/identity.cpp +++ b/core/matrix/identity.cpp @@ -54,7 +54,7 @@ template void Identity::apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta, LinOp* x) const { - precision_dispatch_real_complex( + experimental::precision_dispatch_real_complex_distributed( [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) { dense_x->scale(dense_beta); dense_x->add_scaled(dense_alpha, dense_b); diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp index 1808489c1ad..6a1410cbeb2 100644 --- a/core/solver/bicg.cpp +++ b/core/solver/bicg.cpp @@ -39,7 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include "core/solver/bicg_kernels.hpp" diff --git a/core/solver/bicgstab.cpp b/core/solver/bicgstab.cpp index 820bd51f0e3..1d6a6472048 100644 --- a/core/solver/bicgstab.cpp +++ b/core/solver/bicgstab.cpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/distributed/helpers.hpp" #include "core/solver/bicgstab_kernels.hpp" #include "core/solver/solver_boilerplate.hpp" @@ -95,7 +96,7 @@ void Bicgstab::apply_impl(const LinOp* b, LinOp* x) const if (!this->get_system_matrix()) { return; } - precision_dispatch_real_complex( + experimental::precision_dispatch_real_complex_distributed( [this](auto dense_b, auto dense_x) { this->apply_dense_impl(dense_b, dense_x); }, @@ -104,12 +105,11 @@ void Bicgstab::apply_impl(const LinOp* b, LinOp* x) const template -void Bicgstab::apply_dense_impl( - const matrix::Dense* dense_b, - matrix::Dense* dense_x) const +template +void Bicgstab::apply_dense_impl(const VectorType* dense_b, + VectorType* dense_x) const { using std::swap; - using Vector = matrix::Dense; constexpr uint8 RelativeStoppingId{1}; @@ -141,9 +141,13 @@ void Bicgstab::apply_dense_impl( // prev_rho = rho = omega = alpha = beta = gamma = 1.0 // rr = v = s = t = z = y = p = 0 // stop_status = 0x00 - exec->run(bicgstab::make_initialize(dense_b, r, rr, y, s, t, z, v, p, - prev_rho, rho, alpha, beta, gamma, - omega, &stop_status)); + exec->run(bicgstab::make_initialize( + gko::detail::get_local(dense_b), gko::detail::get_local(r), + gko::detail::get_local(rr), gko::detail::get_local(y), + gko::detail::get_local(s), gko::detail::get_local(t), + gko::detail::get_local(z), gko::detail::get_local(v), + gko::detail::get_local(p), prev_rho, rho, alpha, beta, gamma, omega, + &stop_status)); // r = b - Ax this->get_system_matrix()->apply(neg_one_op, dense_x, one_op, r); @@ -183,8 +187,10 @@ void Bicgstab::apply_dense_impl( // tmp = rho / prev_rho * alpha / omega // p = r + tmp * (p - omega * v) - exec->run(bicgstab::make_step_1(r, p, v, rho, prev_rho, alpha, omega, - &stop_status)); + exec->run(bicgstab::make_step_1(gko::detail::get_local(r), + gko::detail::get_local(p), + gko::detail::get_local(v), rho, + prev_rho, alpha, omega, &stop_status)); // y = preconditioner * p this->get_preconditioner()->apply(p, y); @@ -194,8 +200,9 @@ void Bicgstab::apply_dense_impl( rr->compute_conj_dot(v, beta, reduction_tmp); // alpha = rho / beta // s = r - alpha * v - exec->run( - bicgstab::make_step_2(r, s, v, rho, alpha, beta, &stop_status)); + exec->run(bicgstab::make_step_2( + gko::detail::get_local(r), gko::detail::get_local(s), + gko::detail::get_local(v), rho, alpha, beta, &stop_status)); auto all_converged = stop_criterion->update() @@ -205,7 +212,9 @@ void Bicgstab::apply_dense_impl( // .solution(dense_x) // outdated at this point .check(RelativeStoppingId, false, &stop_status, &one_changed); if (one_changed) { - exec->run(bicgstab::make_finalize(dense_x, y, alpha, &stop_status)); + exec->run(bicgstab::make_finalize(gko::detail::get_local(dense_x), + gko::detail::get_local(y), alpha, + &stop_status)); } if (all_converged) { break; @@ -222,8 +231,11 @@ void Bicgstab::apply_dense_impl( // omega = gamma / beta // x = x + alpha * y + omega * z // r = s - omega * t - exec->run(bicgstab::make_step_3(dense_x, r, s, t, y, z, alpha, beta, - gamma, omega, &stop_status)); + exec->run(bicgstab::make_step_3( + gko::detail::get_local(dense_x), gko::detail::get_local(r), + gko::detail::get_local(s), gko::detail::get_local(t), + gko::detail::get_local(y), gko::detail::get_local(z), alpha, beta, + gamma, omega, &stop_status)); swap(prev_rho, rho); } } @@ -236,7 +248,7 @@ void Bicgstab::apply_impl(const LinOp* alpha, const LinOp* b, if (!this->get_system_matrix()) { return; } - precision_dispatch_real_complex( + experimental::precision_dispatch_real_complex_distributed( [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) { auto x_clone = dense_x->clone(); this->apply_dense_impl(dense_b, x_clone.get()); diff --git a/core/solver/cg.cpp b/core/solver/cg.cpp index 12d67fbe563..8038a361e86 100644 --- a/core/solver/cg.cpp +++ b/core/solver/cg.cpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/distributed/helpers.hpp" #include "core/solver/cg_kernels.hpp" #include "core/solver/solver_boilerplate.hpp" @@ -93,7 +94,7 @@ void Cg::apply_impl(const LinOp* b, LinOp* x) const if (!this->get_system_matrix()) { return; } - precision_dispatch_real_complex( + experimental::precision_dispatch_real_complex_distributed( [this](auto dense_b, auto dense_x) { this->apply_dense_impl(dense_b, dense_x); }, @@ -102,11 +103,12 @@ void Cg::apply_impl(const LinOp* b, LinOp* x) const template -void Cg::apply_dense_impl(const matrix::Dense* dense_b, - matrix::Dense* dense_x) const +template +void Cg::apply_dense_impl(const VectorType* dense_b, + VectorType* dense_x) const { using std::swap; - using Vector = matrix::Dense; + using LocalVector = matrix::Dense; constexpr uint8 RelativeStoppingId{1}; @@ -132,8 +134,10 @@ void Cg::apply_dense_impl(const matrix::Dense* dense_b, // rho = 0.0 // prev_rho = 1.0 // z = p = q = 0 - exec->run( - cg::make_initialize(dense_b, r, z, p, q, prev_rho, rho, &stop_status)); + exec->run(cg::make_initialize( + gko::detail::get_local(dense_b), gko::detail::get_local(r), + gko::detail::get_local(z), gko::detail::get_local(p), + gko::detail::get_local(q), prev_rho, rho, &stop_status)); this->get_system_matrix()->apply(neg_one_op, dense_x, one_op, r); auto stop_criterion = this->get_stop_criterion_factory()->generate( @@ -170,7 +174,9 @@ void Cg::apply_dense_impl(const matrix::Dense* dense_b, // tmp = rho / prev_rho // p = z + tmp * p - exec->run(cg::make_step_1(p, z, rho, prev_rho, &stop_status)); + exec->run(cg::make_step_1(gko::detail::get_local(p), + gko::detail::get_local(z), rho, prev_rho, + &stop_status)); // q = A * p this->get_system_matrix()->apply(p, q); // beta = dot(p, q) @@ -178,7 +184,10 @@ void Cg::apply_dense_impl(const matrix::Dense* dense_b, // tmp = rho / beta // x = x + tmp * p // r = r - tmp * q - exec->run(cg::make_step_2(dense_x, r, p, q, beta, rho, &stop_status)); + exec->run(cg::make_step_2( + gko::detail::get_local(dense_x), gko::detail::get_local(r), + gko::detail::get_local(p), gko::detail::get_local(q), beta, rho, + &stop_status)); swap(prev_rho, rho); } } @@ -191,7 +200,7 @@ void Cg::apply_impl(const LinOp* alpha, const LinOp* b, if (!this->get_system_matrix()) { return; } - precision_dispatch_real_complex( + experimental::precision_dispatch_real_complex_distributed( [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) { auto x_clone = dense_x->clone(); this->apply_dense_impl(dense_b, x_clone.get()); diff --git a/core/solver/cgs.cpp b/core/solver/cgs.cpp index 53ddd773563..abf39f90a7e 100644 --- a/core/solver/cgs.cpp +++ b/core/solver/cgs.cpp @@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/distributed/helpers.hpp" #include "core/solver/cgs_kernels.hpp" #include "core/solver/solver_boilerplate.hpp" @@ -94,7 +95,7 @@ void Cgs::apply_impl(const LinOp* b, LinOp* x) const if (!this->get_system_matrix()) { return; } - precision_dispatch_real_complex( + experimental::precision_dispatch_real_complex_distributed( [this](auto dense_b, auto dense_x) { this->apply_dense_impl(dense_b, dense_x); }, @@ -103,11 +104,12 @@ void Cgs::apply_impl(const LinOp* b, LinOp* x) const template -void Cgs::apply_dense_impl(const matrix::Dense* dense_b, - matrix::Dense* dense_x) const +template +void Cgs::apply_dense_impl(const VectorType* dense_b, + VectorType* dense_x) const { using std::swap; - using Vector = matrix::Dense; + using LocalVector = matrix::Dense; constexpr uint8 RelativeStoppingId{1}; @@ -139,9 +141,13 @@ void Cgs::apply_dense_impl(const matrix::Dense* dense_b, // rho = 0.0 // prev_rho = alpha = beta = gamma = 1.0 // p = q = u = u_hat = v_hat = t = 0 - exec->run(cgs::make_initialize(dense_b, r, r_tld, p, q, u, u_hat, v_hat, t, - alpha, beta, gamma, prev_rho, rho, - &stop_status)); + exec->run(cgs::make_initialize( + gko::detail::get_local(dense_b), gko::detail::get_local(r), + gko::detail::get_local(r_tld), gko::detail::get_local(p), + gko::detail::get_local(q), gko::detail::get_local(u), + gko::detail::get_local(u_hat), gko::detail::get_local(v_hat), + gko::detail::get_local(t), alpha, beta, gamma, prev_rho, rho, + &stop_status)); this->get_system_matrix()->apply(neg_one_op, dense_x, one_op, r); auto stop_criterion = this->get_stop_criterion_factory()->generate( @@ -178,22 +184,29 @@ void Cgs::apply_dense_impl(const matrix::Dense* dense_b, // beta = rho / prev_rho // u = r + beta * q // p = u + beta * ( q + beta * p ) - exec->run( - cgs::make_step_1(r, u, p, q, beta, rho, prev_rho, &stop_status)); + exec->run(cgs::make_step_1( + gko::detail::get_local(r), gko::detail::get_local(u), + gko::detail::get_local(p), gko::detail::get_local(q), beta, rho, + prev_rho, &stop_status)); this->get_preconditioner()->apply(p, t); this->get_system_matrix()->apply(t, v_hat); r_tld->compute_conj_dot(v_hat, gamma, reduction_tmp); // alpha = rho / gamma // q = u - alpha * v_hat // t = u + q - exec->run( - cgs::make_step_2(u, v_hat, q, t, alpha, rho, gamma, &stop_status)); + exec->run(cgs::make_step_2( + gko::detail::get_local(u), gko::detail::get_local(v_hat), + gko::detail::get_local(q), gko::detail::get_local(t), alpha, rho, + gamma, &stop_status)); this->get_preconditioner()->apply(t, u_hat); this->get_system_matrix()->apply(u_hat, t); // r = r - alpha * t // x = x + alpha * u_hat - exec->run(cgs::make_step_3(t, u_hat, r, dense_x, alpha, &stop_status)); + exec->run(cgs::make_step_3( + gko::detail::get_local(t), gko::detail::get_local(u_hat), + gko::detail::get_local(r), gko::detail::get_local(dense_x), alpha, + &stop_status)); swap(prev_rho, rho); } @@ -207,7 +220,7 @@ void Cgs::apply_impl(const LinOp* alpha, const LinOp* b, if (!this->get_system_matrix()) { return; } - precision_dispatch_real_complex( + experimental::precision_dispatch_real_complex_distributed( [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) { auto x_clone = dense_x->clone(); this->apply_dense_impl(dense_b, x_clone.get()); diff --git a/core/solver/fcg.cpp b/core/solver/fcg.cpp index ce789c75e4d..bf6dda5b036 100644 --- a/core/solver/fcg.cpp +++ b/core/solver/fcg.cpp @@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/distributed/helpers.hpp" #include "core/solver/fcg_kernels.hpp" #include "core/solver/solver_boilerplate.hpp" @@ -92,7 +93,7 @@ void Fcg::apply_impl(const LinOp* b, LinOp* x) const if (!this->get_system_matrix()) { return; } - precision_dispatch_real_complex( + experimental::precision_dispatch_real_complex_distributed( [this](auto dense_b, auto dense_x) { this->apply_dense_impl(dense_b, dense_x); }, @@ -101,11 +102,12 @@ void Fcg::apply_impl(const LinOp* b, LinOp* x) const template -void Fcg::apply_dense_impl(const matrix::Dense* dense_b, - matrix::Dense* dense_x) const +template +void Fcg::apply_dense_impl(const VectorType* dense_b, + VectorType* dense_x) const { using std::swap; - using Vector = matrix::Dense; + using LocalVector = matrix::Dense; constexpr uint8 RelativeStoppingId{1}; @@ -129,15 +131,17 @@ void Fcg::apply_dense_impl(const matrix::Dense* dense_b, bool one_changed{}; GKO_SOLVER_STOP_REDUCTION_ARRAYS(); - // TODO: replace this with automatic merged kernel generator - exec->run(fcg::make_initialize(dense_b, r, z, p, q, t, prev_rho, rho, rho_t, - &stop_status)); // r = dense_b // t = r // rho = 0.0 // prev_rho = 1.0 // rho_t = 1.0 // z = p = q = 0 + exec->run(fcg::make_initialize( + gko::detail::get_local(dense_b), gko::detail::get_local(r), + gko::detail::get_local(z), gko::detail::get_local(p), + gko::detail::get_local(q), gko::detail::get_local(t), prev_rho, rho, + rho_t, &stop_status)); this->get_system_matrix()->apply(neg_one_op, dense_x, one_op, r); auto stop_criterion = this->get_stop_criterion_factory()->generate( @@ -173,7 +177,9 @@ void Fcg::apply_dense_impl(const matrix::Dense* dense_b, // tmp = rho_t / prev_rho // p = z + tmp * p - exec->run(fcg::make_step_1(p, z, rho_t, prev_rho, &stop_status)); + exec->run(fcg::make_step_1( + gko::detail::get_local(p), gko::detail::get_local(z), + gko::detail::get_local(rho_t), prev_rho, &stop_status)); this->get_system_matrix()->apply(p, q); p->compute_conj_dot(q, beta, reduction_tmp); // tmp = rho / beta @@ -181,8 +187,10 @@ void Fcg::apply_dense_impl(const matrix::Dense* dense_b, // x = x + tmp * p // r = r - tmp * q // t = r - [prev_r] - exec->run( - fcg::make_step_2(dense_x, r, t, p, q, beta, rho, &stop_status)); + exec->run(fcg::make_step_2( + gko::detail::get_local(dense_x), gko::detail::get_local(r), + gko::detail::get_local(t), gko::detail::get_local(p), + gko::detail::get_local(q), beta, rho, &stop_status)); swap(prev_rho, rho); } } @@ -195,7 +203,7 @@ void Fcg::apply_impl(const LinOp* alpha, const LinOp* b, if (!this->get_system_matrix()) { return; } - precision_dispatch_real_complex( + experimental::precision_dispatch_real_complex_distributed( [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) { auto x_clone = dense_x->clone(); this->apply_dense_impl(dense_b, x_clone.get()); diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp index 92c61b062e4..dbc110c07e1 100644 --- a/core/solver/idr.cpp +++ b/core/solver/idr.cpp @@ -38,10 +38,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include #include -#include #include +#include "core/distributed/helpers.hpp" #include "core/solver/idr_kernels.hpp" #include "core/solver/solver_boilerplate.hpp" @@ -90,11 +90,12 @@ std::unique_ptr Idr::conj_transpose() const template -template -void Idr::iterate(const matrix::Dense* dense_b, - matrix::Dense* dense_x) const +template +void Idr::iterate(const VectorType* dense_b, + VectorType* dense_x) const { using std::swap; + using SubspaceType = typename VectorType::value_type; using Vector = matrix::Dense; using AbsType = remove_complex; using ws = workspace_traits; @@ -161,8 +162,9 @@ void Idr::iterate(const matrix::Dense* dense_b, std::default_random_engine(15)); subspace_vectors->read(subspace_vectors_data); } - exec->run(idr::make_initialize(nrhs, m, subspace_vectors, is_deterministic, - &stop_status)); + exec->run(idr::make_initialize(nrhs, gko::detail::get_local(m), + gko::detail::get_local(subspace_vectors), + is_deterministic, &stop_status)); // omega = 1 omega->fill(one()); @@ -221,14 +223,19 @@ void Idr::iterate(const matrix::Dense* dense_b, for (size_type k = 0; k < subspace_dim; k++) { // c = M \ f = (c_1, ..., c_s)^T // v = residual - sum i=[k,s) of (c_i * g_i) - exec->run(idr::make_step_1(nrhs, k, m, f, residual, g, c, v, - &stop_status)); + exec->run(idr::make_step_1( + nrhs, k, gko::detail::get_local(m), gko::detail::get_local(f), + gko::detail::get_local(residual), gko::detail::get_local(g), + gko::detail::get_local(c), gko::detail::get_local(v), + &stop_status)); this->get_preconditioner()->apply(v, helper); // u_k = omega * precond_vector + sum i=[k,s) of (c_i * u_i) - exec->run( - idr::make_step_2(nrhs, k, omega, helper, c, u, &stop_status)); + exec->run(idr::make_step_2( + nrhs, k, gko::detail::get_local(omega), + gko::detail::get_local(helper), gko::detail::get_local(c), + gko::detail::get_local(u), &stop_status)); auto u_k = u->create_submatrix(span{0, problem_size}, span{k * nrhs, (k + 1) * nrhs}); @@ -249,9 +256,13 @@ void Idr::iterate(const matrix::Dense* dense_b, // residual -= beta * g_k // dense_x += beta * u_k // f = (0,...,0,f_k+1 - beta * m_k+1,k,...,f_s-1 - beta * m_s-1,k) - exec->run(idr::make_step_3(nrhs, k, subspace_vectors, g, helper, u, - m, f, alpha, residual, dense_x, - &stop_status)); + exec->run(idr::make_step_3( + nrhs, k, gko::detail::get_local(subspace_vectors), + gko::detail::get_local(g), gko::detail::get_local(helper), + gko::detail::get_local(u), gko::detail::get_local(m), + gko::detail::get_local(f), gko::detail::get_local(alpha), + gko::detail::get_local(residual), + gko::detail::get_local(dense_x), &stop_status)); } this->get_preconditioner()->apply(residual, helper); @@ -268,8 +279,10 @@ void Idr::iterate(const matrix::Dense* dense_b, // end if // residual -= omega * t // dense_x += omega * v - exec->run(idr::make_compute_omega(nrhs, kappa, tht, residual_norm, - omega, &stop_status)); + exec->run(idr::make_compute_omega( + nrhs, kappa, gko::detail::get_local(tht), + gko::detail::get_local(residual_norm), + gko::detail::get_local(omega), &stop_status)); t->scale(subspace_neg_one_op); residual->add_scaled(omega, t); diff --git a/core/solver/ir.cpp b/core/solver/ir.cpp index 1a596cf1d0c..75368453b71 100644 --- a/core/solver/ir.cpp +++ b/core/solver/ir.cpp @@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include "core/distributed/helpers.hpp" #include "core/solver/ir_kernels.hpp" #include "core/solver/solver_boilerplate.hpp" @@ -164,7 +165,7 @@ void Ir::apply_impl(const LinOp* b, LinOp* x) const if (!this->get_system_matrix()) { return; } - precision_dispatch_real_complex( + experimental::precision_dispatch_real_complex_distributed( [this](auto dense_b, auto dense_x) { this->apply_dense_impl(dense_b, dense_x); }, @@ -173,8 +174,9 @@ void Ir::apply_impl(const LinOp* b, LinOp* x) const template -void Ir::apply_dense_impl(const matrix::Dense* dense_b, - matrix::Dense* dense_x) const +template +void Ir::apply_dense_impl(const VectorType* dense_b, + VectorType* dense_x) const { using Vector = matrix::Dense; using ws = workspace_traits; @@ -250,7 +252,7 @@ void Ir::apply_impl(const LinOp* alpha, const LinOp* b, if (!this->get_system_matrix()) { return; } - precision_dispatch_real_complex( + experimental::precision_dispatch_real_complex_distributed( [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) { auto x_clone = dense_x->clone(); this->apply_dense_impl(dense_b, x_clone.get()); diff --git a/core/stop/residual_norm.cpp b/core/stop/residual_norm.cpp index e699e1cd122..1319ca9b027 100644 --- a/core/stop/residual_norm.cpp +++ b/core/stop/residual_norm.cpp @@ -33,7 +33,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include +#include + + +#include "core/base/dispatch_helper.hpp" #include "core/components/fill_array_kernels.hpp" +#include "core/distributed/helpers.hpp" #include "core/stop/residual_norm_kernels.hpp" @@ -62,6 +67,128 @@ GKO_REGISTER_OPERATION(implicit_residual_norm, } // namespace implicit_residual_norm +template +bool any_is_complex() +{ + return false; +} + + +template +bool any_is_complex(const LinOp* in, Rest&&... rest) +{ +#if GINKGO_BUILD_MPI + bool is_complex_distributed = dynamic_cast>>*>(in); +#else + bool is_complex_distributed = false; +#endif + + return is_complex() || is_complex_distributed || + dynamic_cast< + const ConvertibleTo>>*>(in) || + any_is_complex(std::forward(rest)...); +} + + +template +void norm_dispatch(Function&& fn, LinOps*... linops) +{ +#if GINKGO_BUILD_MPI + if (gko::detail::is_distributed(linops...)) { + if (any_is_complex(linops...)) { + experimental::distributed::precision_dispatch< + to_complex>(std::forward(fn), linops...); + } else { + experimental::distributed::precision_dispatch( + std::forward(fn), linops...); + } + } else +#endif + { + if (any_is_complex(linops...)) { + precision_dispatch>( + std::forward(fn), linops...); + } else { + precision_dispatch(std::forward(fn), + linops...); + } + } +} + + +template +ResidualNormBase::ResidualNormBase( + std::shared_ptr exec, const CriterionArgs& args, + remove_complex reduction_factor, mode baseline) + : EnablePolymorphicObject(exec), + device_storage_{exec, 2}, + reduction_factor_{reduction_factor}, + baseline_{baseline}, + system_matrix_{args.system_matrix}, + b_{args.b}, + one_{gko::initialize({1}, exec)}, + neg_one_{gko::initialize({-1}, exec)} +{ + switch (baseline_) { + case mode::initial_resnorm: { + if (args.initial_residual == nullptr) { + if (args.system_matrix == nullptr || args.b == nullptr || + args.x == nullptr) { + GKO_NOT_SUPPORTED(nullptr); + } else { + this->starting_tau_ = + NormVector::create(exec, dim<2>{1, args.b->get_size()[1]}); + auto b_clone = share(args.b->clone()); + args.system_matrix->apply(neg_one_.get(), args.x, one_.get(), + b_clone.get()); + norm_dispatch( + [&](auto dense_r) { + dense_r->compute_norm2(this->starting_tau_.get()); + }, + b_clone.get()); + } + } else { + this->starting_tau_ = NormVector::create( + exec, dim<2>{1, args.initial_residual->get_size()[1]}); + norm_dispatch( + [&](auto dense_r) { + dense_r->compute_norm2(this->starting_tau_.get()); + }, + args.initial_residual); + } + break; + } + case mode::rhs_norm: { + if (args.b == nullptr) { + GKO_NOT_SUPPORTED(nullptr); + } + this->starting_tau_ = + NormVector::create(exec, dim<2>{1, args.b->get_size()[1]}); + norm_dispatch( + [&](auto dense_r) { + dense_r->compute_norm2(this->starting_tau_.get()); + }, + args.b.get()); + break; + } + case mode::absolute: { + if (args.b == nullptr) { + GKO_NOT_SUPPORTED(nullptr); + } + this->starting_tau_ = + NormVector::create(exec, dim<2>{1, args.b->get_size()[1]}); + this->starting_tau_->fill(gko::one>()); + break; + } + default: + GKO_NOT_SUPPORTED(nullptr); + } + this->u_dense_tau_ = + NormVector::create_with_config_of(this->starting_tau_.get()); +} + + template bool ResidualNormBase::check_impl( uint8 stopping_id, bool set_finalized, array* stop_status, @@ -71,33 +198,21 @@ bool ResidualNormBase::check_impl( if (updater.residual_norm_ != nullptr) { dense_tau = as(updater.residual_norm_); } else if (updater.residual_ != nullptr) { - if (dynamic_cast(updater.residual_)) { - auto* dense_r = as(updater.residual_); - dense_r->compute_norm2(u_dense_tau_.get()); - } else { - auto* dense_r = as(updater.residual_); - dense_r->compute_norm2(u_dense_tau_.get()); - } + norm_dispatch( + [&](auto dense_r) { dense_r->compute_norm2(u_dense_tau_.get()); }, + updater.residual_); dense_tau = u_dense_tau_.get(); } else if (updater.solution_ != nullptr && system_matrix_ != nullptr && b_ != nullptr) { auto exec = this->get_executor(); - // when LinOp is real but rhs is complex, we use real view on complex, - // so it still uses the same type of scalar in apply. - if (auto vec_b = std::dynamic_pointer_cast(b_)) { - auto dense_r = vec_b->clone(); - system_matrix_->apply(neg_one_.get(), updater.solution_, one_.get(), - dense_r.get()); - dense_r->compute_norm2(u_dense_tau_.get()); - } else if (auto vec_b = - std::dynamic_pointer_cast(b_)) { - auto dense_r = vec_b->clone(); - system_matrix_->apply(neg_one_.get(), updater.solution_, one_.get(), - dense_r.get()); - dense_r->compute_norm2(u_dense_tau_.get()); - } else { - GKO_NOT_SUPPORTED(nullptr); - } + norm_dispatch( + [&](auto dense_b, auto dense_x) { + auto dense_r = dense_b->clone(); + system_matrix_->apply(neg_one_.get(), dense_x, one_.get(), + dense_r.get()); + dense_r->compute_norm2(u_dense_tau_.get()); + }, + b_.get(), updater.solution_); dense_tau = u_dense_tau_.get(); } else { GKO_NOT_SUPPORTED(nullptr); diff --git a/core/test/base/CMakeLists.txt b/core/test/base/CMakeLists.txt index b35dcfe723e..aa79ca3ed92 100644 --- a/core/test/base/CMakeLists.txt +++ b/core/test/base/CMakeLists.txt @@ -1,6 +1,7 @@ ginkgo_create_test(abstract_factory) ginkgo_create_test(allocator) ginkgo_create_test(array) +ginkgo_create_test(dense_cache) ginkgo_create_test(combination) ginkgo_create_test(composition) ginkgo_create_test(dim) diff --git a/core/test/base/abstract_factory.cpp b/core/test/base/abstract_factory.cpp index 07d2e490f18..108e32436e7 100644 --- a/core/test/base/abstract_factory.cpp +++ b/core/test/base/abstract_factory.cpp @@ -55,7 +55,7 @@ using base = gko::AbstractFactory; struct IntFactory : gko::EnableDefaultFactory { friend class gko::enable_parameters_type; - friend class gko::EnablePolymorphicObject; + friend class gko::polymorphic_object_traits; using gko::EnableDefaultFactory::EnableDefaultFactory; }; diff --git a/core/test/base/dense_cache.cpp b/core/test/base/dense_cache.cpp new file mode 100644 index 00000000000..41bac8c01c6 --- /dev/null +++ b/core/test/base/dense_cache.cpp @@ -0,0 +1,229 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +template +class DenseCache : public ::testing::Test { +protected: + using value_type = ValueType; + + DenseCache() {} + + void SetUp() { ref = gko::ReferenceExecutor::create(); } + + void TearDown() {} + + void gen_cache(gko::dim<2> size) { cache.init(ref, size); } + + std::shared_ptr ref; + gko::detail::DenseCache cache; +}; + + +TYPED_TEST_SUITE(DenseCache, gko::test::ValueTypes, TypenameNameGenerator); + + +TYPED_TEST(DenseCache, CanDefaultConstruct) +{ + using value_type = typename TestFixture::value_type; + gko::detail::DenseCache cache; + + ASSERT_EQ(cache.get(), nullptr); +} + + +TYPED_TEST(DenseCache, CanInitWithSize) +{ + using value_type = typename TestFixture::value_type; + gko::dim<2> size{4, 7}; + + this->cache.init(this->ref, size); + + ASSERT_NE(this->cache.get(), nullptr); + GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(), size); + ASSERT_EQ(this->cache->get_executor(), this->ref); +} + + +TYPED_TEST(DenseCache, SecondInitWithSameSizeIsNoOp) +{ + using value_type = typename TestFixture::value_type; + gko::dim<2> size{4, 7}; + this->cache.init(this->ref, size); + auto first_ptr = this->cache.get(); + + this->cache.init(this->ref, size); + + ASSERT_NE(this->cache.get(), nullptr); + ASSERT_EQ(first_ptr, this->cache.get()); +} + + +TYPED_TEST(DenseCache, SecondInitWithDifferentSizeInitializes) +{ + using value_type = typename TestFixture::value_type; + gko::dim<2> size{4, 7}; + gko::dim<2> second_size{7, 4}; + this->cache.init(this->ref, size); + auto first_ptr = this->cache.get(); + + this->cache.init(this->ref, second_size); + + ASSERT_NE(this->cache.get(), nullptr); + ASSERT_NE(first_ptr, this->cache.get()); +} + + +TYPED_TEST(DenseCache, CanInitFromDense) +{ + using value_type = typename TestFixture::value_type; + gko::dim<2> size{5, 2}; + auto dense = gko::matrix::Dense::create(this->ref, size); + + this->cache.init_from(dense.get()); + + ASSERT_NE(this->cache.get(), nullptr); + GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(), size); + ASSERT_EQ(this->cache->get_executor(), dense->get_executor()); +} + + +TYPED_TEST(DenseCache, SecondInitFromSameDenseIsNoOp) +{ + using value_type = typename TestFixture::value_type; + gko::dim<2> size{4, 7}; + auto dense = gko::matrix::Dense::create(this->ref, size); + this->cache.init_from(dense.get()); + auto first_ptr = this->cache.get(); + + this->cache.init_from(dense.get()); + + ASSERT_NE(this->cache.get(), nullptr); + ASSERT_EQ(first_ptr, this->cache.get()); +} + + +TYPED_TEST(DenseCache, SecondInitFromDifferentDenseWithSameSizeIsNoOp) +{ + using value_type = typename TestFixture::value_type; + gko::dim<2> size{4, 7}; + auto first_dense = gko::matrix::Dense::create(this->ref, size); + auto second_dense = gko::matrix::Dense::create(this->ref, size); + this->cache.init_from(first_dense.get()); + auto first_ptr = this->cache.get(); + + this->cache.init_from(second_dense.get()); + + ASSERT_NE(this->cache.get(), nullptr); + ASSERT_EQ(first_ptr, this->cache.get()); +} + + +TYPED_TEST(DenseCache, SecondInitFromDifferentDenseWithDifferentSizeInitializes) +{ + using value_type = typename TestFixture::value_type; + gko::dim<2> size{4, 7}; + gko::dim<2> second_size{7, 4}; + auto first_dense = gko::matrix::Dense::create(this->ref, size); + auto second_dense = + gko::matrix::Dense::create(this->ref, second_size); + this->cache.init_from(first_dense.get()); + auto first_ptr = this->cache.get(); + + this->cache.init_from(second_dense.get()); + + ASSERT_NE(this->cache.get(), nullptr); + ASSERT_NE(first_ptr, this->cache.get()); +} + + +TYPED_TEST(DenseCache, VectorIsNotCopied) +{ + using value_type = typename TestFixture::value_type; + this->gen_cache({1, 1}); + gko::detail::DenseCache cache(this->cache); + + ASSERT_EQ(cache.get(), nullptr); + GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(), gko::dim<2>(1, 1)); +} + + +TYPED_TEST(DenseCache, VectorIsNotMoved) +{ + using value_type = typename TestFixture::value_type; + this->gen_cache({1, 1}); + gko::detail::DenseCache cache(std::move(this->cache)); + + ASSERT_EQ(cache.get(), nullptr); + GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(), gko::dim<2>(1, 1)); +} + + +TYPED_TEST(DenseCache, VectorIsNotCopyAssigned) +{ + using value_type = typename TestFixture::value_type; + this->gen_cache({1, 1}); + gko::detail::DenseCache cache; + cache = this->cache; + + ASSERT_EQ(cache.get(), nullptr); + GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(), gko::dim<2>(1, 1)); +} + + +TYPED_TEST(DenseCache, VectorIsNotMoveAssigned) +{ + using value_type = typename TestFixture::value_type; + this->gen_cache({1, 1}); + gko::detail::DenseCache cache; + cache = std::move(this->cache); + + ASSERT_EQ(cache.get(), nullptr); + GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(), gko::dim<2>(1, 1)); +} + + +} // namespace diff --git a/core/test/base/mtx_io.cpp b/core/test/base/mtx_io.cpp index 7f492cdc991..79e7ba35b35 100644 --- a/core/test/base/mtx_io.cpp +++ b/core/test/base/mtx_io.cpp @@ -870,7 +870,7 @@ class DummyLinOp public gko::EnableCreateMethod>, public gko::ReadableFromMatrixData, public gko::WritableToMatrixData { - friend class gko::EnablePolymorphicObject; + friend class gko::polymorphic_object_traits; friend class gko::EnableCreateMethod; public: diff --git a/core/test/matrix/dense.cpp b/core/test/matrix/dense.cpp index dbf9670c5c2..7493c5eb727 100644 --- a/core/test/matrix/dense.cpp +++ b/core/test/matrix/dense.cpp @@ -430,6 +430,7 @@ TYPED_TEST(Dense, CanMakeConstView) class CustomDense : public gko::EnableLinOp> { friend class gko::EnablePolymorphicObject>; + friend struct gko::polymorphic_object_traits; public: static std::unique_ptr create( diff --git a/core/test/mpi/CMakeLists.txt b/core/test/mpi/CMakeLists.txt index 8edc6781c4e..eb2c9192ebc 100644 --- a/core/test/mpi/CMakeLists.txt +++ b/core/test/mpi/CMakeLists.txt @@ -1,7 +1,2 @@ -add_library(gtest_mpi_main "") -target_sources(gtest_mpi_main - PRIVATE - gtest/mpi_listener.cpp) -find_package(MPI REQUIRED) -target_link_libraries(gtest_mpi_main PRIVATE GTest::GTest MPI::MPI_CXX) add_subdirectory(base) +add_subdirectory(distributed) diff --git a/core/test/mpi/base/CMakeLists.txt b/core/test/mpi/base/CMakeLists.txt index 0b22157d269..bdf82c4337c 100644 --- a/core/test/mpi/base/CMakeLists.txt +++ b/core/test/mpi/base/CMakeLists.txt @@ -1,3 +1,7 @@ ginkgo_create_test(communicator MPI_SIZE 8) ginkgo_create_test(exception_helpers MPI_SIZE 2) ginkgo_create_test(bindings MPI_SIZE 4) +if(NOT (MSVC OR WIN32)) + # This test uses some UNIX function so it's disabled on Windows + ginkgo_create_test(rank_mapping MPI_SIZE 4) +endif() diff --git a/core/test/mpi/base/bindings.cpp b/core/test/mpi/base/bindings.cpp index ee34e8aa451..04b90441b5e 100644 --- a/core/test/mpi/base/bindings.cpp +++ b/core/test/mpi/base/bindings.cpp @@ -60,7 +60,7 @@ TYPED_TEST_SUITE(MpiBindings, gko::test::PODTypes, TypenameNameGenerator); TYPED_TEST(MpiBindings, CanSetADefaultwindow) { - gko::mpi::window win; + gko::experimental::mpi::window win; ASSERT_EQ(win.get_window(), MPI_WIN_NULL); } @@ -68,10 +68,10 @@ TYPED_TEST(MpiBindings, CanSetADefaultwindow) TYPED_TEST(MpiBindings, CanCreatewindow) { auto data = std::vector{1, 2, 3, 4}; - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); - auto win = - gko::mpi::window(data.data(), 4 * sizeof(TypeParam), comm); + auto win = gko::experimental::mpi::window( + this->ref, data.data(), 4 * sizeof(TypeParam), comm); ASSERT_NE(win.get_window(), MPI_WIN_NULL); win.lock_all(); @@ -81,7 +81,7 @@ TYPED_TEST(MpiBindings, CanCreatewindow) TYPED_TEST(MpiBindings, CanSendAndRecvValues) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); auto recv_array = gko::array{this->ref}; @@ -90,12 +90,12 @@ TYPED_TEST(MpiBindings, CanSendAndRecvValues) auto send_array = std::vector{1, 2, 3, 4}; for (auto rank = 0; rank < num_ranks; ++rank) { if (rank != my_rank) { - comm.send(send_array.data(), 4, rank, 40 + rank); + comm.send(this->ref, send_array.data(), 4, rank, 40 + rank); } } } else { recv_array = gko::array{this->ref, 4}; - comm.recv(recv_array.get_data(), 4, 0, 40 + my_rank); + comm.recv(this->ref, recv_array.get_data(), 4, 0, 40 + my_rank); } if (my_rank != 0) { @@ -107,25 +107,27 @@ TYPED_TEST(MpiBindings, CanSendAndRecvValues) TYPED_TEST(MpiBindings, CanNonBlockingSendAndNonBlockingRecvValues) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); std::vector send_array; auto recv_array = gko::array{this->ref}; TypeParam* data; - auto req1 = std::vector(num_ranks); - auto req2 = gko::mpi::request(); + auto req1 = std::vector(num_ranks); + auto req2 = gko::experimental::mpi::request(); if (my_rank == 0) { send_array = std::vector{1, 2, 3, 4}; for (auto rank = 0; rank < num_ranks; ++rank) { if (rank != my_rank) { - req1[rank] = comm.i_send(send_array.data(), 4, rank, 40 + rank); + req1[rank] = comm.i_send(this->ref, send_array.data(), 4, rank, + 40 + rank); } } } else { recv_array = gko::array{this->ref, 4}; - req2 = comm.i_recv(recv_array.get_data(), 4, 0, 40 + my_rank); + req2 = + comm.i_recv(this->ref, recv_array.get_data(), 4, 0, 40 + my_rank); } if (my_rank == 0) { @@ -142,8 +144,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingSendAndNonBlockingRecvValues) TYPED_TEST(MpiBindings, CanPutValuesWithLockAll) { - using window = gko::mpi::window; - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); std::vector data; @@ -154,12 +156,12 @@ TYPED_TEST(MpiBindings, CanPutValuesWithLockAll) } { - auto win = window(data.data(), 4, comm); + auto win = window(this->ref, data.data(), 4, comm); if (my_rank == 0) { win.lock_all(); for (auto rank = 0; rank < num_ranks; ++rank) { if (rank != my_rank) { - win.put(data.data(), 4, rank, 0, 4); + win.put(this->ref, data.data(), 4, rank, 0, 4); } } win.unlock_all(); @@ -173,8 +175,8 @@ TYPED_TEST(MpiBindings, CanPutValuesWithLockAll) TYPED_TEST(MpiBindings, CanNonBlockingPutValuesWithLockAll) { - using window = gko::mpi::window; - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); std::vector data; @@ -185,13 +187,13 @@ TYPED_TEST(MpiBindings, CanNonBlockingPutValuesWithLockAll) } { - gko::mpi::request req; - auto win = window(data.data(), 4, comm); + gko::experimental::mpi::request req; + auto win = window(this->ref, data.data(), 4, comm); if (my_rank == 0) { win.lock_all(); for (auto rank = 0; rank < num_ranks; ++rank) { if (rank != my_rank) { - req = win.r_put(data.data(), 4, rank, 0, 4); + req = win.r_put(this->ref, data.data(), 4, rank, 0, 4); } } req.wait(); @@ -206,8 +208,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingPutValuesWithLockAll) TYPED_TEST(MpiBindings, CanPutValuesWithExclusiveLock) { - using window = gko::mpi::window; - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); std::vector data; @@ -219,12 +221,12 @@ TYPED_TEST(MpiBindings, CanPutValuesWithExclusiveLock) } { - auto win = window(data.data(), 4, comm); + auto win = window(this->ref, data.data(), 4, comm); if (my_rank == 0) { for (auto rank = 0; rank < num_ranks; ++rank) { if (rank != my_rank) { win.lock(rank, window::lock_type::exclusive); - win.put(data.data(), 4, rank, 0, 4); + win.put(this->ref, data.data(), 4, rank, 0, 4); win.flush(0); win.unlock(rank); } @@ -239,8 +241,8 @@ TYPED_TEST(MpiBindings, CanPutValuesWithExclusiveLock) TYPED_TEST(MpiBindings, CanPutValuesWithSharedLock) { - using window = gko::mpi::window; - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); std::vector data; @@ -252,12 +254,12 @@ TYPED_TEST(MpiBindings, CanPutValuesWithSharedLock) } { - auto win = window(data.data(), 4, comm); + auto win = window(this->ref, data.data(), 4, comm); if (my_rank == 0) { for (auto rank = 0; rank < num_ranks; ++rank) { if (rank != my_rank) { win.lock(rank); - win.put(data.data(), 4, rank, 0, 4); + win.put(this->ref, data.data(), 4, rank, 0, 4); win.flush(0); win.unlock(rank); } @@ -272,8 +274,8 @@ TYPED_TEST(MpiBindings, CanPutValuesWithSharedLock) TYPED_TEST(MpiBindings, CanPutValuesWithFence) { - using window = gko::mpi::window; - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); std::vector data; @@ -282,13 +284,13 @@ TYPED_TEST(MpiBindings, CanPutValuesWithFence) } else { data = std::vector{0, 0, 0, 0}; } - auto win = window(data.data(), 4, comm); + auto win = window(this->ref, data.data(), 4, comm); win.fence(); if (my_rank == 0) { for (auto rank = 0; rank < num_ranks; ++rank) { if (rank != my_rank) { - win.put(data.data(), 4, rank, 0, 4); + win.put(this->ref, data.data(), 4, rank, 0, 4); } } } @@ -301,8 +303,8 @@ TYPED_TEST(MpiBindings, CanPutValuesWithFence) TYPED_TEST(MpiBindings, CanAccumulateValues) { - using window = gko::mpi::window; - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); std::vector data; @@ -317,12 +319,13 @@ TYPED_TEST(MpiBindings, CanAccumulateValues) } { - auto win = window(data.data(), 4, comm); + auto win = window(this->ref, data.data(), 4, comm); if (my_rank == 0) { win.lock_all(); for (auto rank = 0; rank < num_ranks; ++rank) { if (rank != my_rank) { - win.accumulate(data.data(), 4, rank, 0, 4, MPI_SUM); + win.accumulate(this->ref, data.data(), 4, rank, 0, 4, + MPI_SUM); } } win.unlock_all(); @@ -348,8 +351,8 @@ TYPED_TEST(MpiBindings, CanAccumulateValues) TYPED_TEST(MpiBindings, CanNonBlockingAccumulateValues) { - using window = gko::mpi::window; - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); std::vector data; @@ -363,14 +366,15 @@ TYPED_TEST(MpiBindings, CanNonBlockingAccumulateValues) data = std::vector{0, 0, 0, 0}; } - gko::mpi::request req; + gko::experimental::mpi::request req; { - auto win = window(data.data(), 4, comm); + auto win = window(this->ref, data.data(), 4, comm); if (my_rank == 0) { win.lock_all(); for (auto rank = 0; rank < num_ranks; ++rank) { if (rank != my_rank) { - req = win.r_accumulate(data.data(), 4, rank, 0, 4, MPI_SUM); + req = win.r_accumulate(this->ref, data.data(), 4, rank, 0, + 4, MPI_SUM); } } win.unlock_all(); @@ -397,8 +401,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingAccumulateValues) TYPED_TEST(MpiBindings, CanGetValuesWithLockAll) { - using window = gko::mpi::window; - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); std::vector data; @@ -407,11 +411,11 @@ TYPED_TEST(MpiBindings, CanGetValuesWithLockAll) } else { data = std::vector{0, 0, 0, 0}; } - auto win = window(data.data(), 4, comm); + auto win = window(this->ref, data.data(), 4, comm); if (my_rank != 0) { win.lock_all(); - win.get(data.data(), 4, 0, 0, 4); + win.get(this->ref, data.data(), 4, 0, 0, 4); win.unlock_all(); } @@ -422,8 +426,8 @@ TYPED_TEST(MpiBindings, CanGetValuesWithLockAll) TYPED_TEST(MpiBindings, CanNonBlockingGetValuesWithLockAll) { - using window = gko::mpi::window; - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); std::vector data; @@ -432,12 +436,12 @@ TYPED_TEST(MpiBindings, CanNonBlockingGetValuesWithLockAll) } else { data = std::vector{0, 0, 0, 0}; } - gko::mpi::request req; - auto win = window(data.data(), 4, comm); + gko::experimental::mpi::request req; + auto win = window(this->ref, data.data(), 4, comm); if (my_rank != 0) { win.lock_all(); - req = win.r_get(data.data(), 4, 0, 0, 4); + req = win.r_get(this->ref, data.data(), 4, 0, 0, 4); win.unlock_all(); } @@ -449,8 +453,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingGetValuesWithLockAll) TYPED_TEST(MpiBindings, CanGetValuesWithExclusiveLock) { - using window = gko::mpi::window; - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); std::vector data; @@ -459,11 +463,11 @@ TYPED_TEST(MpiBindings, CanGetValuesWithExclusiveLock) } else { data = std::vector{0, 0, 0, 0}; } - auto win = window(data.data(), 4, comm); + auto win = window(this->ref, data.data(), 4, comm); if (my_rank != 0) { win.lock(0, window::lock_type::exclusive); - win.get(data.data(), 4, 0, 0, 4); + win.get(this->ref, data.data(), 4, 0, 0, 4); win.unlock(0); } @@ -474,8 +478,8 @@ TYPED_TEST(MpiBindings, CanGetValuesWithExclusiveLock) TYPED_TEST(MpiBindings, CanGetValuesWithSharedLock) { - using window = gko::mpi::window; - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); std::vector data; @@ -484,11 +488,11 @@ TYPED_TEST(MpiBindings, CanGetValuesWithSharedLock) } else { data = std::vector{0, 0, 0, 0}; } - auto win = window(data.data(), 4, comm); + auto win = window(this->ref, data.data(), 4, comm); if (my_rank != 0) { win.lock(0); - win.get(data.data(), 4, 0, 0, 4); + win.get(this->ref, data.data(), 4, 0, 0, 4); win.unlock(0); } @@ -499,8 +503,8 @@ TYPED_TEST(MpiBindings, CanGetValuesWithSharedLock) TYPED_TEST(MpiBindings, CanGetValuesWithFence) { - using window = gko::mpi::window; - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); std::vector data; @@ -509,11 +513,11 @@ TYPED_TEST(MpiBindings, CanGetValuesWithFence) } else { data = std::vector{0, 0, 0, 0}; } - auto win = window(data.data(), 4, comm); + auto win = window(this->ref, data.data(), 4, comm); win.fence(); if (my_rank != 0) { - win.get(data.data(), 4, 0, 0, 4); + win.get(this->ref, data.data(), 4, 0, 0, 4); } win.fence(); @@ -524,8 +528,8 @@ TYPED_TEST(MpiBindings, CanGetValuesWithFence) TYPED_TEST(MpiBindings, CanGetAccumulateValuesWithLockAll) { - using window = gko::mpi::window; - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); std::vector data; @@ -546,12 +550,12 @@ TYPED_TEST(MpiBindings, CanGetAccumulateValuesWithLockAll) } { - auto win = window(target.data(), 4, comm); + auto win = window(this->ref, target.data(), 4, comm); if (my_rank == 2) { win.lock_all(); - win.get_accumulate(data.data(), 4, result.data(), 4, 0, 0, 4, - MPI_SUM); + win.get_accumulate(this->ref, data.data(), 4, result.data(), 4, 0, + 0, 4, MPI_SUM); win.unlock_all(); } } @@ -570,8 +574,8 @@ TYPED_TEST(MpiBindings, CanGetAccumulateValuesWithLockAll) TYPED_TEST(MpiBindings, CanNonBlockingGetAccumulateValuesWithLockAll) { - using window = gko::mpi::window; - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); std::vector data; @@ -590,15 +594,15 @@ TYPED_TEST(MpiBindings, CanNonBlockingGetAccumulateValuesWithLockAll) data = std::vector{0, 0, 0, 0}; target = std::vector{0, 0, 0, 0}; } - gko::mpi::request req; + gko::experimental::mpi::request req; { - auto win = window(target.data(), 4, comm); + auto win = window(this->ref, target.data(), 4, comm); if (my_rank == 2) { win.lock_all(); - req = win.r_get_accumulate(data.data(), 4, result.data(), 4, 0, 0, - 4, MPI_SUM); + req = win.r_get_accumulate(this->ref, data.data(), 4, result.data(), + 4, 0, 0, 4, MPI_SUM); win.unlock_all(); } } @@ -623,8 +627,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingGetAccumulateValuesWithLockAll) TYPED_TEST(MpiBindings, CanFetchAndOperate) { - using window = gko::mpi::window; - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + using window = gko::experimental::mpi::window; + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); std::vector data; @@ -645,11 +649,12 @@ TYPED_TEST(MpiBindings, CanFetchAndOperate) } { - auto win = window(target.data(), 4, comm); + auto win = window(this->ref, target.data(), 4, comm); if (my_rank == 2) { win.lock_all(); - win.fetch_and_op(data.data(), result.data(), 0, 1, MPI_SUM); + win.fetch_and_op(this->ref, data.data(), result.data(), 0, 1, + MPI_SUM); win.unlock_all(); } } @@ -668,7 +673,7 @@ TYPED_TEST(MpiBindings, CanFetchAndOperate) TYPED_TEST(MpiBindings, CanBroadcastValues) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); auto array = gko::array{this->ref, 8}; @@ -676,7 +681,7 @@ TYPED_TEST(MpiBindings, CanBroadcastValues) array = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); } - comm.broadcast(array.get_data(), 8, 0); + comm.broadcast(this->ref, array.get_data(), 8, 0); auto ref = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); GKO_ASSERT_ARRAY_EQ(ref, array); @@ -685,7 +690,7 @@ TYPED_TEST(MpiBindings, CanBroadcastValues) TYPED_TEST(MpiBindings, CanNonBlockingBroadcastValues) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); auto array = gko::array{this->ref, 8}; @@ -693,7 +698,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingBroadcastValues) array = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); } - auto req = comm.i_broadcast(array.get_data(), 8, 0); + auto req = comm.i_broadcast(this->ref, array.get_data(), 8, 0); req.wait(); auto ref = gko::array(this->ref, {2, 3, 1, 3, -1, 0, 3, 1}); @@ -703,7 +708,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingBroadcastValues) TYPED_TEST(MpiBindings, CanReduceValues) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); TypeParam data, sum, max, min; @@ -717,9 +722,9 @@ TYPED_TEST(MpiBindings, CanReduceValues) data = 6; } - comm.reduce(&data, &sum, 1, MPI_SUM, 0); - comm.reduce(&data, &max, 1, MPI_MAX, 0); - comm.reduce(&data, &min, 1, MPI_MIN, 0); + comm.reduce(this->ref, &data, &sum, 1, MPI_SUM, 0); + comm.reduce(this->ref, &data, &max, 1, MPI_MAX, 0); + comm.reduce(this->ref, &data, &min, 1, MPI_MIN, 0); if (my_rank == 0) { EXPECT_EQ(sum, TypeParam{16}); @@ -731,7 +736,7 @@ TYPED_TEST(MpiBindings, CanReduceValues) TYPED_TEST(MpiBindings, CanNonBlockingReduceValues) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); TypeParam data, sum, max, min; @@ -745,9 +750,9 @@ TYPED_TEST(MpiBindings, CanNonBlockingReduceValues) data = 6; } - auto req1 = comm.i_reduce(&data, &sum, 1, MPI_SUM, 0); - auto req2 = comm.i_reduce(&data, &max, 1, MPI_MAX, 0); - auto req3 = comm.i_reduce(&data, &min, 1, MPI_MIN, 0); + auto req1 = comm.i_reduce(this->ref, &data, &sum, 1, MPI_SUM, 0); + auto req2 = comm.i_reduce(this->ref, &data, &max, 1, MPI_MAX, 0); + auto req3 = comm.i_reduce(this->ref, &data, &min, 1, MPI_MIN, 0); req1.wait(); req2.wait(); @@ -762,7 +767,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingReduceValues) TYPED_TEST(MpiBindings, CanAllReduceValues) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); TypeParam data, sum; @@ -776,7 +781,7 @@ TYPED_TEST(MpiBindings, CanAllReduceValues) data = 6; } - comm.all_reduce(&data, &sum, 1, MPI_SUM); + comm.all_reduce(this->ref, &data, &sum, 1, MPI_SUM); ASSERT_EQ(sum, TypeParam{16}); } @@ -784,7 +789,7 @@ TYPED_TEST(MpiBindings, CanAllReduceValues) TYPED_TEST(MpiBindings, CanAllReduceValuesInPlace) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); TypeParam data; @@ -798,7 +803,7 @@ TYPED_TEST(MpiBindings, CanAllReduceValuesInPlace) data = 6; } - comm.all_reduce(&data, 1, MPI_SUM); + comm.all_reduce(this->ref, &data, 1, MPI_SUM); ASSERT_EQ(data, TypeParam{16}); } @@ -806,7 +811,7 @@ TYPED_TEST(MpiBindings, CanAllReduceValuesInPlace) TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValues) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); TypeParam data, sum; @@ -820,7 +825,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValues) data = 6; } - auto req = comm.i_all_reduce(&data, &sum, 1, MPI_SUM); + auto req = comm.i_all_reduce(this->ref, &data, &sum, 1, MPI_SUM); req.wait(); ASSERT_EQ(sum, TypeParam{16}); @@ -829,7 +834,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValues) TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValuesInPlace) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); TypeParam data; @@ -843,7 +848,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValuesInPlace) data = 6; } - auto req = comm.i_all_reduce(&data, 1, MPI_SUM); + auto req = comm.i_all_reduce(this->ref, &data, 1, MPI_SUM); req.wait(); ASSERT_EQ(data, TypeParam{16}); @@ -852,7 +857,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValuesInPlace) TYPED_TEST(MpiBindings, CanGatherValues) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); TypeParam data; @@ -868,7 +873,7 @@ TYPED_TEST(MpiBindings, CanGatherValues) auto gather_array = gko::array{ this->ref, static_cast(num_ranks)}; - comm.gather(&data, 1, gather_array.get_data(), 1, 0); + comm.gather(this->ref, &data, 1, gather_array.get_data(), 1, 0); if (my_rank == 0) { auto ref = gko::array(this->ref, {3, 5, 2, 6}); @@ -879,7 +884,7 @@ TYPED_TEST(MpiBindings, CanGatherValues) TYPED_TEST(MpiBindings, CanNonBlockingGatherValues) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); TypeParam data; @@ -895,7 +900,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingGatherValues) auto gather_array = gko::array{ this->ref, static_cast(num_ranks)}; - auto req = comm.i_gather(&data, 1, gather_array.get_data(), 1, 0); + auto req = + comm.i_gather(this->ref, &data, 1, gather_array.get_data(), 1, 0); req.wait(); if (my_rank == 0) { @@ -907,7 +913,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingGatherValues) TYPED_TEST(MpiBindings, CanAllGatherValues) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); TypeParam data; @@ -923,7 +929,7 @@ TYPED_TEST(MpiBindings, CanAllGatherValues) auto gather_array = gko::array{ this->ref, static_cast(num_ranks)}; - comm.all_gather(&data, 1, gather_array.get_data(), 1); + comm.all_gather(this->ref, &data, 1, gather_array.get_data(), 1); auto ref = gko::array(this->ref, {3, 5, 2, 6}); GKO_ASSERT_ARRAY_EQ(ref, gather_array); @@ -932,7 +938,7 @@ TYPED_TEST(MpiBindings, CanAllGatherValues) TYPED_TEST(MpiBindings, CanNonBlockingAllGatherValues) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); TypeParam data; @@ -948,7 +954,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllGatherValues) auto gather_array = gko::array{ this->ref, static_cast(num_ranks)}; - auto req = comm.i_all_gather(&data, 1, gather_array.get_data(), 1); + auto req = + comm.i_all_gather(this->ref, &data, 1, gather_array.get_data(), 1); req.wait(); auto ref = gko::array(this->ref, {3, 5, 2, 6}); @@ -958,7 +965,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllGatherValues) TYPED_TEST(MpiBindings, CanGatherValuesWithDisplacements) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); auto gather_from_array = gko::array{this->ref}; @@ -983,8 +990,8 @@ TYPED_TEST(MpiBindings, CanGatherValuesWithDisplacements) gather_from_array = gko::array{this->ref, {1, -4, 5}}; } - comm.gather(&nelems, 1, r_counts.get_data(), 1, 0); - comm.gather_v(gather_from_array.get_data(), nelems, + comm.gather(this->ref, &nelems, 1, r_counts.get_data(), 1, 0); + comm.gather_v(this->ref, gather_from_array.get_data(), nelems, gather_into_array.get_data(), r_counts.get_data(), displacements.get_data(), 0); @@ -1001,7 +1008,7 @@ TYPED_TEST(MpiBindings, CanGatherValuesWithDisplacements) TYPED_TEST(MpiBindings, CanNonBlockingGatherValuesWithDisplacements) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); auto gather_from_array = gko::array{this->ref}; @@ -1026,10 +1033,11 @@ TYPED_TEST(MpiBindings, CanNonBlockingGatherValuesWithDisplacements) gather_from_array = gko::array{this->ref, {1, -4, 5}}; } - comm.gather(&nelems, 1, r_counts.get_data(), 1, 0); - auto req = comm.i_gather_v( - gather_from_array.get_data(), nelems, gather_into_array.get_data(), - r_counts.get_data(), displacements.get_data(), 0); + comm.gather(this->ref, &nelems, 1, r_counts.get_data(), 1, 0); + auto req = + comm.i_gather_v(this->ref, gather_from_array.get_data(), nelems, + gather_into_array.get_data(), r_counts.get_data(), + displacements.get_data(), 0); req.wait(); auto comp_data = gather_into_array.get_data(); @@ -1045,7 +1053,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingGatherValuesWithDisplacements) TYPED_TEST(MpiBindings, CanScatterValues) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); auto scatter_from_array = gko::array{this->ref}; @@ -1055,7 +1063,7 @@ TYPED_TEST(MpiBindings, CanScatterValues) } auto scatter_into_array = gko::array{this->ref, 2}; - comm.scatter(scatter_from_array.get_data(), 2, + comm.scatter(this->ref, scatter_from_array.get_data(), 2, scatter_into_array.get_data(), 2, 0); auto comp_data = scatter_into_array.get_data(); @@ -1078,7 +1086,7 @@ TYPED_TEST(MpiBindings, CanScatterValues) TYPED_TEST(MpiBindings, CanNonBlockingScatterValues) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); auto scatter_from_array = gko::array{this->ref}; @@ -1088,7 +1096,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingScatterValues) } auto scatter_into_array = gko::array{this->ref, 2}; - auto req = comm.i_scatter(scatter_from_array.get_data(), 2, + auto req = comm.i_scatter(this->ref, scatter_from_array.get_data(), 2, scatter_into_array.get_data(), 2, 0); req.wait(); @@ -1112,7 +1120,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingScatterValues) TYPED_TEST(MpiBindings, CanScatterValuesWithDisplacements) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); auto scatter_from_array = gko::array{this->ref}; @@ -1136,10 +1144,10 @@ TYPED_TEST(MpiBindings, CanScatterValuesWithDisplacements) scatter_into_array = gko::array{this->ref, static_cast(nelems)}; - comm.gather(&nelems, 1, s_counts.get_data(), 1, 0); - comm.scatter_v(scatter_from_array.get_data(), s_counts.get_data(), - displacements.get_data(), scatter_into_array.get_data(), - nelems, 0); + comm.gather(this->ref, &nelems, 1, s_counts.get_data(), 1, 0); + comm.scatter_v(this->ref, scatter_from_array.get_data(), + s_counts.get_data(), displacements.get_data(), + scatter_into_array.get_data(), nelems, 0); auto comp_data = scatter_into_array.get_data(); if (my_rank == 0) { @@ -1163,7 +1171,7 @@ TYPED_TEST(MpiBindings, CanScatterValuesWithDisplacements) TYPED_TEST(MpiBindings, CanNonBlockingScatterValuesWithDisplacements) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); auto scatter_from_array = gko::array{this->ref}; @@ -1187,8 +1195,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingScatterValuesWithDisplacements) scatter_into_array = gko::array{this->ref, static_cast(nelems)}; - comm.gather(&nelems, 1, s_counts.get_data(), 1, 0); - auto req = comm.i_scatter_v(scatter_from_array.get_data(), + comm.gather(this->ref, &nelems, 1, s_counts.get_data(), 1, 0); + auto req = comm.i_scatter_v(this->ref, scatter_from_array.get_data(), s_counts.get_data(), displacements.get_data(), scatter_into_array.get_data(), nelems, 0); @@ -1215,7 +1223,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingScatterValuesWithDisplacements) TYPED_TEST(MpiBindings, AllToAllWorksCorrectly) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); auto send_array = gko::array{this->ref}; @@ -1236,7 +1244,8 @@ TYPED_TEST(MpiBindings, AllToAllWorksCorrectly) ref_array = gko::array(this->ref, {2, 2, 0, -2}); } - comm.all_to_all(send_array.get_data(), 1, recv_array.get_data(), 1); + comm.all_to_all(this->ref, send_array.get_data(), 1, recv_array.get_data(), + 1); GKO_ASSERT_ARRAY_EQ(recv_array, ref_array); } @@ -1244,7 +1253,7 @@ TYPED_TEST(MpiBindings, AllToAllWorksCorrectly) TYPED_TEST(MpiBindings, NonBlockingAllToAllWorksCorrectly) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); auto send_array = gko::array{this->ref}; @@ -1265,8 +1274,8 @@ TYPED_TEST(MpiBindings, NonBlockingAllToAllWorksCorrectly) ref_array = gko::array(this->ref, {2, 2, 0, -2}); } - auto req = - comm.i_all_to_all(send_array.get_data(), 1, recv_array.get_data(), 1); + auto req = comm.i_all_to_all(this->ref, send_array.get_data(), 1, + recv_array.get_data(), 1); req.wait(); GKO_ASSERT_ARRAY_EQ(recv_array, ref_array); @@ -1275,7 +1284,7 @@ TYPED_TEST(MpiBindings, NonBlockingAllToAllWorksCorrectly) TYPED_TEST(MpiBindings, AllToAllInPlaceWorksCorrectly) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); auto recv_array = gko::array{this->ref}; @@ -1295,14 +1304,14 @@ TYPED_TEST(MpiBindings, AllToAllInPlaceWorksCorrectly) ref_array = gko::array(this->ref, {2, 2, 0, -2}); } - comm.all_to_all(recv_array.get_data(), 1); + comm.all_to_all(this->ref, recv_array.get_data(), 1); GKO_ASSERT_ARRAY_EQ(recv_array, ref_array); } TYPED_TEST(MpiBindings, NonBlockingAllToAllInPlaceWorksCorrectly) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); auto recv_array = gko::array{this->ref}; @@ -1322,7 +1331,7 @@ TYPED_TEST(MpiBindings, NonBlockingAllToAllInPlaceWorksCorrectly) ref_array = gko::array(this->ref, {2, 2, 0, -2}); } - auto req = comm.i_all_to_all(recv_array.get_data(), 1); + auto req = comm.i_all_to_all(this->ref, recv_array.get_data(), 1); req.wait(); GKO_ASSERT_ARRAY_EQ(recv_array, ref_array); @@ -1331,7 +1340,7 @@ TYPED_TEST(MpiBindings, NonBlockingAllToAllInPlaceWorksCorrectly) TYPED_TEST(MpiBindings, AllToAllVWorksCorrectly) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); auto send_array = gko::array{this->ref}; @@ -1375,16 +1384,17 @@ TYPED_TEST(MpiBindings, AllToAllVWorksCorrectly) ref_array = gko::array{this->ref, {0, 2, 3, 3}}; } - comm.all_to_all_v(send_array.get_data(), scounts_array.get_data(), - soffset_array.get_data(), recv_array.get_data(), - rcounts_array.get_data(), roffset_array.get_data()); + comm.all_to_all_v(this->ref, send_array.get_data(), + scounts_array.get_data(), soffset_array.get_data(), + recv_array.get_data(), rcounts_array.get_data(), + roffset_array.get_data()); GKO_ASSERT_ARRAY_EQ(recv_array, ref_array); } TYPED_TEST(MpiBindings, NonBlockingAllToAllVWorksCorrectly) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); auto send_array = gko::array{this->ref}; @@ -1428,10 +1438,10 @@ TYPED_TEST(MpiBindings, NonBlockingAllToAllVWorksCorrectly) ref_array = gko::array{this->ref, {0, 2, 3, 3}}; } - auto req = - comm.i_all_to_all_v(send_array.get_data(), scounts_array.get_data(), - soffset_array.get_data(), recv_array.get_data(), - rcounts_array.get_data(), roffset_array.get_data()); + auto req = comm.i_all_to_all_v( + this->ref, send_array.get_data(), scounts_array.get_data(), + soffset_array.get_data(), recv_array.get_data(), + rcounts_array.get_data(), roffset_array.get_data()); req.wait(); GKO_ASSERT_ARRAY_EQ(recv_array, ref_array); @@ -1440,7 +1450,7 @@ TYPED_TEST(MpiBindings, NonBlockingAllToAllVWorksCorrectly) TYPED_TEST(MpiBindings, CanScanValues) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); TypeParam data, sum, max, min; @@ -1454,9 +1464,9 @@ TYPED_TEST(MpiBindings, CanScanValues) data = 6; } - comm.scan(&data, &sum, 1, MPI_SUM); - comm.scan(&data, &max, 1, MPI_MAX); - comm.scan(&data, &min, 1, MPI_MIN); + comm.scan(this->ref, &data, &sum, 1, MPI_SUM); + comm.scan(this->ref, &data, &max, 1, MPI_MAX); + comm.scan(this->ref, &data, &min, 1, MPI_MIN); if (my_rank == 0) { EXPECT_EQ(sum, TypeParam{3}); @@ -1480,7 +1490,7 @@ TYPED_TEST(MpiBindings, CanScanValues) TYPED_TEST(MpiBindings, CanNonBlockingScanValues) { - auto comm = gko::mpi::communicator(MPI_COMM_WORLD); + auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD); auto my_rank = comm.rank(); auto num_ranks = comm.size(); TypeParam data, sum, max, min; @@ -1494,9 +1504,9 @@ TYPED_TEST(MpiBindings, CanNonBlockingScanValues) data = 6; } - auto req1 = comm.i_scan(&data, &sum, 1, MPI_SUM); - auto req2 = comm.i_scan(&data, &max, 1, MPI_MAX); - auto req3 = comm.i_scan(&data, &min, 1, MPI_MIN); + auto req1 = comm.i_scan(this->ref, &data, &sum, 1, MPI_SUM); + auto req2 = comm.i_scan(this->ref, &data, &max, 1, MPI_MAX); + auto req3 = comm.i_scan(this->ref, &data, &min, 1, MPI_MIN); req1.wait(); req2.wait(); diff --git a/core/test/mpi/base/communicator.cpp b/core/test/mpi/base/communicator.cpp index d335b84ce09..2121ef41503 100644 --- a/core/test/mpi/base/communicator.cpp +++ b/core/test/mpi/base/communicator.cpp @@ -53,7 +53,7 @@ class Communicator : public ::testing::Test { ASSERT_EQ(comm.size(), 8); } - gko::mpi::communicator comm; + gko::experimental::mpi::communicator comm; int rank; }; @@ -88,7 +88,7 @@ TEST_F(Communicator, CommKnowsItsLocalRank) TEST_F(Communicator, CommunicatorCanBeCopyConstructed) { - gko::mpi::communicator copy(comm); + gko::experimental::mpi::communicator copy(comm); EXPECT_TRUE(copy == comm); } @@ -96,7 +96,7 @@ TEST_F(Communicator, CommunicatorCanBeCopyConstructed) TEST_F(Communicator, CommunicatorCanBeCopyAssigned) { - gko::mpi::communicator copy = comm; + gko::experimental::mpi::communicator copy = comm; EXPECT_TRUE(copy == comm); } @@ -104,8 +104,8 @@ TEST_F(Communicator, CommunicatorCanBeCopyAssigned) TEST_F(Communicator, CommunicatorCanBeMoveConstructed) { - gko::mpi::communicator comm2(MPI_COMM_WORLD); - gko::mpi::communicator copy(std::move(comm2)); + gko::experimental::mpi::communicator comm2(MPI_COMM_WORLD); + gko::experimental::mpi::communicator copy(std::move(comm2)); EXPECT_TRUE(copy == comm); } @@ -113,8 +113,8 @@ TEST_F(Communicator, CommunicatorCanBeMoveConstructed) TEST_F(Communicator, CommunicatorCanBeMoveAssigned) { - gko::mpi::communicator comm2(MPI_COMM_WORLD); - gko::mpi::communicator copy(MPI_COMM_NULL); + gko::experimental::mpi::communicator comm2(MPI_COMM_WORLD); + gko::experimental::mpi::communicator copy(MPI_COMM_NULL); copy = std::move(comm2); EXPECT_TRUE(copy == comm); @@ -133,7 +133,8 @@ TEST_F(Communicator, CanSetCustomCommunicator) auto world_size = comm.size(); auto color = world_rank / 4; - auto row_comm = gko::mpi::communicator(comm.get(), color, world_rank); + auto row_comm = + gko::experimental::mpi::communicator(comm.get(), color, world_rank); for (auto i = 0; i < world_size; ++i) { EXPECT_LT(row_comm.rank(), 4); } diff --git a/core/test/mpi/base/rank_mapping.cpp b/core/test/mpi/base/rank_mapping.cpp new file mode 100644 index 00000000000..ab7459de66b --- /dev/null +++ b/core/test/mpi/base/rank_mapping.cpp @@ -0,0 +1,133 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include + + +#include + + +#include "core/test/utils.hpp" + + +class MapRankToDevice : public ::testing::Test { +protected: + MapRankToDevice() + : comm(MPI_COMM_WORLD), + rank(gko::experimental::mpi::communicator(comm).rank()), + size(gko::experimental::mpi::communicator(comm).size()), + env({{"MV2_COMM_WORLD_LOCAL_RANK", ""}, + {"OMPI_COMM_WORLD_LOCAL_RANK", ""}, + {"MPI_LOCALRANKID", ""}, + {"SLURM_LOCALID", ""}}) + {} + + void SetUp() override + { + for (auto& it : env) { + const auto& env_name = it.first; + if (auto v = std::getenv(env_name.c_str())) { + env[env_name] = std::string(v); + } + unsetenv(env_name.c_str()); + } + } + + void TearDown() override + { + for (auto& it : env) { + const auto& env_name = it.first; + const auto& env_value = it.second; + setenv(env_name.c_str(), env_value.c_str(), 1); + } + } + + MPI_Comm comm; + int rank; + int size; + std::map env; +}; + + +TEST_F(MapRankToDevice, OneDevice) +{ + ASSERT_EQ(gko::experimental::mpi::map_rank_to_device_id(comm, 1), 0); +} + + +TEST_F(MapRankToDevice, EqualDevicesAndRanks) +{ + auto id = gko::experimental::mpi::map_rank_to_device_id(comm, size); + + ASSERT_EQ(id, rank); +} + + +TEST_F(MapRankToDevice, LessDevicesThanRanks) +{ + int target_id[] = {0, 1, 2, 0}; + + auto id = gko::experimental::mpi::map_rank_to_device_id(comm, 3); + + ASSERT_EQ(id, target_id[rank]); +} + + +TEST_F(MapRankToDevice, UsesRankFromEnvironment) +{ + int reordered_rank[] = {2, 3, 1, 0}; + for (const auto& it : env) { + SCOPED_TRACE("Using environment variable " + it.first); + setenv(it.first.c_str(), std::to_string(reordered_rank[rank]).c_str(), + 1); + + auto id = gko::experimental::mpi::map_rank_to_device_id(comm, size); + + ASSERT_EQ(id, reordered_rank[rank]); + unsetenv(it.first.c_str()); + } +} + + +TEST_F(MapRankToDevice, NonCommWorld) +{ + MPI_Comm split; + MPI_Comm_split(comm, static_cast(rank < 3), rank, &split); + int target_id[] = {0, 1, 0, 0}; + + auto id = gko::experimental::mpi::map_rank_to_device_id(split, 2); + + ASSERT_EQ(id, target_id[rank]); +} diff --git a/core/test/mpi/distributed/CMakeLists.txt b/core/test/mpi/distributed/CMakeLists.txt new file mode 100644 index 00000000000..ec4f4dc5954 --- /dev/null +++ b/core/test/mpi/distributed/CMakeLists.txt @@ -0,0 +1 @@ +ginkgo_create_test(matrix MPI_SIZE 3) diff --git a/core/test/mpi/distributed/matrix.cpp b/core/test/mpi/distributed/matrix.cpp new file mode 100644 index 00000000000..98167bd5d1f --- /dev/null +++ b/core/test/mpi/distributed/matrix.cpp @@ -0,0 +1,282 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include "core/test/utils.hpp" + + +namespace { + + +using comm_index_type = gko::experimental::distributed::comm_index_type; + + +template +class CustomLinOp + : public gko::EnableLinOp>, + public gko::ReadableFromMatrixData, + public gko::EnableCreateMethod> { +public: + void read(const gko::matrix_data& data) override {} + + explicit CustomLinOp(std::shared_ptr exec) + : gko::EnableLinOp(exec) + {} + +protected: + void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override {} + + void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b, + const gko::LinOp* beta, gko::LinOp* x) const override + {} +}; + + +template +class MatrixBuilder : public ::testing::Test { +protected: + using value_type = + typename std::tuple_element<0, decltype( + ValueLocalGlobalIndexType())>::type; + using local_index_type = + typename std::tuple_element<1, decltype( + ValueLocalGlobalIndexType())>::type; + using global_index_type = + typename std::tuple_element<2, decltype( + ValueLocalGlobalIndexType())>::type; + using dist_mtx_type = + gko::experimental::distributed::Matrix; + using dist_vec_type = gko::experimental::distributed::Vector; + + MatrixBuilder() + : ref(gko::ReferenceExecutor::create()), + comm(gko::experimental::mpi::communicator(MPI_COMM_WORLD)) + {} + + void SetUp() override {} + + template + void forall_matrix_types(F&& f) + { + using namespace gko::matrix; + auto empty_test = [](const gko::LinOp*) {}; + { + SCOPED_TRACE("With Coo"); + f(gko::with_matrix_type(), + Coo::create(this->ref), empty_test); + } + { + SCOPED_TRACE("With Csr"); + f(gko::with_matrix_type(), + Csr::create(this->ref), empty_test); + } + { + SCOPED_TRACE("With Csr with strategy"); + using ConcreteCsr = Csr; + f(gko::with_matrix_type( + std::make_shared()), + ConcreteCsr::create(this->ref), [](const gko::LinOp* local_mat) { + auto local_csr = gko::as(local_mat); + + ASSERT_NO_THROW(gko::as( + local_csr->get_strategy())); + }); + } + { + SCOPED_TRACE("With Ell"); + f(gko::with_matrix_type(), + Ell::create(this->ref), empty_test); + } + { + SCOPED_TRACE("With Fbcsr"); + f(gko::with_matrix_type(), + Fbcsr::create(this->ref), + empty_test); + } + { + SCOPED_TRACE("With Fbcsr with block_size"); + f(gko::with_matrix_type(5), + Fbcsr::create(this->ref), + [](const gko::LinOp* local_mat) { + auto local_fbcsr = + gko::as>(local_mat); + + ASSERT_EQ(local_fbcsr->get_block_size(), 5); + }); + } + { + SCOPED_TRACE("With Hybrid"); + f(gko::with_matrix_type(), + Hybrid::create(this->ref), + empty_test); + } + { + SCOPED_TRACE("With Hybrid with strategy"); + using Concrete = Hybrid; + f(gko::with_matrix_type( + std::make_shared(11)), + Concrete::create(this->ref), [](const gko::LinOp* local_mat) { + auto local_hy = gko::as(local_mat); + + ASSERT_NO_THROW(gko::as( + local_hy->get_strategy())); + ASSERT_EQ(gko::as( + local_hy->get_strategy()) + ->get_num_columns(), + 11); + }); + } + { + SCOPED_TRACE("With Sellp"); + f(gko::with_matrix_type(), + Sellp::create(this->ref), + empty_test); + } + } + + template + void expected_interface_no_throw(dist_mtx_type* mat, + LocalMatrixType local_matrix_type, + NonLocalMatrixType non_local_matrix_type) + { + auto num_rows = mat->get_size()[0]; + auto a = dist_vec_type::create(ref, comm); + auto b = dist_vec_type::create(ref, comm); + auto convert_result = dist_mtx_type::create( + ref, comm, local_matrix_type, non_local_matrix_type); + auto move_result = dist_mtx_type::create(ref, comm, local_matrix_type, + non_local_matrix_type); + + ASSERT_NO_THROW(mat->apply(a.get(), b.get())); + ASSERT_NO_THROW(mat->convert_to(convert_result.get())); + ASSERT_NO_THROW(mat->move_to(move_result.get())); + } + + + std::shared_ptr ref; + gko::experimental::mpi::communicator comm; +}; + +TYPED_TEST_SUITE(MatrixBuilder, gko::test::ValueLocalGlobalIndexTypes); + + +TYPED_TEST(MatrixBuilder, BuildWithLocal) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::local_index_type; + using dist_mat_type = typename TestFixture::dist_mtx_type; + this->template forall_matrix_types([this](auto with_matrix_type, + auto expected_type_ptr, + auto additional_test) { + using expected_type = typename std::remove_pointer::type; + + auto mat = + dist_mat_type ::create(this->ref, this->comm, with_matrix_type); + + ASSERT_NO_THROW(gko::as(mat->get_local_matrix())); + additional_test(mat->get_local_matrix().get()); + additional_test(mat->get_non_local_matrix().get()); + this->expected_interface_no_throw(mat.get(), with_matrix_type, + with_matrix_type); + }); +} + + +TYPED_TEST(MatrixBuilder, BuildWithLocalAndNonLocal) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::local_index_type; + using dist_mat_type = typename TestFixture::dist_mtx_type; + this->template forall_matrix_types([this](auto with_local_matrix_type, + auto expected_local_type_ptr, + auto additional_local_test) { + using expected_local_type = typename std::remove_pointer::type; + this->forall_matrix_types([=](auto with_non_local_matrix_type, + auto expected_non_local_type_ptr, + auto additional_non_local_test) { + using expected_non_local_type = + typename std::remove_pointer::type; + + auto mat = dist_mat_type ::create(this->ref, this->comm, + with_local_matrix_type, + with_non_local_matrix_type); + + ASSERT_NO_THROW( + gko::as(mat->get_local_matrix())); + ASSERT_NO_THROW( + gko::as(mat->get_non_local_matrix())); + additional_local_test(mat->get_local_matrix().get()); + additional_non_local_test(mat->get_non_local_matrix().get()); + this->expected_interface_no_throw(mat.get(), with_local_matrix_type, + with_non_local_matrix_type); + }); + }); +} + + +TYPED_TEST(MatrixBuilder, BuildWithCustomLinOp) +{ + using value_type = typename TestFixture::value_type; + using index_type = typename TestFixture::local_index_type; + using dist_mat_type = typename TestFixture::dist_mtx_type; + using custom_type = CustomLinOp; + + auto mat = dist_mat_type::create(this->ref, this->comm, + gko::with_matrix_type()); + + ASSERT_NO_THROW(gko::as(mat->get_local_matrix())); + this->expected_interface_no_throw(mat.get(), + gko::with_matrix_type(), + gko::with_matrix_type()); +} + + +} // namespace diff --git a/core/test/mpi/gtest/mpi_listener.cpp b/core/test/mpi/gtest/mpi_listener.cpp index d74a77040aa..f26f8d5d60b 100644 --- a/core/test/mpi/gtest/mpi_listener.cpp +++ b/core/test/mpi/gtest/mpi_listener.cpp @@ -40,12 +40,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *******************************************************************************/ #include -#include #include #include #include +#include + + #include @@ -383,5 +385,5 @@ int main(int argc, char** argv) listeners.Append( new GTestMPIListener::MPIWrapperPrinter(l, MPI_COMM_WORLD)); int result = RUN_ALL_TESTS(); - return 0; + return result; } diff --git a/core/test/stop/criterion.cpp b/core/test/stop/criterion.cpp index 61b5439fe5a..c9776df7da2 100644 --- a/core/test/stop/criterion.cpp +++ b/core/test/stop/criterion.cpp @@ -73,8 +73,7 @@ struct DummyLogger : public gko::log::Logger { class DummyCriterion : public gko::EnablePolymorphicObject { - friend class gko::EnablePolymorphicObject; + friend class gko::polymorphic_object_traits; public: explicit DummyCriterion(std::shared_ptr exec) diff --git a/core/test/utils.hpp b/core/test/utils.hpp index c99b323108f..361aaa74e72 100644 --- a/core/test/utils.hpp +++ b/core/test/utils.hpp @@ -185,6 +185,30 @@ using TwoValueIndexType = #endif +using ValueLocalGlobalIndexTypes = +#if GINKGO_DPCPP_SINGLE_MODE + ::testing::Types, + std::tuple, + std::tuple, + std::tuple, gko::int32, int32>, + std::tuple, gko::int32, int64>, + std::tuple, gko::int64, int64>>; +#else + ::testing::Types, + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple, + std::tuple, gko::int32, int32>, + std::tuple, gko::int32, int64>, + std::tuple, gko::int64, int64>, + std::tuple, gko::int32, int32>, + std::tuple, gko::int32, int64>, + std::tuple, gko::int64, int64>>; +#endif + + template struct reduction_factor { using nc_output = remove_complex; diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp index cc03ade81c9..f4e6e4e26dc 100644 --- a/core/test/utils/matrix_generator.hpp +++ b/core/test/utils/matrix_generator.hpp @@ -132,6 +132,30 @@ matrix_data generate_random_matrix_data( } +/** + * Generates device matrix data for a random matrix. + * + * @see generate_random_matrix_data + */ +template +gko::device_matrix_data +generate_random_device_matrix_data(gko::size_type num_rows, + gko::size_type num_cols, + NonzeroDistribution&& nonzero_dist, + ValueDistribution&& value_dist, + Engine&& engine, + std::shared_ptr exec) +{ + auto md = gko::test::generate_random_matrix_data( + num_rows, num_cols, std::forward(nonzero_dist), + std::forward(value_dist), + std::forward(engine)); + return gko::device_matrix_data::create_from_host(exec, + md); +} + + /** * Generates a random matrix. * diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt index 11a91936679..ac4e5c5bb8c 100644 --- a/cuda/CMakeLists.txt +++ b/cuda/CMakeLists.txt @@ -5,9 +5,12 @@ target_sources(ginkgo_cuda base/exception.cpp base/executor.cpp base/index_set_kernels.cpp + base/scoped_device_id.cpp base/version.cpp components/prefix_sum_kernels.cu + distributed/matrix_kernels.cu distributed/partition_kernels.cu + distributed/vector_kernels.cu factorization/cholesky_kernels.cu factorization/factorization_kernels.cu factorization/ic_kernels.cu diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp index 949fafbe0a2..d5c8cf323eb 100644 --- a/cuda/base/executor.cpp +++ b/cuda/base/executor.cpp @@ -47,7 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "cuda/base/config.hpp" #include "cuda/base/cublas_bindings.hpp" #include "cuda/base/cusparse_handle.hpp" -#include "cuda/base/device_guard.hpp" +#include "cuda/base/scoped_device_id.hpp" namespace gko { @@ -71,7 +71,7 @@ std::shared_ptr CudaExecutor::create( auto& num_execs = nvidia_device::get_num_execs(device_id); num_execs--; if (!num_execs && device_reset) { - cuda::device_guard g(device_id); + detail::cuda_scoped_device_id_guard g(device_id); cudaDeviceReset(); } }); @@ -82,7 +82,7 @@ void CudaExecutor::populate_exec_info(const machine_topology* mach_topo) { if (this->get_device_id() < this->get_num_devices() && this->get_device_id() >= 0) { - cuda::device_guard g(this->get_device_id()); + detail::cuda_scoped_device_id_guard g(this->get_device_id()); GKO_ASSERT_NO_CUDA_ERRORS( cudaDeviceGetPCIBusId(&(this->get_exec_info().pci_bus_id.front()), 13, this->get_device_id())); @@ -102,7 +102,7 @@ void OmpExecutor::raw_copy_to(const CudaExecutor* dest, size_type num_bytes, const void* src_ptr, void* dest_ptr) const { if (num_bytes > 0) { - cuda::device_guard g(dest->get_device_id()); + detail::cuda_scoped_device_id_guard g(dest->get_device_id()); GKO_ASSERT_NO_CUDA_ERRORS( cudaMemcpy(dest_ptr, src_ptr, num_bytes, cudaMemcpyHostToDevice)); } @@ -111,7 +111,7 @@ void OmpExecutor::raw_copy_to(const CudaExecutor* dest, size_type num_bytes, void CudaExecutor::raw_free(void* ptr) const noexcept { - cuda::device_guard g(this->get_device_id()); + detail::cuda_scoped_device_id_guard g(this->get_device_id()); auto error_code = cudaFree(ptr); if (error_code != cudaSuccess) { #if GKO_VERBOSE_LEVEL >= 1 @@ -130,7 +130,7 @@ void CudaExecutor::raw_free(void* ptr) const noexcept void* CudaExecutor::raw_alloc(size_type num_bytes) const { void* dev_ptr = nullptr; - cuda::device_guard g(this->get_device_id()); + detail::cuda_scoped_device_id_guard g(this->get_device_id()); int error_code = 0; if (this->alloc_mode_ == allocation_mode::unified_host) { error_code = cudaMallocManaged(&dev_ptr, num_bytes, cudaMemAttachHost); @@ -154,7 +154,7 @@ void CudaExecutor::raw_copy_to(const OmpExecutor*, size_type num_bytes, const void* src_ptr, void* dest_ptr) const { if (num_bytes > 0) { - cuda::device_guard g(this->get_device_id()); + detail::cuda_scoped_device_id_guard g(this->get_device_id()); GKO_ASSERT_NO_CUDA_ERRORS( cudaMemcpy(dest_ptr, src_ptr, num_bytes, cudaMemcpyDeviceToHost)); } @@ -166,7 +166,7 @@ void CudaExecutor::raw_copy_to(const HipExecutor* dest, size_type num_bytes, { #if GINKGO_HIP_PLATFORM_NVCC == 1 if (num_bytes > 0) { - cuda::device_guard g(this->get_device_id()); + detail::cuda_scoped_device_id_guard g(this->get_device_id()); GKO_ASSERT_NO_CUDA_ERRORS( cudaMemcpyPeer(dest_ptr, dest->get_device_id(), src_ptr, this->get_device_id(), num_bytes)); @@ -188,7 +188,7 @@ void CudaExecutor::raw_copy_to(const CudaExecutor* dest, size_type num_bytes, const void* src_ptr, void* dest_ptr) const { if (num_bytes > 0) { - cuda::device_guard g(this->get_device_id()); + detail::cuda_scoped_device_id_guard g(this->get_device_id()); GKO_ASSERT_NO_CUDA_ERRORS( cudaMemcpyPeer(dest_ptr, dest->get_device_id(), src_ptr, this->get_device_id(), num_bytes)); @@ -198,15 +198,21 @@ void CudaExecutor::raw_copy_to(const CudaExecutor* dest, size_type num_bytes, void CudaExecutor::synchronize() const { - cuda::device_guard g(this->get_device_id()); + detail::cuda_scoped_device_id_guard g(this->get_device_id()); GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceSynchronize()); } +scoped_device_id_guard CudaExecutor::get_scoped_device_id_guard() const +{ + return {this, this->get_device_id()}; +} + + void CudaExecutor::run(const Operation& op) const { this->template log(this, &op); - cuda::device_guard g(this->get_device_id()); + detail::cuda_scoped_device_id_guard g(this->get_device_id()); op.run( std::static_pointer_cast(this->shared_from_this())); this->template log(this, &op); @@ -229,7 +235,7 @@ void CudaExecutor::set_gpu_property() { if (this->get_device_id() < this->get_num_devices() && this->get_device_id() >= 0) { - cuda::device_guard g(this->get_device_id()); + detail::cuda_scoped_device_id_guard g(this->get_device_id()); GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceGetAttribute( &this->get_exec_info().major, cudaDevAttrComputeCapabilityMajor, this->get_device_id())); @@ -270,15 +276,15 @@ void CudaExecutor::init_handles() if (this->get_device_id() < this->get_num_devices() && this->get_device_id() >= 0) { const auto id = this->get_device_id(); - cuda::device_guard g(id); + detail::cuda_scoped_device_id_guard g(id); this->cublas_handle_ = handle_manager( kernels::cuda::cublas::init(), [id](cublasHandle_t handle) { - cuda::device_guard g(id); + detail::cuda_scoped_device_id_guard g(id); kernels::cuda::cublas::destroy(handle); }); this->cusparse_handle_ = handle_manager( kernels::cuda::cusparse::init(), [id](cusparseHandle_t handle) { - cuda::device_guard g(id); + detail::cuda_scoped_device_id_guard g(id); kernels::cuda::cusparse::destroy(handle); }); } diff --git a/cuda/base/scoped_device_id.cpp b/cuda/base/scoped_device_id.cpp new file mode 100644 index 00000000000..f43117c4ca1 --- /dev/null +++ b/cuda/base/scoped_device_id.cpp @@ -0,0 +1,107 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include +#include + + +#include + + +#include + + +#include "cuda/base/scoped_device_id.hpp" + + +namespace gko { +namespace detail { + + +cuda_scoped_device_id_guard::cuda_scoped_device_id_guard(int device_id) + : original_device_id_{}, need_reset_{} +{ + GKO_ASSERT_NO_CUDA_ERRORS(cudaGetDevice(&original_device_id_)); + if (original_device_id_ != device_id) { + GKO_ASSERT_NO_CUDA_ERRORS(cudaSetDevice(device_id)); + need_reset_ = true; + } +} + + +cuda_scoped_device_id_guard::~cuda_scoped_device_id_guard() +{ + if (need_reset_) { + auto error_code = cudaSetDevice(original_device_id_); + if (error_code != cudaSuccess) { +#if GKO_VERBOSE_LEVEL >= 1 + std::cerr + << "Unrecoverable CUDA error while resetting the device id to " + << original_device_id_ << " in " << __func__ << ": " + << cudaGetErrorName(error_code) << ": " + << cudaGetErrorString(error_code) << std::endl + << "Exiting program" << std::endl; +#endif // GKO_VERBOSE_LEVEL >= 1 + std::exit(error_code); + } + } +} + + +cuda_scoped_device_id_guard::cuda_scoped_device_id_guard( + gko::detail::cuda_scoped_device_id_guard&& other) noexcept +{ + *this = std::move(other); +} + + +cuda_scoped_device_id_guard& cuda_scoped_device_id_guard::operator=( + gko::detail::cuda_scoped_device_id_guard&& other) noexcept +{ + if (this != &other) { + original_device_id_ = std::exchange(other.original_device_id_, 0); + need_reset_ = std::exchange(other.need_reset_, false); + } + return *this; +} + + +} // namespace detail + + +scoped_device_id_guard::scoped_device_id_guard(const CudaExecutor* exec, + int device_id) + : scope_(std::make_unique(device_id)) +{} + + +} // namespace gko diff --git a/cuda/base/scoped_device_id.hpp b/cuda/base/scoped_device_id.hpp new file mode 100644 index 00000000000..1e5f57e122a --- /dev/null +++ b/cuda/base/scoped_device_id.hpp @@ -0,0 +1,77 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_CUDA_BASE_SCOPED_DEVICE_ID_HPP_ +#define GKO_CUDA_BASE_SCOPED_DEVICE_ID_HPP_ + + +#include + + +namespace gko { +namespace detail { + + +/** + * A scoped device id for CUDA. + */ +class cuda_scoped_device_id_guard : public generic_scoped_device_id_guard { +public: + /** + * The constructor sets the device id to the passed in value for the + * lifetime of the created object. + * + * @param device_id Set the device id to this. + */ + explicit cuda_scoped_device_id_guard(int device_id); + + /** + * This resets the device id. If this fails, the program is terminated. + */ + ~cuda_scoped_device_id_guard() override; + + cuda_scoped_device_id_guard(cuda_scoped_device_id_guard&& other) noexcept; + + cuda_scoped_device_id_guard& operator=( + cuda_scoped_device_id_guard&& other) noexcept; + +private: + int original_device_id_; + bool need_reset_; +}; + + +} // namespace detail +} // namespace gko + + +#endif // GKO_CUDA_BASE_SCOPED_DEVICE_ID_HPP_ diff --git a/cuda/base/device_guard.hpp b/cuda/distributed/matrix_kernels.cu similarity index 52% rename from cuda/base/device_guard.hpp rename to cuda/distributed/matrix_kernels.cu index e6f885e4c66..f4629799b63 100644 --- a/cuda/base/device_guard.hpp +++ b/cuda/distributed/matrix_kernels.cu @@ -30,70 +30,38 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_CUDA_BASE_DEVICE_GUARD_HPP_ -#define GKO_CUDA_BASE_DEVICE_GUARD_HPP_ +#include "core/distributed/matrix_kernels.hpp" -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include -#include +#include -#include +#include "cuda/components/atomic.cuh" namespace gko { +namespace kernels { namespace cuda { +namespace distributed_matrix { -/** - * This class defines a device guard for the cuda functions and the cuda module. - * The guard is used to make sure that the device code is run on the correct - * cuda device, when run with multiple devices. The class records the current - * device id and uses `cudaSetDevice` to set the device id to the one being - * passed in. After the scope has been exited, the destructor sets the device_id - * back to the one before entering the scope. - */ -class device_guard { -public: - device_guard(int device_id) : original_device_id{}, need_reset{} - { - GKO_ASSERT_NO_CUDA_ERRORS(cudaGetDevice(&original_device_id)); - if (original_device_id != device_id) { - GKO_ASSERT_NO_CUDA_ERRORS(cudaSetDevice(device_id)); - need_reset = true; - } - } - - device_guard(device_guard& other) = delete; - - device_guard& operator=(const device_guard& other) = delete; - - device_guard(device_guard&& other) = delete; - - device_guard const& operator=(device_guard&& other) = delete; - - ~device_guard() noexcept(false) - { - if (need_reset) { - /* Ignore the error during stack unwinding for this call */ - if (std::uncaught_exception()) { - cudaSetDevice(original_device_id); - } else { - GKO_ASSERT_NO_CUDA_ERRORS(cudaSetDevice(original_device_id)); - } - } - } - -private: - int original_device_id; - bool need_reset; -}; +#include "common/cuda_hip/distributed/matrix_kernels.hpp.inc" +} // namespace distributed_matrix } // namespace cuda +} // namespace kernels } // namespace gko - - -#endif // GKO_CUDA_BASE_DEVICE_GUARD_HPP_ diff --git a/cuda/distributed/vector_kernels.cu b/cuda/distributed/vector_kernels.cu new file mode 100644 index 00000000000..def3fc8ec87 --- /dev/null +++ b/cuda/distributed/vector_kernels.cu @@ -0,0 +1,59 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/distributed/vector_kernels.hpp" + + +#include +#include +#include +#include +#include +#include + + +#include + + +namespace gko { +namespace kernels { +namespace cuda { +namespace distributed_vector { + + +#include "common/cuda_hip/distributed/vector_kernels.hpp.inc" + + +} // namespace distributed_vector +} // namespace cuda +} // namespace kernels +} // namespace gko diff --git a/cuda/test/base/CMakeLists.txt b/cuda/test/base/CMakeLists.txt index 483d1a47913..7b0cd28436c 100644 --- a/cuda/test/base/CMakeLists.txt +++ b/cuda/test/base/CMakeLists.txt @@ -10,3 +10,4 @@ ginkgo_create_cuda_test(exception_helpers) ginkgo_create_cuda_test(kernel_launch) ginkgo_create_cuda_test(lin_op) ginkgo_create_cuda_test(math) +ginkgo_create_cuda_test(scoped_device_id) diff --git a/cuda/test/base/scoped_device_id.cu b/cuda/test/base/scoped_device_id.cu new file mode 100644 index 00000000000..608d8fcc3db --- /dev/null +++ b/cuda/test/base/scoped_device_id.cu @@ -0,0 +1,90 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +// force-top: on +// prevent compilation failure related to disappearing assert(...) statements +#include +// force-top: off + + +#include + + +#include + + +#include "cuda/base/scoped_device_id.hpp" + + +namespace { + + +class ScopedDeviceIdGuard : public ::testing::Test { +protected: + ScopedDeviceIdGuard() + : ref(gko::ReferenceExecutor::create()), + cuda(gko::CudaExecutor::create(0, ref)) + {} + + std::shared_ptr ref; + std::shared_ptr cuda; +}; + + +TEST_F(ScopedDeviceIdGuard, SetsId) +{ + auto new_device_id = std::max(cuda->get_num_devices() - 1, 0); + + gko::detail::cuda_scoped_device_id_guard g{new_device_id}; + + int device_id; + cudaGetDevice(&device_id); + ASSERT_EQ(device_id, new_device_id); +} + + +TEST_F(ScopedDeviceIdGuard, ResetsId) +{ + auto old_device_id = cuda->get_device_id(); + + { + auto new_device_id = std::max(cuda->get_num_devices() - 1, 0); + gko::detail::cuda_scoped_device_id_guard g{new_device_id}; + } + + int device_id; + cudaGetDevice(&device_id); + ASSERT_EQ(device_id, old_device_id); +} + + +} // namespace diff --git a/dev_tools/scripts/regroup b/dev_tools/scripts/regroup index 1756481e2e4..25455dada69 100644 --- a/dev_tools/scripts/regroup +++ b/dev_tools/scripts/regroup @@ -2,7 +2,7 @@ IncludeBlocks: Regroup IncludeCategories: - Regex: '^<(rapidjson|gflags|gtest|papi).*' Priority: 3 - - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative|oneapi).*' + - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative|oneapi|mpi).*' Priority: 2 - Regex: '^ Use multigrid with different precision multigrid_level as a * solver. * + + * + * @ref distributed_solver + * Use a distributed solver to solve a 1D Laplace equation. + * * * * @@ -405,5 +410,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * @ref mixed_multigrid_solver * * + + * + * Distributed + * + * @ref distributed_solver + * + * * */ diff --git a/doc/scripts/examples.pl b/doc/scripts/examples.pl index 87632d2e721..4194d8ae7a7 100644 --- a/doc/scripts/examples.pl +++ b/doc/scripts/examples.pl @@ -35,6 +35,7 @@ "stopping-criteria" => ',height=.25,width=.25,fillcolor="deepskyblue"', "preconditioners" => ',height=.25,width=.25,fillcolor="crimson"', "mixed-precision" => ',height=.25,width=.25,fillcolor="aquamarine"', + "distributed" => ',height=.25,width=.25,fillcolor="plum"', "unfinished" => ',height=.25,width=.25,style="dashed"', ); @@ -196,6 +197,7 @@ "stopping-criteria" => 'Stopping criteria', "preconditioners" => 'Preconditioners', "mixed-precision" => 'Mixed Precision', + "distributed" => 'Distributed techniques', "unfinished" => 'Unfinished codes', ); @@ -213,12 +215,12 @@ } # now add connections to make sure they appear nicely next to each other # in the legend -print " basic -- techniques -- logging -- stopping_criteria -- preconditioners -- mixed_precision -- unfinished;\n"; +print " basic -- techniques -- logging -- stopping_criteria -- preconditioners -- mixed_precision -- distributed -- unfinished;\n"; # we need to tell 'dot' that all of these are at the same # rank to ensure they appear next to (as opposed to atop) # each other -print " {rank=same; basic, techniques, logging, stopping_criteria, preconditioners, mixed_precision, unfinished}"; +print " {rank=same; basic, techniques, logging, stopping_criteria, preconditioners, mixed_precision, distributed, unfinished}"; # end the graph print "}\n"; diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt index e0f9806cf96..c1fae0ede26 100644 --- a/dpcpp/CMakeLists.txt +++ b/dpcpp/CMakeLists.txt @@ -13,9 +13,12 @@ target_sources(ginkgo_dpcpp base/executor.dp.cpp base/helper.dp.cpp base/index_set_kernels.dp.cpp + base/scoped_device_id.dp.cpp base/version.dp.cpp components/prefix_sum_kernels.dp.cpp + distributed/matrix_kernels.dp.cpp distributed/partition_kernels.dp.cpp + distributed/vector_kernels.dp.cpp factorization/cholesky_kernels.dp.cpp factorization/ic_kernels.dp.cpp factorization/ilu_kernels.dp.cpp diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp index cf51b504e64..8930d7afe2d 100644 --- a/dpcpp/base/executor.dp.cpp +++ b/dpcpp/base/executor.dp.cpp @@ -187,6 +187,11 @@ void DpcppExecutor::raw_copy_to(const DpcppExecutor* dest, size_type num_bytes, void DpcppExecutor::synchronize() const { queue_->wait_and_throw(); } +scoped_device_id_guard DpcppExecutor::get_scoped_device_id_guard() const +{ + return {this, this->get_device_id()}; +} + void DpcppExecutor::run(const Operation& op) const { diff --git a/dpcpp/base/scoped_device_id.dp.cpp b/dpcpp/base/scoped_device_id.dp.cpp new file mode 100644 index 00000000000..8bcda156266 --- /dev/null +++ b/dpcpp/base/scoped_device_id.dp.cpp @@ -0,0 +1,49 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include +#include + + +#include "core/base/noop_scoped_device_id_guard.hpp" + + +namespace gko { + + +scoped_device_id_guard::scoped_device_id_guard(const DpcppExecutor* exec, + int device_id) + : scope_(std::make_unique()) +{} + + +} // namespace gko diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp new file mode 100644 index 00000000000..cf94ec43ef0 --- /dev/null +++ b/dpcpp/distributed/matrix_kernels.dp.cpp @@ -0,0 +1,69 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/distributed/matrix_kernels.hpp" + + +#include + + +namespace gko { +namespace kernels { +namespace dpcpp { +namespace distributed_matrix { + + +template +void build_local_nonlocal( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + row_partition, + const experimental::distributed::Partition* + col_partition, + comm_index_type local_part, array& local_row_idxs, + array& local_col_idxs, array& local_values, + array& non_local_row_idxs, + array& non_local_col_idxs, + array& non_local_values, + array& local_gather_idxs, + array& recv_sizes, + array& non_local_to_global) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_BUILD_LOCAL_NONLOCAL); + + +} // namespace distributed_matrix +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/dpcpp/distributed/vector_kernels.dp.cpp b/dpcpp/distributed/vector_kernels.dp.cpp new file mode 100644 index 00000000000..c294ab0c0fb --- /dev/null +++ b/dpcpp/distributed/vector_kernels.dp.cpp @@ -0,0 +1,61 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/distributed/vector_kernels.hpp" + + +#include + + +namespace gko { +namespace kernels { +namespace dpcpp { +namespace distributed_vector { + + +template +void build_local( + std::shared_ptr exec, + const device_matrix_data& input, + const experimental::distributed::Partition* + partition, + comm_index_type local_part, + matrix::Dense* local_mtx) GKO_NOT_IMPLEMENTED; + +GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( + GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL); + + +} // namespace distributed_vector +} // namespace dpcpp +} // namespace kernels +} // namespace gko diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index b284bd244fe..2f4f1392fe8 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -49,6 +49,10 @@ if(GINKGO_HAVE_PAPI_SDE) list(APPEND EXAMPLES_LIST papi-logging) endif() +if(GINKGO_BUILD_MPI) + list(APPEND EXAMPLES_LIST distributed-solver) +endif() + foreach(example ${EXAMPLES_LIST}) add_subdirectory(${example}) endforeach() diff --git a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp index 3a6291e9e14..fbdb6b01384 100644 --- a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp +++ b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp @@ -47,8 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ class ByInteraction : public gko::EnablePolymorphicObject { - friend class gko::EnablePolymorphicObject; + friend class gko::polymorphic_object_traits; using Criterion = gko::stop::Criterion; public: diff --git a/examples/distributed-solver/CMakeLists.txt b/examples/distributed-solver/CMakeLists.txt new file mode 100644 index 00000000000..9e520b71559 --- /dev/null +++ b/examples/distributed-solver/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(distributed-solver distributed-solver.cpp) +target_link_libraries(distributed-solver Ginkgo::ginkgo) diff --git a/examples/distributed-solver/distributed-solver.cpp b/examples/distributed-solver/distributed-solver.cpp new file mode 100644 index 00000000000..d9a1050f32d --- /dev/null +++ b/examples/distributed-solver/distributed-solver.cpp @@ -0,0 +1,268 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +// @sect3{Include files} + +// This is the main ginkgo header file. +#include + +// Add the C++ iostream header to output information to the console. +#include +// Add the STL map header for the executor selection +#include +// Add the string manipulation header to handle strings. +#include + + +int main(int argc, char* argv[]) +{ + // @sect3{Type Definitiions} + // Define the needed types. In a parallel program we need to differentiate + // beweeen global and local indices, thus we have two index types. + using GlobalIndexType = gko::int64; + using LocalIndexType = gko::int32; + // The underlying value type. + using ValueType = double; + // As vector type we use the following, which implements a subset of @ref + // gko::matrix::Dense. + using dist_vec = gko::experimental::distributed::Vector; + // As matrix type we simply use the following type, which can read + // distributed data and be applied to a distributed vector. + using dist_mtx = + gko::experimental::distributed::Matrix; + // We still need a localized vector type to be used as scalars in the + // advanced apply operations. + using vec = gko::matrix::Dense; + // The partition type describes how the rows of the matrices are + // distributed. + using part_type = + gko::experimental::distributed::Partition; + // We can use here the same solver type as you would use in a + // non-distributed program. Please note that not all solvers support + // distributed systems at the moment. + using solver = gko::solver::Cg; + + // @sect3{Initialization and User Input Handling} + // Since this is an MPI program, we need to initialize and finalize + // MPI at the begin and end respectively of our program. This can be easily + // done with the following helper construct that uses RAII to automize the + // initialization and finalization. + const gko::experimental::mpi::environment env(argc, argv); + + // Create an MPI communicator wrapper and get the rank. + const gko::experimental::mpi::communicator comm{MPI_COMM_WORLD}; + const auto rank = comm.rank(); + + // Print the ginkgo version information and help message. + if (rank == 0) { + std::cout << gko::version_info::get() << std::endl; + } + if (argc == 2 && (std::string(argv[1]) == "--help")) { + if (rank == 0) { + std::cerr << "Usage: " << argv[0] + << " [executor] [num_grid_points] " << std::endl; + } + std::exit(-1); + } + + ValueType t_init = gko::experimental::mpi::get_walltime(); + + // User input settings: + // - The executor, defaults to reference. + // - The number of grid points, defaults to 100. + const auto executor_string = argc >= 2 ? argv[1] : "reference"; + const auto grid_dim = + static_cast(argc >= 3 ? std::atoi(argv[2]) : 100); + + // Pick the requested executor. + std::map()>> + exec_map{ + {"omp", [] { return gko::OmpExecutor::create(); }}, + {"cuda", + [&] { + return gko::CudaExecutor::create( + gko::experimental::mpi::map_rank_to_device_id( + MPI_COMM_WORLD, gko::CudaExecutor::get_num_devices()), + gko::ReferenceExecutor::create(), false, + gko::allocation_mode::device); + }}, + {"hip", + [&] { + return gko::HipExecutor::create( + gko::experimental::mpi::map_rank_to_device_id( + MPI_COMM_WORLD, gko::HipExecutor::get_num_devices()), + gko::ReferenceExecutor::create(), true); + }}, + {"dpcpp", + [&] { + auto ref = gko::ReferenceExecutor::create(); + if (gko::DpcppExecutor::get_num_devices("gpu") > 0) { + return gko::DpcppExecutor::create( + gko::experimental::mpi::map_rank_to_device_id( + MPI_COMM_WORLD, + gko::DpcppExecutor::get_num_devices("gpu")), + ref); + } else if (gko::DpcppExecutor::get_num_devices("cpu") > 0) { + return gko::DpcppExecutor::create( + gko::experimental::mpi::map_rank_to_device_id( + MPI_COMM_WORLD, + gko::DpcppExecutor::get_num_devices("cpu")), + ref); + } else { + throw std::runtime_error("No suitable DPC++ devices"); + } + }}, + {"reference", [] { return gko::ReferenceExecutor::create(); }}}; + const auto exec = exec_map.at(executor_string)(); + + // @sect3{Creating the Distributed Matrix and Vectors} + // As a first step, we create a partition of the rows. The partition + // consists of ranges of consecutive rows which are assigned a part-id. + // These part-ids will be used for the distributed data structures to + // determine which rows will be stored locally. In this example each rank + // has (nearly) the same number of rows, so we can use the following + // specialized constructor. See @ref + // gko::experimental::distributed::Partition for other modes of creating a + // partition. + const auto num_rows = grid_dim; + auto partition = gko::share(part_type::build_from_global_size_uniform( + exec->get_master(), comm.size(), + static_cast(num_rows))); + + // Assemble the matrix using a 3-pt stencil and fill the right-hand-side + // with a sine value. The distributed matrix supports only constructing an + // empty matrix of zero size and filling in the values with + // gko::experimental::distributed::Matrix::read_distributed. Only the data + // that belongs to the rows by this rank will be assembled. + gko::matrix_data A_data; + gko::matrix_data b_data; + gko::matrix_data x_data; + A_data.size = {num_rows, num_rows}; + b_data.size = {num_rows, 1}; + x_data.size = {num_rows, 1}; + const auto range_start = partition->get_range_bounds()[rank]; + const auto range_end = partition->get_range_bounds()[rank + 1]; + for (int i = range_start; i < range_end; i++) { + if (i > 0) { + A_data.nonzeros.emplace_back(i, i - 1, -1); + } + A_data.nonzeros.emplace_back(i, i, 2); + if (i < grid_dim - 1) { + A_data.nonzeros.emplace_back(i, i + 1, -1); + } + b_data.nonzeros.emplace_back(i, 0, std::sin(i * 0.01)); + x_data.nonzeros.emplace_back(i, 0, gko::zero()); + } + + // Take timings. + comm.synchronize(); + ValueType t_init_end = gko::experimental::mpi::get_walltime(); + + // Read the matrix data, currently this is only supported on CPU executors. + // This will also set up the communication pattern needed for the + // distributed matrix-vector multiplication. + auto A_host = gko::share(dist_mtx::create(exec->get_master(), comm)); + auto x_host = dist_vec::create(exec->get_master(), comm); + auto b_host = dist_vec::create(exec->get_master(), comm); + A_host->read_distributed(A_data, partition.get()); + b_host->read_distributed(b_data, partition.get()); + x_host->read_distributed(x_data, partition.get()); + // After reading, the matrix and vector can be moved to the chosen executor, + // since the distributed matrix supports SpMV also on devices. + auto A = gko::share(dist_mtx::create(exec, comm)); + auto x = dist_vec::create(exec, comm); + auto b = dist_vec::create(exec, comm); + A->copy_from(A_host.get()); + b->copy_from(b_host.get()); + x->copy_from(x_host.get()); + + // Take timings. + comm.synchronize(); + ValueType t_read_setup_end = gko::experimental::mpi::get_walltime(); + + // @sect3{Solve the Distributed System} + // Generate the solver, this is the same as in the non-distributed case. + auto Ainv = + solver::build() + .with_criteria( + gko::stop::Iteration::build().with_max_iters(100u).on(exec), + gko::stop::ResidualNorm::build() + .with_baseline(gko::stop::mode::absolute) + .with_reduction_factor(1e-4) + .on(exec)) + .on(exec) + ->generate(A); + + // Take timings. + comm.synchronize(); + ValueType t_solver_generate_end = gko::experimental::mpi::get_walltime(); + + // Apply the distributed solver, this is the same as in the non-distributed + // case. + Ainv->apply(gko::lend(b), gko::lend(x)); + + // Take timings. + comm.synchronize(); + ValueType t_solver_apply_end = gko::experimental::mpi::get_walltime(); + + // Compute the residual, this is done in the same way as in the + // non-distributed case. + x_host->copy_from(x.get()); + auto one = gko::initialize({1.0}, exec); + auto minus_one = gko::initialize({-1.0}, exec); + A_host->apply(gko::lend(minus_one), gko::lend(x_host), gko::lend(one), + gko::lend(b_host)); + auto res_norm = gko::initialize({0.0}, exec->get_master()); + b_host->compute_norm2(gko::lend(res_norm)); + + // Take timings. + comm.synchronize(); + ValueType t_end = gko::experimental::mpi::get_walltime(); + + // @sect3{Printing Results} + // Print the achieved residual norm and timings on rank 0. + if (comm.rank() == 0) { + // clang-format off + std::cout << "\nNum rows in matrix: " << num_rows + << "\nNum ranks: " << comm.size() + << "\nFinal Res norm: " << *res_norm->get_values() + << "\nInit time: " << t_init_end - t_init + << "\nRead time: " << t_read_setup_end - t_init + << "\nSolver generate time: " << t_solver_generate_end - t_read_setup_end + << "\nSolver apply time: " << t_solver_apply_end - t_solver_generate_end + << "\nTotal time: " << t_end - t_init + << std::endl; + // clang-format on + } +} diff --git a/examples/distributed-solver/doc/builds-on b/examples/distributed-solver/doc/builds-on new file mode 100644 index 00000000000..896db74e274 --- /dev/null +++ b/examples/distributed-solver/doc/builds-on @@ -0,0 +1 @@ +simple-solver three-pt-stencil-solver diff --git a/examples/distributed-solver/doc/intro.dox b/examples/distributed-solver/doc/intro.dox new file mode 100644 index 00000000000..4f5e6532b6f --- /dev/null +++ b/examples/distributed-solver/doc/intro.dox @@ -0,0 +1,8 @@ + +

Introduction

+This distributed solver example should help you understand the basics of using Ginkgo in a distributed setting. +The example will solve a simple 1D Laplace equation where the system can be distributed row-wise to multiple processes. +To run the solver with multiple processes, use `mpirun -n NUM_PROCS ./distributed-solver [executor] [num_grid_points]`. + +If you are using GPU devices, please make sure that you run this example with at most as many processes as you have GPU +devices available. diff --git a/examples/distributed-solver/doc/kind b/examples/distributed-solver/doc/kind new file mode 100644 index 00000000000..196aa616342 --- /dev/null +++ b/examples/distributed-solver/doc/kind @@ -0,0 +1 @@ +distributed diff --git a/examples/distributed-solver/doc/results.dox b/examples/distributed-solver/doc/results.dox new file mode 100644 index 00000000000..e888bf14a6f --- /dev/null +++ b/examples/distributed-solver/doc/results.dox @@ -0,0 +1,17 @@ +

Results

+This is the expected output for `mpirun -n 4 ./distributed-solver`: + +@code{.cpp} + +Num rows in matrix: 100 +Num ranks: 4 +Final Res norm: 5.58392e-12 +Init time: 0.0663887 +Read time: 0.0729806 +Solver generate time: 7.6348e-05 +Solver apply time: 0.0680783 +Total time: 0.141351 + +@endcode + +The timings may vary depending on the machine. diff --git a/examples/distributed-solver/doc/short-intro b/examples/distributed-solver/doc/short-intro new file mode 100644 index 00000000000..57a54287458 --- /dev/null +++ b/examples/distributed-solver/doc/short-intro @@ -0,0 +1 @@ +The distributed solver example. diff --git a/examples/distributed-solver/doc/tooltip b/examples/distributed-solver/doc/tooltip new file mode 100644 index 00000000000..3e6cc291852 --- /dev/null +++ b/examples/distributed-solver/doc/tooltip @@ -0,0 +1 @@ +Solves a distributed linear system. diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt index 3300e73b483..35f19e77406 100644 --- a/hip/CMakeLists.txt +++ b/hip/CMakeLists.txt @@ -3,9 +3,12 @@ set(GINKGO_HIP_SOURCES base/exception.hip.cpp base/executor.hip.cpp base/index_set_kernels.hip.cpp + base/scoped_device_id.hip.cpp base/version.hip.cpp components/prefix_sum_kernels.hip.cpp + distributed/matrix_kernels.hip.cpp distributed/partition_kernels.hip.cpp + distributed/vector_kernels.hip.cpp factorization/cholesky_kernels.hip.cpp factorization/factorization_kernels.hip.cpp factorization/ic_kernels.hip.cpp diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp index 0832a87a39a..d7fc631bb2d 100644 --- a/hip/base/executor.hip.cpp +++ b/hip/base/executor.hip.cpp @@ -45,9 +45,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "hip/base/config.hip.hpp" -#include "hip/base/device_guard.hip.hpp" #include "hip/base/hipblas_bindings.hip.hpp" #include "hip/base/hipsparse_bindings.hip.hpp" +#include "hip/base/scoped_device_id.hip.hpp" namespace gko { @@ -77,7 +77,7 @@ std::shared_ptr HipExecutor::create( auto& num_execs = hip_device_class::get_num_execs(device_id); num_execs--; if (!num_execs && device_reset) { - hip::device_guard g(device_id); + detail::hip_scoped_device_id_guard g(device_id); hipDeviceReset(); } }); @@ -88,7 +88,7 @@ void HipExecutor::populate_exec_info(const machine_topology* mach_topo) { if (this->get_device_id() < this->get_num_devices() && this->get_device_id() >= 0) { - hip::device_guard g(this->get_device_id()); + detail::hip_scoped_device_id_guard g(this->get_device_id()); GKO_ASSERT_NO_HIP_ERRORS( hipDeviceGetPCIBusId(&(this->get_exec_info().pci_bus_id.front()), 13, this->get_device_id())); @@ -108,7 +108,7 @@ void OmpExecutor::raw_copy_to(const HipExecutor* dest, size_type num_bytes, const void* src_ptr, void* dest_ptr) const { if (num_bytes > 0) { - hip::device_guard g(dest->get_device_id()); + detail::hip_scoped_device_id_guard g(dest->get_device_id()); GKO_ASSERT_NO_HIP_ERRORS( hipMemcpy(dest_ptr, src_ptr, num_bytes, hipMemcpyHostToDevice)); } @@ -117,7 +117,7 @@ void OmpExecutor::raw_copy_to(const HipExecutor* dest, size_type num_bytes, void HipExecutor::raw_free(void* ptr) const noexcept { - hip::device_guard g(this->get_device_id()); + detail::hip_scoped_device_id_guard g(this->get_device_id()); auto error_code = hipFree(ptr); if (error_code != hipSuccess) { #if GKO_VERBOSE_LEVEL >= 1 @@ -136,7 +136,7 @@ void HipExecutor::raw_free(void* ptr) const noexcept void* HipExecutor::raw_alloc(size_type num_bytes) const { void* dev_ptr = nullptr; - hip::device_guard g(this->get_device_id()); + detail::hip_scoped_device_id_guard g(this->get_device_id()); int error_code = 0; if (this->alloc_mode_ == allocation_mode::device) { error_code = hipMalloc(&dev_ptr, num_bytes); @@ -161,7 +161,7 @@ void HipExecutor::raw_copy_to(const OmpExecutor*, size_type num_bytes, const void* src_ptr, void* dest_ptr) const { if (num_bytes > 0) { - hip::device_guard g(this->get_device_id()); + detail::hip_scoped_device_id_guard g(this->get_device_id()); GKO_ASSERT_NO_HIP_ERRORS( hipMemcpy(dest_ptr, src_ptr, num_bytes, hipMemcpyDeviceToHost)); } @@ -173,7 +173,7 @@ void HipExecutor::raw_copy_to(const CudaExecutor* dest, size_type num_bytes, { #if GINKGO_HIP_PLATFORM_NVCC == 1 if (num_bytes > 0) { - hip::device_guard g(this->get_device_id()); + detail::hip_scoped_device_id_guard g(this->get_device_id()); GKO_ASSERT_NO_HIP_ERRORS(hipMemcpyPeer(dest_ptr, dest->get_device_id(), src_ptr, this->get_device_id(), num_bytes)); @@ -195,7 +195,7 @@ void HipExecutor::raw_copy_to(const HipExecutor* dest, size_type num_bytes, const void* src_ptr, void* dest_ptr) const { if (num_bytes > 0) { - hip::device_guard g(this->get_device_id()); + detail::hip_scoped_device_id_guard g(this->get_device_id()); GKO_ASSERT_NO_HIP_ERRORS(hipMemcpyPeer(dest_ptr, dest->get_device_id(), src_ptr, this->get_device_id(), num_bytes)); @@ -205,7 +205,7 @@ void HipExecutor::raw_copy_to(const HipExecutor* dest, size_type num_bytes, void HipExecutor::synchronize() const { - hip::device_guard g(this->get_device_id()); + detail::hip_scoped_device_id_guard g(this->get_device_id()); GKO_ASSERT_NO_HIP_ERRORS(hipDeviceSynchronize()); } @@ -213,13 +213,19 @@ void HipExecutor::synchronize() const void HipExecutor::run(const Operation& op) const { this->template log(this, &op); - hip::device_guard g(this->get_device_id()); + detail::hip_scoped_device_id_guard g(this->get_device_id()); op.run( std::static_pointer_cast(this->shared_from_this())); this->template log(this, &op); } +scoped_device_id_guard HipExecutor::get_scoped_device_id_guard() const +{ + return {this, this->get_device_id()}; +} + + int HipExecutor::get_num_devices() { int deviceCount = 0; @@ -236,7 +242,7 @@ void HipExecutor::set_gpu_property() { if (this->get_device_id() < this->get_num_devices() && this->get_device_id() >= 0) { - hip::device_guard g(this->get_device_id()); + detail::hip_scoped_device_id_guard g(this->get_device_id()); GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute( &this->get_exec_info().num_computing_units, hipDeviceAttributeMultiprocessorCount, this->get_device_id())); @@ -285,15 +291,15 @@ void HipExecutor::init_handles() if (this->get_device_id() < this->get_num_devices() && this->get_device_id() >= 0) { const auto id = this->get_device_id(); - hip::device_guard g(id); + detail::hip_scoped_device_id_guard g(id); this->hipblas_handle_ = handle_manager( kernels::hip::hipblas::init(), [id](hipblasContext* handle) { - hip::device_guard g(id); + detail::hip_scoped_device_id_guard g(id); kernels::hip::hipblas::destroy_hipblas_handle(handle); }); this->hipsparse_handle_ = handle_manager( kernels::hip::hipsparse::init(), [id](hipsparseContext* handle) { - hip::device_guard g(id); + detail::hip_scoped_device_id_guard g(id); kernels::hip::hipsparse::destroy_hipsparse_handle(handle); }); } diff --git a/hip/base/scoped_device_id.hip.cpp b/hip/base/scoped_device_id.hip.cpp new file mode 100644 index 00000000000..a6d59f1122a --- /dev/null +++ b/hip/base/scoped_device_id.hip.cpp @@ -0,0 +1,107 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include +#include + + +#include + + +#include + + +#include "hip/base/scoped_device_id.hip.hpp" + + +namespace gko { +namespace detail { + + +hip_scoped_device_id_guard::hip_scoped_device_id_guard(int device_id) + : original_device_id_{}, need_reset_{} +{ + GKO_ASSERT_NO_HIP_ERRORS(hipGetDevice(&original_device_id_)); + if (original_device_id_ != device_id) { + GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(device_id)); + need_reset_ = true; + } +} + + +hip_scoped_device_id_guard::~hip_scoped_device_id_guard() +{ + if (need_reset_) { + auto error_code = hipSetDevice(original_device_id_); + if (error_code != hipSuccess) { +#if GKO_VERBOSE_LEVEL >= 1 + std::cerr + << "Unrecoverable CUDA error while resetting the device id to " + << original_device_id_ << " in " << __func__ << ": " + << hipGetErrorName(error_code) << ": " + << hipGetErrorString(error_code) << std::endl + << "Exiting program" << std::endl; +#endif // GKO_VERBOSE_LEVEL >= 1 + std::exit(error_code); + } + } +} + + +hip_scoped_device_id_guard::hip_scoped_device_id_guard( + hip_scoped_device_id_guard&& other) noexcept +{ + *this = std::move(other); +} + + +hip_scoped_device_id_guard& hip_scoped_device_id_guard::operator=( + gko::detail::hip_scoped_device_id_guard&& other) noexcept +{ + if (this != &other) { + original_device_id_ = std::exchange(other.original_device_id_, 0); + need_reset_ = std::exchange(other.need_reset_, false); + } + return *this; +} + + +} // namespace detail + + +scoped_device_id_guard::scoped_device_id_guard(const HipExecutor* exec, + int device_id) + : scope_(std::make_unique(device_id)) +{} + + +} // namespace gko diff --git a/hip/base/scoped_device_id.hip.hpp b/hip/base/scoped_device_id.hip.hpp new file mode 100644 index 00000000000..b64825998a0 --- /dev/null +++ b/hip/base/scoped_device_id.hip.hpp @@ -0,0 +1,77 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_HIP_BASE_SCOPED_DEVICE_ID_HIP_HPP_ +#define GKO_HIP_BASE_SCOPED_DEVICE_ID_HIP_HPP_ + + +#include + + +namespace gko { +namespace detail { + + +/** + * A scoped device id for HIP. + */ +class hip_scoped_device_id_guard : public generic_scoped_device_id_guard { +public: + /** + * The constructor sets the device id to the passed in value for the + * lifetime of the created object. + * + * @param device_id Set the device id to this. + */ + explicit hip_scoped_device_id_guard(int device_id); + + /** + * This resets the device id. If this fails, the program is terminated. + */ + ~hip_scoped_device_id_guard() override; + + hip_scoped_device_id_guard(hip_scoped_device_id_guard&& other) noexcept; + + hip_scoped_device_id_guard& operator=( + hip_scoped_device_id_guard&& other) noexcept; + +private: + int original_device_id_; + bool need_reset_; +}; + + +} // namespace detail +} // namespace gko + + +#endif // GKO_HIP_BASE_SCOPED_DEVICE_ID_HIP_HPP_ diff --git a/hip/base/device_guard.hip.hpp b/hip/distributed/matrix_kernels.hip.cpp similarity index 52% rename from hip/base/device_guard.hip.hpp rename to hip/distributed/matrix_kernels.hip.cpp index 3999ebb7be8..03d46967831 100644 --- a/hip/base/device_guard.hip.hpp +++ b/hip/distributed/matrix_kernels.hip.cpp @@ -30,70 +30,38 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *************************************************************/ -#ifndef GKO_HIP_BASE_DEVICE_GUARD_HIP_HPP_ -#define GKO_HIP_BASE_DEVICE_GUARD_HIP_HPP_ +#include "core/distributed/matrix_kernels.hpp" -#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include -#include +#include -#include +#include "hip/components/atomic.hip.hpp" namespace gko { +namespace kernels { namespace hip { +namespace distributed_matrix { -/** - * This class defines a device guard for the hip functions and the hip module. - * The guard is used to make sure that the device code is run on the correct - * hip device, when run with multiple devices. The class records the current - * device id and uses `hipSetDevice` to set the device id to the one being - * passed in. After the scope has been exited, the destructor sets the device_id - * back to the one before entering the scope. - */ -class device_guard { -public: - device_guard(int device_id) : original_device_id{}, need_reset{} - { - GKO_ASSERT_NO_HIP_ERRORS(hipGetDevice(&original_device_id)); - if (original_device_id != device_id) { - GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(device_id)); - need_reset = true; - } - } - - device_guard(device_guard& other) = delete; - - device_guard& operator=(const device_guard& other) = delete; - - device_guard(device_guard&& other) = delete; - - device_guard const& operator=(device_guard&& other) = delete; - - ~device_guard() noexcept(false) - { - if (need_reset) { - /* Ignore the error during stack unwinding for this call */ - if (std::uncaught_exception()) { - hipSetDevice(original_device_id); - } else { - GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(original_device_id)); - } - } - } - -private: - int original_device_id; - bool need_reset; -}; +#include "common/cuda_hip/distributed/matrix_kernels.hpp.inc" +} // namespace distributed_matrix } // namespace hip +} // namespace kernels } // namespace gko - - -#endif // GKO_HIP_BASE_DEVICE_GUARD_HIP_HPP_ diff --git a/hip/distributed/vector_kernels.hip.cpp b/hip/distributed/vector_kernels.hip.cpp new file mode 100644 index 00000000000..6cbfa1224e9 --- /dev/null +++ b/hip/distributed/vector_kernels.hip.cpp @@ -0,0 +1,62 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#include "core/distributed/vector_kernels.hpp" + + +#include + + +#include +#include +#include +#include +#include +#include + + +#include + + +namespace gko { +namespace kernels { +namespace hip { +namespace distributed_vector { + + +#include "common/cuda_hip/distributed/vector_kernels.hpp.inc" + + +} // namespace distributed_vector +} // namespace hip +} // namespace kernels +} // namespace gko diff --git a/hip/test/base/CMakeLists.txt b/hip/test/base/CMakeLists.txt index 970746acb35..91dd4209d5d 100644 --- a/hip/test/base/CMakeLists.txt +++ b/hip/test/base/CMakeLists.txt @@ -15,3 +15,4 @@ if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}") else() ginkgo_create_hip_test(exception_helpers) endif() +ginkgo_create_hip_test(scoped_device_id) diff --git a/hip/test/base/scoped_device_id.hip.cpp b/hip/test/base/scoped_device_id.hip.cpp new file mode 100644 index 00000000000..032476ab3fb --- /dev/null +++ b/hip/test/base/scoped_device_id.hip.cpp @@ -0,0 +1,90 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +// force-top: on +// prevent compilation failure related to disappearing assert(...) statements +#include +// force-top: off + + +#include + + +#include + + +#include "hip/base/scoped_device_id.hip.hpp" + + +namespace { + + +class ScopedDeviceIdGuard : public ::testing::Test { +protected: + ScopedDeviceIdGuard() + : ref(gko::ReferenceExecutor::create()), + hip(gko::HipExecutor::create(0, ref)) + {} + + std::shared_ptr ref; + std::shared_ptr hip; +}; + + +TEST_F(ScopedDeviceIdGuard, SetsId) +{ + auto new_device_id = std::max(hip->get_num_devices() - 1, 0); + + gko::detail::hip_scoped_device_id_guard g{new_device_id}; + + int device_id; + hipGetDevice(&device_id); + ASSERT_EQ(device_id, new_device_id); +} + + +TEST_F(ScopedDeviceIdGuard, ResetsId) +{ + auto old_device_id = hip->get_device_id(); + + { + auto new_device_id = std::max(hip->get_num_devices() - 1, 0); + gko::detail::hip_scoped_device_id_guard g{new_device_id}; + } + + int device_id; + hipGetDevice(&device_id); + ASSERT_EQ(device_id, old_device_id); +} + + +} // namespace diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in index 198f465d4d0..a87ce01d37a 100644 --- a/include/ginkgo/config.hpp.in +++ b/include/ginkgo/config.hpp.in @@ -87,6 +87,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #cmakedefine01 GINKGO_BUILD_MPI // clang-format on +/* Is the MPI implementation GPU aware? */ +// clang-format off +#cmakedefine01 GINKGO_HAVE_GPU_AWARE_MPI +// clang-format on + /* Is HWLOC available ? */ // clang-format off @@ -94,4 +99,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. // clang-format on +/* Do we need to use blocking communication in our SpMV? */ +// clang-format off +#cmakedefine GINKGO_FORCE_SPMV_BLOCKING_COMM +// clang-format on + + #endif // GKO_INCLUDE_CONFIG_H diff --git a/include/ginkgo/core/base/abstract_factory.hpp b/include/ginkgo/core/base/abstract_factory.hpp index 3b7ecd712b6..1fae391d63d 100644 --- a/include/ginkgo/core/base/abstract_factory.hpp +++ b/include/ginkgo/core/base/abstract_factory.hpp @@ -150,7 +150,7 @@ class EnableDefaultFactory : public EnablePolymorphicObject, public EnablePolymorphicAssignment { public: - friend class EnablePolymorphicObject; + friend struct polymorphic_object_traits; using product_type = ProductType; using parameters_type = ParametersType; diff --git a/include/ginkgo/core/base/combination.hpp b/include/ginkgo/core/base/combination.hpp index 8686854720d..9e86dbd80d9 100644 --- a/include/ginkgo/core/base/combination.hpp +++ b/include/ginkgo/core/base/combination.hpp @@ -59,7 +59,7 @@ template class Combination : public EnableLinOp>, public EnableCreateMethod>, public Transposable { - friend class EnablePolymorphicObject; + friend struct polymorphic_object_traits; friend class EnableCreateMethod; public: diff --git a/include/ginkgo/core/base/composition.hpp b/include/ginkgo/core/base/composition.hpp index 6c9b9666323..02bd0ed2431 100644 --- a/include/ginkgo/core/base/composition.hpp +++ b/include/ginkgo/core/base/composition.hpp @@ -67,7 +67,7 @@ template class Composition : public EnableLinOp>, public EnableCreateMethod>, public Transposable { - friend class EnablePolymorphicObject; + friend struct polymorphic_object_traits; friend class EnableCreateMethod; public: diff --git a/include/ginkgo/core/base/dense_cache.hpp b/include/ginkgo/core/base/dense_cache.hpp new file mode 100644 index 00000000000..ea2a29ddf3c --- /dev/null +++ b/include/ginkgo/core/base/dense_cache.hpp @@ -0,0 +1,126 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_BASE_DENSE_CACHE_HPP_ +#define GKO_PUBLIC_CORE_BASE_DENSE_CACHE_HPP_ + + +#include + + +#include + + +namespace gko { +namespace matrix { + + +template +class Dense; + + +} + + +namespace detail { + + +/** + * Manages a Dense vector that is buffered and reused internally to avoid + * repeated allocations. Copying an instance will only yield an empty object + * since copying the cached vector would not make sense. The stored object is + * always mutable, so the cache can be used in a const-context. + * + * @internal The struct is present to wrap cache-like buffer storage that will + * not be copied when the outer object gets copied. + */ +template +struct DenseCache { + DenseCache() = default; + ~DenseCache() = default; + DenseCache(const DenseCache&) {} + DenseCache(DenseCache&&) noexcept {} + DenseCache& operator=(const DenseCache&) { return *this; } + DenseCache& operator=(DenseCache&&) noexcept { return *this; } + mutable std::unique_ptr> vec{}; + + + /** + * Initializes the buffered vector with the same configuration as the + * template vector, if + * - the current vector is null, + * - the sizes of the buffered and template vector differ, + * - the executor of the buffered and template vector differ. + * + * @note This does not copy any data from the template vector. + * + * @param template_vec Defines the configuration (executor, size, stride) + * of the buffered vector. + */ + void init_from(const matrix::Dense* template_vec) const; + + /** + * Initializes the buffered vector, if + * - the current vector is null, + * - the sizes differ, + * - the executor differs. + * + * @param exec Executor of the buffered vector. + * @param size Size of the buffered vector. + */ + void init(std::shared_ptr exec, dim<2> size) const; + + /** + * Reference access to the underlying vector. + * @return Reference to the stored vector. + */ + matrix::Dense& operator*() const { return *vec; } + + /** + * Pointer access to the underlying vector. + * @return Pointer to the stored vector. + */ + matrix::Dense* operator->() const { return vec.get(); } + + /** + * Pointer access to the underlying vector. + * @return Pointer to the stored vector. + */ + matrix::Dense* get() const { return vec.get(); } +}; + + +} // namespace detail +} // namespace gko + + +#endif // GKO_PUBLIC_CORE_BASE_DENSE_CACHE_HPP_ diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp index 697e86f941c..7623411d657 100644 --- a/include/ginkgo/core/base/executor.hpp +++ b/include/ginkgo/core/base/executor.hpp @@ -47,6 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include +#include #include #include #include @@ -796,6 +797,8 @@ class Executor : public log::EnableLogging { return this->verify_memory_from(other.get()); } + virtual scoped_device_id_guard get_scoped_device_id_guard() const = 0; + protected: /** * A struct that abstracts the executor info for different executors @@ -1272,6 +1275,8 @@ class OmpExecutor : public detail::ExecutorBase, return this->get_exec_info().num_pu_per_cu; } + scoped_device_id_guard get_scoped_device_id_guard() const override; + protected: OmpExecutor() { @@ -1327,6 +1332,11 @@ class ReferenceExecutor : public OmpExecutor { this->template log(this, &op); } + scoped_device_id_guard get_scoped_device_id_guard() const override + { + return {this, 0}; + } + protected: ReferenceExecutor() { @@ -1401,6 +1411,8 @@ class CudaExecutor : public detail::ExecutorBase, void run(const Operation& op) const override; + scoped_device_id_guard get_scoped_device_id_guard() const override; + /** * Get the CUDA device id of the device associated to this executor. */ @@ -1606,6 +1618,8 @@ class HipExecutor : public detail::ExecutorBase, void run(const Operation& op) const override; + scoped_device_id_guard get_scoped_device_id_guard() const override; + /** * Get the HIP device id of the device associated to this executor. */ @@ -1807,6 +1821,8 @@ class DpcppExecutor : public detail::ExecutorBase, void run(const Operation& op) const override; + scoped_device_id_guard get_scoped_device_id_guard() const override; + /** * Get the DPCPP device id of the device associated to this executor. * diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp index e29230186f6..f174bfcfe6c 100644 --- a/include/ginkgo/core/base/lin_op.hpp +++ b/include/ginkgo/core/base/lin_op.hpp @@ -1041,8 +1041,7 @@ public: \ class _factory_name \ : public ::gko::EnableDefaultLinOpFactory<_factory_name, _lin_op, \ _parameters_name##_type> { \ - friend class ::gko::EnablePolymorphicObject<_factory_name, \ - ::gko::LinOpFactory>; \ + friend class ::gko::polymorphic_object_traits<_factory_name>; \ friend class ::gko::enable_parameters_type<_parameters_name##_type, \ _factory_name>; \ explicit _factory_name(std::shared_ptr exec) \ diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp index c25f92d3f34..e41a6dd98ed 100644 --- a/include/ginkgo/core/base/mpi.hpp +++ b/include/ginkgo/core/base/mpi.hpp @@ -35,9 +35,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include -#include -#include #include +#include #include @@ -55,15 +54,53 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. namespace gko { +namespace experimental { +/** + * @brief The mpi namespace, contains wrapper for many MPI functions. + * + * @ingroup mpi + * @ingroup distributed + */ namespace mpi { +/** + * Return if GPU aware functionality is available + */ +inline constexpr bool is_gpu_aware() +{ +#if GINKGO_HAVE_GPU_AWARE_MPI + return true; +#else + return false; +#endif +} + + +/** + * Maps each MPI rank to a single device id in a round robin manner. + * @param comm used to determine the node-local rank, if no suitable + * environment variable is available. + * @param num_devices the number of devices per node. + * @return device id that this rank should use. + */ +int map_rank_to_device_id(MPI_Comm comm, int num_devices); + + #define GKO_REGISTER_MPI_TYPE(input_type, mpi_type) \ template <> \ struct type_impl { \ static MPI_Datatype get_type() { return mpi_type; } \ } +/** + * A struct that is used to determine the MPI_Datatype of a specified type. + * + * @tparam T type of which the MPI_Datatype should be inferred. + * + * @note any specialization of this type hast to provide a static function + * `get_type()` that returns an MPI_Datatype + */ template struct type_impl {}; @@ -75,6 +112,7 @@ GKO_REGISTER_MPI_TYPE(int, MPI_INT); GKO_REGISTER_MPI_TYPE(unsigned short, MPI_UNSIGNED_SHORT); GKO_REGISTER_MPI_TYPE(unsigned long, MPI_UNSIGNED_LONG); GKO_REGISTER_MPI_TYPE(long, MPI_LONG); +GKO_REGISTER_MPI_TYPE(long long, MPI_LONG_LONG_INT); GKO_REGISTER_MPI_TYPE(float, MPI_FLOAT); GKO_REGISTER_MPI_TYPE(double, MPI_DOUBLE); GKO_REGISTER_MPI_TYPE(long double, MPI_LONG_DOUBLE); @@ -82,11 +120,86 @@ GKO_REGISTER_MPI_TYPE(std::complex, MPI_C_COMPLEX); GKO_REGISTER_MPI_TYPE(std::complex, MPI_C_DOUBLE_COMPLEX); -template -inline const T* in_place() -{ - return reinterpret_cast(MPI_IN_PLACE); -} +/** + * A move-only wrapper for a contiguous MPI_Datatype. + * + * The underlying MPI_Datatype is automatically created and committed when an + * object of this type is constructed, and freed when it is destructed. + */ +class contiguous_type { +public: + /** + * Constructs a wrapper for a contiguous MPI_Datatype. + * + * @param count the number of old_type elements the new datatype contains. + * @param old_type the MPI_Datatype that is contained. + */ + contiguous_type(int count, MPI_Datatype old_type) : type_(MPI_DATATYPE_NULL) + { + GKO_ASSERT_NO_MPI_ERRORS(MPI_Type_contiguous(count, old_type, &type_)); + GKO_ASSERT_NO_MPI_ERRORS(MPI_Type_commit(&type_)); + } + + /** + * Constructs empty wrapper with MPI_DATATYPE_NULL. + */ + contiguous_type() : type_(MPI_DATATYPE_NULL) {} + + /** + * Disallow copying of wrapper type. + */ + contiguous_type(const contiguous_type&) = delete; + + /** + * Disallow copying of wrapper type. + */ + contiguous_type& operator=(const contiguous_type&) = delete; + + /** + * Move constructor, leaves other with MPI_DATATYPE_NULL. + * + * @param other to be moved from object. + */ + contiguous_type(contiguous_type&& other) noexcept : type_(MPI_DATATYPE_NULL) + { + *this = std::move(other); + } + + /** + * Move assignment, leaves other with MPI_DATATYPE_NULL. + * + * @param other to be moved from object. + * + * @return this object. + */ + contiguous_type& operator=(contiguous_type&& other) noexcept + { + if (this != &other) { + this->type_ = std::exchange(other.type_, MPI_DATATYPE_NULL); + } + return *this; + } + + /** + * Destructs object by freeing wrapped MPI_Datatype. + */ + ~contiguous_type() + { + if (type_ != MPI_DATATYPE_NULL) { + MPI_Type_free(&type_); + } + } + + /** + * Access the underlying MPI_Datatype. + * + * @return the underlying MPI_Datatype. + */ + MPI_Datatype get() const { return type_; } + +private: + MPI_Datatype type_; +}; /** @@ -228,7 +341,8 @@ struct status { /** - * The request class is a light wrapper around the MPI_Request handle class. + * The request class is a light, move-only wrapper around the MPI_Request + * handle. */ class request { public: @@ -238,6 +352,30 @@ class request { */ request() : req_(MPI_REQUEST_NULL) {} + request(const request&) = delete; + + request& operator=(const request&) = delete; + + request(request&& o) noexcept { *this = std::move(o); } + + request& operator=(request&& o) noexcept + { + if (this != &o) { + this->req_ = std::exchange(o.req_, MPI_REQUEST_NULL); + } + return *this; + } + + ~request() + { + if (req_ != MPI_REQUEST_NULL) { + if (MPI_Request_free(&req_) != MPI_SUCCESS) { + std::terminate(); // since we can't throw in destructors, we + // have to terminate the program + } + } + } + /** * Get a pointer to the underlying MPI_Request handle. * @@ -282,9 +420,18 @@ inline std::vector wait_all(std::vector& req) /** - * A communicator class that takes in the given communicator and duplicates it - * for our purposes. As the class or object goes out of scope, the communicator - * is freed. + * A thin wrapper of MPI_Comm that supports most MPI calls. + * + * A wrapper class that takes in the given MPI communicator. If a bare MPI_Comm + * is provided, the wrapper takes no ownership of the MPI_Comm. Thus the + * MPI_Comm must remain valid throughout the lifetime of the communicator. If + * the communicator was created through splitting, the wrapper takes ownership + * of the MPI_Comm. In this case, as the class or object goes out of scope, the + * underlying MPI_Comm is freed. + * + * @note All MPI calls that work on a buffer take in an Executor as an + * additional argument. This argument specifies the memory space the + * buffer lives in. */ class communicator { public: @@ -389,15 +536,22 @@ class communicator { /** * Send (Blocking) data from calling process to destination rank. * + * @param exec The executor, on which the message buffer is located. * @param send_buffer the buffer to send * @param send_count the number of elements to send * @param destination_rank the rank to send the data to * @param send_tag the tag for the send call + * + * @tparam SendType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. */ template - void send(const SendType* send_buffer, const int send_count, - const int destination_rank, const int send_tag) const + void send(std::shared_ptr exec, const SendType* send_buffer, + const int send_count, const int destination_rank, + const int send_tag) const { + auto guard = exec->get_scoped_device_id_guard(); GKO_ASSERT_NO_MPI_ERRORS( MPI_Send(send_buffer, send_count, type_impl::get_type(), destination_rank, send_tag, this->get())); @@ -407,17 +561,24 @@ class communicator { * Send (Non-blocking, Immediate return) data from calling process to * destination rank. * + * @param exec The executor, on which the message buffer is located. * @param send_buffer the buffer to send * @param send_count the number of elements to send * @param destination_rank the rank to send the data to * @param send_tag the tag for the send call * + * @tparam SendType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * * @return the request handle for the send call */ template - request i_send(const SendType* send_buffer, const int send_count, + request i_send(std::shared_ptr exec, + const SendType* send_buffer, const int send_count, const int destination_rank, const int send_tag) const { + auto guard = exec->get_scoped_device_id_guard(); request req; GKO_ASSERT_NO_MPI_ERRORS( MPI_Isend(send_buffer, send_count, type_impl::get_type(), @@ -428,17 +589,24 @@ class communicator { /** * Receive data from source rank. * + * @param exec The executor, on which the message buffer is located. * @param recv_buffer the buffer to receive * @param recv_count the number of elements to receive * @param source_rank the rank to receive the data from * @param recv_tag the tag for the recv call * + * @tparam RecvType the type of the data to receive. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * * @return the status of completion of this call */ template - status recv(RecvType* recv_buffer, const int recv_count, - const int source_rank, const int recv_tag) const + status recv(std::shared_ptr exec, RecvType* recv_buffer, + const int recv_count, const int source_rank, + const int recv_tag) const { + auto guard = exec->get_scoped_device_id_guard(); status st; GKO_ASSERT_NO_MPI_ERRORS( MPI_Recv(recv_buffer, recv_count, type_impl::get_type(), @@ -449,17 +617,24 @@ class communicator { /** * Receive (Non-blocking, Immediate return) data from source rank. * + * @param exec The executor, on which the message buffer is located. * @param recv_buffer the buffer to send * @param recv_count the number of elements to receive * @param source_rank the rank to receive the data from * @param recv_tag the tag for the recv call * + * @tparam RecvType the type of the data to receive. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * * @return the request handle for the recv call */ template - request i_recv(RecvType* recv_buffer, const int recv_count, - const int source_rank, const int recv_tag) const + request i_recv(std::shared_ptr exec, RecvType* recv_buffer, + const int recv_count, const int source_rank, + const int recv_tag) const { + auto guard = exec->get_scoped_device_id_guard(); request req; GKO_ASSERT_NO_MPI_ERRORS( MPI_Irecv(recv_buffer, recv_count, type_impl::get_type(), @@ -470,13 +645,20 @@ class communicator { /** * Broadcast data from calling process to all ranks in the communicator * + * @param exec The executor, on which the message buffer is located. * @param buffer the buffer to broadcsat * @param count the number of elements to broadcast * @param root_rank the rank to broadcast from + * + * @tparam BroadcastType the type of the data to broadcast. Has to be a + * type which has a specialization of type_impl that + * defines its MPI_Datatype. */ template - void broadcast(BroadcastType* buffer, int count, int root_rank) const + void broadcast(std::shared_ptr exec, BroadcastType* buffer, + int count, int root_rank) const { + auto guard = exec->get_scoped_device_id_guard(); GKO_ASSERT_NO_MPI_ERRORS(MPI_Bcast(buffer, count, type_impl::get_type(), root_rank, this->get())); @@ -486,15 +668,22 @@ class communicator { * (Non-blocking) Broadcast data from calling process to all ranks in the * communicator * + * @param exec The executor, on which the message buffer is located. * @param buffer the buffer to broadcsat * @param count the number of elements to broadcast * @param root_rank the rank to broadcast from * + * @tparam BroadcastType the type of the data to broadcast. Has to be a + * type which has a specialization of type_impl that + * defines its MPI_Datatype. + * * @return the request handle for the call */ template - request i_broadcast(BroadcastType* buffer, int count, int root_rank) const + request i_broadcast(std::shared_ptr exec, + BroadcastType* buffer, int count, int root_rank) const { + auto guard = exec->get_scoped_device_id_guard(); request req; GKO_ASSERT_NO_MPI_ERRORS( MPI_Ibcast(buffer, count, type_impl::get_type(), @@ -506,15 +695,22 @@ class communicator { * Reduce data into root from all calling processes on the same * communicator. * + * @param exec The executor, on which the message buffer is located. * @param send_buffer the buffer to reduce * @param recv_buffer the reduced result * @param count the number of elements to reduce * @param operation the MPI_Op type reduce operation. + * + * @tparam ReduceType the type of the data to reduce. Has to be a type + * which has a specialization of type_impl that defines + * its MPI_Datatype. */ template - void reduce(const ReduceType* send_buffer, ReduceType* recv_buffer, + void reduce(std::shared_ptr exec, + const ReduceType* send_buffer, ReduceType* recv_buffer, int count, MPI_Op operation, int root_rank) const { + auto guard = exec->get_scoped_device_id_guard(); GKO_ASSERT_NO_MPI_ERRORS(MPI_Reduce(send_buffer, recv_buffer, count, type_impl::get_type(), operation, root_rank, this->get())); @@ -524,17 +720,24 @@ class communicator { * (Non-blocking) Reduce data into root from all calling processes on the * same communicator. * + * @param exec The executor, on which the message buffer is located. * @param send_buffer the buffer to reduce * @param recv_buffer the reduced result * @param count the number of elements to reduce * @param operation the MPI_Op type reduce operation. * + * @tparam ReduceType the type of the data to reduce. Has to be a type + * which has a specialization of type_impl that defines + * its MPI_Datatype. + * * @return the request handle for the call */ template - request i_reduce(const ReduceType* send_buffer, ReduceType* recv_buffer, + request i_reduce(std::shared_ptr exec, + const ReduceType* send_buffer, ReduceType* recv_buffer, int count, MPI_Op operation, int root_rank) const { + auto guard = exec->get_scoped_device_id_guard(); request req; GKO_ASSERT_NO_MPI_ERRORS(MPI_Ireduce( send_buffer, recv_buffer, count, type_impl::get_type(), @@ -546,37 +749,50 @@ class communicator { * (In-place) Reduce data from all calling processes from all calling * processes on same communicator. * + * @param exec The executor, on which the message buffer is located. * @param recv_buffer the data to reduce and the reduced result * @param count the number of elements to reduce * @param operation the MPI_Op type reduce operation. + * + * @tparam ReduceType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. */ template - void all_reduce(ReduceType* recv_buffer, int count, MPI_Op operation) const + void all_reduce(std::shared_ptr exec, + ReduceType* recv_buffer, int count, MPI_Op operation) const { + auto guard = exec->get_scoped_device_id_guard(); GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce( - in_place(), recv_buffer, count, - type_impl::get_type(), operation, this->get())); + MPI_IN_PLACE, recv_buffer, count, type_impl::get_type(), + operation, this->get())); } /** * (In-place, non-blocking) Reduce data from all calling processes from all * calling processes on same communicator. * + * @param exec The executor, on which the message buffer is located. * @param recv_buffer the data to reduce and the reduced result * @param count the number of elements to reduce * @param operation the reduce operation. See @MPI_Op * + * @tparam ReduceType the type of the data to reduce. Has to be a type + * which has a specialization of type_impl that defines + * its MPI_Datatype. + * * @return the request handle for the call */ template - request i_all_reduce(ReduceType* recv_buffer, int count, + request i_all_reduce(std::shared_ptr exec, + ReduceType* recv_buffer, int count, MPI_Op operation) const { + auto guard = exec->get_scoped_device_id_guard(); request req; - GKO_ASSERT_NO_MPI_ERRORS( - MPI_Iallreduce(in_place(), recv_buffer, count, - type_impl::get_type(), operation, - this->get(), req.get())); + GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallreduce( + MPI_IN_PLACE, recv_buffer, count, type_impl::get_type(), + operation, this->get(), req.get())); return req; } @@ -584,15 +800,22 @@ class communicator { * Reduce data from all calling processes from all calling processes on same * communicator. * + * @param exec The executor, on which the message buffers are located. * @param send_buffer the data to reduce * @param recv_buffer the reduced result * @param count the number of elements to reduce * @param operation the reduce operation. See @MPI_Op + * + * @tparam ReduceType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. */ template - void all_reduce(const ReduceType* send_buffer, ReduceType* recv_buffer, + void all_reduce(std::shared_ptr exec, + const ReduceType* send_buffer, ReduceType* recv_buffer, int count, MPI_Op operation) const { + auto guard = exec->get_scoped_device_id_guard(); GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce( send_buffer, recv_buffer, count, type_impl::get_type(), operation, this->get())); @@ -602,17 +825,24 @@ class communicator { * Reduce data from all calling processes from all calling processes on same * communicator. * + * @param exec The executor, on which the message buffers are located. * @param send_buffer the data to reduce * @param recv_buffer the reduced result * @param count the number of elements to reduce * @param operation the reduce operation. See @MPI_Op * + * @tparam ReduceType the type of the data to reduce. Has to be a type + * which has a specialization of type_impl that defines + * its MPI_Datatype. + * * @return the request handle for the call */ template - request i_all_reduce(const ReduceType* send_buffer, ReduceType* recv_buffer, + request i_all_reduce(std::shared_ptr exec, + const ReduceType* send_buffer, ReduceType* recv_buffer, int count, MPI_Op operation) const { + auto guard = exec->get_scoped_device_id_guard(); request req; GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallreduce( send_buffer, recv_buffer, count, type_impl::get_type(), @@ -623,17 +853,26 @@ class communicator { /** * Gather data onto the root rank from all ranks in the communicator. * + * @param exec The executor, on which the message buffers are located. * @param send_buffer the buffer to gather from * @param send_count the number of elements to send * @param recv_buffer the buffer to gather into * @param recv_count the number of elements to receive * @param root_rank the rank to gather into + * + * @tparam SendType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * @tparam RecvType the type of the data to receive. The same restrictions + * as for SendType apply. */ template - void gather(const SendType* send_buffer, const int send_count, + void gather(std::shared_ptr exec, + const SendType* send_buffer, const int send_count, RecvType* recv_buffer, const int recv_count, int root_rank) const { + auto guard = exec->get_scoped_device_id_guard(); GKO_ASSERT_NO_MPI_ERRORS( MPI_Gather(send_buffer, send_count, type_impl::get_type(), recv_buffer, recv_count, type_impl::get_type(), @@ -644,19 +883,28 @@ class communicator { * (Non-blocking) Gather data onto the root rank from all ranks in the * communicator. * + * @param exec The executor, on which the message buffers are located. * @param send_buffer the buffer to gather from * @param send_count the number of elements to send * @param recv_buffer the buffer to gather into * @param recv_count the number of elements to receive * @param root_rank the rank to gather into * + * @tparam SendType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * @tparam RecvType the type of the data to receive. The same restrictions + * as for SendType apply. + * * @return the request handle for the call */ template - request i_gather(const SendType* send_buffer, const int send_count, + request i_gather(std::shared_ptr exec, + const SendType* send_buffer, const int send_count, RecvType* recv_buffer, const int recv_count, int root_rank) const { + auto guard = exec->get_scoped_device_id_guard(); request req; GKO_ASSERT_NO_MPI_ERRORS(MPI_Igather( send_buffer, send_count, type_impl::get_type(), @@ -669,18 +917,27 @@ class communicator { * Gather data onto the root rank from all ranks in the communicator with * offsets. * + * @param exec The executor, on which the message buffers are located. * @param send_buffer the buffer to gather from * @param send_count the number of elements to send * @param recv_buffer the buffer to gather into * @param recv_count the number of elements to receive * @param displacements the offsets for the buffer * @param root_rank the rank to gather into + * + * @tparam SendType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * @tparam RecvType the type of the data to receive. The same restrictions + * as for SendType apply. */ template - void gather_v(const SendType* send_buffer, const int send_count, + void gather_v(std::shared_ptr exec, + const SendType* send_buffer, const int send_count, RecvType* recv_buffer, const int* recv_counts, const int* displacements, int root_rank) const { + auto guard = exec->get_scoped_device_id_guard(); GKO_ASSERT_NO_MPI_ERRORS(MPI_Gatherv( send_buffer, send_count, type_impl::get_type(), recv_buffer, recv_counts, displacements, @@ -691,6 +948,7 @@ class communicator { * (Non-blocking) Gather data onto the root rank from all ranks in the * communicator with offsets. * + * @param exec The executor, on which the message buffers are located. * @param send_buffer the buffer to gather from * @param send_count the number of elements to send * @param recv_buffer the buffer to gather into @@ -698,13 +956,21 @@ class communicator { * @param displacements the offsets for the buffer * @param root_rank the rank to gather into * + * @tparam SendType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * @tparam RecvType the type of the data to receive. The same restrictions + * as for SendType apply. + * * @return the request handle for the call */ template - request i_gather_v(const SendType* send_buffer, const int send_count, + request i_gather_v(std::shared_ptr exec, + const SendType* send_buffer, const int send_count, RecvType* recv_buffer, const int* recv_counts, const int* displacements, int root_rank) const { + auto guard = exec->get_scoped_device_id_guard(); request req; GKO_ASSERT_NO_MPI_ERRORS(MPI_Igatherv( send_buffer, send_count, type_impl::get_type(), @@ -717,15 +983,24 @@ class communicator { /** * Gather data onto all ranks from all ranks in the communicator. * + * @param exec The executor, on which the message buffers are located. * @param send_buffer the buffer to gather from * @param send_count the number of elements to send * @param recv_buffer the buffer to gather into * @param recv_count the number of elements to receive + * + * @tparam SendType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * @tparam RecvType the type of the data to receive. The same restrictions + * as for SendType apply. */ template - void all_gather(const SendType* send_buffer, const int send_count, + void all_gather(std::shared_ptr exec, + const SendType* send_buffer, const int send_count, RecvType* recv_buffer, const int recv_count) const { + auto guard = exec->get_scoped_device_id_guard(); GKO_ASSERT_NO_MPI_ERRORS(MPI_Allgather( send_buffer, send_count, type_impl::get_type(), recv_buffer, recv_count, type_impl::get_type(), @@ -736,17 +1011,26 @@ class communicator { * (Non-blocking) Gather data onto all ranks from all ranks in the * communicator. * + * @param exec The executor, on which the message buffers are located. * @param send_buffer the buffer to gather from * @param send_count the number of elements to send * @param recv_buffer the buffer to gather into * @param recv_count the number of elements to receive * + * @tparam SendType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * @tparam RecvType the type of the data to receive. The same restrictions + * as for SendType apply. + * * @return the request handle for the call */ template - request i_all_gather(const SendType* send_buffer, const int send_count, + request i_all_gather(std::shared_ptr exec, + const SendType* send_buffer, const int send_count, RecvType* recv_buffer, const int recv_count) const { + auto guard = exec->get_scoped_device_id_guard(); request req; GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallgather( send_buffer, send_count, type_impl::get_type(), @@ -758,16 +1042,25 @@ class communicator { /** * Scatter data from root rank to all ranks in the communicator. * + * @param exec The executor, on which the message buffers are located. * @param send_buffer the buffer to gather from * @param send_count the number of elements to send * @param recv_buffer the buffer to gather into * @param recv_count the number of elements to receive + * + * @tparam SendType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * @tparam RecvType the type of the data to receive. The same restrictions + * as for SendType apply. */ template - void scatter(const SendType* send_buffer, const int send_count, + void scatter(std::shared_ptr exec, + const SendType* send_buffer, const int send_count, RecvType* recv_buffer, const int recv_count, int root_rank) const { + auto guard = exec->get_scoped_device_id_guard(); GKO_ASSERT_NO_MPI_ERRORS(MPI_Scatter( send_buffer, send_count, type_impl::get_type(), recv_buffer, recv_count, type_impl::get_type(), root_rank, @@ -778,18 +1071,27 @@ class communicator { * (Non-blocking) Scatter data from root rank to all ranks in the * communicator. * + * @param exec The executor, on which the message buffers are located. * @param send_buffer the buffer to gather from * @param send_count the number of elements to send * @param recv_buffer the buffer to gather into * @param recv_count the number of elements to receive * + * @tparam SendType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * @tparam RecvType the type of the data to receive. The same restrictions + * as for SendType apply. + * * @return the request handle for the call */ template - request i_scatter(const SendType* send_buffer, const int send_count, + request i_scatter(std::shared_ptr exec, + const SendType* send_buffer, const int send_count, RecvType* recv_buffer, const int recv_count, int root_rank) const { + auto guard = exec->get_scoped_device_id_guard(); request req; GKO_ASSERT_NO_MPI_ERRORS(MPI_Iscatter( send_buffer, send_count, type_impl::get_type(), @@ -802,18 +1104,27 @@ class communicator { * Scatter data from root rank to all ranks in the communicator with * offsets. * + * @param exec The executor, on which the message buffers are located. * @param send_buffer the buffer to gather from * @param send_count the number of elements to send * @param recv_buffer the buffer to gather into * @param recv_count the number of elements to receive * @param displacements the offsets for the buffer * @param comm the communicator + * + * @tparam SendType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * @tparam RecvType the type of the data to receive. The same restrictions + * as for SendType apply. */ template - void scatter_v(const SendType* send_buffer, const int* send_counts, + void scatter_v(std::shared_ptr exec, + const SendType* send_buffer, const int* send_counts, const int* displacements, RecvType* recv_buffer, const int recv_count, int root_rank) const { + auto guard = exec->get_scoped_device_id_guard(); GKO_ASSERT_NO_MPI_ERRORS(MPI_Scatterv( send_buffer, send_counts, displacements, type_impl::get_type(), recv_buffer, recv_count, @@ -824,6 +1135,7 @@ class communicator { * (Non-blocking) Scatter data from root rank to all ranks in the * communicator with offsets. * + * @param exec The executor, on which the message buffers are located. * @param send_buffer the buffer to gather from * @param send_count the number of elements to send * @param recv_buffer the buffer to gather into @@ -831,13 +1143,21 @@ class communicator { * @param displacements the offsets for the buffer * @param comm the communicator * + * @tparam SendType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * @tparam RecvType the type of the data to receive. The same restrictions + * as for SendType apply. + * * @return the request handle for the call */ template - request i_scatter_v(const SendType* send_buffer, const int* send_counts, + request i_scatter_v(std::shared_ptr exec, + const SendType* send_buffer, const int* send_counts, const int* displacements, RecvType* recv_buffer, const int recv_count, int root_rank) const { + auto guard = exec->get_scoped_device_id_guard(); request req; GKO_ASSERT_NO_MPI_ERRORS( MPI_Iscatterv(send_buffer, send_counts, displacements, @@ -851,41 +1171,55 @@ class communicator { * (In-place) Communicate data from all ranks to all other ranks in place * (MPI_Alltoall). See MPI documentation for more details. * + * @param exec The executor, on which the message buffer is located. * @param buffer the buffer to send and the buffer receive * @param recv_count the number of elements to receive * @param comm the communicator * + * @tparam RecvType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * * @note This overload uses MPI_IN_PLACE and the source and destination * buffers are the same. */ template - void all_to_all(RecvType* recv_buffer, const int recv_count) const + void all_to_all(std::shared_ptr exec, RecvType* recv_buffer, + const int recv_count) const { + auto guard = exec->get_scoped_device_id_guard(); GKO_ASSERT_NO_MPI_ERRORS(MPI_Alltoall( - in_place(), recv_count, type_impl::get_type(), + MPI_IN_PLACE, recv_count, type_impl::get_type(), recv_buffer, recv_count, type_impl::get_type(), this->get())); } /** * (In-place, Non-blocking) Communicate data from all ranks to all other - * ranks in place (MPI_Alltoall). See MPI documentation for more details. + * ranks in place (MPI_Ialltoall). See MPI documentation for more details. * + * @param exec The executor, on which the message buffer is located. * @param buffer the buffer to send and the buffer receive * @param recv_count the number of elements to receive * @param comm the communicator * + * @tparam RecvType the type of the data to receive. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * * @return the request handle for the call * * @note This overload uses MPI_IN_PLACE and the source and destination * buffers are the same. */ template - request i_all_to_all(RecvType* recv_buffer, const int recv_count) const + request i_all_to_all(std::shared_ptr exec, + RecvType* recv_buffer, const int recv_count) const { + auto guard = exec->get_scoped_device_id_guard(); request req; GKO_ASSERT_NO_MPI_ERRORS(MPI_Ialltoall( - in_place(), recv_count, type_impl::get_type(), + MPI_IN_PLACE, recv_count, type_impl::get_type(), recv_buffer, recv_count, type_impl::get_type(), this->get(), req.get())); return req; @@ -895,15 +1229,24 @@ class communicator { * Communicate data from all ranks to all other ranks (MPI_Alltoall). * See MPI documentation for more details. * + * @param exec The executor, on which the message buffers are located. * @param send_buffer the buffer to send * @param send_count the number of elements to send * @param recv_buffer the buffer to receive * @param recv_count the number of elements to receive + * + * @tparam SendType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * @tparam RecvType the type of the data to receive. The same restrictions + * as for SendType apply. */ template - void all_to_all(const SendType* send_buffer, const int send_count, + void all_to_all(std::shared_ptr exec, + const SendType* send_buffer, const int send_count, RecvType* recv_buffer, const int recv_count) const { + auto guard = exec->get_scoped_device_id_guard(); GKO_ASSERT_NO_MPI_ERRORS(MPI_Alltoall( send_buffer, send_count, type_impl::get_type(), recv_buffer, recv_count, type_impl::get_type(), @@ -912,19 +1255,28 @@ class communicator { /** * (Non-blocking) Communicate data from all ranks to all other ranks - * (MPI_Alltoall). See MPI documentation for more details. + * (MPI_Ialltoall). See MPI documentation for more details. * + * @param exec The executor, on which the message buffers are located. * @param send_buffer the buffer to send * @param send_count the number of elements to send * @param recv_buffer the buffer to receive * @param recv_count the number of elements to receive * + * @tparam SendType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * @tparam RecvType the type of the data to receive. The same restrictions + * as for SendType apply. + * * @return the request handle for the call */ template - request i_all_to_all(const SendType* send_buffer, const int send_count, + request i_all_to_all(std::shared_ptr exec, + const SendType* send_buffer, const int send_count, RecvType* recv_buffer, const int recv_count) const { + auto guard = exec->get_scoped_device_id_guard(); request req; GKO_ASSERT_NO_MPI_ERRORS(MPI_Ialltoall( send_buffer, send_count, type_impl::get_type(), @@ -937,6 +1289,7 @@ class communicator { * Communicate data from all ranks to all other ranks with * offsets (MPI_Alltoallv). See MPI documentation for more details. * + * @param exec The executor, on which the message buffers are located. * @param send_buffer the buffer to send * @param send_count the number of elements to send * @param send_offsets the offsets for the send buffer @@ -944,59 +1297,138 @@ class communicator { * @param recv_count the number of elements to receive * @param recv_offsets the offsets for the recv buffer * @param comm the communicator + * + * @tparam SendType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * @tparam RecvType the type of the data to receive. The same restrictions + * as for SendType apply. */ template - void all_to_all_v(const SendType* send_buffer, const int* send_counts, + void all_to_all_v(std::shared_ptr exec, + const SendType* send_buffer, const int* send_counts, const int* send_offsets, RecvType* recv_buffer, const int* recv_counts, const int* recv_offsets) const { - GKO_ASSERT_NO_MPI_ERRORS(MPI_Alltoallv( - send_buffer, send_counts, send_offsets, - type_impl::get_type(), recv_buffer, recv_counts, - recv_offsets, type_impl::get_type(), this->get())); + this->all_to_all_v(std::move(exec), send_buffer, send_counts, + send_offsets, type_impl::get_type(), + recv_buffer, recv_counts, recv_offsets, + type_impl::get_type()); } /** * Communicate data from all ranks to all other ranks with * offsets (MPI_Alltoallv). See MPI documentation for more details. * + * @param exec The executor, on which the message buffers are located. + * @param send_buffer the buffer to send + * @param send_count the number of elements to send + * @param send_offsets the offsets for the send buffer + * @param send_type the MPI_Datatype for the send buffer + * @param recv_buffer the buffer to gather into + * @param recv_count the number of elements to receive + * @param recv_offsets the offsets for the recv buffer + * @param recv_type the MPI_Datatype for the recv buffer + * @param comm the communicator + */ + void all_to_all_v(std::shared_ptr exec, + const void* send_buffer, const int* send_counts, + const int* send_offsets, MPI_Datatype send_type, + void* recv_buffer, const int* recv_counts, + const int* recv_offsets, MPI_Datatype recv_type) const + { + auto guard = exec->get_scoped_device_id_guard(); + GKO_ASSERT_NO_MPI_ERRORS(MPI_Alltoallv( + send_buffer, send_counts, send_offsets, send_type, recv_buffer, + recv_counts, recv_offsets, recv_type, this->get())); + } + + /** + * Communicate data from all ranks to all other ranks with + * offsets (MPI_Ialltoallv). See MPI documentation for more details. + * + * @param exec The executor, on which the message buffers are located. * @param send_buffer the buffer to send * @param send_count the number of elements to send * @param send_offsets the offsets for the send buffer + * @param send_type the MPI_Datatype for the send buffer * @param recv_buffer the buffer to gather into * @param recv_count the number of elements to receive * @param recv_offsets the offsets for the recv buffer + * @param recv_type the MPI_Datatype for the recv buffer + * + * @return the request handle for the call + * + * @note This overload allows specifying the MPI_Datatype for both + * the send and received data. + */ + request i_all_to_all_v(std::shared_ptr exec, + const void* send_buffer, const int* send_counts, + const int* send_offsets, MPI_Datatype send_type, + void* recv_buffer, const int* recv_counts, + const int* recv_offsets, + MPI_Datatype recv_type) const + { + auto guard = exec->get_scoped_device_id_guard(); + request req; + GKO_ASSERT_NO_MPI_ERRORS(MPI_Ialltoallv( + send_buffer, send_counts, send_offsets, send_type, recv_buffer, + recv_counts, recv_offsets, recv_type, this->get(), req.get())); + return req; + } + + /** + * Communicate data from all ranks to all other ranks with + * offsets (MPI_Ialltoallv). See MPI documentation for more details. + * + * @param exec The executor, on which the message buffers are located. + * @param send_buffer the buffer to send + * @param send_count the number of elements to send + * @param send_offsets the offsets for the send buffer + * @param recv_buffer the buffer to gather into + * @param recv_count the number of elements to receive + * @param recv_offsets the offsets for the recv buffer + * + * @tparam SendType the type of the data to send. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * @tparam RecvType the type of the data to receive. The same restrictions + * as for SendType apply. * * @return the request handle for the call */ template - request i_all_to_all_v(const SendType* send_buffer, const int* send_counts, + request i_all_to_all_v(std::shared_ptr exec, + const SendType* send_buffer, const int* send_counts, const int* send_offsets, RecvType* recv_buffer, const int* recv_counts, const int* recv_offsets) const { - request req; - GKO_ASSERT_NO_MPI_ERRORS(MPI_Ialltoallv( - send_buffer, send_counts, send_offsets, + return this->i_all_to_all_v( + std::move(exec), send_buffer, send_counts, send_offsets, type_impl::get_type(), recv_buffer, recv_counts, - recv_offsets, type_impl::get_type(), this->get(), - req.get())); - return req; + recv_offsets, type_impl::get_type()); } /** * Does a scan operation with the given operator. * (MPI_Scan). See MPI documentation for more details. * + * @param exec The executor, on which the message buffers are located. * @param send_buffer the buffer to scan from * @param recv_buffer the result buffer * @param recv_count the number of elements to scan * @param operation the operation type to be used for the scan. See @MPI_Op + * + * @tparam ScanType the type of the data to scan. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. */ template - void scan(const ScanType* send_buffer, ScanType* recv_buffer, int count, - MPI_Op operation) const + void scan(std::shared_ptr exec, const ScanType* send_buffer, + ScanType* recv_buffer, int count, MPI_Op operation) const { + auto guard = exec->get_scoped_device_id_guard(); GKO_ASSERT_NO_MPI_ERRORS(MPI_Scan(send_buffer, recv_buffer, count, type_impl::get_type(), operation, this->get())); @@ -1004,19 +1436,26 @@ class communicator { /** * Does a scan operation with the given operator. - * (MPI_Scan). See MPI documentation for more details. + * (MPI_Iscan). See MPI documentation for more details. * + * @param exec The executor, on which the message buffers are located. * @param send_buffer the buffer to scan from * @param recv_buffer the result buffer * @param recv_count the number of elements to scan * @param operation the operation type to be used for the scan. See @MPI_Op * + * @tparam ScanType the type of the data to scan. Has to be a type which + * has a specialization of type_impl that defines its + * MPI_Datatype. + * * @return the request handle for the call */ template - request i_scan(const ScanType* send_buffer, ScanType* recv_buffer, + request i_scan(std::shared_ptr exec, + const ScanType* send_buffer, ScanType* recv_buffer, int count, MPI_Op operation) const { + auto guard = exec->get_scoped_device_id_guard(); request req; GKO_ASSERT_NO_MPI_ERRORS(MPI_Iscan(send_buffer, recv_buffer, count, type_impl::get_type(), @@ -1026,9 +1465,6 @@ class communicator { private: std::shared_ptr comm_; - int size_{}; - int rank_{}; - int node_local_rank_{}; int get_my_rank() const { @@ -1126,6 +1562,7 @@ class window { * Create a window object with a given data pointer and type. A collective * operation. * + * @param exec The executor, on which the base pointer is located. * @param base the base pointer for the window object. * @param num_elems the num_elems of type ValueType the window points to. * @param comm the communicator whose ranks will have windows created. @@ -1133,11 +1570,12 @@ class window { * @param input_info the MPI_Info object used to set certain properties. * @param c_type the type of creation method to use to create the window. */ - window(ValueType* base, int num_elems, const communicator& comm, - const int disp_unit = sizeof(ValueType), + window(std::shared_ptr exec, ValueType* base, int num_elems, + const communicator& comm, const int disp_unit = sizeof(ValueType), MPI_Info input_info = MPI_INFO_NULL, create_type c_type = create_type::create) { + auto guard = exec->get_scoped_device_id_guard(); unsigned size = num_elems * sizeof(ValueType); if (c_type == create_type::create) { GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_create( @@ -1283,6 +1721,7 @@ class window { /** * Put data into the target window. * + * @param exec The executor, on which the message buffer is located. * @param origin_buffer the buffer to send * @param origin_count the number of elements to put * @param target_rank the rank to put the data to @@ -1290,10 +1729,11 @@ class window { * @param target_count the request handle for the send call */ template - void put(const PutType* origin_buffer, const int origin_count, - const int target_rank, const unsigned int target_disp, - const int target_count) const + void put(std::shared_ptr exec, const PutType* origin_buffer, + const int origin_count, const int target_rank, + const unsigned int target_disp, const int target_count) const { + auto guard = exec->get_scoped_device_id_guard(); GKO_ASSERT_NO_MPI_ERRORS( MPI_Put(origin_buffer, origin_count, type_impl::get_type(), target_rank, target_disp, target_count, @@ -1303,6 +1743,7 @@ class window { /** * Put data into the target window. * + * @param exec The executor, on which the message buffer is located. * @param origin_buffer the buffer to send * @param origin_count the number of elements to put * @param target_rank the rank to put the data to @@ -1312,10 +1753,12 @@ class window { * @return the request handle for the send call */ template - request r_put(const PutType* origin_buffer, const int origin_count, + request r_put(std::shared_ptr exec, + const PutType* origin_buffer, const int origin_count, const int target_rank, const unsigned int target_disp, const int target_count) const { + auto guard = exec->get_scoped_device_id_guard(); request req; GKO_ASSERT_NO_MPI_ERRORS(MPI_Rput( origin_buffer, origin_count, type_impl::get_type(), @@ -1327,6 +1770,7 @@ class window { /** * Accumulate data into the target window. * + * @param exec The executor, on which the message buffer is located. * @param origin_buffer the buffer to send * @param origin_count the number of elements to put * @param target_rank the rank to put the data to @@ -1335,10 +1779,12 @@ class window { * @param operation the reduce operation. See @MPI_Op */ template - void accumulate(const PutType* origin_buffer, const int origin_count, + void accumulate(std::shared_ptr exec, + const PutType* origin_buffer, const int origin_count, const int target_rank, const unsigned int target_disp, const int target_count, MPI_Op operation) const { + auto guard = exec->get_scoped_device_id_guard(); GKO_ASSERT_NO_MPI_ERRORS(MPI_Accumulate( origin_buffer, origin_count, type_impl::get_type(), target_rank, target_disp, target_count, @@ -1348,6 +1794,7 @@ class window { /** * (Non-blocking) Accumulate data into the target window. * + * @param exec The executor, on which the message buffer is located. * @param origin_buffer the buffer to send * @param origin_count the number of elements to put * @param target_rank the rank to put the data to @@ -1358,10 +1805,12 @@ class window { * @return the request handle for the send call */ template - request r_accumulate(const PutType* origin_buffer, const int origin_count, + request r_accumulate(std::shared_ptr exec, + const PutType* origin_buffer, const int origin_count, const int target_rank, const unsigned int target_disp, const int target_count, MPI_Op operation) const { + auto guard = exec->get_scoped_device_id_guard(); request req; GKO_ASSERT_NO_MPI_ERRORS(MPI_Raccumulate( origin_buffer, origin_count, type_impl::get_type(), @@ -1374,6 +1823,7 @@ class window { /** * Get data from the target window. * + * @param exec The executor, on which the message buffer is located. * @param origin_buffer the buffer to send * @param origin_count the number of elements to get * @param target_rank the rank to get the data from @@ -1381,10 +1831,11 @@ class window { * @param target_count the request handle for the send call */ template - void get(GetType* origin_buffer, const int origin_count, - const int target_rank, const unsigned int target_disp, - const int target_count) const + void get(std::shared_ptr exec, GetType* origin_buffer, + const int origin_count, const int target_rank, + const unsigned int target_disp, const int target_count) const { + auto guard = exec->get_scoped_device_id_guard(); GKO_ASSERT_NO_MPI_ERRORS( MPI_Get(origin_buffer, origin_count, type_impl::get_type(), target_rank, target_disp, target_count, @@ -1394,6 +1845,7 @@ class window { /** * Get data (with handle) from the target window. * + * @param exec The executor, on which the message buffer is located. * @param origin_buffer the buffer to send * @param origin_count the number of elements to get * @param target_rank the rank to get the data from @@ -1403,10 +1855,11 @@ class window { * @return the request handle for the send call */ template - request r_get(GetType* origin_buffer, const int origin_count, - const int target_rank, const unsigned int target_disp, - const int target_count) const + request r_get(std::shared_ptr exec, GetType* origin_buffer, + const int origin_count, const int target_rank, + const unsigned int target_disp, const int target_count) const { + auto guard = exec->get_scoped_device_id_guard(); request req; GKO_ASSERT_NO_MPI_ERRORS(MPI_Rget( origin_buffer, origin_count, type_impl::get_type(), @@ -1418,6 +1871,7 @@ class window { /** * Get Accumulate data from the target window. * + * @param exec The executor, on which the message buffers are located. * @param origin_buffer the buffer to send * @param origin_count the number of elements to get * @param result_buffer the buffer to receive the target data @@ -1428,11 +1882,13 @@ class window { * @param operation the reduce operation. See @MPI_Op */ template - void get_accumulate(GetType* origin_buffer, const int origin_count, + void get_accumulate(std::shared_ptr exec, + GetType* origin_buffer, const int origin_count, GetType* result_buffer, const int result_count, const int target_rank, const unsigned int target_disp, const int target_count, MPI_Op operation) const { + auto guard = exec->get_scoped_device_id_guard(); GKO_ASSERT_NO_MPI_ERRORS(MPI_Get_accumulate( origin_buffer, origin_count, type_impl::get_type(), result_buffer, result_count, type_impl::get_type(), @@ -1443,6 +1899,7 @@ class window { /** * (Non-blocking) Get Accumulate data (with handle) from the target window. * + * @param exec The executor, on which the message buffers are located. * @param origin_buffer the buffer to send * @param origin_count the number of elements to get * @param result_buffer the buffer to receive the target data @@ -1455,12 +1912,14 @@ class window { * @return the request handle for the send call */ template - request r_get_accumulate(GetType* origin_buffer, const int origin_count, + request r_get_accumulate(std::shared_ptr exec, + GetType* origin_buffer, const int origin_count, GetType* result_buffer, const int result_count, const int target_rank, const unsigned int target_disp, const int target_count, MPI_Op operation) const { + auto guard = exec->get_scoped_device_id_guard(); request req; GKO_ASSERT_NO_MPI_ERRORS(MPI_Rget_accumulate( origin_buffer, origin_count, type_impl::get_type(), @@ -1475,16 +1934,19 @@ class window { * Fetch and operate on data from the target window (An optimized version of * Get_accumulate). * + * @param exec The executor, on which the message buffer is located. * @param origin_buffer the buffer to send * @param target_rank the rank to get the data from * @param target_disp the displacement at the target window * @param operation the reduce operation. See @MPI_Op */ template - void fetch_and_op(GetType* origin_buffer, GetType* result_buffer, + void fetch_and_op(std::shared_ptr exec, + GetType* origin_buffer, GetType* result_buffer, const int target_rank, const unsigned int target_disp, MPI_Op operation) const { + auto guard = exec->get_scoped_device_id_guard(); GKO_ASSERT_NO_MPI_ERRORS(MPI_Fetch_and_op( origin_buffer, result_buffer, type_impl::get_type(), target_rank, target_disp, operation, this->get_window())); @@ -1496,6 +1958,7 @@ class window { } // namespace mpi +} // namespace experimental } // namespace gko diff --git a/include/ginkgo/core/base/perturbation.hpp b/include/ginkgo/core/base/perturbation.hpp index 38da78be2cf..0da68a375bc 100644 --- a/include/ginkgo/core/base/perturbation.hpp +++ b/include/ginkgo/core/base/perturbation.hpp @@ -66,7 +66,7 @@ namespace gko { template class Perturbation : public EnableLinOp>, public EnableCreateMethod> { - friend class EnablePolymorphicObject; + friend struct polymorphic_object_traits; friend class EnableCreateMethod; public: diff --git a/include/ginkgo/core/base/polymorphic_object.hpp b/include/ginkgo/core/base/polymorphic_object.hpp index 19af66924bb..281838dfedf 100644 --- a/include/ginkgo/core/base/polymorphic_object.hpp +++ b/include/ginkgo/core/base/polymorphic_object.hpp @@ -609,6 +609,29 @@ std::shared_ptr copy_and_convert_to( } +template +struct polymorphic_object_traits { + static std::unique_ptr create_default_impl( + const ConcreteObject* self, std::shared_ptr exec) + { + return std::unique_ptr{new ConcreteObject(exec)}; + } + + template + static std::unique_ptr create_conversion_target_impl( + const OtherType* self, std::shared_ptr exec) + { + return std::unique_ptr{new ConcreteObject(exec)}; + } + + static PolymorphicObject* clear_impl(ConcreteObject* self) + { + *self = ConcreteObject{self->get_executor()}; + return self; + } +}; + + /** * This mixin inherits from (a subclass of) PolymorphicObject and provides a * base implementation of a new concrete polymorphic object. @@ -653,7 +676,8 @@ class EnablePolymorphicObject std::unique_ptr create_default_impl( std::shared_ptr exec) const override { - return std::unique_ptr{new ConcreteObject(exec)}; + return polymorphic_object_traits::create_default_impl( + self(), std::move(exec)); } PolymorphicObject* copy_from_impl(const PolymorphicObject* other) override @@ -684,8 +708,7 @@ class EnablePolymorphicObject PolymorphicObject* clear_impl() override { - *self() = ConcreteObject{this->get_executor()}; - return this; + return polymorphic_object_traits::clear_impl(self()); } private: diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp index a10f1871718..3e03519ff7b 100644 --- a/include/ginkgo/core/base/precision_dispatch.hpp +++ b/include/ginkgo/core/base/precision_dispatch.hpp @@ -34,8 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define GKO_PUBLIC_CORE_BASE_PRECISION_DISPATCH_HPP_ +#include #include #include +#include #include @@ -330,6 +332,288 @@ void mixed_precision_dispatch_real_complex(Function fn, const LinOp* in, } +namespace experimental { + + +#if GINKGO_BUILD_MPI + + +namespace distributed { + + +/** + * Convert the given LinOp from experimental::distributed::Vector<...> to + * experimental::distributed::Vector. The conversion tries to convert + * the input LinOp to all Dense types with value type recursively reachable by + * next_precision<...> starting from the ValueType template parameter. This + * means that all real-to-real and complex-to-complex conversions for default + * precisions are being considered. If the input matrix is non-const, the + * contents of the modified converted object will be converted back to the input + * matrix when the returned object is destroyed. This may lead to a loss of + * precision! + * + * @param matrix the input matrix which is supposed to be converted. It is + * wrapped unchanged if it is already of type + * experimental::distributed::Vector, otherwise it + * will be converted to this type if possible. + * + * @returns a detail::temporary_conversion pointing to the (potentially + * converted) object. + * + * @throws NotSupported if the input matrix cannot be converted to + * experimental::distributed::Vector + * + * @tparam ValueType the value type into whose associated + * experimental::distributed::Vector type to convert the input LinOp. + */ +template +detail::temporary_conversion> +make_temporary_conversion(LinOp* matrix) +{ + auto result = detail::temporary_conversion< + experimental::distributed::Vector>:: + template create< + experimental::distributed::Vector>>( + matrix); + if (!result) { + GKO_NOT_SUPPORTED(matrix); + } + return result; +} + + +/** + * @copydoc make_temporary_conversion + */ +template +detail::temporary_conversion> +make_temporary_conversion(const LinOp* matrix) +{ + auto result = detail::temporary_conversion< + const experimental::distributed::Vector>:: + template create< + experimental::distributed::Vector>>( + matrix); + if (!result) { + GKO_NOT_SUPPORTED(matrix); + } + return result; +} + + +/** + * Calls the given function with each given argument LinOp temporarily + * converted into experimental::distributed::Vector as parameters. + * + * @param fn the given function. It will be passed one (potentially const) + * experimental::distributed::Vector* parameter per + * parameter in the parameter pack `linops`. + * @param linops the given arguments to be converted and passed on to fn. + * + * @tparam ValueType the value type to use for the parameters of `fn`. + * @tparam Function the function pointer, lambda or other functor type to call + * with the converted arguments. + * @tparam Args the argument type list. + */ +template +void precision_dispatch(Function fn, Args*... linops) +{ + fn(distributed::make_temporary_conversion(linops).get()...); +} + + +/** + * Calls the given function with the given LinOps temporarily converted to + * experimental::distributed::Vector* as parameters. + * If ValueType is real and both input vectors are complex, uses + * experimental::distributed::Vector::get_real_view() to convert them into real + * matrices after precision conversion. + * + * @see precision_dispatch() + */ +template +void precision_dispatch_real_complex(Function fn, const LinOp* in, LinOp* out) +{ + auto complex_to_real = !( + is_complex() || + dynamic_cast>*>( + in)); + if (complex_to_real) { + auto dense_in = + distributed::make_temporary_conversion>(in); + auto dense_out = + distributed::make_temporary_conversion>(out); + using Vector = experimental::distributed::Vector; + // These dynamic_casts are only needed to make the code compile + // If ValueType is complex, this branch will never be taken + // If ValueType is real, the cast is a no-op + fn(dynamic_cast(dense_in->create_real_view().get()), + dynamic_cast(dense_out->create_real_view().get())); + } else { + distributed::precision_dispatch(fn, in, out); + } +} + + +/** + * @copydoc precision_dispatch_real_complex(Function, const LinOp*, LinOp*) + */ +template +void precision_dispatch_real_complex(Function fn, const LinOp* alpha, + const LinOp* in, LinOp* out) +{ + auto complex_to_real = !( + is_complex() || + dynamic_cast>*>( + in)); + if (complex_to_real) { + auto dense_in = + distributed::make_temporary_conversion>(in); + auto dense_out = + distributed::make_temporary_conversion>(out); + auto dense_alpha = gko::make_temporary_conversion(alpha); + using Vector = experimental::distributed::Vector; + // These dynamic_casts are only needed to make the code compile + // If ValueType is complex, this branch will never be taken + // If ValueType is real, the cast is a no-op + fn(dense_alpha.get(), + dynamic_cast(dense_in->create_real_view().get()), + dynamic_cast(dense_out->create_real_view().get())); + } else { + fn(gko::make_temporary_conversion(alpha).get(), + distributed::make_temporary_conversion(in).get(), + distributed::make_temporary_conversion(out).get()); + } +} + + +/** + * @copydoc precision_dispatch_real_complex(Function, const LinOp*, LinOp*) + */ +template +void precision_dispatch_real_complex(Function fn, const LinOp* alpha, + const LinOp* in, const LinOp* beta, + LinOp* out) +{ + auto complex_to_real = !( + is_complex() || + dynamic_cast>*>( + in)); + if (complex_to_real) { + auto dense_in = + distributed::make_temporary_conversion>(in); + auto dense_out = + distributed::make_temporary_conversion>(out); + auto dense_alpha = gko::make_temporary_conversion(alpha); + auto dense_beta = gko::make_temporary_conversion(beta); + using Vector = experimental::distributed::Vector; + // These dynamic_casts are only needed to make the code compile + // If ValueType is complex, this branch will never be taken + // If ValueType is real, the cast is a no-op + fn(dense_alpha.get(), + dynamic_cast(dense_in->create_real_view().get()), + dense_beta.get(), + dynamic_cast(dense_out->create_real_view().get())); + } else { + fn(gko::make_temporary_conversion(alpha).get(), + distributed::make_temporary_conversion(in).get(), + gko::make_temporary_conversion(beta).get(), + distributed::make_temporary_conversion(out).get()); + } +} + + +} // namespace distributed + + +/** + * Calls the given function with the given LinOps temporarily converted to + * either experimental::distributed::Vector* or + * matrix::Dense as parameters. The choice depends on the runtime + * type of `in` and `out` is assumed to fall into the same category. If + * ValueType is real and both input vectors are complex, uses + * experimental::distributed::Vector::get_real_view(), or + * matrix::Dense::get_real_view() to convert them into real matrices after + * precision conversion. + * + * @see precision_dispatch() + * @see distributed::precision_dispatch() + */ +template +void precision_dispatch_real_complex_distributed(Function fn, const LinOp* in, + LinOp* out) +{ + if (dynamic_cast(in)) { + experimental::distributed::precision_dispatch_real_complex( + fn, in, out); + } else { + gko::precision_dispatch_real_complex(fn, in, out); + } +} + + +/** + * @copydoc precision_dispatch_real_complex_distributed(Function, const LinOp*, + * LinOp*) + */ +template +void precision_dispatch_real_complex_distributed(Function fn, + const LinOp* alpha, + const LinOp* in, LinOp* out) +{ + if (dynamic_cast(in)) { + experimental::distributed::precision_dispatch_real_complex( + fn, alpha, in, out); + } else { + gko::precision_dispatch_real_complex(fn, alpha, in, out); + } +} + + +/** + * @copydoc precision_dispatch_real_complex_distributed(Function, const LinOp*, + * LinOp*) + */ +template +void precision_dispatch_real_complex_distributed(Function fn, + const LinOp* alpha, + const LinOp* in, + const LinOp* beta, LinOp* out) +{ + if (dynamic_cast(in)) { + experimental::distributed::precision_dispatch_real_complex( + fn, alpha, in, beta, out); + } else { + gko::precision_dispatch_real_complex(fn, alpha, in, beta, + out); + } +} + + +#else + + +/** + * Calls the given function with the given LinOps temporarily converted to + * matrix::Dense as parameters. + * If ValueType is real and both input vectors are complex, uses + * experimental::distributed::Vector::get_real_view(), or + * matrix::Dense::get_real_view() to convert them into real matrices after + * precision conversion. + * + * @see precision_dispatch() + */ +template +void precision_dispatch_real_complex_distributed(Function fn, Args*... args) +{ + precision_dispatch_real_complex(fn, args...); +} + + +#endif + + +} // namespace experimental } // namespace gko diff --git a/include/ginkgo/core/base/scoped_device_id_guard.hpp b/include/ginkgo/core/base/scoped_device_id_guard.hpp new file mode 100644 index 00000000000..d68598c73f1 --- /dev/null +++ b/include/ginkgo/core/base/scoped_device_id_guard.hpp @@ -0,0 +1,181 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_BASE_SCOPED_DEVICE_ID_GUARD_HPP_ +#define GKO_PUBLIC_CORE_BASE_SCOPED_DEVICE_ID_GUARD_HPP_ + + +#include + + +namespace gko { + + +class OmpExecutor; +class ReferenceExecutor; +class CudaExecutor; +class HipExecutor; +class DpcppExecutor; + + +namespace detail { + + +/** + * A RAII, move-only base class for the scoped device id used for different + * executors. + */ +class generic_scoped_device_id_guard { +public: + generic_scoped_device_id_guard() = default; + + // TODO: this should be a purely virtual funtion, but somehow that leads to + // linker errors + virtual ~generic_scoped_device_id_guard() = default; + + // Prohibit copy construction + generic_scoped_device_id_guard( + const generic_scoped_device_id_guard& other) = delete; + + // Prohibit copy assignment + generic_scoped_device_id_guard& operator=( + const generic_scoped_device_id_guard& other) = delete; +}; + + +} // namespace detail + + +/** + * This move-only class uses RAII to set the device id within a scoped block, if + * necessary. + * + * The class behaves similar to std::scoped_lock. The scoped guard will make + * sure that the device code is run on the correct device within one scoped + * block, when run with multiple devices. Depending on the executor it will + * record the current device id and set the device id to the one being passed + * in. After the scope has been exited, the destructor sets the device_id back + * to the one before entering the scope. The OmpExecutor and DpcppExecutor don't + * require setting an device id, so in those cases, the class is a no-op. + * + * The device id scope has to be constructed from a executor with concrete type + * (not plain Executor) and a device id. Only the type of the executor object is + * relevant, so the pointer will not be accessed, and may even be a nullptr. + * From the executor type the correct derived class of + * detail::generic_scoped_device_id_guard is picked. The following illustrates + * the usage of this class: + * ``` + * { + * scoped_device_id_guard g{static_cast(nullptr), 1}; + * // now the device id is set to 1 + * } + * // now the device id is reverted again + * ``` + */ +class scoped_device_id_guard { +public: + /** + * Create a scoped device id from an Reference. + * + * The resulting object will be a noop. + * + * @param exec Not used. + * @param device_id Not used. + */ + scoped_device_id_guard(const ReferenceExecutor* exec, int device_id); + + /** + * Create a scoped device id from an OmpExecutor. + * + * The resulting object will be a noop. + * + * @param exec Not used. + * @param device_id Not used. + */ + scoped_device_id_guard(const OmpExecutor* exec, int device_id); + + /** + * Create a scoped device id from an CudaExecutor. + * + * The resulting object will set the cuda device id accordingly. + * + * @param exec Not used. + * @param device_id The device id to use within the scope. + */ + scoped_device_id_guard(const CudaExecutor* exec, int device_id); + + /** + * Create a scoped device id from an HipExecutor. + * + * The resulting object will set the hip device id accordingly. + * + * @param exec Not used. + * @param device_id The device id to use within the scope. + */ + scoped_device_id_guard(const HipExecutor* exec, int device_id); + + /** + * Create a scoped device id from an DpcppExecutor. + * + * The resulting object will be a noop. + * + * @param exec Not used. + * @param device_id Not used. + */ + scoped_device_id_guard(const DpcppExecutor* exec, int device_id); + + scoped_device_id_guard() = default; + + // Prohibit copy construction. + scoped_device_id_guard(const scoped_device_id_guard&) = delete; + + // Allow move construction. + // These are needed, since C++14 does not guarantee copy elision. + scoped_device_id_guard(scoped_device_id_guard&&) = default; + + // Prohibit copy assignment. + scoped_device_id_guard& operator=(const scoped_device_id_guard&) = delete; + + // Allow move construction. + // These are needed, since C++14 does not guarantee copy elision. + scoped_device_id_guard& operator=(scoped_device_id_guard&&) = default; + + ~scoped_device_id_guard() = default; + +private: + std::unique_ptr scope_; +}; + + +} // namespace gko + +#endif // GKO_PUBLIC_CORE_BASE_SCOPED_DEVICE_ID_GUARD_HPP_ diff --git a/include/ginkgo/core/base/temporary_conversion.hpp b/include/ginkgo/core/base/temporary_conversion.hpp index 314e169cc78..a553011c02c 100644 --- a/include/ginkgo/core/base/temporary_conversion.hpp +++ b/include/ginkgo/core/base/temporary_conversion.hpp @@ -146,7 +146,11 @@ struct conversion_helper { if ((cast_obj = dynamic_cast(obj))) { // if the cast is successful, obj is of dynamic type candidate_type // so we can convert from this type to TargetType - auto converted = TargetType::create(obj->get_executor()); + auto converted = + polymorphic_object_traits>:: + create_conversion_target_impl(cast_obj, + cast_obj->get_executor()); + // TargetType::create(obj->get_executor()); cast_obj->convert_to(converted.get()); // Make sure ConvertibleTo is available and symmetric static_assert( diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp index 009c2e182cd..efb6dffe30d 100644 --- a/include/ginkgo/core/base/types.hpp +++ b/include/ginkgo/core/base/types.hpp @@ -152,7 +152,7 @@ using uint64 = std::uint64_t; /** - * + * Unsigned integer type capable of holding a pointer to void */ using uintptr = std::uintptr_t; @@ -559,6 +559,73 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, #endif +/** + * Instantiates a template for each non-complex value, local and global index + * type compiled by Ginkgo. + * + * @param _macro A macro which expands the template instantiation + * (not including the leading `template` specifier). + * Should take three arguments, which are replaced by the + * value, the local and the global index types. + */ +#if GINKGO_DPCPP_SINGLE_MODE +#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ + _macro) \ + template _macro(float, int32, int32); \ + template _macro(float, int32, int64); \ + template _macro(float, int64, int64); \ + template <> \ + _macro(double, int32, int32) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(double, int32, int64) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(double, int64, int64) GKO_NOT_IMPLEMENTED +#else +#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ + _macro) \ + template _macro(float, int32, int32); \ + template _macro(float, int32, int64); \ + template _macro(float, int64, int64); \ + template _macro(double, int32, int32); \ + template _macro(double, int32, int64); \ + template _macro(double, int64, int64) +#endif + + +/** + * Instantiates a template for each value and index type compiled by Ginkgo. + * + * @param _macro A macro which expands the template instantiation + * (not including the leading `template` specifier). + * Should take two arguments, which are replaced by the + * value and index types. + */ +#if GINKGO_DPCPP_SINGLE_MODE +#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ + GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ + _macro); \ + template _macro(std::complex, int32, int32); \ + template _macro(std::complex, int32, int64); \ + template _macro(std::complex, int64, int64); \ + template <> \ + _macro(std::complex, int32, int32) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(std::complex, int32, int64) GKO_NOT_IMPLEMENTED; \ + template <> \ + _macro(std::complex, int64, int64) GKO_NOT_IMPLEMENTED +#else +#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \ + GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \ + _macro); \ + template _macro(std::complex, int32, int32); \ + template _macro(std::complex, int32, int64); \ + template _macro(std::complex, int64, int64); \ + template _macro(std::complex, int32, int32); \ + template _macro(std::complex, int32, int64); \ + template _macro(std::complex, int64, int64) +#endif + + #if GINKGO_DPCPP_SINGLE_MODE #define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro) \ template <> \ @@ -580,8 +647,6 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x, template <> \ _macro(std::complex, std::complex) GKO_NOT_IMPLEMENTED #else - - /** * Instantiates a template for each value type conversion pair compiled by * Ginkgo. @@ -699,6 +764,7 @@ inline constexpr GKO_ATTRIBUTES IndexType invalid_index() } +namespace experimental { namespace distributed { @@ -726,6 +792,7 @@ using comm_index_type = int; } // namespace distributed +} // namespace experimental } // namespace gko diff --git a/include/ginkgo/core/distributed/base.hpp b/include/ginkgo/core/distributed/base.hpp new file mode 100644 index 00000000000..70459d9cbf0 --- /dev/null +++ b/include/ginkgo/core/distributed/base.hpp @@ -0,0 +1,106 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_DISTRIBUTED_BASE_HPP_ +#define GKO_PUBLIC_CORE_DISTRIBUTED_BASE_HPP_ + + +#include + + +#if GINKGO_BUILD_MPI + + +#include + + +namespace gko { +namespace experimental { +namespace distributed { + + +/** + * A base class for distributed objects. + * + * This class stores and gives access to the used mpi::communicator object. + * + * @note The communicator is not changed on assignment. + * + * @ingroup distributed + */ +class DistributedBase { +public: + virtual ~DistributedBase() = default; + + DistributedBase(const DistributedBase& other) = default; + + DistributedBase(DistributedBase&& other) = default; + + /** + * Copy assignment that doesn't change the used mpi::communicator. + * @return unmodified *this + */ + DistributedBase& operator=(const DistributedBase&) { return *this; } + + /** + * Move assignment that doesn't change the used mpi::communicator. + * @return unmodified *this + */ + DistributedBase& operator=(DistributedBase&&) noexcept { return *this; } + + /** + * Access the used mpi::communicator. + * @return used mpi::communicator + */ + mpi::communicator get_communicator() const { return comm_; } + +protected: + /** + * Creates a new DistributedBase with the specified mpi::communicator. + * @param comm used mpi::communicator + */ + explicit DistributedBase(mpi::communicator comm) : comm_{std::move(comm)} {} + +private: + mpi::communicator comm_; +}; + + +} // namespace distributed +} // namespace experimental +} // namespace gko + + +#endif // GINKGO_BUILD_MPI + + +#endif // GKO_PUBLIC_CORE_DISTRIBUTED_BASE_HPP_ diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp new file mode 100644 index 00000000000..0000d56600c --- /dev/null +++ b/include/ginkgo/core/distributed/matrix.hpp @@ -0,0 +1,568 @@ +/************************************************************* +Copyright (c) 2017-2022, the Ginkgo authors +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in the +documentation and/or other materials provided with the distribution. + +3. Neither the name of the copyright holder nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS +IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED +TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*************************************************************/ + +#ifndef GKO_PUBLIC_CORE_DISTRIBUTED_MATRIX_HPP_ +#define GKO_PUBLIC_CORE_DISTRIBUTED_MATRIX_HPP_ + + +#include + + +#if GINKGO_BUILD_MPI + + +#include +#include +#include +#include + + +namespace gko { +namespace matrix { + + +template +class Csr; + + +} + + +namespace detail { + + +template