Merge distributed capabilities

This PR will add basic, distributed data structures (matrix and vector), and enable some solvers for these types. This PR contains the following PRs: - #961 - #971 - #976 - #985 - #1007 - #1030 - #1054 # Additional Changes - moves new types into experimental namespace - moves existing Partition class into experimental namespace - moves existing mpi namespace into experimental namespace - makes generic_scoped_device_id_guard destructor noexcept by terminating if restoring the original device id fails - switches to blocking communication in the SpMV if OpenMPI version 4.0.x is used - disables Horeka mpi tests and uses nla-gpu instead Related PR: #1133
ginkgo-project · Oct 31, 2022 · c1f8bd4 · c1f8bd4
2 parents cff9742 + b59a9dd
commit c1f8bd4
Show file tree

Hide file tree

Showing 192 changed files with 11,800 additions and 892 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -341,38 +341,6 @@ build/cuda102/nompi/intel/cuda/debug/static:
     CUDA_ARCH: 35
 
 # cuda 11.0 and friends on HoreKa with tests
-build/cuda110/mvapich2/gcc/cuda/debug/shared:
-  extends:
-    - .build_template
-    - .default_variables
-    - .full_test_condition
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
-  variables:
-    BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
-    BUILD_MPI: "ON"
-    BUILD_TYPE: "Debug"
-    FAST_TESTS: "ON"
-    CUDA_ARCH: 80
-    USE_NAME: "cuda110-mvapich2-gcc-${CI_PIPELINE_ID}"
-    KEEP_CONTAINER: "ON"
-    USE_SLURM: 0
-
-test/cuda110/mvapich2/gcc/cuda/debug/shared:
-  extends:
-    - .horeka_test_template
-    - .default_variables
-    - .full_test_condition
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
-  variables:
-    USE_NAME: "cuda110-mvapich2-gcc-${CI_PIPELINE_ID}"
-    SLURM_PARTITION: "accelerated"
-    SLURM_GRES: "gpu:1"
-    SLURM_TIME: "00:45:00"
-  dependencies: null
-  needs: [ "build/cuda110/mvapich2/gcc/cuda/debug/shared" ]
-
-
 build/cuda110/nompi/clang/cuda/release/static:
   extends:
     - .build_template
@@ -533,13 +501,15 @@ build/amd/openmpi/clang/rocm502/release/shared:
   extends:
     - .build_and_test_template
     - .default_variables
-    - .quick_test_condition
-    - .use_gko-rocm502-openmpi-gnu11-llvm11
+    - .full_test_condition
+    - .use_gko-rocm502-openmpi-gnu11-llvm11-multi-gpu
   variables:
     C_COMPILER: "clang"
     CXX_COMPILER: "clang++"
     BUILD_OMP: "ON"
     BUILD_HIP: "ON"
+    BUILD_MPI: "ON"
+    MPI_AS_ROOT: "ON"
     RUN_EXAMPLES: "ON"
     BUILD_TYPE: "Release"
 
@@ -834,7 +804,6 @@ iwyu:
   variables:
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_CUDA: "HIP"
     EXTRA_CMAKE_FLAGS: '-DGINKGO_WITH_IWYU=ON'
   allow_failure: yes
 

diff --git a/.gitlab/image.yml b/.gitlab/image.yml
@@ -84,6 +84,12 @@
     - amdci
     - gpu
 
+.use_gko-rocm502-openmpi-gnu11-llvm11-multi-gpu:
+  image: ginkgohub/rocm:502-openmpi-gnu11-llvm11
+  tags:
+    - private_ci
+    - nla-gpu
+
 .use_gko-oneapi-cpu:
   image: ginkgohub/oneapi:latest
   tags:

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -88,6 +88,8 @@ option(GINKGO_DPCPP_SINGLE_MODE "Do not compile double kernels for the DPC++ bac
 option(GINKGO_INSTALL_RPATH "Set the RPATH when installing its libraries." ON)
 option(GINKGO_INSTALL_RPATH_ORIGIN "Add $ORIGIN (Linux) or @loader_path (MacOS) to the installation RPATH." ON)
 option(GINKGO_INSTALL_RPATH_DEPENDENCIES "Add dependencies to the installation RPATH." OFF)
+option(GINKGO_FORCE_GPU_AWARE_MPI "Assert that the MPI library is GPU aware. This forces Ginkgo to assume that GPU aware functionality is available (OFF (default) or ON), but may fail
+     catastrophically in case the MPI implementation is not GPU Aware, and GPU aware functionality has been forced" OFF)
 
 # load executor-specific configuration
 if(GINKGO_BUILD_CUDA)
@@ -107,10 +109,10 @@ include(cmake/build_type_helpers.cmake)
 include(cmake/build_helpers.cmake)
 include(cmake/install_helpers.cmake)
 
-if (MSVC)
+if(MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj")
 endif()
-if (MINGW OR CYGWIN)
+if(MINGW OR CYGWIN)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wa,-mbig-obj")
 endif()
 
@@ -204,8 +206,31 @@ else()
     message(STATUS "HWLOC is being forcibly switched off")
 endif()
 
+set(GINKGO_HAVE_GPU_AWARE_MPI OFF)
+set(GINKGO_FORCE_SPMV_BLOCKING_COMM OFF)
 if(GINKGO_BUILD_MPI)
     find_package(MPI REQUIRED)
+    if(GINKGO_FORCE_GPU_AWARE_MPI)
+        set(GINKGO_HAVE_GPU_AWARE_MPI ON)
+    else()
+        set(GINKGO_HAVE_GPU_AWARE_MPI OFF)
+    endif()
+
+    try_run(uses_openmpi gko_result_unused
+        ${PROJECT_BINARY_DIR}
+        ${CMAKE_SOURCE_DIR}/cmake/openmpi_test.cpp
+        LINK_LIBRARIES MPI::MPI_CXX
+        RUN_OUTPUT_VARIABLE openmpi_version
+        )
+    if(uses_openmpi)
+        if(openmpi_version VERSION_LESS "4.1")
+            message(WARNING
+                "OpenMPI v4.0.x has a bug that forces us to use blocking communication in our distributed "
+                "matrix class. To enable faster, non-blocking communication, consider updating your OpenMPI version or "
+                "switch to a different vendor.")
+            set(GINKGO_FORCE_SPMV_BLOCKING_COMM ON)
+        endif()
+    endif()
 endif()
 
 # Try to find the third party packages before using our subdirectories
@@ -241,21 +266,21 @@ add_subdirectory(common)         # Import list of unified kernel source files
 if(GINKGO_BUILD_CUDA)
     add_subdirectory(cuda)       # High-performance kernels for NVIDIA GPUs
 endif()
-if (GINKGO_BUILD_REFERENCE)
+if(GINKGO_BUILD_REFERENCE)
     add_subdirectory(reference)  # Reference kernel implementations
 endif()
 if(GINKGO_BUILD_HIP)
     add_subdirectory(hip)        # High-performance kernels for AMD or NVIDIA GPUs
 endif()
-if (GINKGO_BUILD_DPCPP)
+if(GINKGO_BUILD_DPCPP)
     add_subdirectory(dpcpp)        # High-performance DPC++ kernels
 endif()
-if (GINKGO_BUILD_OMP)
+if(GINKGO_BUILD_OMP)
     add_subdirectory(omp)        # High-performance omp kernels
 endif()
 add_subdirectory(core)           # Core Ginkgo types and top-level functions
 add_subdirectory(include)        # Public API self-contained check
-if (GINKGO_BUILD_TESTS)
+if(GINKGO_BUILD_TESTS)
     add_subdirectory(test)       # Tests running on all executors
 endif()
 
@@ -323,7 +348,7 @@ endif()
 configure_file(${Ginkgo_SOURCE_DIR}/cmake/ginkgo.pc.in
     ${Ginkgo_BINARY_DIR}/ginkgo.pc.in @ONLY)
 file(GENERATE OUTPUT ${Ginkgo_BINARY_DIR}/ginkgo_$<CONFIG>.pc
-     INPUT ${Ginkgo_BINARY_DIR}/ginkgo.pc.in)
+    INPUT ${Ginkgo_BINARY_DIR}/ginkgo.pc.in)
 
 # WINDOWS NVCC has " inside the string, add escape character
 # to avoid config problem.
@@ -356,7 +381,7 @@ endif()
 file(MAKE_DIRECTORY "${GINKGO_TEST_INSTALL_BIN_DIR}")
 file(MAKE_DIRECTORY "${GINKGO_TEST_EXPORTBUILD_BIN_DIR}")
 set(TOOLSET "")
-if (NOT "${CMAKE_GENERATOR_TOOLSET}" STREQUAL "")
+if(NOT "${CMAKE_GENERATOR_TOOLSET}" STREQUAL "")
     set(TOOLSET "-T${CMAKE_GENERATOR_TOOLSET}")
 endif()
 add_custom_target(test_install

diff --git a/benchmark/utils/cuda_linops.cu b/benchmark/utils/cuda_linops.cu
@@ -44,7 +44,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "benchmark/utils/sparselib_linops.hpp"
 #include "benchmark/utils/types.hpp"
 #include "cuda/base/cusparse_bindings.hpp"
-#include "cuda/base/device_guard.hpp"
 #include "cuda/base/pointer_mode_guard.hpp"
 #include "cuda/base/types.hpp"
 
@@ -102,12 +101,12 @@ protected:
 
     void initialize_descr()
     {
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::cuda::device_guard g{id};
+        auto exec = this->get_gpu_exec();
+        auto guard = exec->get_scoped_device_id_guard();
         this->descr_ = handle_manager<cusparseMatDescr>(
             gko::kernels::cuda::cusparse::create_mat_descr(),
-            [id](cusparseMatDescr_t descr) {
-                gko::cuda::device_guard g{id};
+            [exec](cusparseMatDescr_t descr) {
+                auto guard = exec->get_scoped_device_id_guard();
                 gko::kernels::cuda::cusparse::destroy(descr);
             });
     }
@@ -130,7 +129,7 @@ class CusparseCsrmp
       public gko::ReadableFromMatrixData<ValueType, IndexType>,
       public gko::EnableCreateMethod<CusparseCsrmp<ValueType, IndexType>> {
     friend class gko::EnableCreateMethod<CusparseCsrmp>;
-    friend class gko::EnablePolymorphicObject<CusparseCsrmp, CusparseBase>;
+    friend class gko::polymorphic_object_traits<CusparseCsrmp>;
 
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
@@ -166,8 +165,7 @@ protected:
         auto db = dense_b->get_const_values();
         auto dx = dense_x->get_values();
 
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::cuda::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::spmv_mp(
             this->get_gpu_exec()->get_cusparse_handle(), trans_,
             this->get_size()[0], this->get_size()[1],
@@ -205,7 +203,7 @@ class CusparseCsr
       public gko::EnableCreateMethod<CusparseCsr<ValueType, IndexType>>,
       public gko::ReadableFromMatrixData<ValueType, IndexType> {
     friend class gko::EnableCreateMethod<CusparseCsr>;
-    friend class gko::EnablePolymorphicObject<CusparseCsr, CusparseBase>;
+    friend class gko::polymorphic_object_traits<CusparseCsr>;
 
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
@@ -241,8 +239,7 @@ protected:
         auto db = dense_b->get_const_values();
         auto dx = dense_x->get_values();
 
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::cuda::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::spmv(
             this->get_gpu_exec()->get_cusparse_handle(), trans_,
             this->get_size()[0], this->get_size()[1],
@@ -281,7 +278,7 @@ class CusparseCsrmm
       public gko::EnableCreateMethod<CusparseCsrmm<ValueType, IndexType>>,
       public gko::ReadableFromMatrixData<ValueType, IndexType> {
     friend class gko::EnableCreateMethod<CusparseCsrmm>;
-    friend class gko::EnablePolymorphicObject<CusparseCsrmm, CusparseBase>;
+    friend class gko::polymorphic_object_traits<CusparseCsrmm>;
 
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
@@ -317,8 +314,7 @@ protected:
         auto db = dense_b->get_const_values();
         auto dx = dense_x->get_values();
 
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::cuda::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::spmm(
             this->get_gpu_exec()->get_cusparse_handle(), trans_,
             this->get_size()[0], dense_b->get_size()[1], this->get_size()[1],
@@ -361,7 +357,7 @@ class CusparseCsrEx
       public gko::EnableCreateMethod<CusparseCsrEx<ValueType, IndexType>>,
       public gko::ReadableFromMatrixData<ValueType, IndexType> {
     friend class gko::EnableCreateMethod<CusparseCsrEx>;
-    friend class gko::EnablePolymorphicObject<CusparseCsrEx, CusparseBase>;
+    friend class gko::polymorphic_object_traits<CusparseCsrEx>;
 
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
@@ -404,8 +400,7 @@ protected:
         ValueType beta = gko::zero<ValueType>();
         gko::size_type buffer_size = 0;
 
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::cuda::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         auto handle = this->get_gpu_exec()->get_cusparse_handle();
         // This function seems to require the pointer mode to be set to HOST.
         // Ginkgo use pointer mode DEVICE by default, so we change this
@@ -468,7 +463,7 @@ class CusparseHybrid
           CusparseHybrid<ValueType, IndexType, Partition, Threshold>>,
       public gko::ReadableFromMatrixData<ValueType, IndexType> {
     friend class gko::EnableCreateMethod<CusparseHybrid>;
-    friend class gko::EnablePolymorphicObject<CusparseHybrid, CusparseBase>;
+    friend class gko::polymorphic_object_traits<CusparseHybrid>;
 
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
@@ -492,8 +487,7 @@ public:
         t_csr->read(data);
         this->set_size(t_csr->get_size());
 
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::cuda::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::csr2hyb(
             this->get_gpu_exec()->get_cusparse_handle(), this->get_size()[0],
             this->get_size()[1], this->get_descr(), t_csr->get_const_values(),
@@ -503,9 +497,8 @@ public:
 
     ~CusparseHybrid() override
     {
-        const auto id = this->get_gpu_exec()->get_device_id();
         try {
-            gko::cuda::device_guard g{id};
+            auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
             GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyHybMat(hyb_));
         } catch (const std::exception& e) {
             std::cerr << "Error when unallocating CusparseHybrid hyb_ matrix: "
@@ -525,8 +518,7 @@ protected:
         auto db = dense_b->get_const_values();
         auto dx = dense_x->get_values();
 
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::cuda::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::spmv(
             this->get_gpu_exec()->get_cusparse_handle(), trans_,
             &scalars.get_const_data()[0], this->get_descr(), hyb_, db,
@@ -542,8 +534,7 @@ protected:
         : gko::EnableLinOp<CusparseHybrid, CusparseBase>(exec, size),
           trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
     {
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::cuda::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateHybMat(&hyb_));
     }
 
@@ -576,8 +567,7 @@ void cusparse_generic_spmv(std::shared_ptr<const gko::CudaExecutor> gpu_exec,
     auto dense_x = gko::as<gko::matrix::Dense<ValueType>>(x);
     auto db = dense_b->get_const_values();
     auto dx = dense_x->get_values();
-    const auto id = gpu_exec->get_device_id();
-    gko::cuda::device_guard g{id};
+    auto guard = gpu_exec->get_scoped_device_id_guard();
     cusparseDnVecDescr_t vecb, vecx;
     GKO_ASSERT_NO_CUSPARSE_ERRORS(
         cusparseCreateDnVec(&vecx, dense_x->get_num_stored_elements(),
@@ -612,7 +602,7 @@ class CusparseGenericCsr
           CusparseGenericCsr<ValueType, IndexType, Alg>>,
       public gko::ReadableFromMatrixData<ValueType, IndexType> {
     friend class gko::EnableCreateMethod<CusparseGenericCsr>;
-    friend class gko::EnablePolymorphicObject<CusparseGenericCsr, CusparseBase>;
+    friend class gko::polymorphic_object_traits<CusparseGenericCsr>;
 
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
@@ -653,9 +643,8 @@ public:
 
     ~CusparseGenericCsr() override
     {
-        const auto id = this->get_gpu_exec()->get_device_id();
         try {
-            gko::cuda::device_guard g{id};
+            auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
             GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(mat_));
         } catch (const std::exception& e) {
             std::cerr
@@ -705,7 +694,7 @@ class CusparseGenericCoo
       public gko::EnableCreateMethod<CusparseGenericCoo<ValueType, IndexType>>,
       public gko::ReadableFromMatrixData<ValueType, IndexType> {
     friend class gko::EnableCreateMethod<CusparseGenericCoo>;
-    friend class gko::EnablePolymorphicObject<CusparseGenericCoo, CusparseBase>;
+    friend class gko::polymorphic_object_traits<CusparseGenericCoo>;
 
 public:
     using coo = gko::matrix::Coo<ValueType, IndexType>;
@@ -746,9 +735,8 @@ public:
 
     ~CusparseGenericCoo() override
     {
-        const auto id = this->get_gpu_exec()->get_device_id();
         try {
-            gko::cuda::device_guard g{id};
+            auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
             GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(mat_));
         } catch (const std::exception& e) {
             std::cerr