Skip to content

Commit

Permalink
Merge distributed capabilities
Browse files Browse the repository at this point in the history
This PR will add basic, distributed data structures (matrix and vector), and enable some solvers for these types. This PR contains the following PRs:
- #961
- #971 
- #976 
- #985 
- #1007 
- #1030 
- #1054

# Additional Changes

- moves new types into experimental namespace
- moves existing Partition class into experimental namespace
- moves existing mpi namespace into experimental namespace
- makes generic_scoped_device_id_guard destructor noexcept by terminating if restoring the original device id fails
- switches to blocking communication in the SpMV if OpenMPI version 4.0.x is used
- disables Horeka mpi tests and uses nla-gpu instead

Related PR: #1133
  • Loading branch information
MarcelKoch authored Oct 31, 2022
2 parents cff9742 + b59a9dd commit c1f8bd4
Show file tree
Hide file tree
Showing 192 changed files with 11,800 additions and 892 deletions.
39 changes: 4 additions & 35 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -341,38 +341,6 @@ build/cuda102/nompi/intel/cuda/debug/static:
CUDA_ARCH: 35

# cuda 11.0 and friends on HoreKa with tests
build/cuda110/mvapich2/gcc/cuda/debug/shared:
extends:
- .build_template
- .default_variables
- .full_test_condition
- .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
variables:
BUILD_OMP: "ON"
BUILD_CUDA: "ON"
BUILD_MPI: "ON"
BUILD_TYPE: "Debug"
FAST_TESTS: "ON"
CUDA_ARCH: 80
USE_NAME: "cuda110-mvapich2-gcc-${CI_PIPELINE_ID}"
KEEP_CONTAINER: "ON"
USE_SLURM: 0

test/cuda110/mvapich2/gcc/cuda/debug/shared:
extends:
- .horeka_test_template
- .default_variables
- .full_test_condition
- .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
variables:
USE_NAME: "cuda110-mvapich2-gcc-${CI_PIPELINE_ID}"
SLURM_PARTITION: "accelerated"
SLURM_GRES: "gpu:1"
SLURM_TIME: "00:45:00"
dependencies: null
needs: [ "build/cuda110/mvapich2/gcc/cuda/debug/shared" ]


build/cuda110/nompi/clang/cuda/release/static:
extends:
- .build_template
Expand Down Expand Up @@ -533,13 +501,15 @@ build/amd/openmpi/clang/rocm502/release/shared:
extends:
- .build_and_test_template
- .default_variables
- .quick_test_condition
- .use_gko-rocm502-openmpi-gnu11-llvm11
- .full_test_condition
- .use_gko-rocm502-openmpi-gnu11-llvm11-multi-gpu
variables:
C_COMPILER: "clang"
CXX_COMPILER: "clang++"
BUILD_OMP: "ON"
BUILD_HIP: "ON"
BUILD_MPI: "ON"
MPI_AS_ROOT: "ON"
RUN_EXAMPLES: "ON"
BUILD_TYPE: "Release"

Expand Down Expand Up @@ -834,7 +804,6 @@ iwyu:
variables:
BUILD_OMP: "ON"
BUILD_CUDA: "ON"
BUILD_CUDA: "HIP"
EXTRA_CMAKE_FLAGS: '-DGINKGO_WITH_IWYU=ON'
allow_failure: yes

Expand Down
6 changes: 6 additions & 0 deletions .gitlab/image.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,12 @@
- amdci
- gpu

.use_gko-rocm502-openmpi-gnu11-llvm11-multi-gpu:
image: ginkgohub/rocm:502-openmpi-gnu11-llvm11
tags:
- private_ci
- nla-gpu

.use_gko-oneapi-cpu:
image: ginkgohub/oneapi:latest
tags:
Expand Down
41 changes: 33 additions & 8 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,8 @@ option(GINKGO_DPCPP_SINGLE_MODE "Do not compile double kernels for the DPC++ bac
option(GINKGO_INSTALL_RPATH "Set the RPATH when installing its libraries." ON)
option(GINKGO_INSTALL_RPATH_ORIGIN "Add $ORIGIN (Linux) or @loader_path (MacOS) to the installation RPATH." ON)
option(GINKGO_INSTALL_RPATH_DEPENDENCIES "Add dependencies to the installation RPATH." OFF)
option(GINKGO_FORCE_GPU_AWARE_MPI "Assert that the MPI library is GPU aware. This forces Ginkgo to assume that GPU aware functionality is available (OFF (default) or ON), but may fail
catastrophically in case the MPI implementation is not GPU Aware, and GPU aware functionality has been forced" OFF)

# load executor-specific configuration
if(GINKGO_BUILD_CUDA)
Expand All @@ -107,10 +109,10 @@ include(cmake/build_type_helpers.cmake)
include(cmake/build_helpers.cmake)
include(cmake/install_helpers.cmake)

if (MSVC)
if(MSVC)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj")
endif()
if (MINGW OR CYGWIN)
if(MINGW OR CYGWIN)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wa,-mbig-obj")
endif()

Expand Down Expand Up @@ -204,8 +206,31 @@ else()
message(STATUS "HWLOC is being forcibly switched off")
endif()

set(GINKGO_HAVE_GPU_AWARE_MPI OFF)
set(GINKGO_FORCE_SPMV_BLOCKING_COMM OFF)
if(GINKGO_BUILD_MPI)
find_package(MPI REQUIRED)
if(GINKGO_FORCE_GPU_AWARE_MPI)
set(GINKGO_HAVE_GPU_AWARE_MPI ON)
else()
set(GINKGO_HAVE_GPU_AWARE_MPI OFF)
endif()

try_run(uses_openmpi gko_result_unused
${PROJECT_BINARY_DIR}
${CMAKE_SOURCE_DIR}/cmake/openmpi_test.cpp
LINK_LIBRARIES MPI::MPI_CXX
RUN_OUTPUT_VARIABLE openmpi_version
)
if(uses_openmpi)
if(openmpi_version VERSION_LESS "4.1")
message(WARNING
"OpenMPI v4.0.x has a bug that forces us to use blocking communication in our distributed "
"matrix class. To enable faster, non-blocking communication, consider updating your OpenMPI version or "
"switch to a different vendor.")
set(GINKGO_FORCE_SPMV_BLOCKING_COMM ON)
endif()
endif()
endif()

# Try to find the third party packages before using our subdirectories
Expand Down Expand Up @@ -241,21 +266,21 @@ add_subdirectory(common) # Import list of unified kernel source files
if(GINKGO_BUILD_CUDA)
add_subdirectory(cuda) # High-performance kernels for NVIDIA GPUs
endif()
if (GINKGO_BUILD_REFERENCE)
if(GINKGO_BUILD_REFERENCE)
add_subdirectory(reference) # Reference kernel implementations
endif()
if(GINKGO_BUILD_HIP)
add_subdirectory(hip) # High-performance kernels for AMD or NVIDIA GPUs
endif()
if (GINKGO_BUILD_DPCPP)
if(GINKGO_BUILD_DPCPP)
add_subdirectory(dpcpp) # High-performance DPC++ kernels
endif()
if (GINKGO_BUILD_OMP)
if(GINKGO_BUILD_OMP)
add_subdirectory(omp) # High-performance omp kernels
endif()
add_subdirectory(core) # Core Ginkgo types and top-level functions
add_subdirectory(include) # Public API self-contained check
if (GINKGO_BUILD_TESTS)
if(GINKGO_BUILD_TESTS)
add_subdirectory(test) # Tests running on all executors
endif()

Expand Down Expand Up @@ -323,7 +348,7 @@ endif()
configure_file(${Ginkgo_SOURCE_DIR}/cmake/ginkgo.pc.in
${Ginkgo_BINARY_DIR}/ginkgo.pc.in @ONLY)
file(GENERATE OUTPUT ${Ginkgo_BINARY_DIR}/ginkgo_$<CONFIG>.pc
INPUT ${Ginkgo_BINARY_DIR}/ginkgo.pc.in)
INPUT ${Ginkgo_BINARY_DIR}/ginkgo.pc.in)

# WINDOWS NVCC has " inside the string, add escape character
# to avoid config problem.
Expand Down Expand Up @@ -356,7 +381,7 @@ endif()
file(MAKE_DIRECTORY "${GINKGO_TEST_INSTALL_BIN_DIR}")
file(MAKE_DIRECTORY "${GINKGO_TEST_EXPORTBUILD_BIN_DIR}")
set(TOOLSET "")
if (NOT "${CMAKE_GENERATOR_TOOLSET}" STREQUAL "")
if(NOT "${CMAKE_GENERATOR_TOOLSET}" STREQUAL "")
set(TOOLSET "-T${CMAKE_GENERATOR_TOOLSET}")
endif()
add_custom_target(test_install
Expand Down
56 changes: 22 additions & 34 deletions benchmark/utils/cuda_linops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "benchmark/utils/sparselib_linops.hpp"
#include "benchmark/utils/types.hpp"
#include "cuda/base/cusparse_bindings.hpp"
#include "cuda/base/device_guard.hpp"
#include "cuda/base/pointer_mode_guard.hpp"
#include "cuda/base/types.hpp"

Expand Down Expand Up @@ -102,12 +101,12 @@ protected:

void initialize_descr()
{
const auto id = this->get_gpu_exec()->get_device_id();
gko::cuda::device_guard g{id};
auto exec = this->get_gpu_exec();
auto guard = exec->get_scoped_device_id_guard();
this->descr_ = handle_manager<cusparseMatDescr>(
gko::kernels::cuda::cusparse::create_mat_descr(),
[id](cusparseMatDescr_t descr) {
gko::cuda::device_guard g{id};
[exec](cusparseMatDescr_t descr) {
auto guard = exec->get_scoped_device_id_guard();
gko::kernels::cuda::cusparse::destroy(descr);
});
}
Expand All @@ -130,7 +129,7 @@ class CusparseCsrmp
public gko::ReadableFromMatrixData<ValueType, IndexType>,
public gko::EnableCreateMethod<CusparseCsrmp<ValueType, IndexType>> {
friend class gko::EnableCreateMethod<CusparseCsrmp>;
friend class gko::EnablePolymorphicObject<CusparseCsrmp, CusparseBase>;
friend class gko::polymorphic_object_traits<CusparseCsrmp>;

public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
Expand Down Expand Up @@ -166,8 +165,7 @@ protected:
auto db = dense_b->get_const_values();
auto dx = dense_x->get_values();

const auto id = this->get_gpu_exec()->get_device_id();
gko::cuda::device_guard g{id};
auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
gko::kernels::cuda::cusparse::spmv_mp(
this->get_gpu_exec()->get_cusparse_handle(), trans_,
this->get_size()[0], this->get_size()[1],
Expand Down Expand Up @@ -205,7 +203,7 @@ class CusparseCsr
public gko::EnableCreateMethod<CusparseCsr<ValueType, IndexType>>,
public gko::ReadableFromMatrixData<ValueType, IndexType> {
friend class gko::EnableCreateMethod<CusparseCsr>;
friend class gko::EnablePolymorphicObject<CusparseCsr, CusparseBase>;
friend class gko::polymorphic_object_traits<CusparseCsr>;

public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
Expand Down Expand Up @@ -241,8 +239,7 @@ protected:
auto db = dense_b->get_const_values();
auto dx = dense_x->get_values();

const auto id = this->get_gpu_exec()->get_device_id();
gko::cuda::device_guard g{id};
auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
gko::kernels::cuda::cusparse::spmv(
this->get_gpu_exec()->get_cusparse_handle(), trans_,
this->get_size()[0], this->get_size()[1],
Expand Down Expand Up @@ -281,7 +278,7 @@ class CusparseCsrmm
public gko::EnableCreateMethod<CusparseCsrmm<ValueType, IndexType>>,
public gko::ReadableFromMatrixData<ValueType, IndexType> {
friend class gko::EnableCreateMethod<CusparseCsrmm>;
friend class gko::EnablePolymorphicObject<CusparseCsrmm, CusparseBase>;
friend class gko::polymorphic_object_traits<CusparseCsrmm>;

public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
Expand Down Expand Up @@ -317,8 +314,7 @@ protected:
auto db = dense_b->get_const_values();
auto dx = dense_x->get_values();

const auto id = this->get_gpu_exec()->get_device_id();
gko::cuda::device_guard g{id};
auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
gko::kernels::cuda::cusparse::spmm(
this->get_gpu_exec()->get_cusparse_handle(), trans_,
this->get_size()[0], dense_b->get_size()[1], this->get_size()[1],
Expand Down Expand Up @@ -361,7 +357,7 @@ class CusparseCsrEx
public gko::EnableCreateMethod<CusparseCsrEx<ValueType, IndexType>>,
public gko::ReadableFromMatrixData<ValueType, IndexType> {
friend class gko::EnableCreateMethod<CusparseCsrEx>;
friend class gko::EnablePolymorphicObject<CusparseCsrEx, CusparseBase>;
friend class gko::polymorphic_object_traits<CusparseCsrEx>;

public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
Expand Down Expand Up @@ -404,8 +400,7 @@ protected:
ValueType beta = gko::zero<ValueType>();
gko::size_type buffer_size = 0;

const auto id = this->get_gpu_exec()->get_device_id();
gko::cuda::device_guard g{id};
auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
auto handle = this->get_gpu_exec()->get_cusparse_handle();
// This function seems to require the pointer mode to be set to HOST.
// Ginkgo use pointer mode DEVICE by default, so we change this
Expand Down Expand Up @@ -468,7 +463,7 @@ class CusparseHybrid
CusparseHybrid<ValueType, IndexType, Partition, Threshold>>,
public gko::ReadableFromMatrixData<ValueType, IndexType> {
friend class gko::EnableCreateMethod<CusparseHybrid>;
friend class gko::EnablePolymorphicObject<CusparseHybrid, CusparseBase>;
friend class gko::polymorphic_object_traits<CusparseHybrid>;

public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
Expand All @@ -492,8 +487,7 @@ public:
t_csr->read(data);
this->set_size(t_csr->get_size());

const auto id = this->get_gpu_exec()->get_device_id();
gko::cuda::device_guard g{id};
auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
gko::kernels::cuda::cusparse::csr2hyb(
this->get_gpu_exec()->get_cusparse_handle(), this->get_size()[0],
this->get_size()[1], this->get_descr(), t_csr->get_const_values(),
Expand All @@ -503,9 +497,8 @@ public:

~CusparseHybrid() override
{
const auto id = this->get_gpu_exec()->get_device_id();
try {
gko::cuda::device_guard g{id};
auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyHybMat(hyb_));
} catch (const std::exception& e) {
std::cerr << "Error when unallocating CusparseHybrid hyb_ matrix: "
Expand All @@ -525,8 +518,7 @@ protected:
auto db = dense_b->get_const_values();
auto dx = dense_x->get_values();

const auto id = this->get_gpu_exec()->get_device_id();
gko::cuda::device_guard g{id};
auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
gko::kernels::cuda::cusparse::spmv(
this->get_gpu_exec()->get_cusparse_handle(), trans_,
&scalars.get_const_data()[0], this->get_descr(), hyb_, db,
Expand All @@ -542,8 +534,7 @@ protected:
: gko::EnableLinOp<CusparseHybrid, CusparseBase>(exec, size),
trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
{
const auto id = this->get_gpu_exec()->get_device_id();
gko::cuda::device_guard g{id};
auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateHybMat(&hyb_));
}

Expand Down Expand Up @@ -576,8 +567,7 @@ void cusparse_generic_spmv(std::shared_ptr<const gko::CudaExecutor> gpu_exec,
auto dense_x = gko::as<gko::matrix::Dense<ValueType>>(x);
auto db = dense_b->get_const_values();
auto dx = dense_x->get_values();
const auto id = gpu_exec->get_device_id();
gko::cuda::device_guard g{id};
auto guard = gpu_exec->get_scoped_device_id_guard();
cusparseDnVecDescr_t vecb, vecx;
GKO_ASSERT_NO_CUSPARSE_ERRORS(
cusparseCreateDnVec(&vecx, dense_x->get_num_stored_elements(),
Expand Down Expand Up @@ -612,7 +602,7 @@ class CusparseGenericCsr
CusparseGenericCsr<ValueType, IndexType, Alg>>,
public gko::ReadableFromMatrixData<ValueType, IndexType> {
friend class gko::EnableCreateMethod<CusparseGenericCsr>;
friend class gko::EnablePolymorphicObject<CusparseGenericCsr, CusparseBase>;
friend class gko::polymorphic_object_traits<CusparseGenericCsr>;

public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
Expand Down Expand Up @@ -653,9 +643,8 @@ public:

~CusparseGenericCsr() override
{
const auto id = this->get_gpu_exec()->get_device_id();
try {
gko::cuda::device_guard g{id};
auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(mat_));
} catch (const std::exception& e) {
std::cerr
Expand Down Expand Up @@ -705,7 +694,7 @@ class CusparseGenericCoo
public gko::EnableCreateMethod<CusparseGenericCoo<ValueType, IndexType>>,
public gko::ReadableFromMatrixData<ValueType, IndexType> {
friend class gko::EnableCreateMethod<CusparseGenericCoo>;
friend class gko::EnablePolymorphicObject<CusparseGenericCoo, CusparseBase>;
friend class gko::polymorphic_object_traits<CusparseGenericCoo>;

public:
using coo = gko::matrix::Coo<ValueType, IndexType>;
Expand Down Expand Up @@ -746,9 +735,8 @@ public:

~CusparseGenericCoo() override
{
const auto id = this->get_gpu_exec()->get_device_id();
try {
gko::cuda::device_guard g{id};
auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(mat_));
} catch (const std::exception& e) {
std::cerr
Expand Down
Loading

0 comments on commit c1f8bd4

Please sign in to comment.