diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ab76637eab0..da7afaf6b55 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -341,38 +341,6 @@ build/cuda102/nompi/intel/cuda/debug/static:
     CUDA_ARCH: 35
 
 # cuda 11.0 and friends on HoreKa with tests
-build/cuda110/mvapich2/gcc/cuda/debug/shared:
-  extends:
-    - .build_template
-    - .default_variables
-    - .full_test_condition
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
-  variables:
-    BUILD_OMP: "ON"
-    BUILD_CUDA: "ON"
-    BUILD_MPI: "ON"
-    BUILD_TYPE: "Debug"
-    FAST_TESTS: "ON"
-    CUDA_ARCH: 80
-    USE_NAME: "cuda110-mvapich2-gcc-${CI_PIPELINE_ID}"
-    KEEP_CONTAINER: "ON"
-    USE_SLURM: 0
-
-test/cuda110/mvapich2/gcc/cuda/debug/shared:
-  extends:
-    - .horeka_test_template
-    - .default_variables
-    - .full_test_condition
-    - .use_gko-cuda110-mvapich2-gnu9-llvm9-intel2020
-  variables:
-    USE_NAME: "cuda110-mvapich2-gcc-${CI_PIPELINE_ID}"
-    SLURM_PARTITION: "accelerated"
-    SLURM_GRES: "gpu:1"
-    SLURM_TIME: "00:45:00"
-  dependencies: null
-  needs: [ "build/cuda110/mvapich2/gcc/cuda/debug/shared" ]
-
-
 build/cuda110/nompi/clang/cuda/release/static:
   extends:
     - .build_template
@@ -533,13 +501,15 @@ build/amd/openmpi/clang/rocm502/release/shared:
   extends:
     - .build_and_test_template
     - .default_variables
-    - .quick_test_condition
-    - .use_gko-rocm502-openmpi-gnu11-llvm11
+    - .full_test_condition
+    - .use_gko-rocm502-openmpi-gnu11-llvm11-multi-gpu
   variables:
     C_COMPILER: "clang"
     CXX_COMPILER: "clang++"
     BUILD_OMP: "ON"
     BUILD_HIP: "ON"
+    BUILD_MPI: "ON"
+    MPI_AS_ROOT: "ON"
     RUN_EXAMPLES: "ON"
     BUILD_TYPE: "Release"
 
@@ -834,7 +804,6 @@ iwyu:
   variables:
     BUILD_OMP: "ON"
     BUILD_CUDA: "ON"
-    BUILD_CUDA: "HIP"
     EXTRA_CMAKE_FLAGS: '-DGINKGO_WITH_IWYU=ON'
   allow_failure: yes
 
diff --git a/.gitlab/image.yml b/.gitlab/image.yml
index 04aabfebcdb..0f8128ea2f1 100644
--- a/.gitlab/image.yml
+++ b/.gitlab/image.yml
@@ -84,6 +84,12 @@
     - amdci
     - gpu
 
+.use_gko-rocm502-openmpi-gnu11-llvm11-multi-gpu:
+  image: ginkgohub/rocm:502-openmpi-gnu11-llvm11
+  tags:
+    - private_ci
+    - nla-gpu
+
 .use_gko-oneapi-cpu:
   image: ginkgohub/oneapi:latest
   tags:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9376cef03aa..23cac48d3c8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -88,6 +88,8 @@ option(GINKGO_DPCPP_SINGLE_MODE "Do not compile double kernels for the DPC++ bac
 option(GINKGO_INSTALL_RPATH "Set the RPATH when installing its libraries." ON)
 option(GINKGO_INSTALL_RPATH_ORIGIN "Add $ORIGIN (Linux) or @loader_path (MacOS) to the installation RPATH." ON)
 option(GINKGO_INSTALL_RPATH_DEPENDENCIES "Add dependencies to the installation RPATH." OFF)
+option(GINKGO_FORCE_GPU_AWARE_MPI "Assert that the MPI library is GPU aware. This forces Ginkgo to assume that GPU aware functionality is available (OFF (default) or ON), but may fail
+     catastrophically in case the MPI implementation is not GPU Aware, and GPU aware functionality has been forced" OFF)
 
 # load executor-specific configuration
 if(GINKGO_BUILD_CUDA)
@@ -107,10 +109,10 @@ include(cmake/build_type_helpers.cmake)
 include(cmake/build_helpers.cmake)
 include(cmake/install_helpers.cmake)
 
-if (MSVC)
+if(MSVC)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj")
 endif()
-if (MINGW OR CYGWIN)
+if(MINGW OR CYGWIN)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wa,-mbig-obj")
 endif()
 
@@ -204,8 +206,31 @@ else()
     message(STATUS "HWLOC is being forcibly switched off")
 endif()
 
+set(GINKGO_HAVE_GPU_AWARE_MPI OFF)
+set(GINKGO_FORCE_SPMV_BLOCKING_COMM OFF)
 if(GINKGO_BUILD_MPI)
     find_package(MPI REQUIRED)
+    if(GINKGO_FORCE_GPU_AWARE_MPI)
+        set(GINKGO_HAVE_GPU_AWARE_MPI ON)
+    else()
+        set(GINKGO_HAVE_GPU_AWARE_MPI OFF)
+    endif()
+
+    try_run(uses_openmpi gko_result_unused
+        ${PROJECT_BINARY_DIR}
+        ${CMAKE_SOURCE_DIR}/cmake/openmpi_test.cpp
+        LINK_LIBRARIES MPI::MPI_CXX
+        RUN_OUTPUT_VARIABLE openmpi_version
+        )
+    if(uses_openmpi)
+        if(openmpi_version VERSION_LESS "4.1")
+            message(WARNING
+                "OpenMPI v4.0.x has a bug that forces us to use blocking communication in our distributed "
+                "matrix class. To enable faster, non-blocking communication, consider updating your OpenMPI version or "
+                "switch to a different vendor.")
+            set(GINKGO_FORCE_SPMV_BLOCKING_COMM ON)
+        endif()
+    endif()
 endif()
 
 # Try to find the third party packages before using our subdirectories
@@ -241,21 +266,21 @@ add_subdirectory(common)         # Import list of unified kernel source files
 if(GINKGO_BUILD_CUDA)
     add_subdirectory(cuda)       # High-performance kernels for NVIDIA GPUs
 endif()
-if (GINKGO_BUILD_REFERENCE)
+if(GINKGO_BUILD_REFERENCE)
     add_subdirectory(reference)  # Reference kernel implementations
 endif()
 if(GINKGO_BUILD_HIP)
     add_subdirectory(hip)        # High-performance kernels for AMD or NVIDIA GPUs
 endif()
-if (GINKGO_BUILD_DPCPP)
+if(GINKGO_BUILD_DPCPP)
     add_subdirectory(dpcpp)        # High-performance DPC++ kernels
 endif()
-if (GINKGO_BUILD_OMP)
+if(GINKGO_BUILD_OMP)
     add_subdirectory(omp)        # High-performance omp kernels
 endif()
 add_subdirectory(core)           # Core Ginkgo types and top-level functions
 add_subdirectory(include)        # Public API self-contained check
-if (GINKGO_BUILD_TESTS)
+if(GINKGO_BUILD_TESTS)
     add_subdirectory(test)       # Tests running on all executors
 endif()
 
@@ -323,7 +348,7 @@ endif()
 configure_file(${Ginkgo_SOURCE_DIR}/cmake/ginkgo.pc.in
     ${Ginkgo_BINARY_DIR}/ginkgo.pc.in @ONLY)
 file(GENERATE OUTPUT ${Ginkgo_BINARY_DIR}/ginkgo_$<CONFIG>.pc
-     INPUT ${Ginkgo_BINARY_DIR}/ginkgo.pc.in)
+    INPUT ${Ginkgo_BINARY_DIR}/ginkgo.pc.in)
 
 # WINDOWS NVCC has " inside the string, add escape character
 # to avoid config problem.
@@ -356,7 +381,7 @@ endif()
 file(MAKE_DIRECTORY "${GINKGO_TEST_INSTALL_BIN_DIR}")
 file(MAKE_DIRECTORY "${GINKGO_TEST_EXPORTBUILD_BIN_DIR}")
 set(TOOLSET "")
-if (NOT "${CMAKE_GENERATOR_TOOLSET}" STREQUAL "")
+if(NOT "${CMAKE_GENERATOR_TOOLSET}" STREQUAL "")
     set(TOOLSET "-T${CMAKE_GENERATOR_TOOLSET}")
 endif()
 add_custom_target(test_install
diff --git a/benchmark/utils/cuda_linops.cu b/benchmark/utils/cuda_linops.cu
index 9eeb309017e..502ccb89c7c 100644
--- a/benchmark/utils/cuda_linops.cu
+++ b/benchmark/utils/cuda_linops.cu
@@ -44,7 +44,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "benchmark/utils/sparselib_linops.hpp"
 #include "benchmark/utils/types.hpp"
 #include "cuda/base/cusparse_bindings.hpp"
-#include "cuda/base/device_guard.hpp"
 #include "cuda/base/pointer_mode_guard.hpp"
 #include "cuda/base/types.hpp"
 
@@ -102,12 +101,12 @@ protected:
 
     void initialize_descr()
     {
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::cuda::device_guard g{id};
+        auto exec = this->get_gpu_exec();
+        auto guard = exec->get_scoped_device_id_guard();
         this->descr_ = handle_manager<cusparseMatDescr>(
             gko::kernels::cuda::cusparse::create_mat_descr(),
-            [id](cusparseMatDescr_t descr) {
-                gko::cuda::device_guard g{id};
+            [exec](cusparseMatDescr_t descr) {
+                auto guard = exec->get_scoped_device_id_guard();
                 gko::kernels::cuda::cusparse::destroy(descr);
             });
     }
@@ -130,7 +129,7 @@ class CusparseCsrmp
       public gko::ReadableFromMatrixData<ValueType, IndexType>,
       public gko::EnableCreateMethod<CusparseCsrmp<ValueType, IndexType>> {
     friend class gko::EnableCreateMethod<CusparseCsrmp>;
-    friend class gko::EnablePolymorphicObject<CusparseCsrmp, CusparseBase>;
+    friend class gko::polymorphic_object_traits<CusparseCsrmp>;
 
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
@@ -166,8 +165,7 @@ protected:
         auto db = dense_b->get_const_values();
         auto dx = dense_x->get_values();
 
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::cuda::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::spmv_mp(
             this->get_gpu_exec()->get_cusparse_handle(), trans_,
             this->get_size()[0], this->get_size()[1],
@@ -205,7 +203,7 @@ class CusparseCsr
       public gko::EnableCreateMethod<CusparseCsr<ValueType, IndexType>>,
       public gko::ReadableFromMatrixData<ValueType, IndexType> {
     friend class gko::EnableCreateMethod<CusparseCsr>;
-    friend class gko::EnablePolymorphicObject<CusparseCsr, CusparseBase>;
+    friend class gko::polymorphic_object_traits<CusparseCsr>;
 
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
@@ -241,8 +239,7 @@ protected:
         auto db = dense_b->get_const_values();
         auto dx = dense_x->get_values();
 
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::cuda::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::spmv(
             this->get_gpu_exec()->get_cusparse_handle(), trans_,
             this->get_size()[0], this->get_size()[1],
@@ -281,7 +278,7 @@ class CusparseCsrmm
       public gko::EnableCreateMethod<CusparseCsrmm<ValueType, IndexType>>,
       public gko::ReadableFromMatrixData<ValueType, IndexType> {
     friend class gko::EnableCreateMethod<CusparseCsrmm>;
-    friend class gko::EnablePolymorphicObject<CusparseCsrmm, CusparseBase>;
+    friend class gko::polymorphic_object_traits<CusparseCsrmm>;
 
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
@@ -317,8 +314,7 @@ protected:
         auto db = dense_b->get_const_values();
         auto dx = dense_x->get_values();
 
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::cuda::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::spmm(
             this->get_gpu_exec()->get_cusparse_handle(), trans_,
             this->get_size()[0], dense_b->get_size()[1], this->get_size()[1],
@@ -361,7 +357,7 @@ class CusparseCsrEx
       public gko::EnableCreateMethod<CusparseCsrEx<ValueType, IndexType>>,
       public gko::ReadableFromMatrixData<ValueType, IndexType> {
     friend class gko::EnableCreateMethod<CusparseCsrEx>;
-    friend class gko::EnablePolymorphicObject<CusparseCsrEx, CusparseBase>;
+    friend class gko::polymorphic_object_traits<CusparseCsrEx>;
 
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
@@ -404,8 +400,7 @@ protected:
         ValueType beta = gko::zero<ValueType>();
         gko::size_type buffer_size = 0;
 
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::cuda::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         auto handle = this->get_gpu_exec()->get_cusparse_handle();
         // This function seems to require the pointer mode to be set to HOST.
         // Ginkgo use pointer mode DEVICE by default, so we change this
@@ -468,7 +463,7 @@ class CusparseHybrid
           CusparseHybrid<ValueType, IndexType, Partition, Threshold>>,
       public gko::ReadableFromMatrixData<ValueType, IndexType> {
     friend class gko::EnableCreateMethod<CusparseHybrid>;
-    friend class gko::EnablePolymorphicObject<CusparseHybrid, CusparseBase>;
+    friend class gko::polymorphic_object_traits<CusparseHybrid>;
 
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
@@ -492,8 +487,7 @@ public:
         t_csr->read(data);
         this->set_size(t_csr->get_size());
 
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::cuda::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::csr2hyb(
             this->get_gpu_exec()->get_cusparse_handle(), this->get_size()[0],
             this->get_size()[1], this->get_descr(), t_csr->get_const_values(),
@@ -503,9 +497,8 @@ public:
 
     ~CusparseHybrid() override
     {
-        const auto id = this->get_gpu_exec()->get_device_id();
         try {
-            gko::cuda::device_guard g{id};
+            auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
             GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroyHybMat(hyb_));
         } catch (const std::exception& e) {
             std::cerr << "Error when unallocating CusparseHybrid hyb_ matrix: "
@@ -525,8 +518,7 @@ protected:
         auto db = dense_b->get_const_values();
         auto dx = dense_x->get_values();
 
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::cuda::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::cuda::cusparse::spmv(
             this->get_gpu_exec()->get_cusparse_handle(), trans_,
             &scalars.get_const_data()[0], this->get_descr(), hyb_, db,
@@ -542,8 +534,7 @@ protected:
         : gko::EnableLinOp<CusparseHybrid, CusparseBase>(exec, size),
           trans_(CUSPARSE_OPERATION_NON_TRANSPOSE)
     {
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::cuda::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseCreateHybMat(&hyb_));
     }
 
@@ -576,8 +567,7 @@ void cusparse_generic_spmv(std::shared_ptr<const gko::CudaExecutor> gpu_exec,
     auto dense_x = gko::as<gko::matrix::Dense<ValueType>>(x);
     auto db = dense_b->get_const_values();
     auto dx = dense_x->get_values();
-    const auto id = gpu_exec->get_device_id();
-    gko::cuda::device_guard g{id};
+    auto guard = gpu_exec->get_scoped_device_id_guard();
     cusparseDnVecDescr_t vecb, vecx;
     GKO_ASSERT_NO_CUSPARSE_ERRORS(
         cusparseCreateDnVec(&vecx, dense_x->get_num_stored_elements(),
@@ -612,7 +602,7 @@ class CusparseGenericCsr
           CusparseGenericCsr<ValueType, IndexType, Alg>>,
       public gko::ReadableFromMatrixData<ValueType, IndexType> {
     friend class gko::EnableCreateMethod<CusparseGenericCsr>;
-    friend class gko::EnablePolymorphicObject<CusparseGenericCsr, CusparseBase>;
+    friend class gko::polymorphic_object_traits<CusparseGenericCsr>;
 
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
@@ -653,9 +643,8 @@ public:
 
     ~CusparseGenericCsr() override
     {
-        const auto id = this->get_gpu_exec()->get_device_id();
         try {
-            gko::cuda::device_guard g{id};
+            auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
             GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(mat_));
         } catch (const std::exception& e) {
             std::cerr
@@ -705,7 +694,7 @@ class CusparseGenericCoo
       public gko::EnableCreateMethod<CusparseGenericCoo<ValueType, IndexType>>,
       public gko::ReadableFromMatrixData<ValueType, IndexType> {
     friend class gko::EnableCreateMethod<CusparseGenericCoo>;
-    friend class gko::EnablePolymorphicObject<CusparseGenericCoo, CusparseBase>;
+    friend class gko::polymorphic_object_traits<CusparseGenericCoo>;
 
 public:
     using coo = gko::matrix::Coo<ValueType, IndexType>;
@@ -746,9 +735,8 @@ public:
 
     ~CusparseGenericCoo() override
     {
-        const auto id = this->get_gpu_exec()->get_device_id();
         try {
-            gko::cuda::device_guard g{id};
+            auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
             GKO_ASSERT_NO_CUSPARSE_ERRORS(cusparseDestroySpMat(mat_));
         } catch (const std::exception& e) {
             std::cerr
diff --git a/benchmark/utils/cuda_timer.cu b/benchmark/utils/cuda_timer.cu
index c4222dcaa73..3ccdd2d8b3c 100644
--- a/benchmark/utils/cuda_timer.cu
+++ b/benchmark/utils/cuda_timer.cu
@@ -35,7 +35,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "benchmark/utils/timer_impl.hpp"
-#include "cuda/base/device_guard.hpp"
 
 
 /**
@@ -61,8 +60,7 @@ public:
     {
         assert(exec != nullptr);
         exec_ = exec;
-        id_ = exec_->get_device_id();
-        gko::cuda::device_guard g{id_};
+        auto guard = exec_->get_scoped_device_id_guard();
         GKO_ASSERT_NO_CUDA_ERRORS(cudaEventCreate(&start_));
         GKO_ASSERT_NO_CUDA_ERRORS(cudaEventCreate(&stop_));
     }
@@ -71,14 +69,14 @@ protected:
     void tic_impl() override
     {
         exec_->synchronize();
-        gko::cuda::device_guard g{id_};
+        auto guard = exec_->get_scoped_device_id_guard();
         // Currently, gko::CudaExecutor always use default stream.
         GKO_ASSERT_NO_CUDA_ERRORS(cudaEventRecord(start_));
     }
 
     double toc_impl() override
     {
-        gko::cuda::device_guard g{id_};
+        auto guard = exec_->get_scoped_device_id_guard();
         // Currently, gko::CudaExecutor always use default stream.
         GKO_ASSERT_NO_CUDA_ERRORS(cudaEventRecord(stop_));
         GKO_ASSERT_NO_CUDA_ERRORS(cudaEventSynchronize(stop_));
@@ -95,7 +93,6 @@ private:
     std::shared_ptr<const gko::CudaExecutor> exec_;
     cudaEvent_t start_;
     cudaEvent_t stop_;
-    int id_;
 };
 
 
diff --git a/benchmark/utils/dpcpp_linops.dp.cpp b/benchmark/utils/dpcpp_linops.dp.cpp
index 522a0f205fa..e03d8520ea3 100644
--- a/benchmark/utils/dpcpp_linops.dp.cpp
+++ b/benchmark/utils/dpcpp_linops.dp.cpp
@@ -126,7 +126,7 @@ class OnemklCsr
           OnemklCsr<optimized, ValueType, IndexType>>,
       public gko::ReadableFromMatrixData<ValueType, IndexType> {
     friend class gko::EnableCreateMethod<OnemklCsr>;
-    friend class gko::EnablePolymorphicObject<OnemklCsr, OnemklBase>;
+    friend class gko::polymorphic_object_traits<OnemklCsr>;
 
 public:
     using Csr = gko::matrix::Csr<ValueType, IndexType>;
diff --git a/benchmark/utils/hip_linops.hip.cpp b/benchmark/utils/hip_linops.hip.cpp
index bd7d4e7650a..ae0a1a2d82d 100644
--- a/benchmark/utils/hip_linops.hip.cpp
+++ b/benchmark/utils/hip_linops.hip.cpp
@@ -41,7 +41,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include "benchmark/utils/sparselib_linops.hpp"
 #include "benchmark/utils/types.hpp"
-#include "hip/base/device_guard.hip.hpp"
 #include "hip/base/hipsparse_bindings.hip.hpp"
 
 
@@ -94,13 +93,13 @@ class HipsparseBase : public gko::LinOp {
 
     void initialize_descr()
     {
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::hip::device_guard g{id};
+        auto exec = this->get_gpu_exec();
+        auto guard = exec->get_scoped_device_id_guard();
         this->descr_ = handle_manager<hipsparseMatDescr>(
             reinterpret_cast<hipsparseMatDescr*>(
                 gko::kernels::hip::hipsparse::create_mat_descr()),
-            [id](hipsparseMatDescr* descr) {
-                gko::hip::device_guard g{id};
+            [exec](hipsparseMatDescr* descr) {
+                auto guard = exec->get_scoped_device_id_guard();
                 gko::kernels::hip::hipsparse::destroy(descr);
             });
     }
@@ -120,7 +119,7 @@ class HipsparseCsr
       public gko::EnableCreateMethod<HipsparseCsr<ValueType, IndexType>>,
       public gko::ReadableFromMatrixData<ValueType, IndexType> {
     friend class gko::EnableCreateMethod<HipsparseCsr>;
-    friend class gko::EnablePolymorphicObject<HipsparseCsr, HipsparseBase>;
+    friend class gko::polymorphic_object_traits<HipsparseCsr>;
 
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
@@ -156,8 +155,7 @@ class HipsparseCsr
         auto db = dense_b->get_const_values();
         auto dx = dense_x->get_values();
 
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::hip::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::hip::hipsparse::spmv(
             this->get_gpu_exec()->get_hipsparse_handle(), trans_,
             this->get_size()[0], this->get_size()[1],
@@ -196,7 +194,7 @@ class HipsparseCsrmm
       public gko::EnableCreateMethod<HipsparseCsrmm<ValueType, IndexType>>,
       public gko::ReadableFromMatrixData<ValueType, IndexType> {
     friend class gko::EnableCreateMethod<HipsparseCsrmm>;
-    friend class gko::EnablePolymorphicObject<HipsparseCsrmm, HipsparseBase>;
+    friend class gko::polymorphic_object_traits<HipsparseCsrmm>;
 
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
@@ -232,8 +230,7 @@ class HipsparseCsrmm
         auto db = dense_b->get_const_values();
         auto dx = dense_x->get_values();
 
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::hip::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::hip::hipsparse::spmm(
             this->get_gpu_exec()->get_hipsparse_handle(), trans_,
             this->get_size()[0], dense_b->get_size()[1], this->get_size()[1],
@@ -277,7 +274,7 @@ class HipsparseHybrid
           HipsparseHybrid<ValueType, IndexType, Partition, Threshold>>,
       public gko::ReadableFromMatrixData<ValueType, IndexType> {
     friend class gko::EnableCreateMethod<HipsparseHybrid>;
-    friend class gko::EnablePolymorphicObject<HipsparseHybrid, HipsparseBase>;
+    friend class gko::polymorphic_object_traits<HipsparseHybrid>;
 
 public:
     using csr = gko::matrix::Csr<ValueType, IndexType>;
@@ -301,8 +298,7 @@ class HipsparseHybrid
         t_csr->read(data);
         this->set_size(t_csr->get_size());
 
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::hip::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::hip::hipsparse::csr2hyb(
             this->get_gpu_exec()->get_hipsparse_handle(), this->get_size()[0],
             this->get_size()[1], this->get_descr(), t_csr->get_const_values(),
@@ -312,9 +308,8 @@ class HipsparseHybrid
 
     ~HipsparseHybrid() override
     {
-        const auto id = this->get_gpu_exec()->get_device_id();
         try {
-            gko::hip::device_guard g{id};
+            auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
             GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseDestroyHybMat(hyb_));
         } catch (const std::exception& e) {
             std::cerr << "Error when unallocating HipsparseHybrid hyb_ matrix: "
@@ -334,8 +329,7 @@ class HipsparseHybrid
         auto db = dense_b->get_const_values();
         auto dx = dense_x->get_values();
 
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::hip::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         gko::kernels::hip::hipsparse::spmv(
             this->get_gpu_exec()->get_hipsparse_handle(), trans_,
             &scalars.get_const_data()[0], this->get_descr(), hyb_, db,
@@ -351,8 +345,7 @@ class HipsparseHybrid
         : gko::EnableLinOp<HipsparseHybrid, HipsparseBase>(exec, size),
           trans_(HIPSPARSE_OPERATION_NON_TRANSPOSE)
     {
-        const auto id = this->get_gpu_exec()->get_device_id();
-        gko::hip::device_guard g{id};
+        auto guard = this->get_gpu_exec()->get_scoped_device_id_guard();
         GKO_ASSERT_NO_HIPSPARSE_ERRORS(hipsparseCreateHybMat(&hyb_));
     }
 
diff --git a/benchmark/utils/hip_timer.hip.cpp b/benchmark/utils/hip_timer.hip.cpp
index 2a6e6fe9c29..168c46ed3f8 100644
--- a/benchmark/utils/hip_timer.hip.cpp
+++ b/benchmark/utils/hip_timer.hip.cpp
@@ -34,7 +34,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "benchmark/utils/timer_impl.hpp"
-#include "hip/base/device_guard.hip.hpp"
 
 
 /**
@@ -60,8 +59,7 @@ class HipTimer : public Timer {
     {
         assert(exec != nullptr);
         exec_ = exec;
-        id_ = exec_->get_device_id();
-        gko::hip::device_guard g{id_};
+        auto guard = exec_->get_scoped_device_id_guard();
         GKO_ASSERT_NO_HIP_ERRORS(hipEventCreate(&start_));
         GKO_ASSERT_NO_HIP_ERRORS(hipEventCreate(&stop_));
     }
@@ -70,14 +68,14 @@ class HipTimer : public Timer {
     void tic_impl() override
     {
         exec_->synchronize();
-        gko::hip::device_guard g{id_};
+        auto guard = exec_->get_scoped_device_id_guard();
         // Currently, gko::HipExecutor always use default stream.
         GKO_ASSERT_NO_HIP_ERRORS(hipEventRecord(start_));
     }
 
     double toc_impl() override
     {
-        gko::hip::device_guard g{id_};
+        auto guard = exec_->get_scoped_device_id_guard();
         // Currently, gko::HipExecutor always use default stream.
         GKO_ASSERT_NO_HIP_ERRORS(hipEventRecord(stop_));
         GKO_ASSERT_NO_HIP_ERRORS(hipEventSynchronize(stop_));
@@ -94,7 +92,6 @@ class HipTimer : public Timer {
     std::shared_ptr<const gko::HipExecutor> exec_;
     hipEvent_t start_;
     hipEvent_t stop_;
-    int id_;
 };
 
 
diff --git a/benchmark/utils/overhead_linop.hpp b/benchmark/utils/overhead_linop.hpp
index dce50d49f15..af6b533d30d 100644
--- a/benchmark/utils/overhead_linop.hpp
+++ b/benchmark/utils/overhead_linop.hpp
@@ -101,7 +101,7 @@ template <typename ValueType = default_precision>
 class Overhead : public EnableLinOp<Overhead<ValueType>>,
                  public Preconditionable {
     friend class EnableLinOp<Overhead>;
-    friend class EnablePolymorphicObject<Overhead, LinOp>;
+    friend struct polymorphic_object_traits<Overhead>;
 
 public:
     GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
diff --git a/cmake/autodetect_executors.cmake b/cmake/autodetect_executors.cmake
index 9e396ee96b9..3a5ba36b559 100644
--- a/cmake/autodetect_executors.cmake
+++ b/cmake/autodetect_executors.cmake
@@ -15,7 +15,7 @@ if (NOT DEFINED GINKGO_BUILD_OMP)
 endif()
 
 if (NOT DEFINED GINKGO_BUILD_MPI)
-    find_package(MPI)
+    find_package(MPI 3.1)
     if(MPI_FOUND)
         message(STATUS "Enabling MPI support")
         set(GINKGO_HAS_MPI ON)
diff --git a/cmake/create_test.cmake b/cmake/create_test.cmake
index aa0a657b215..0bd181cffd5 100644
--- a/cmake/create_test.cmake
+++ b/cmake/create_test.cmake
@@ -1,5 +1,5 @@
 set(gko_test_single_args "MPI_SIZE")
-set(gko_test_multi_arg "DISABLE_EXECUTORS;ADDITIONAL_LIBRARIES;ADDITIONAL_INCLUDES")
+set(gko_test_multi_args "DISABLE_EXECUTORS;ADDITIONAL_LIBRARIES;ADDITIONAL_INCLUDES")
 
 ## Replaces / by _ to create valid target names from relative paths
 function(ginkgo_build_test_name test_name target_name)
@@ -7,12 +7,21 @@ function(ginkgo_build_test_name test_name target_name)
         ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
     string(REPLACE "/" "_" TEST_TARGET_NAME "${REL_BINARY_DIR}/${test_name}")
     set(${target_name} ${TEST_TARGET_NAME} PARENT_SCOPE)
-endfunction()
+endfunction(ginkgo_build_test_name)
+
+function(ginkgo_create_gtest_mpi_main)
+    add_library(gtest_mpi_main "")
+    target_sources(gtest_mpi_main
+      PRIVATE
+      ${PROJECT_SOURCE_DIR}/core/test/mpi/gtest/mpi_listener.cpp)
+    find_package(MPI REQUIRED)
+    target_link_libraries(gtest_mpi_main PRIVATE GTest::GTest MPI::MPI_CXX)
+endfunction(ginkgo_create_gtest_mpi_main)
 
 ## Set up shared target properties and handle ADDITIONAL_LIBRARIES/ADDITIONAL_INCLUDES
 ## `MPI_SIZE size` causes the tests to be run with `size` MPI processes.
 function(ginkgo_set_test_target_properties test_target_name)
-    cmake_parse_arguments(PARSE_ARGV 1 set_properties "" "${gko_test_single_args}" "${gko_test_multi_arg}")
+    cmake_parse_arguments(PARSE_ARGV 1 set_properties "" "${gko_test_single_args}" "${gko_test_multi_args}")
     if (GINKGO_FAST_TESTS)
         target_compile_definitions(${test_target_name} PRIVATE GINKGO_FAST_TESTS)
     endif()
@@ -23,6 +32,9 @@ function(ginkgo_set_test_target_properties test_target_name)
         target_link_libraries(${test_target_name} PRIVATE "${GINKGO_CIRCULAR_DEPS_FLAGS}")
     endif()
     if (set_properties_MPI_SIZE)
+        if(NOT TARGET gtest_mpi_main)
+            ginkgo_create_gtest_mpi_main()
+        endif()
         set(gtest_main gtest_mpi_main MPI::MPI_CXX)
     else()
         set(gtest_main GTest::Main)
@@ -40,7 +52,7 @@ endfunction()
 ## - `ADDITIONAL_LIBRARIES lib1 lib2` adds additional target link dependencies
 ## - `ADDITIONAL_INCLUDES path1 path2` adds additional target include paths
 function(ginkgo_add_test test_name test_target_name)
-    cmake_parse_arguments(PARSE_ARGV 2 add_test "" "${gko_test_single_arg}" "${gko_test_multi_arg}")
+    cmake_parse_arguments(PARSE_ARGV 2 add_test "" "${gko_test_single_args}" "${gko_test_multi_args}")
     file(RELATIVE_PATH REL_BINARY_DIR ${PROJECT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR})
     set_target_properties(${test_target_name} PROPERTIES OUTPUT_NAME ${test_name})
     if (add_test_MPI_SIZE)
@@ -189,7 +201,7 @@ function(ginkgo_create_common_test test_name)
 endfunction(ginkgo_create_common_test)
 
 function(ginkgo_create_common_test_internal test_name exec_type exec)
-    cmake_parse_arguments(PARSE_ARGV 3 common_test "" "${gko_test_single_arg}" "${gko_test_multi_arg}")
+    cmake_parse_arguments(PARSE_ARGV 3 common_test "" "${gko_test_single_args}" "${gko_test_multi_args}")
     if(exec IN_LIST common_test_DISABLE_EXECUTORS)
         return()
     endif()
@@ -211,7 +223,7 @@ endfunction(ginkgo_create_common_test_internal)
 
 ## Common test compiled with the device compiler, one target for each enabled backend
 function(ginkgo_create_common_device_test test_name)
-    cmake_parse_arguments(PARSE_ARGV 1 common_device_test "" "${gko_test_single_arg}" "${gko_test_multi_arg}")
+    cmake_parse_arguments(PARSE_ARGV 1 common_device_test "" "${gko_test_single_args}" "${gko_test_multi_args}")
     ginkgo_build_test_name(${test_name} test_target_name)
     if(GINKGO_BUILD_DPCPP)
         ginkgo_create_common_test_internal(${test_name} DpcppExecutor dpcpp ${ARGN})
diff --git a/cmake/get_info.cmake b/cmake/get_info.cmake
index 479b889aeaf..2cf8dd06c3f 100644
--- a/cmake/get_info.cmake
+++ b/cmake/get_info.cmake
@@ -130,7 +130,7 @@ foreach(log_type ${log_types})
         "GINKGO_BUILD_OMP;GINKGO_BUILD_MPI;GINKGO_BUILD_REFERENCE;GINKGO_BUILD_CUDA;GINKGO_BUILD_HIP;GINKGO_BUILD_DPCPP")
     ginkgo_print_module_footer(${${log_type}} "  Enabled features:")
     ginkgo_print_foreach_variable(${${log_type}}
-        "GINKGO_MIXED_PRECISION")
+        "GINKGO_MIXED_PRECISION;GINKGO_HAVE_GPU_AWARE_MPI")
     ginkgo_print_module_footer(${${log_type}} "  Tests, benchmarks and examples:")
     ginkgo_print_foreach_variable(${${log_type}}
         "GINKGO_BUILD_TESTS;GINKGO_FAST_TESTS;GINKGO_BUILD_EXAMPLES;GINKGO_EXTLIB_EXAMPLE;GINKGO_BUILD_BENCHMARKS;GINKGO_BENCHMARK_ENABLE_TUNING")
diff --git a/cmake/openmpi_test.cpp b/cmake/openmpi_test.cpp
new file mode 100644
index 00000000000..18bf7669368
--- /dev/null
+++ b/cmake/openmpi_test.cpp
@@ -0,0 +1,44 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <mpi.h>
+#include <cstdio>
+int main()
+{
+#if defined(OPEN_MPI) && OPEN_MPI
+    std::printf("%d.%d.%d", OMPI_MAJOR_VERSION, OMPI_MINOR_VERSION,
+                OMPI_RELEASE_VERSION);
+    return 1;
+#else
+    return 0;
+#endif
+}
diff --git a/common/cuda_hip/distributed/matrix_kernels.hpp.inc b/common/cuda_hip/distributed/matrix_kernels.hpp.inc
new file mode 100644
index 00000000000..4b327a41872
--- /dev/null
+++ b/common/cuda_hip/distributed/matrix_kernels.hpp.inc
@@ -0,0 +1,291 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+template <typename ValueType, typename GlobalIndexType>
+struct input_type {
+    GlobalIndexType row;
+    GlobalIndexType col;
+    ValueType val;
+    size_type row_range;
+    size_type col_range;
+
+    __forceinline__ __device__ __host__
+    input_type(thrust::tuple<GlobalIndexType, GlobalIndexType, ValueType,
+                             size_type, size_type>
+                   t)
+        : row(thrust::get<0>(t)),
+          col(thrust::get<1>(t)),
+          val(thrust::get<2>(t)),
+          row_range(thrust::get<3>(t)),
+          col_range(thrust::get<4>(t))
+    {}
+};
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void build_local_nonlocal(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        col_partition,
+    comm_index_type local_part, array<LocalIndexType>& local_row_idxs,
+    array<LocalIndexType>& local_col_idxs, array<ValueType>& local_values,
+    array<LocalIndexType>& non_local_row_idxs,
+    array<LocalIndexType>& non_local_col_idxs,
+    array<ValueType>& non_local_values,
+    array<LocalIndexType>& local_gather_idxs,
+    array<comm_index_type>& recv_sizes,
+    array<GlobalIndexType>& non_local_to_global)
+{
+    auto input_vals = input.get_const_values();
+    auto row_part_ids = row_partition->get_part_ids();
+    auto col_part_ids = col_partition->get_part_ids();
+    auto num_parts = static_cast<size_type>(row_partition->get_num_parts());
+    const auto* row_range_bounds = row_partition->get_range_bounds();
+    const auto* col_range_bounds = col_partition->get_range_bounds();
+    const auto* row_range_starting_indices =
+        row_partition->get_range_starting_indices();
+    const auto* col_range_starting_indices =
+        col_partition->get_range_starting_indices();
+    const auto num_row_ranges = row_partition->get_num_ranges();
+    const auto num_col_ranges = col_partition->get_num_ranges();
+    const auto num_input_elements = input.get_num_elems();
+
+    // precompute the row and column range id of each input element
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto input_col_idxs = input.get_const_col_idxs();
+    array<size_type> row_range_ids{exec, num_input_elements};
+    thrust::upper_bound(thrust::device, row_range_bounds + 1,
+                        row_range_bounds + num_row_ranges + 1, input_row_idxs,
+                        input_row_idxs + num_input_elements,
+                        row_range_ids.get_data());
+    array<size_type> col_range_ids{exec, input.get_num_elems()};
+    thrust::upper_bound(thrust::device, col_range_bounds + 1,
+                        col_range_bounds + num_col_ranges + 1, input_col_idxs,
+                        input_col_idxs + num_input_elements,
+                        col_range_ids.get_data());
+
+    // count number of local<0> and non-local<1> elements
+    auto range_ids_it = thrust::make_zip_iterator(thrust::make_tuple(
+        row_range_ids.get_const_data(), col_range_ids.get_const_data()));
+    auto num_elements_pair = thrust::transform_reduce(
+        thrust::device, range_ids_it, range_ids_it + num_input_elements,
+        [local_part, row_part_ids, col_part_ids] __host__ __device__(
+            const thrust::tuple<size_type, size_type>& tuple) {
+            auto row_part = row_part_ids[thrust::get<0>(tuple)];
+            auto col_part = col_part_ids[thrust::get<1>(tuple)];
+            bool is_inner_entry =
+                row_part == local_part && col_part == local_part;
+            bool is_ghost_entry =
+                row_part == local_part && col_part != local_part;
+            return thrust::make_tuple(
+                is_inner_entry ? size_type{1} : size_type{0},
+                is_ghost_entry ? size_type{1} : size_type{0});
+        },
+        thrust::make_tuple(size_type{}, size_type{}),
+        [] __host__ __device__(const thrust::tuple<size_type, size_type>& a,
+                               const thrust::tuple<size_type, size_type>& b) {
+            return thrust::make_tuple(thrust::get<0>(a) + thrust::get<0>(b),
+                                      thrust::get<1>(a) + thrust::get<1>(b));
+        });
+    auto num_local_elements = thrust::get<0>(num_elements_pair);
+    auto num_non_local_elements = thrust::get<1>(num_elements_pair);
+
+    // define global-to-local maps for row and column indices
+    auto map_to_local_row =
+        [row_range_bounds, row_range_starting_indices] __host__ __device__(
+            const GlobalIndexType row, const size_type range_id) {
+            return static_cast<LocalIndexType>(row -
+                                               row_range_bounds[range_id]) +
+                   row_range_starting_indices[range_id];
+        };
+    auto map_to_local_col =
+        [col_range_bounds, col_range_starting_indices] __host__ __device__(
+            const GlobalIndexType col, const size_type range_id) {
+            return static_cast<LocalIndexType>(col -
+                                               col_range_bounds[range_id]) +
+                   col_range_starting_indices[range_id];
+        };
+
+    using input_type = input_type<ValueType, GlobalIndexType>;
+    auto input_it = thrust::make_zip_iterator(thrust::make_tuple(
+        input.get_const_row_idxs(), input.get_const_col_idxs(),
+        input.get_const_values(), row_range_ids.get_const_data(),
+        col_range_ids.get_const_data()));
+
+    // copy and transform local entries into arrays
+    local_row_idxs.resize_and_reset(num_local_elements);
+    local_col_idxs.resize_and_reset(num_local_elements);
+    local_values.resize_and_reset(num_local_elements);
+    auto local_it = thrust::make_transform_iterator(
+        input_it, [map_to_local_row, map_to_local_col] __host__ __device__(
+                      const input_type input) {
+            auto local_row = map_to_local_row(input.row, input.row_range);
+            auto local_col = map_to_local_col(input.col, input.col_range);
+            return thrust::make_tuple(local_row, local_col, input.val);
+        });
+    thrust::copy_if(
+        thrust::device, local_it, local_it + input.get_num_elems(),
+        range_ids_it,
+        thrust::make_zip_iterator(thrust::make_tuple(local_row_idxs.get_data(),
+                                                     local_col_idxs.get_data(),
+                                                     local_values.get_data())),
+        [local_part, row_part_ids, col_part_ids] __host__ __device__(
+            const thrust::tuple<size_type, size_type>& tuple) {
+            auto row_part = row_part_ids[thrust::get<0>(tuple)];
+            auto col_part = col_part_ids[thrust::get<1>(tuple)];
+            return row_part == local_part && col_part == local_part;
+        });
+    // copy and transform non-local entries into arrays. this keeps global
+    // column indices, and also stores the column part id for each non-local
+    // entry in an array
+    non_local_row_idxs.resize_and_reset(num_non_local_elements);
+    non_local_values.resize_and_reset(num_non_local_elements);
+    array<GlobalIndexType> non_local_global_col_idxs{exec,
+                                                     num_non_local_elements};
+    array<comm_index_type> non_local_col_part_ids{exec, num_non_local_elements};
+    array<size_type> non_local_col_range_ids{exec, num_non_local_elements};
+    auto non_local_it = thrust::make_transform_iterator(
+        input_it, [map_to_local_row, map_to_local_col,
+                   col_part_ids] __host__ __device__(const input_type input) {
+            auto local_row = map_to_local_row(input.row, input.row_range);
+            return thrust::make_tuple(local_row, input.col, input.val,
+                                      col_part_ids[input.col_range],
+                                      input.col_range);
+        });
+    thrust::copy_if(
+        thrust::device, non_local_it, non_local_it + input.get_num_elems(),
+        range_ids_it,
+        thrust::make_zip_iterator(thrust::make_tuple(
+            non_local_row_idxs.get_data(), non_local_global_col_idxs.get_data(),
+            non_local_values.get_data(), non_local_col_part_ids.get_data(),
+            non_local_col_range_ids.get_data())),
+        [local_part, row_part_ids, col_part_ids] __host__ __device__(
+            const thrust::tuple<size_type, size_type>& tuple) {
+            auto row_part = row_part_ids[thrust::get<0>(tuple)];
+            auto col_part = col_part_ids[thrust::get<1>(tuple)];
+            return row_part == local_part && col_part != local_part;
+        });
+
+    // 1. sort global columns, part-id and range-id according to
+    // their part-id and global columns
+    // the previous `non_local_global_col_idxs` is not modify to
+    // keep it consistent with the non-local row and values array
+    array<GlobalIndexType> sorted_non_local_global_col_idxs{
+        exec, non_local_global_col_idxs};
+    auto key_it = thrust::make_zip_iterator(
+        thrust::make_tuple(non_local_col_part_ids.get_data(),
+                           sorted_non_local_global_col_idxs.get_data()));
+    thrust::sort_by_key(thrust::device, key_it, key_it + num_non_local_elements,
+                        non_local_col_range_ids.get_data());
+
+    // 2. remove duplicate columns, now the new column i has global index
+    // non_local_global_col_idxs[i]
+    auto non_local_global_col_idxs_begin =
+        sorted_non_local_global_col_idxs.get_data();
+    auto non_local_global_col_idxs_end = thrust::get<0>(thrust::unique_by_key(
+        thrust::device, non_local_global_col_idxs_begin,
+        non_local_global_col_idxs_begin + num_non_local_elements,
+        thrust::make_zip_iterator(
+            thrust::make_tuple(non_local_col_part_ids.get_data(),
+                               non_local_col_range_ids.get_data()))));
+    auto num_non_local_cols = static_cast<size_type>(thrust::distance(
+        non_local_global_col_idxs_begin, non_local_global_col_idxs_end));
+
+    // 2.5 copy unique_columns to non_local_to_global map
+    non_local_to_global.resize_and_reset(num_non_local_cols);
+    exec->copy(num_non_local_cols, non_local_global_col_idxs_begin,
+               non_local_to_global.get_data());
+
+    // 3. create mapping from unique_columns
+    // since we don't have hash tables on GPUs I'm first sorting the non-local
+    // global column indices and their new local index again by the global
+    // column index. Then I'm using binary searches to find the new local column
+    // index.
+    array<LocalIndexType> permutation{exec, num_non_local_cols};
+    thrust::sequence(thrust::device, permutation.get_data(),
+                     permutation.get_data() + num_non_local_cols);
+    thrust::sort_by_key(
+        thrust::device, non_local_global_col_idxs_begin,
+        non_local_global_col_idxs_begin + num_non_local_cols,
+        thrust::make_zip_iterator(thrust::make_tuple(
+            non_local_col_part_ids.get_data(), permutation.get_data())));
+
+    // 4. map column index of non-local entries to new columns
+    non_local_col_idxs.resize_and_reset(num_non_local_elements);
+    array<size_type> lower_bounds{exec, num_non_local_elements};
+    // I have to precompute the lower bounds because the calling binary
+    // searches from the device does not work:
+    // https://github.com/NVIDIA/thrust/issues/1415
+    // TODO: compute lower bounds on-the-fly if available
+    thrust::lower_bound(
+        thrust::device, non_local_global_col_idxs_begin,
+        non_local_global_col_idxs_begin + num_non_local_cols,
+        non_local_global_col_idxs.get_data(),
+        non_local_global_col_idxs.get_data() + num_non_local_elements,
+        lower_bounds.get_data());
+    auto permutation_data = permutation.get_data();
+    thrust::transform(
+        thrust::device, lower_bounds.get_data(),
+        lower_bounds.get_data() + num_non_local_elements,
+        non_local_col_idxs.get_data(),
+        [permutation_data] __host__ __device__(const size_type lower_bound) {
+            return permutation_data[lower_bound];
+        });
+
+    // 5. compute gather idxs and recv_sizes
+    local_gather_idxs.resize_and_reset(num_non_local_cols);
+    auto transform_it = thrust::make_zip_iterator(thrust::make_tuple(
+        non_local_to_global.get_data(), non_local_col_range_ids.get_data()));
+    thrust::transform(
+        thrust::device, transform_it, transform_it + num_non_local_cols,
+        local_gather_idxs.get_data(),
+        [map_to_local_col] __host__ __device__(
+            const thrust::tuple<GlobalIndexType, size_type>& tuple) {
+            return map_to_local_col(thrust::get<0>(tuple),
+                                    thrust::get<1>(tuple));
+        });
+
+    auto recv_sizes_ptr = recv_sizes.get_data();
+    thrust::fill_n(thrust::device, recv_sizes_ptr, num_parts, 0);
+    thrust::for_each_n(thrust::device, non_local_col_part_ids.get_data(),
+                       num_non_local_cols,
+                       [recv_sizes_ptr] __device__(const size_type part) {
+                           atomic_add(recv_sizes_ptr + part, 1);
+                       });
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_BUILD_LOCAL_NONLOCAL);
diff --git a/common/cuda_hip/distributed/vector_kernels.hpp.inc b/common/cuda_hip/distributed/vector_kernels.hpp.inc
new file mode 100644
index 00000000000..b3030ce5252
--- /dev/null
+++ b/common/cuda_hip/distributed/vector_kernels.hpp.inc
@@ -0,0 +1,96 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void build_local(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        partition,
+    comm_index_type local_part, matrix::Dense<ValueType>* local_mtx)
+{
+    const auto* range_bounds = partition->get_range_bounds();
+    const auto* range_starting_indices =
+        partition->get_range_starting_indices();
+    const auto* part_ids = partition->get_part_ids();
+    const auto num_ranges = partition->get_num_ranges();
+
+    array<size_type> range_id{exec, input.get_num_elems()};
+    thrust::upper_bound(thrust::device, range_bounds + 1,
+                        range_bounds + num_ranges + 1,
+                        input.get_const_row_idxs(),
+                        input.get_const_row_idxs() + input.get_num_elems(),
+                        range_id.get_data(), thrust::less<GlobalIndexType>());
+
+    // write values with local rows into the local matrix at the correct index
+    // this needs the following iterators:
+    // - local_row_it: (global_row, range_id) -> local row index
+    // - flat_idx_it: (local_row, col) -> flat index in local matrix values
+    //                                    array
+    // the flat_idx_it is used by the scatter_if as an index map for the values
+    auto map_to_local_row =
+        [range_bounds, range_starting_indices] __host__ __device__(
+            const thrust::tuple<GlobalIndexType, size_type>& idx_range_id) {
+            const auto idx = thrust::get<0>(idx_range_id);
+            const auto rid = thrust::get<1>(idx_range_id);
+            return static_cast<LocalIndexType>(idx - range_bounds[rid]) +
+                   range_starting_indices[rid];
+        };
+    auto local_row_it = thrust::make_transform_iterator(
+        thrust::make_zip_iterator(thrust::make_tuple(input.get_const_row_idxs(),
+                                                     range_id.get_data())),
+        map_to_local_row);
+
+    auto stride = local_mtx->get_stride();
+    auto map_to_flat_idx =
+        [stride] __host__ __device__(
+            const thrust::tuple<LocalIndexType, GlobalIndexType>& row_col) {
+            return thrust::get<0>(row_col) * stride + thrust::get<1>(row_col);
+        };
+    auto flat_idx_it = thrust::make_transform_iterator(
+        thrust::make_zip_iterator(
+            thrust::make_tuple(local_row_it, input.get_const_col_idxs())),
+        map_to_flat_idx);
+
+    auto is_local_row =
+        [part_ids, local_part] __host__ __device__(const size_type rid) {
+            return part_ids[rid] == local_part;
+        };
+    thrust::scatter_if(thrust::device, input.get_const_values(),
+                       input.get_const_values() + input.get_num_elems(),
+                       flat_idx_it, range_id.get_data(),
+                       local_mtx->get_values(), is_local_row);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
diff --git a/common/unified/distributed/partition_kernels.cpp b/common/unified/distributed/partition_kernels.cpp
index ecb8e9b0733..8129dec3fc3 100644
--- a/common/unified/distributed/partition_kernels.cpp
+++ b/common/unified/distributed/partition_kernels.cpp
@@ -44,7 +44,7 @@ namespace GKO_DEVICE_NAMESPACE {
 namespace partition {
 
 
-using distributed::comm_index_type;
+using experimental::distributed::comm_index_type;
 
 void count_ranges(std::shared_ptr<const DefaultExecutor> exec,
                   const array<comm_index_type>& mapping, size_type& num_ranges)
@@ -149,7 +149,8 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE(GKO_PARTITION_BUILD_FROM_GLOBAL_SIZE);
 template <typename LocalIndexType, typename GlobalIndexType>
 void has_ordered_parts(
     std::shared_ptr<const DefaultExecutor> exec,
-    const distributed::Partition<LocalIndexType, GlobalIndexType>* partition,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        partition,
     bool* result)
 {
     const auto part_ids = partition->get_part_ids();
diff --git a/common/unified/matrix/dense_kernels.cpp b/common/unified/matrix/dense_kernels.cpp
index c1d1422c483..6d9f78d2cdb 100644
--- a/common/unified/matrix/dense_kernels.cpp
+++ b/common/unified/matrix/dense_kernels.cpp
@@ -380,6 +380,38 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
     GKO_DECLARE_DENSE_COUNT_NONZEROS_PER_ROW_KERNEL_SIZE_T);
 
 
+template <typename ValueType>
+void compute_squared_norm2(std::shared_ptr<const DefaultExecutor> exec,
+                           const matrix::Dense<ValueType>* x,
+                           matrix::Dense<remove_complex<ValueType>>* result,
+                           array<char>& tmp)
+{
+    run_kernel_col_reduction_cached(
+        exec,
+        [] GKO_KERNEL(auto i, auto j, auto x) { return squared_norm(x(i, j)); },
+        GKO_KERNEL_REDUCE_SUM(remove_complex<ValueType>), result->get_values(),
+        x->get_size(), tmp, x);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL);
+
+
+template <typename ValueType>
+void compute_sqrt(std::shared_ptr<const DefaultExecutor> exec,
+                  matrix::Dense<ValueType>* x)
+{
+    run_kernel(
+        exec,
+        [] GKO_KERNEL(auto row, auto col, auto x) {
+            x(row, col) = sqrt(x(row, col));
+        },
+        x->get_size(), x);
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void symm_permute(std::shared_ptr<const DefaultExecutor> exec,
                   const array<IndexType>* permutation_indices,
diff --git a/common/unified/solver/bicg_kernels.cpp b/common/unified/solver/bicg_kernels.cpp
index 5b6b27e5737..c63e9b1467b 100644
--- a/common/unified/solver/bicg_kernels.cpp
+++ b/common/unified/solver/bicg_kernels.cpp
@@ -60,26 +60,38 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 matrix::Dense<ValueType>* q2,
                 array<stopping_status>* stop_status)
 {
-    run_kernel_solver(
-        exec,
-        [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto z, auto p,
-                      auto q, auto prev_rho, auto rho, auto r2, auto z2,
-                      auto p2, auto q2, auto stop) {
-            if (row == 0) {
+    if (b->get_size()) {
+        run_kernel_solver(
+            exec,
+            [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto z, auto p,
+                          auto q, auto prev_rho, auto rho, auto r2, auto z2,
+                          auto p2, auto q2, auto stop) {
+                if (row == 0) {
+                    rho[col] = zero(rho[col]);
+                    prev_rho[col] = one(prev_rho[col]);
+                    stop[col].reset();
+                }
+                r(row, col) = b(row, col);
+                r2(row, col) = b(row, col);
+                z(row, col) = p(row, col) = q(row, col) = z2(row, col) =
+                    p2(row, col) = q2(row, col) = zero(z(row, col));
+            },
+            b->get_size(), b->get_stride(), default_stride(b),
+            default_stride(r), default_stride(z), default_stride(p),
+            default_stride(q), row_vector(prev_rho), row_vector(rho),
+            default_stride(r2), default_stride(z2), default_stride(p2),
+            default_stride(q2), *stop_status);
+    } else {
+        run_kernel(
+            exec,
+            [] GKO_KERNEL(auto col, auto prev_rho, auto rho, auto stop) {
                 rho[col] = zero(rho[col]);
                 prev_rho[col] = one(prev_rho[col]);
                 stop[col].reset();
-            }
-            r(row, col) = b(row, col);
-            r2(row, col) = b(row, col);
-            z(row, col) = p(row, col) = q(row, col) = z2(row, col) =
-                p2(row, col) = q2(row, col) = zero(z(row, col));
-        },
-        b->get_size(), b->get_stride(), default_stride(b), default_stride(r),
-        default_stride(z), default_stride(p), default_stride(q),
-        row_vector(prev_rho), row_vector(rho), default_stride(r2),
-        default_stride(z2), default_stride(p2), default_stride(q2),
-        *stop_status);
+            },
+            b->get_size()[1], row_vector(prev_rho), row_vector(rho),
+            *stop_status);
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICG_INITIALIZE_KERNEL);
diff --git a/common/unified/solver/bicgstab_kernels.cpp b/common/unified/solver/bicgstab_kernels.cpp
index 1c120065198..2e675514c5b 100644
--- a/common/unified/solver/bicgstab_kernels.cpp
+++ b/common/unified/solver/bicgstab_kernels.cpp
@@ -62,27 +62,41 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 matrix::Dense<ValueType>* omega,
                 array<stopping_status>* stop_status)
 {
-    run_kernel_solver(
-        exec,
-        [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto rr, auto y,
-                      auto s, auto t, auto z, auto v, auto p, auto prev_rho,
-                      auto rho, auto alpha, auto beta, auto gamma, auto omega,
-                      auto stop) {
-            if (row == 0) {
+    if (b->get_size()) {
+        run_kernel_solver(
+            exec,
+            [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto rr, auto y,
+                          auto s, auto t, auto z, auto v, auto p, auto prev_rho,
+                          auto rho, auto alpha, auto beta, auto gamma,
+                          auto omega, auto stop) {
+                if (row == 0) {
+                    rho[col] = prev_rho[col] = alpha[col] = beta[col] =
+                        gamma[col] = omega[col] = one(rho[col]);
+                    stop[col].reset();
+                }
+                r(row, col) = b(row, col);
+                rr(row, col) = z(row, col) = v(row, col) = s(row, col) = t(
+                    row, col) = y(row, col) = p(row, col) = zero(rr(row, col));
+            },
+            b->get_size(), b->get_stride(), default_stride(b),
+            default_stride(r), default_stride(rr), default_stride(y),
+            default_stride(s), default_stride(t), default_stride(z),
+            default_stride(v), default_stride(p), row_vector(prev_rho),
+            row_vector(rho), row_vector(alpha), row_vector(beta),
+            row_vector(gamma), row_vector(omega), *stop_status);
+    } else {
+        run_kernel(
+            exec,
+            [] GKO_KERNEL(auto col, auto prev_rho, auto rho, auto alpha,
+                          auto beta, auto gamma, auto omega, auto stop) {
                 rho[col] = prev_rho[col] = alpha[col] = beta[col] = gamma[col] =
                     omega[col] = one(rho[col]);
                 stop[col].reset();
-            }
-            r(row, col) = b(row, col);
-            rr(row, col) = z(row, col) = v(row, col) = s(row, col) =
-                t(row, col) = y(row, col) = p(row, col) = zero(rr(row, col));
-        },
-        b->get_size(), b->get_stride(), default_stride(b), default_stride(r),
-        default_stride(rr), default_stride(y), default_stride(s),
-        default_stride(t), default_stride(z), default_stride(v),
-        default_stride(p), row_vector(prev_rho), row_vector(rho),
-        row_vector(alpha), row_vector(beta), row_vector(gamma),
-        row_vector(omega), *stop_status);
+            },
+            b->get_size()[1], row_vector(prev_rho), row_vector(rho),
+            row_vector(alpha), row_vector(beta), row_vector(gamma),
+            row_vector(omega), *stop_status);
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_BICGSTAB_INITIALIZE_KERNEL);
diff --git a/common/unified/solver/cg_kernels.cpp b/common/unified/solver/cg_kernels.cpp
index 36a7fc3ce9b..966317adbcf 100644
--- a/common/unified/solver/cg_kernels.cpp
+++ b/common/unified/solver/cg_kernels.cpp
@@ -58,21 +58,33 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 matrix::Dense<ValueType>* rho,
                 array<stopping_status>* stop_status)
 {
-    run_kernel_solver(
-        exec,
-        [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto z, auto p,
-                      auto q, auto prev_rho, auto rho, auto stop) {
-            if (row == 0) {
+    if (b->get_size()) {
+        run_kernel_solver(
+            exec,
+            [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto z, auto p,
+                          auto q, auto prev_rho, auto rho, auto stop) {
+                if (row == 0) {
+                    rho[col] = zero(rho[col]);
+                    prev_rho[col] = one(prev_rho[col]);
+                    stop[col].reset();
+                }
+                r(row, col) = b(row, col);
+                z(row, col) = p(row, col) = q(row, col) = zero(z(row, col));
+            },
+            b->get_size(), b->get_stride(), b, default_stride(r),
+            default_stride(z), default_stride(p), default_stride(q),
+            row_vector(prev_rho), row_vector(rho), *stop_status);
+    } else {
+        run_kernel(
+            exec,
+            [] GKO_KERNEL(auto col, auto prev_rho, auto rho, auto stop) {
                 rho[col] = zero(rho[col]);
                 prev_rho[col] = one(prev_rho[col]);
                 stop[col].reset();
-            }
-            r(row, col) = b(row, col);
-            z(row, col) = p(row, col) = q(row, col) = zero(z(row, col));
-        },
-        b->get_size(), b->get_stride(), b, default_stride(r), default_stride(z),
-        default_stride(p), default_stride(q), row_vector(prev_rho),
-        row_vector(rho), *stop_status);
+            },
+            b->get_size()[1], row_vector(prev_rho), row_vector(rho),
+            *stop_status);
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CG_INITIALIZE_KERNEL);
diff --git a/common/unified/solver/cgs_kernels.cpp b/common/unified/solver/cgs_kernels.cpp
index 9878c0d9751..8cefb60e976 100644
--- a/common/unified/solver/cgs_kernels.cpp
+++ b/common/unified/solver/cgs_kernels.cpp
@@ -63,27 +63,43 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 matrix::Dense<ValueType>* rho,
                 array<stopping_status>* stop_status)
 {
-    run_kernel_solver(
-        exec,
-        [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto r_tld, auto p,
-                      auto q, auto u, auto u_hat, auto v_hat, auto t,
-                      auto alpha, auto beta, auto gamma, auto prev_rho,
-                      auto rho, auto stop) {
-            if (row == 0) {
+    if (b->get_size()) {
+        run_kernel_solver(
+            exec,
+            [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto r_tld,
+                          auto p, auto q, auto u, auto u_hat, auto v_hat,
+                          auto t, auto alpha, auto beta, auto gamma,
+                          auto prev_rho, auto rho, auto stop) {
+                if (row == 0) {
+                    rho[col] = zero(rho[col]);
+                    prev_rho[col] = alpha[col] = beta[col] = gamma[col] =
+                        one(prev_rho[col]);
+                    stop[col].reset();
+                }
+                r(row, col) = r_tld(row, col) = b(row, col);
+                u(row, col) = u_hat(row, col) = p(row, col) = q(row, col) =
+                    v_hat(row, col) = t(row, col) = zero(u(row, col));
+            },
+            b->get_size(), b->get_stride(), default_stride(b),
+            default_stride(r), default_stride(r_tld), default_stride(p),
+            default_stride(q), default_stride(u), default_stride(u_hat),
+            default_stride(v_hat), default_stride(t), row_vector(alpha),
+            row_vector(beta), row_vector(gamma), row_vector(prev_rho),
+            row_vector(rho), *stop_status);
+    } else {
+        run_kernel(
+            exec,
+            [] GKO_KERNEL(auto col, auto alpha, auto beta, auto gamma,
+                          auto prev_rho, auto rho, auto stop) {
                 rho[col] = zero(rho[col]);
                 prev_rho[col] = alpha[col] = beta[col] = gamma[col] =
                     one(prev_rho[col]);
                 stop[col].reset();
-            }
-            r(row, col) = r_tld(row, col) = b(row, col);
-            u(row, col) = u_hat(row, col) = p(row, col) = q(row, col) =
-                v_hat(row, col) = t(row, col) = zero(u(row, col));
-        },
-        b->get_size(), b->get_stride(), default_stride(b), default_stride(r),
-        default_stride(r_tld), default_stride(p), default_stride(q),
-        default_stride(u), default_stride(u_hat), default_stride(v_hat),
-        default_stride(t), row_vector(alpha), row_vector(beta),
-        row_vector(gamma), row_vector(prev_rho), row_vector(rho), *stop_status);
+            },
+            b->get_size()[1], row_vector(alpha), row_vector(beta),
+            row_vector(gamma), row_vector(prev_rho), row_vector(rho),
+            *stop_status);
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_CGS_INITIALIZE_KERNEL);
diff --git a/common/unified/solver/fcg_kernels.cpp b/common/unified/solver/fcg_kernels.cpp
index 4277f45f5fa..fb03f686d56 100644
--- a/common/unified/solver/fcg_kernels.cpp
+++ b/common/unified/solver/fcg_kernels.cpp
@@ -59,23 +59,36 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
                 matrix::Dense<ValueType>* rho, matrix::Dense<ValueType>* rho_t,
                 array<stopping_status>* stop_status)
 {
-    run_kernel_solver(
-        exec,
-        [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto z, auto p,
-                      auto q, auto t, auto prev_rho, auto rho, auto rho_t,
-                      auto stop) {
-            if (row == 0) {
+    if (b->get_size()) {
+        run_kernel_solver(
+            exec,
+            [] GKO_KERNEL(auto row, auto col, auto b, auto r, auto z, auto p,
+                          auto q, auto t, auto prev_rho, auto rho, auto rho_t,
+                          auto stop) {
+                if (row == 0) {
+                    rho[col] = zero(rho[col]);
+                    prev_rho[col] = rho_t[col] = one(prev_rho[col]);
+                    stop[col].reset();
+                }
+                t(row, col) = r(row, col) = b(row, col);
+                z(row, col) = p(row, col) = q(row, col) = zero(z(row, col));
+            },
+            b->get_size(), b->get_stride(), default_stride(b),
+            default_stride(r), default_stride(z), default_stride(p),
+            default_stride(q), default_stride(t), row_vector(prev_rho),
+            row_vector(rho), row_vector(rho_t), *stop_status);
+    } else {
+        run_kernel(
+            exec,
+            [] GKO_KERNEL(auto col, auto prev_rho, auto rho, auto rho_t,
+                          auto stop) {
                 rho[col] = zero(rho[col]);
                 prev_rho[col] = rho_t[col] = one(prev_rho[col]);
                 stop[col].reset();
-            }
-            t(row, col) = r(row, col) = b(row, col);
-            z(row, col) = p(row, col) = q(row, col) = zero(z(row, col));
-        },
-        b->get_size(), b->get_stride(), default_stride(b), default_stride(r),
-        default_stride(z), default_stride(p), default_stride(q),
-        default_stride(t), row_vector(prev_rho), row_vector(rho),
-        row_vector(rho_t), *stop_status);
+            },
+            b->get_size()[1], row_vector(prev_rho), row_vector(rho),
+            row_vector(rho_t), *stop_status);
+    }
 }
 
 GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_FCG_INITIALIZE_KERNEL);
diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt
index 427ac8091a3..26a7bf1c3e3 100644
--- a/core/CMakeLists.txt
+++ b/core/CMakeLists.txt
@@ -6,9 +6,11 @@ target_sources(ginkgo
     base/array.cpp
     base/combination.cpp
     base/composition.cpp
+    base/dense_cache.cpp
     base/device_matrix_data.cpp
     base/executor.cpp
     base/index_set.cpp
+    base/mpi.cpp
     base/mtx_io.cpp
     base/perturbation.cpp
     base/version.cpp
@@ -67,7 +69,10 @@ endif()
 
 if (GINKGO_BUILD_MPI)
     target_sources(ginkgo
-        PRIVATE mpi/exception.cpp)
+        PRIVATE
+        mpi/exception.cpp
+        distributed/matrix.cpp
+        distributed/vector.cpp)
 endif()
 
 ginkgo_compile_features(ginkgo)
diff --git a/core/base/dense_cache.cpp b/core/base/dense_cache.cpp
new file mode 100644
index 00000000000..91e4a4247cd
--- /dev/null
+++ b/core/base/dense_cache.cpp
@@ -0,0 +1,69 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/dense_cache.hpp>
+
+
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+namespace gko {
+namespace detail {
+
+
+template <typename ValueType>
+void DenseCache<ValueType>::init(std::shared_ptr<const Executor> exec,
+                                 dim<2> size) const
+{
+    if (!vec || vec->get_size() != size || vec->get_executor() != exec) {
+        vec = matrix::Dense<ValueType>::create(exec, size);
+    }
+}
+
+
+template <typename ValueType>
+void DenseCache<ValueType>::init_from(
+    const matrix::Dense<ValueType>* template_vec) const
+{
+    if (!vec || vec->get_size() != template_vec->get_size() ||
+        vec->get_executor() != template_vec->get_executor()) {
+        vec = matrix::Dense<ValueType>::create_with_config_of(template_vec);
+    }
+}
+
+
+#define GKO_DECLARE_DENSE_CACHE(_type) class DenseCache<_type>
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DENSE_CACHE);
+
+
+}  // namespace detail
+}  // namespace gko
diff --git a/core/base/mpi.cpp b/core/base/mpi.cpp
new file mode 100644
index 00000000000..ed33cd38e42
--- /dev/null
+++ b/core/base/mpi.cpp
@@ -0,0 +1,99 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/mpi.hpp>
+
+
+#if GINKGO_BUILD_MPI
+
+
+#include <string>
+
+
+#include <mpi.h>
+
+
+namespace gko {
+namespace experimental {
+namespace mpi {
+
+
+int map_rank_to_device_id(MPI_Comm comm, const int num_devices)
+{
+    GKO_ASSERT(num_devices > 0);
+    if (num_devices == 1) {
+        return 0;
+    } else {
+        auto mpi_node_local_rank = [](MPI_Comm comm_) {
+            int local_rank;
+            MPI_Comm local_comm;
+            GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_split_type(
+                comm_, MPI_COMM_TYPE_SHARED, 0, MPI_INFO_NULL, &local_comm));
+            GKO_ASSERT_NO_MPI_ERRORS(MPI_Comm_rank(local_comm, &local_rank));
+            MPI_Comm_free(&local_comm);
+            return local_rank;
+        };
+
+        // When we are using MPI_COMM_WORLD, there might be already an
+        // environment variable describing the node local rank, so we
+        // prioritize it. If no suitable environment variable is found
+        // we determine the node-local rank with MPI calls.
+        int local_rank;
+        int compare_result;
+        GKO_ASSERT_NO_MPI_ERRORS(
+            MPI_Comm_compare(comm, MPI_COMM_WORLD, &compare_result));
+        if (compare_result != MPI_IDENT && compare_result != MPI_CONGRUENT) {
+            local_rank = mpi_node_local_rank(comm);
+        } else {
+            if (auto str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK")) {
+                local_rank = std::stoi(str);
+            } else if (auto str = std::getenv("OMPI_COMM_WORLD_LOCAL_RANK")) {
+                local_rank = std::stoi(str);
+            } else if (auto str = std::getenv("MPI_LOCALRANKID")) {
+                local_rank = std::stoi(str);
+            } else if (auto str = std::getenv("SLURM_LOCALID")) {
+                local_rank = std::stoi(str);
+            } else {
+                local_rank = mpi_node_local_rank(comm);
+            }
+        }
+        return local_rank % num_devices;
+    }
+}
+
+
+}  // namespace mpi
+}  // namespace experimental
+}  // namespace gko
+
+
+#endif  // GKO_HAVE_MPI
diff --git a/core/base/noop_scoped_device_id_guard.hpp b/core/base/noop_scoped_device_id_guard.hpp
new file mode 100644
index 00000000000..ab6f514e9dc
--- /dev/null
+++ b/core/base/noop_scoped_device_id_guard.hpp
@@ -0,0 +1,57 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GINKGO_CORE_BASE_NOOP_SCOPED_DEVICE_ID_GUARD_HPP
+#define GINKGO_CORE_BASE_NOOP_SCOPED_DEVICE_ID_GUARD_HPP
+
+
+#include <ginkgo/core/base/scoped_device_id_guard.hpp>
+
+
+namespace gko {
+namespace detail {
+
+
+/**
+ * An implementation of generic_scoped_device_id_guard that does nothing.
+ *
+ * This is used for OmpExecutor and DpcppExecutor, since they don't require
+ * setting a device id.
+ */
+class noop_scoped_device_id_guard : public generic_scoped_device_id_guard {};
+
+
+}  // namespace detail
+}  // namespace gko
+
+
+#endif  // GINKGO_CORE_BASE_NOOP_SCOPED_DEVICE_ID_GUARD_HPP
diff --git a/core/device_hooks/CMakeLists.txt b/core/device_hooks/CMakeLists.txt
index fcb370a81a0..901acef7797 100644
--- a/core/device_hooks/CMakeLists.txt
+++ b/core/device_hooks/CMakeLists.txt
@@ -45,6 +45,7 @@ if (NOT GINKGO_BUILD_REFERENCE)
     add_library(ginkgo_reference
         $<TARGET_OBJECTS:ginkgo_reference_device>
         reference_hooks.cpp)
+    target_link_libraries(ginkgo_reference PRIVATE ginkgo_omp)
     target_link_libraries(ginkgo_reference PUBLIC ginkgo_device)
     ginkgo_compile_features(ginkgo_reference)
     ginkgo_default_includes(ginkgo_reference)
diff --git a/core/device_hooks/common_kernels.inc.cpp b/core/device_hooks/common_kernels.inc.cpp
index 83f6628404e..fdaf02b050d 100644
--- a/core/device_hooks/common_kernels.inc.cpp
+++ b/core/device_hooks/common_kernels.inc.cpp
@@ -43,7 +43,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "core/components/precision_conversion_kernels.hpp"
 #include "core/components/prefix_sum_kernels.hpp"
 #include "core/components/reduce_array_kernels.hpp"
+#include "core/distributed/matrix_kernels.hpp"
 #include "core/distributed/partition_kernels.hpp"
+#include "core/distributed/vector_kernels.hpp"
 #include "core/factorization/cholesky_kernels.hpp"
 #include "core/factorization/factorization_kernels.hpp"
 #include "core/factorization/ic_kernels.hpp"
@@ -144,6 +146,13 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
         GKO_NOT_COMPILED(GKO_HOOK_MODULE);                       \
     GKO_INSTANTIATE_FOR_EACH_MIXED_VALUE_AND_INDEX_TYPE_2(_macro)
 
+#define GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro) \
+    template <typename ValueType, typename LocalIndexType, \
+              typename GlobalIndexType>                    \
+    _macro(ValueType, LocalIndexType, GlobalIndexType)     \
+        GKO_NOT_COMPILED(GKO_HOOK_MODULE);                 \
+    GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro)
+
 #define GKO_STUB_TEMPLATE_TYPE(_macro)                   \
     template <typename IndexType>                        \
     _macro(IndexType) GKO_NOT_COMPILED(GKO_HOOK_MODULE); \
@@ -243,6 +252,24 @@ GKO_STUB_LOCAL_GLOBAL_TYPE(GKO_DECLARE_PARTITION_IS_ORDERED);
 }  // namespace partition
 
 
+namespace distributed_vector {
+
+
+GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
+
+
+}
+
+namespace distributed_matrix {
+
+
+GKO_STUB_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_BUILD_LOCAL_NONLOCAL);
+
+
+}  // namespace distributed_matrix
+
+
 namespace dense {
 
 
@@ -264,6 +291,8 @@ GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_CONJ_DOT_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM2_DISPATCH_KERNEL);
 GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL);
+GKO_STUB_VALUE_TYPE(GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL);
 GKO_STUB_VALUE_AND_INDEX_TYPE(GKO_DECLARE_DENSE_CONVERT_TO_CSR_KERNEL);
diff --git a/core/device_hooks/cuda_hooks.cpp b/core/device_hooks/cuda_hooks.cpp
index fa6dbe6c773..1d456e8173c 100644
--- a/core/device_hooks/cuda_hooks.cpp
+++ b/core/device_hooks/cuda_hooks.cpp
@@ -31,7 +31,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
 #include <memory>
-#include <string>
 
 
 #include <ginkgo/core/base/exception_helpers.hpp>
@@ -40,9 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/version.hpp>
 
 
-#include "core/matrix/csr_kernels.hpp"
-
-
 namespace gko {
 
 
@@ -108,6 +104,10 @@ void CudaExecutor::raw_copy_to(const DpcppExecutor*, size_type num_bytes,
 void CudaExecutor::synchronize() const GKO_NOT_COMPILED(cuda);
 
 
+scoped_device_id_guard CudaExecutor::get_scoped_device_id_guard() const
+    GKO_NOT_COMPILED(cuda);
+
+
 void CudaExecutor::run(const Operation& op) const
 {
     op.run(
@@ -154,6 +154,11 @@ void CudaExecutor::set_gpu_property() {}
 void CudaExecutor::init_handles() {}
 
 
+scoped_device_id_guard::scoped_device_id_guard(const CudaExecutor* exec,
+                                               int device_id)
+    GKO_NOT_COMPILED(cuda);
+
+
 }  // namespace gko
 
 
diff --git a/core/device_hooks/dpcpp_hooks.cpp b/core/device_hooks/dpcpp_hooks.cpp
index 9cd057fbeab..b1dbc3f666b 100644
--- a/core/device_hooks/dpcpp_hooks.cpp
+++ b/core/device_hooks/dpcpp_hooks.cpp
@@ -121,6 +121,10 @@ void DpcppExecutor::run(const Operation& op) const
 }
 
 
+scoped_device_id_guard DpcppExecutor::get_scoped_device_id_guard() const
+    GKO_NOT_COMPILED(dpcpp);
+
+
 int DpcppExecutor::get_num_devices(std::string) { return 0; }
 
 
@@ -142,6 +146,11 @@ bool DpcppExecutor::verify_memory_to(const DpcppExecutor* dest_exec) const
 }
 
 
+scoped_device_id_guard::scoped_device_id_guard(const DpcppExecutor* exec,
+                                               int device_id)
+    GKO_NOT_COMPILED(dpcpp);
+
+
 }  // namespace gko
 
 
diff --git a/core/device_hooks/hip_hooks.cpp b/core/device_hooks/hip_hooks.cpp
index 54cc6439956..35bdd30b068 100644
--- a/core/device_hooks/hip_hooks.cpp
+++ b/core/device_hooks/hip_hooks.cpp
@@ -112,6 +112,10 @@ void HipExecutor::run(const Operation& op) const
 }
 
 
+scoped_device_id_guard HipExecutor::get_scoped_device_id_guard() const
+    GKO_NOT_COMPILED(hip);
+
+
 std::string HipError::get_error(int64)
 {
     return "ginkgo HIP module is not compiled";
@@ -151,6 +155,11 @@ void HipExecutor::set_gpu_property() {}
 void HipExecutor::init_handles() {}
 
 
+scoped_device_id_guard::scoped_device_id_guard(const HipExecutor* exec,
+                                               int device_id)
+    GKO_NOT_COMPILED(hip);
+
+
 }  // namespace gko
 
 
diff --git a/core/device_hooks/omp_hooks.cpp b/core/device_hooks/omp_hooks.cpp
index 29f568c6932..981585c909d 100644
--- a/core/device_hooks/omp_hooks.cpp
+++ b/core/device_hooks/omp_hooks.cpp
@@ -30,6 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/scoped_device_id_guard.hpp>
 #include <ginkgo/core/base/version.hpp>
 
 
@@ -44,6 +46,11 @@ version version_info::get_omp_version() noexcept
 }
 
 
+scoped_device_id_guard::scoped_device_id_guard(const OmpExecutor* exec,
+                                               int device_id)
+    GKO_NOT_COMPILED(omp);
+
+
 }  // namespace gko
 
 
diff --git a/core/device_hooks/reference_hooks.cpp b/core/device_hooks/reference_hooks.cpp
index ac39ed1070d..a3e2fe3a34d 100644
--- a/core/device_hooks/reference_hooks.cpp
+++ b/core/device_hooks/reference_hooks.cpp
@@ -30,6 +30,8 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
+#include <ginkgo/core/base/exception_helpers.hpp>
+#include <ginkgo/core/base/scoped_device_id_guard.hpp>
 #include <ginkgo/core/base/version.hpp>
 
 
@@ -44,6 +46,11 @@ version version_info::get_reference_version() noexcept
 }
 
 
+scoped_device_id_guard::scoped_device_id_guard(const ReferenceExecutor* exec,
+                                               int device_id)
+    GKO_NOT_COMPILED(reference);
+
+
 }  // namespace gko
 
 
diff --git a/core/distributed/helpers.hpp b/core/distributed/helpers.hpp
new file mode 100644
index 00000000000..0e4f7b34e55
--- /dev/null
+++ b/core/distributed/helpers.hpp
@@ -0,0 +1,128 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <memory>
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/distributed/vector.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+namespace gko {
+namespace detail {
+
+
+template <typename ValueType>
+std::unique_ptr<matrix::Dense<ValueType>> create_with_config_of(
+    const matrix::Dense<ValueType>* mtx)
+{
+    return matrix::Dense<ValueType>::create(mtx->get_executor(),
+                                            mtx->get_size(), mtx->get_stride());
+}
+
+
+template <typename ValueType>
+const matrix::Dense<ValueType>* get_local(const matrix::Dense<ValueType>* mtx)
+{
+    return mtx;
+}
+
+
+template <typename ValueType>
+matrix::Dense<ValueType>* get_local(matrix::Dense<ValueType>* mtx)
+{
+    return mtx;
+}
+
+
+#if GINKGO_BUILD_MPI
+
+
+template <typename ValueType>
+std::unique_ptr<experimental::distributed::Vector<ValueType>>
+create_with_config_of(const experimental::distributed::Vector<ValueType>* mtx)
+{
+    return experimental::distributed::Vector<ValueType>::create(
+        mtx->get_executor(), mtx->get_communicator(), mtx->get_size(),
+        mtx->get_local_vector()->get_size(),
+        mtx->get_local_vector()->get_stride());
+}
+
+
+template <typename ValueType>
+matrix::Dense<ValueType>* get_local(
+    experimental::distributed::Vector<ValueType>* mtx)
+{
+    return const_cast<matrix::Dense<ValueType>*>(mtx->get_local_vector());
+}
+
+
+template <typename ValueType>
+const matrix::Dense<ValueType>* get_local(
+    const experimental::distributed::Vector<ValueType>* mtx)
+{
+    return mtx->get_local_vector();
+}
+
+
+#endif
+
+
+template <typename Arg>
+bool is_distributed(Arg* linop)
+{
+#if GINKGO_BUILD_MPI
+    return dynamic_cast<const experimental::distributed::DistributedBase*>(
+        linop);
+#else
+    return false;
+#endif
+}
+
+
+template <typename Arg, typename... Rest>
+bool is_distributed(Arg* linop, Rest*... rest)
+{
+#if GINKGO_BUILD_MPI
+    bool is_distributed_value =
+        dynamic_cast<const experimental::distributed::DistributedBase*>(linop);
+    GKO_ASSERT(is_distributed_value == is_distributed(rest...));
+    return is_distributed_value;
+#else
+    return false;
+#endif
+}
+
+
+}  // namespace detail
+}  // namespace gko
diff --git a/core/distributed/matrix.cpp b/core/distributed/matrix.cpp
new file mode 100644
index 00000000000..924dc216086
--- /dev/null
+++ b/core/distributed/matrix.cpp
@@ -0,0 +1,450 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/distributed/matrix.hpp>
+
+
+#include <ginkgo/core/base/precision_dispatch.hpp>
+#include <ginkgo/core/distributed/vector.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/distributed/matrix_kernels.hpp"
+
+
+namespace gko {
+namespace experimental {
+namespace distributed {
+namespace matrix {
+namespace {
+
+
+GKO_REGISTER_OPERATION(build_local_nonlocal,
+                       distributed_matrix::build_local_nonlocal);
+
+
+}  // namespace
+}  // namespace matrix
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+Matrix<ValueType, LocalIndexType, GlobalIndexType>::Matrix(
+    std::shared_ptr<const Executor> exec, mpi::communicator comm)
+    : Matrix(exec, comm, with_matrix_type<gko::matrix::Csr>())
+{}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+Matrix<ValueType, LocalIndexType, GlobalIndexType>::Matrix(
+    std::shared_ptr<const Executor> exec, mpi::communicator comm,
+    const LinOp* local_matrix_type)
+    : Matrix(exec, comm, local_matrix_type, local_matrix_type)
+{}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+Matrix<ValueType, LocalIndexType, GlobalIndexType>::Matrix(
+    std::shared_ptr<const Executor> exec, mpi::communicator comm,
+    const LinOp* local_matrix_template, const LinOp* non_local_matrix_template)
+    : EnableLinOp<
+          Matrix<value_type, local_index_type, global_index_type>>{exec},
+      DistributedBase{comm},
+      send_offsets_(comm.size() + 1),
+      send_sizes_(comm.size()),
+      recv_offsets_(comm.size() + 1),
+      recv_sizes_(comm.size()),
+      gather_idxs_{exec},
+      non_local_to_global_{exec},
+      one_scalar_{},
+      local_mtx_{local_matrix_template->clone(exec)},
+      non_local_mtx_{non_local_matrix_template->clone(exec)}
+{
+    GKO_ASSERT(
+        (dynamic_cast<ReadableFromMatrixData<ValueType, LocalIndexType>*>(
+            local_mtx_.get())));
+    GKO_ASSERT(
+        (dynamic_cast<ReadableFromMatrixData<ValueType, LocalIndexType>*>(
+            non_local_mtx_.get())));
+    one_scalar_.init(exec, dim<2>{1, 1});
+    one_scalar_->fill(one<value_type>());
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void Matrix<ValueType, LocalIndexType, GlobalIndexType>::convert_to(
+    Matrix<next_precision<value_type>, local_index_type, global_index_type>*
+        result) const
+{
+    GKO_ASSERT(this->get_communicator().size() ==
+               result->get_communicator().size());
+    result->local_mtx_->copy_from(this->local_mtx_.get());
+    result->non_local_mtx_->copy_from(this->non_local_mtx_.get());
+    result->gather_idxs_ = this->gather_idxs_;
+    result->send_offsets_ = this->send_offsets_;
+    result->recv_offsets_ = this->recv_offsets_;
+    result->recv_sizes_ = this->recv_sizes_;
+    result->send_sizes_ = this->send_sizes_;
+    result->non_local_to_global_ = this->non_local_to_global_;
+    result->set_size(this->get_size());
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void Matrix<ValueType, LocalIndexType, GlobalIndexType>::move_to(
+    Matrix<next_precision<value_type>, local_index_type, global_index_type>*
+        result)
+{
+    GKO_ASSERT(this->get_communicator().size() ==
+               result->get_communicator().size());
+    result->local_mtx_->move_from(this->local_mtx_.get());
+    result->non_local_mtx_->move_from(this->non_local_mtx_.get());
+    result->gather_idxs_ = std::move(this->gather_idxs_);
+    result->send_offsets_ = std::move(this->send_offsets_);
+    result->recv_offsets_ = std::move(this->recv_offsets_);
+    result->recv_sizes_ = std::move(this->recv_sizes_);
+    result->send_sizes_ = std::move(this->send_sizes_);
+    result->non_local_to_global_ = std::move(this->non_local_to_global_);
+    result->set_size(this->get_size());
+    this->set_size({});
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
+    const device_matrix_data<value_type, global_index_type>& data,
+    const Partition<local_index_type, global_index_type>* row_partition,
+    const Partition<local_index_type, global_index_type>* col_partition)
+{
+    const auto comm = this->get_communicator();
+    GKO_ASSERT_EQ(data.get_size()[0], row_partition->get_size());
+    GKO_ASSERT_EQ(data.get_size()[1], col_partition->get_size());
+    GKO_ASSERT_EQ(comm.size(), row_partition->get_num_parts());
+    GKO_ASSERT_EQ(comm.size(), col_partition->get_num_parts());
+    auto exec = this->get_executor();
+    auto local_part = comm.rank();
+
+    // set up LinOp sizes
+    auto num_parts = static_cast<size_type>(row_partition->get_num_parts());
+    auto global_num_rows = row_partition->get_size();
+    auto global_num_cols = col_partition->get_size();
+    dim<2> global_dim{global_num_rows, global_num_cols};
+    this->set_size(global_dim);
+
+    // temporary storage for the output
+    array<local_index_type> local_row_idxs{exec};
+    array<local_index_type> local_col_idxs{exec};
+    array<value_type> local_values{exec};
+    array<local_index_type> non_local_row_idxs{exec};
+    array<local_index_type> non_local_col_idxs{exec};
+    array<value_type> non_local_values{exec};
+    array<local_index_type> recv_gather_idxs{exec};
+    array<comm_index_type> recv_sizes_array{exec, num_parts};
+
+    // build local, non-local matrix data and communication structures
+    exec->run(matrix::make_build_local_nonlocal(
+        data, make_temporary_clone(exec, row_partition).get(),
+        make_temporary_clone(exec, col_partition).get(), local_part,
+        local_row_idxs, local_col_idxs, local_values, non_local_row_idxs,
+        non_local_col_idxs, non_local_values, recv_gather_idxs,
+        recv_sizes_array, non_local_to_global_));
+
+    // read the local matrix data
+    const auto num_local_rows =
+        static_cast<size_type>(row_partition->get_part_size(local_part));
+    const auto num_local_cols =
+        static_cast<size_type>(col_partition->get_part_size(local_part));
+    const auto num_non_local_cols = non_local_to_global_.get_num_elems();
+    device_matrix_data<value_type, local_index_type> local_data{
+        exec, dim<2>{num_local_rows, num_local_cols}, std::move(local_row_idxs),
+        std::move(local_col_idxs), std::move(local_values)};
+    device_matrix_data<value_type, local_index_type> non_local_data{
+        exec, dim<2>{num_local_rows, num_non_local_cols},
+        std::move(non_local_row_idxs), std::move(non_local_col_idxs),
+        std::move(non_local_values)};
+    as<ReadableFromMatrixData<ValueType, LocalIndexType>>(this->local_mtx_)
+        ->read(std::move(local_data));
+    as<ReadableFromMatrixData<ValueType, LocalIndexType>>(this->non_local_mtx_)
+        ->read(std::move(non_local_data));
+
+    // exchange step 1: determine recv_sizes, send_sizes, send_offsets
+    exec->get_master()->copy_from(exec.get(), num_parts,
+                                  recv_sizes_array.get_const_data(),
+                                  recv_sizes_.data());
+    std::partial_sum(recv_sizes_.begin(), recv_sizes_.end(),
+                     recv_offsets_.begin() + 1);
+    comm.all_to_all(exec, recv_sizes_.data(), 1, send_sizes_.data(), 1);
+    std::partial_sum(send_sizes_.begin(), send_sizes_.end(),
+                     send_offsets_.begin() + 1);
+    send_offsets_[0] = 0;
+    recv_offsets_[0] = 0;
+
+    // exchange step 2: exchange gather_idxs from receivers to senders
+    auto use_host_buffer = exec->get_master() != exec && !mpi::is_gpu_aware();
+    if (use_host_buffer) {
+        recv_gather_idxs.set_executor(exec->get_master());
+        gather_idxs_.clear();
+        gather_idxs_.set_executor(exec->get_master());
+    }
+    gather_idxs_.resize_and_reset(send_offsets_.back());
+    comm.all_to_all_v(use_host_buffer ? exec->get_master() : exec,
+                      recv_gather_idxs.get_const_data(), recv_sizes_.data(),
+                      recv_offsets_.data(), gather_idxs_.get_data(),
+                      send_sizes_.data(), send_offsets_.data());
+    if (use_host_buffer) {
+        gather_idxs_.set_executor(exec);
+    }
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
+    const matrix_data<value_type, global_index_type>& data,
+    const Partition<local_index_type, global_index_type>* row_partition,
+    const Partition<local_index_type, global_index_type>* col_partition)
+{
+    this->read_distributed(
+        device_matrix_data<value_type, global_index_type>::create_from_host(
+            this->get_executor(), data),
+        row_partition, col_partition);
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
+    const matrix_data<ValueType, global_index_type>& data,
+    const Partition<local_index_type, global_index_type>* partition)
+{
+    this->read_distributed(
+        device_matrix_data<value_type, global_index_type>::create_from_host(
+            this->get_executor(), data),
+        partition, partition);
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void Matrix<ValueType, LocalIndexType, GlobalIndexType>::read_distributed(
+    const device_matrix_data<ValueType, GlobalIndexType>& data,
+    const Partition<local_index_type, global_index_type>* partition)
+{
+    this->read_distributed(data, partition, partition);
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+mpi::request Matrix<ValueType, LocalIndexType, GlobalIndexType>::communicate(
+    const local_vector_type* local_b) const
+{
+    auto exec = this->get_executor();
+    const auto comm = this->get_communicator();
+    auto num_cols = local_b->get_size()[1];
+    auto send_size = send_offsets_.back();
+    auto recv_size = recv_offsets_.back();
+    auto send_dim = dim<2>{static_cast<size_type>(send_size), num_cols};
+    auto recv_dim = dim<2>{static_cast<size_type>(recv_size), num_cols};
+    recv_buffer_.init(exec, recv_dim);
+    send_buffer_.init(exec, send_dim);
+
+    local_b->row_gather(&gather_idxs_, send_buffer_.get());
+
+    auto use_host_buffer = exec->get_master() != exec && !mpi::is_gpu_aware();
+    if (use_host_buffer) {
+        host_recv_buffer_.init(exec->get_master(), recv_dim);
+        host_send_buffer_.init(exec->get_master(), send_dim);
+        host_send_buffer_->copy_from(send_buffer_.get());
+    }
+
+    mpi::contiguous_type type(num_cols, mpi::type_impl<ValueType>::get_type());
+    auto send_ptr = use_host_buffer ? host_send_buffer_->get_const_values()
+                                    : send_buffer_->get_const_values();
+    auto recv_ptr = use_host_buffer ? host_recv_buffer_->get_values()
+                                    : recv_buffer_->get_values();
+    exec->synchronize();
+#ifdef GINKGO_FORCE_SPMV_BLOCKING_COMM
+    comm.all_to_all_v(use_host_buffer ? exec->get_master() : exec, send_ptr,
+                      send_sizes_.data(), send_offsets_.data(), type.get(),
+                      recv_ptr, recv_sizes_.data(), recv_offsets_.data(),
+                      type.get());
+    return {};
+#else
+    return comm.i_all_to_all_v(
+        use_host_buffer ? exec->get_master() : exec, send_ptr,
+        send_sizes_.data(), send_offsets_.data(), type.get(), recv_ptr,
+        recv_sizes_.data(), recv_offsets_.data(), type.get());
+#endif
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void Matrix<ValueType, LocalIndexType, GlobalIndexType>::apply_impl(
+    const LinOp* b, LinOp* x) const
+{
+    distributed::precision_dispatch_real_complex<ValueType>(
+        [this](const auto dense_b, auto dense_x) {
+            auto x_exec = dense_x->get_executor();
+            auto local_x = gko::matrix::Dense<ValueType>::create(
+                x_exec, dense_x->get_local_vector()->get_size(),
+                gko::make_array_view(
+                    x_exec,
+                    dense_x->get_local_vector()->get_num_stored_elements(),
+                    dense_x->get_local_values()),
+                dense_x->get_local_vector()->get_stride());
+
+            auto req = this->communicate(dense_b->get_local_vector());
+            local_mtx_->apply(dense_b->get_local_vector(), local_x.get());
+            req.wait();
+
+            auto exec = this->get_executor();
+            auto use_host_buffer =
+                exec->get_master() != exec && !mpi::is_gpu_aware();
+            if (use_host_buffer) {
+                recv_buffer_->copy_from(host_recv_buffer_.get());
+            }
+            non_local_mtx_->apply(one_scalar_.get(), recv_buffer_.get(),
+                                  one_scalar_.get(), local_x.get());
+        },
+        b, x);
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void Matrix<ValueType, LocalIndexType, GlobalIndexType>::apply_impl(
+    const LinOp* alpha, const LinOp* b, const LinOp* beta, LinOp* x) const
+{
+    distributed::precision_dispatch_real_complex<ValueType>(
+        [this](const auto local_alpha, const auto dense_b,
+               const auto local_beta, auto dense_x) {
+            const auto x_exec = dense_x->get_executor();
+            auto local_x = gko::matrix::Dense<ValueType>::create(
+                x_exec, dense_x->get_local_vector()->get_size(),
+                gko::make_array_view(
+                    x_exec,
+                    dense_x->get_local_vector()->get_num_stored_elements(),
+                    dense_x->get_local_values()),
+                dense_x->get_local_vector()->get_stride());
+
+            auto req = this->communicate(dense_b->get_local_vector());
+            local_mtx_->apply(local_alpha, dense_b->get_local_vector(),
+                              local_beta, local_x.get());
+            req.wait();
+
+            auto exec = this->get_executor();
+            auto use_host_buffer =
+                exec->get_master() != exec && !mpi::is_gpu_aware();
+            if (use_host_buffer) {
+                recv_buffer_->copy_from(host_recv_buffer_.get());
+            }
+            non_local_mtx_->apply(local_alpha, recv_buffer_.get(),
+                                  one_scalar_.get(), local_x.get());
+        },
+        alpha, b, beta, x);
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+Matrix<ValueType, LocalIndexType, GlobalIndexType>::Matrix(const Matrix& other)
+    : EnableLinOp<Matrix<value_type, local_index_type,
+                         global_index_type>>{other.get_executor()},
+      DistributedBase{other.get_communicator()}
+{
+    *this = other;
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+Matrix<ValueType, LocalIndexType, GlobalIndexType>::Matrix(
+    Matrix&& other) noexcept
+    : EnableLinOp<Matrix<value_type, local_index_type,
+                         global_index_type>>{other.get_executor()},
+      DistributedBase{other.get_communicator()}
+{
+    *this = std::move(other);
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+Matrix<ValueType, LocalIndexType, GlobalIndexType>&
+Matrix<ValueType, LocalIndexType, GlobalIndexType>::operator=(
+    const Matrix& other)
+{
+    if (this != &other) {
+        GKO_ASSERT_EQ(other.get_communicator().size(),
+                      this->get_communicator().size());
+        this->set_size(other.get_size());
+        local_mtx_->copy_from(other.local_mtx_.get());
+        non_local_mtx_->copy_from(other.non_local_mtx_.get());
+        gather_idxs_ = other.gather_idxs_;
+        send_offsets_ = other.send_offsets_;
+        recv_offsets_ = other.recv_offsets_;
+        send_sizes_ = other.send_sizes_;
+        recv_sizes_ = other.recv_sizes_;
+        non_local_to_global_ = other.non_local_to_global_;
+        one_scalar_.init(this->get_executor(), dim<2>{1, 1});
+        one_scalar_->fill(one<value_type>());
+    }
+    return *this;
+}
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+Matrix<ValueType, LocalIndexType, GlobalIndexType>&
+Matrix<ValueType, LocalIndexType, GlobalIndexType>::operator=(Matrix&& other)
+{
+    if (this != &other) {
+        GKO_ASSERT_EQ(other.get_communicator().size(),
+                      this->get_communicator().size());
+        this->set_size(other.get_size());
+        other.set_size({});
+        local_mtx_->move_from(other.local_mtx_.get());
+        non_local_mtx_->move_from(other.non_local_mtx_.get());
+        gather_idxs_ = std::move(other.gather_idxs_);
+        send_offsets_ = std::move(other.send_offsets_);
+        recv_offsets_ = std::move(other.recv_offsets_);
+        send_sizes_ = std::move(other.send_sizes_);
+        recv_sizes_ = std::move(other.recv_sizes_);
+        non_local_to_global_ = std::move(other.non_local_to_global_);
+        one_scalar_.init(this->get_executor(), dim<2>{1, 1});
+        one_scalar_->fill(one<value_type>());
+    }
+    return *this;
+}
+
+
+#define GKO_DECLARE_DISTRIBUTED_MATRIX(ValueType, LocalIndexType, \
+                                       GlobalIndexType)           \
+    class Matrix<ValueType, LocalIndexType, GlobalIndexType>
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_DISTRIBUTED_MATRIX);
+
+
+}  // namespace distributed
+}  // namespace experimental
+}  // namespace gko
diff --git a/core/distributed/matrix_kernels.hpp b/core/distributed/matrix_kernels.hpp
new file mode 100644
index 00000000000..878e7fe3239
--- /dev/null
+++ b/core/distributed/matrix_kernels.hpp
@@ -0,0 +1,89 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_DISTRIBUTED_MATRIX_KERNELS_HPP_
+#define GKO_CORE_DISTRIBUTED_MATRIX_KERNELS_HPP_
+
+
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/base/types.hpp>
+#include <ginkgo/core/distributed/partition.hpp>
+
+
+#include "core/base/kernel_declaration.hpp"
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_BUILD_LOCAL_NONLOCAL(ValueType, LocalIndexType,            \
+                                         GlobalIndexType)                      \
+    void build_local_nonlocal(                                                 \
+        std::shared_ptr<const DefaultExecutor> exec,                           \
+        const device_matrix_data<ValueType, GlobalIndexType>& input,           \
+        const experimental::distributed::Partition<                            \
+            LocalIndexType, GlobalIndexType>* row_partition,                   \
+        const experimental::distributed::Partition<                            \
+            LocalIndexType, GlobalIndexType>* col_partition,                   \
+        comm_index_type local_part, array<LocalIndexType>& local_row_idxs,     \
+        array<LocalIndexType>& local_col_idxs, array<ValueType>& local_values, \
+        array<LocalIndexType>& non_local_row_idxs,                             \
+        array<LocalIndexType>& non_local_col_idxs,                             \
+        array<ValueType>& non_local_values,                                    \
+        array<LocalIndexType>& local_gather_idxs,                              \
+        array<comm_index_type>& recv_offsets,                                  \
+        array<GlobalIndexType>& non_local_to_global)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                    \
+    using comm_index_type = experimental::distributed::comm_index_type; \
+    template <typename ValueType, typename LocalIndexType,              \
+              typename GlobalIndexType>                                 \
+    GKO_DECLARE_BUILD_LOCAL_NONLOCAL(ValueType, LocalIndexType,         \
+                                     GlobalIndexType)
+
+
+GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(distributed_matrix,
+                                        GKO_DECLARE_ALL_AS_TEMPLATES);
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_DISTRIBUTED_MATRIX_KERNELS_HPP_
diff --git a/core/distributed/partition.cpp b/core/distributed/partition.cpp
index 4a58ccdaf7a..7dc634f17a9 100644
--- a/core/distributed/partition.cpp
+++ b/core/distributed/partition.cpp
@@ -37,6 +37,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 namespace gko {
+namespace experimental {
 namespace distributed {
 namespace partition {
 
@@ -142,4 +143,5 @@ GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(GKO_DECLARE_PARTITION);
 
 
 }  // namespace distributed
+}  // namespace experimental
 }  // namespace gko
diff --git a/core/distributed/partition_kernels.hpp b/core/distributed/partition_kernels.hpp
index 4a1f76d94c5..9f9d162044a 100644
--- a/core/distributed/partition_kernels.hpp
+++ b/core/distributed/partition_kernels.hpp
@@ -76,27 +76,26 @@ namespace kernels {
                                 comm_index_type& num_empty_parts,             \
                                 LocalIndexType* ranks, LocalIndexType* sizes)
 
-#define GKO_DECLARE_PARTITION_IS_ORDERED(LocalIndexType, GlobalIndexType) \
-    void has_ordered_parts(                                               \
-        std::shared_ptr<const DefaultExecutor> exec,                      \
-        const distributed::Partition<LocalIndexType, GlobalIndexType>*    \
-            partition,                                                    \
-        bool* result)
-
-
-#define GKO_DECLARE_ALL_AS_TEMPLATES                               \
-    using comm_index_type = distributed::comm_index_type;          \
-    GKO_PARTITION_COUNT_RANGES;                                    \
-    template <typename GlobalIndexType>                            \
-    GKO_PARTITION_BUILD_FROM_CONTIGUOUS(GlobalIndexType);          \
-    template <typename GlobalIndexType>                            \
-    GKO_PARTITION_BUILD_FROM_MAPPING(GlobalIndexType);             \
-    template <typename GlobalIndexType>                            \
-    GKO_PARTITION_BUILD_FROM_GLOBAL_SIZE(GlobalIndexType);         \
-    template <typename LocalIndexType, typename GlobalIndexType>   \
-    GKO_DECLARE_PARTITION_BUILD_STARTING_INDICES(LocalIndexType,   \
-                                                 GlobalIndexType); \
-    template <typename LocalIndexType, typename GlobalIndexType>   \
+#define GKO_DECLARE_PARTITION_IS_ORDERED(LocalIndexType, GlobalIndexType)   \
+    void has_ordered_parts(std::shared_ptr<const DefaultExecutor> exec,     \
+                           const experimental::distributed::Partition<      \
+                               LocalIndexType, GlobalIndexType>* partition, \
+                           bool* result)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                    \
+    using comm_index_type = experimental::distributed::comm_index_type; \
+    GKO_PARTITION_COUNT_RANGES;                                         \
+    template <typename GlobalIndexType>                                 \
+    GKO_PARTITION_BUILD_FROM_CONTIGUOUS(GlobalIndexType);               \
+    template <typename GlobalIndexType>                                 \
+    GKO_PARTITION_BUILD_FROM_MAPPING(GlobalIndexType);                  \
+    template <typename GlobalIndexType>                                 \
+    GKO_PARTITION_BUILD_FROM_GLOBAL_SIZE(GlobalIndexType);              \
+    template <typename LocalIndexType, typename GlobalIndexType>        \
+    GKO_DECLARE_PARTITION_BUILD_STARTING_INDICES(LocalIndexType,        \
+                                                 GlobalIndexType);      \
+    template <typename LocalIndexType, typename GlobalIndexType>        \
     GKO_DECLARE_PARTITION_IS_ORDERED(LocalIndexType, GlobalIndexType)
 
 GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(partition,
diff --git a/core/distributed/vector.cpp b/core/distributed/vector.cpp
new file mode 100644
index 00000000000..4d338d77f05
--- /dev/null
+++ b/core/distributed/vector.cpp
@@ -0,0 +1,543 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/distributed/vector.hpp>
+
+
+#include <ginkgo/core/distributed/partition.hpp>
+
+
+#include "core/distributed/vector_kernels.hpp"
+#include "core/matrix/dense_kernels.hpp"
+
+
+namespace gko {
+namespace experimental {
+namespace distributed {
+namespace vector {
+namespace {
+
+
+GKO_REGISTER_OPERATION(compute_squared_norm2, dense::compute_squared_norm2);
+GKO_REGISTER_OPERATION(compute_sqrt, dense::compute_sqrt);
+GKO_REGISTER_OPERATION(outplace_absolute_dense, dense::outplace_absolute_dense);
+GKO_REGISTER_OPERATION(build_local, distributed_vector::build_local);
+
+
+}  // namespace
+}  // namespace vector
+
+
+dim<2> compute_global_size(std::shared_ptr<const Executor> exec,
+                           mpi::communicator comm, dim<2> local_size)
+{
+    size_type num_global_rows = local_size[0];
+    comm.all_reduce(std::move(exec), &num_global_rows, 1, MPI_SUM);
+    return {num_global_rows, local_size[1]};
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::apply_impl(const LinOp* b, LinOp* x) const
+{
+    GKO_NOT_SUPPORTED(this);
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::apply_impl(const LinOp* alpha, const LinOp* b,
+                                   const LinOp* beta, LinOp* x) const
+{
+    GKO_NOT_SUPPORTED(this);
+}
+
+template <typename ValueType>
+Vector<ValueType>::Vector(std::shared_ptr<const Executor> exec,
+                          mpi::communicator comm, dim<2> global_size,
+                          dim<2> local_size)
+    : Vector(exec, comm, global_size, local_size, local_size[1])
+{}
+
+
+template <typename ValueType>
+Vector<ValueType>::Vector(std::shared_ptr<const Executor> exec,
+                          mpi::communicator comm, dim<2> global_size,
+                          dim<2> local_size, size_type stride)
+    : EnableLinOp<Vector<ValueType>>{exec, global_size},
+      DistributedBase{comm},
+      local_{exec, local_size, stride}
+{
+    GKO_ASSERT_EQUAL_COLS(global_size, local_size);
+}
+
+template <typename ValueType>
+Vector<ValueType>::Vector(std::shared_ptr<const Executor> exec,
+                          mpi::communicator comm, dim<2> global_size,
+                          local_vector_type* local_vector)
+    : EnableLinOp<Vector<ValueType>>{exec, global_size},
+      DistributedBase{comm},
+      local_{exec}
+{
+    local_vector->move_to(&local_);
+}
+
+
+template <typename ValueType>
+Vector<ValueType>::Vector(std::shared_ptr<const Executor> exec,
+                          mpi::communicator comm,
+                          local_vector_type* local_vector)
+    : EnableLinOp<Vector<ValueType>>{exec, {}},
+      DistributedBase{comm},
+      local_{exec}
+{
+    this->set_size(compute_global_size(exec, comm, local_vector->get_size()));
+    local_vector->move_to(&local_);
+}
+
+
+template <typename ValueType>
+template <typename LocalIndexType, typename GlobalIndexType>
+void Vector<ValueType>::read_distributed(
+    const device_matrix_data<ValueType, GlobalIndexType>& data,
+    const Partition<LocalIndexType, GlobalIndexType>* partition)
+{
+    auto exec = this->get_executor();
+    auto global_cols = data.get_size()[1];
+    this->resize(
+        dim<2>(partition->get_size(), global_cols),
+        dim<2>(partition->get_part_size(this->get_communicator().rank()),
+               global_cols));
+
+    auto rank = this->get_communicator().rank();
+    local_.fill(zero<ValueType>());
+    exec->run(vector::make_build_local(
+        data, make_temporary_clone(exec, partition).get(), rank, &local_));
+}
+
+
+template <typename ValueType>
+template <typename LocalIndexType, typename GlobalIndexType>
+void Vector<ValueType>::read_distributed(
+    const matrix_data<ValueType, GlobalIndexType>& data,
+    const Partition<LocalIndexType, GlobalIndexType>* partition)
+
+{
+    this->read_distributed(
+        device_matrix_data<value_type, GlobalIndexType>::create_from_host(
+            this->get_executor(), data),
+        std::move(partition));
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::fill(const ValueType value)
+{
+    local_.fill(value);
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::convert_to(
+    Vector<next_precision<ValueType>>* result) const
+{
+    GKO_ASSERT(this->get_communicator().size() ==
+               result->get_communicator().size());
+    result->set_size(this->get_size());
+    this->get_local_vector()->convert_to(&result->local_);
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::move_to(Vector<next_precision<ValueType>>* result)
+{
+    this->convert_to(result);
+}
+
+
+template <typename ValueType>
+std::unique_ptr<typename Vector<ValueType>::absolute_type>
+Vector<ValueType>::compute_absolute() const
+{
+    auto exec = this->get_executor();
+
+    auto result =
+        absolute_type::create(exec, this->get_communicator(), this->get_size(),
+                              this->get_local_vector()->get_size());
+
+    exec->run(vector::make_outplace_absolute_dense(this->get_local_vector(),
+                                                   &result->local_));
+
+    return result;
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::compute_absolute_inplace()
+{
+    local_.compute_absolute_inplace();
+}
+
+
+template <typename ValueType>
+const typename Vector<ValueType>::local_vector_type*
+Vector<ValueType>::get_local_vector() const
+{
+    return &local_;
+}
+
+
+template <typename ValueType>
+std::unique_ptr<typename Vector<ValueType>::complex_type>
+Vector<ValueType>::make_complex() const
+{
+    auto result = complex_type::create(
+        this->get_executor(), this->get_communicator(), this->get_size(),
+        this->get_local_vector()->get_size(),
+        this->get_local_vector()->get_stride());
+    this->make_complex(result.get());
+    return result;
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::make_complex(Vector::complex_type* result) const
+{
+    this->get_local_vector()->make_complex(&result->local_);
+}
+
+
+template <typename ValueType>
+std::unique_ptr<typename Vector<ValueType>::real_type>
+Vector<ValueType>::get_real() const
+{
+    auto result = real_type::create(this->get_executor(),
+                                    this->get_communicator(), this->get_size(),
+                                    this->get_local_vector()->get_size(),
+                                    this->get_local_vector()->get_stride());
+    this->get_real(result.get());
+    return result;
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::get_real(Vector::real_type* result) const
+{
+    this->get_local_vector()->get_real(&result->local_);
+}
+
+
+template <typename ValueType>
+std::unique_ptr<typename Vector<ValueType>::real_type>
+Vector<ValueType>::get_imag() const
+{
+    auto result = real_type::create(this->get_executor(),
+                                    this->get_communicator(), this->get_size(),
+                                    this->get_local_vector()->get_size(),
+                                    this->get_local_vector()->get_stride());
+    this->get_imag(result.get());
+    return result;
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::get_imag(Vector::real_type* result) const
+{
+    this->get_local_vector()->get_imag(&result->local_);
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::scale(const LinOp* alpha)
+{
+    local_.scale(alpha);
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::inv_scale(const LinOp* alpha)
+{
+    local_.inv_scale(alpha);
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::add_scaled(const LinOp* alpha, const LinOp* b)
+{
+    auto dense_b = as<Vector<ValueType>>(b);
+    local_.add_scaled(alpha, dense_b->get_local_vector());
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::sub_scaled(const LinOp* alpha, const LinOp* b)
+{
+    auto dense_b = as<Vector<ValueType>>(b);
+    local_.sub_scaled(alpha, dense_b->get_local_vector());
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::compute_dot(const LinOp* b, LinOp* result) const
+{
+    array<char> tmp{this->get_executor()};
+    this->compute_dot(b, result, tmp);
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::compute_dot(const LinOp* b, LinOp* result,
+                                    array<char>& tmp) const
+{
+    GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1]));
+    auto exec = this->get_executor();
+    const auto comm = this->get_communicator();
+    auto dense_res =
+        make_temporary_clone(exec, as<matrix::Dense<ValueType>>(result));
+    this->get_local_vector()->compute_dot(as<Vector>(b)->get_local_vector(),
+                                          dense_res.get(), tmp);
+    exec->synchronize();
+    auto use_host_buffer = exec->get_master() != exec && !mpi::is_gpu_aware();
+    if (use_host_buffer) {
+        host_reduction_buffer_.init(exec->get_master(), dense_res->get_size());
+        host_reduction_buffer_->copy_from(dense_res.get());
+        comm.all_reduce(exec->get_master(),
+                        host_reduction_buffer_->get_values(),
+                        static_cast<int>(this->get_size()[1]), MPI_SUM);
+        dense_res->copy_from(host_reduction_buffer_.get());
+    } else {
+        comm.all_reduce(exec, dense_res->get_values(),
+                        static_cast<int>(this->get_size()[1]), MPI_SUM);
+    }
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::compute_conj_dot(const LinOp* b, LinOp* result) const
+{
+    array<char> tmp{this->get_executor()};
+    this->compute_conj_dot(b, result, tmp);
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::compute_conj_dot(const LinOp* b, LinOp* result,
+                                         array<char>& tmp) const
+{
+    GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1]));
+    auto exec = this->get_executor();
+    const auto comm = this->get_communicator();
+    auto dense_res =
+        make_temporary_clone(exec, as<matrix::Dense<ValueType>>(result));
+    this->get_local_vector()->compute_conj_dot(
+        as<Vector>(b)->get_local_vector(), dense_res.get(), tmp);
+    exec->synchronize();
+    auto use_host_buffer = exec->get_master() != exec && !mpi::is_gpu_aware();
+    if (use_host_buffer) {
+        host_reduction_buffer_.init(exec->get_master(), dense_res->get_size());
+        host_reduction_buffer_->copy_from(dense_res.get());
+        comm.all_reduce(exec->get_master(),
+                        host_reduction_buffer_->get_values(),
+                        static_cast<int>(this->get_size()[1]), MPI_SUM);
+        dense_res->copy_from(host_reduction_buffer_.get());
+    } else {
+        comm.all_reduce(exec, dense_res->get_values(),
+                        static_cast<int>(this->get_size()[1]), MPI_SUM);
+    }
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::compute_norm2(LinOp* result) const
+{
+    array<char> tmp{this->get_executor()};
+    this->compute_norm2(result, tmp);
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::compute_norm2(LinOp* result, array<char>& tmp) const
+{
+    using NormVector = typename local_vector_type::absolute_type;
+    GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1]));
+    auto exec = this->get_executor();
+    const auto comm = this->get_communicator();
+    auto dense_res = make_temporary_clone(exec, as<NormVector>(result));
+    exec->run(vector::make_compute_squared_norm2(this->get_local_vector(),
+                                                 dense_res.get(), tmp));
+    exec->synchronize();
+    auto use_host_buffer = exec->get_master() != exec && !mpi::is_gpu_aware();
+    if (use_host_buffer) {
+        host_norm_buffer_.init(exec->get_master(), dense_res->get_size());
+        host_norm_buffer_->copy_from(dense_res.get());
+        comm.all_reduce(exec->get_master(), host_norm_buffer_->get_values(),
+                        static_cast<int>(this->get_size()[1]), MPI_SUM);
+        dense_res->copy_from(host_norm_buffer_.get());
+    } else {
+        comm.all_reduce(exec, dense_res->get_values(),
+                        static_cast<int>(this->get_size()[1]), MPI_SUM);
+    }
+    exec->run(vector::make_compute_sqrt(dense_res.get()));
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::compute_norm1(LinOp* result) const
+{
+    array<char> tmp{this->get_executor()};
+    this->compute_norm1(result, tmp);
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::compute_norm1(LinOp* result, array<char>& tmp) const
+{
+    using NormVector = typename local_vector_type::absolute_type;
+    GKO_ASSERT_EQUAL_DIMENSIONS(result, dim<2>(1, this->get_size()[1]));
+    auto exec = this->get_executor();
+    const auto comm = this->get_communicator();
+    auto dense_res = make_temporary_clone(exec, as<NormVector>(result));
+    this->get_local_vector()->compute_norm1(dense_res.get());
+    exec->synchronize();
+    auto use_host_buffer = exec->get_master() != exec && !mpi::is_gpu_aware();
+    if (use_host_buffer) {
+        host_norm_buffer_.init(exec->get_master(), dense_res->get_size());
+        host_norm_buffer_->copy_from(dense_res.get());
+        comm.all_reduce(exec->get_master(), host_norm_buffer_->get_values(),
+                        static_cast<int>(this->get_size()[1]), MPI_SUM);
+        dense_res->copy_from(host_norm_buffer_.get());
+    } else {
+        comm.all_reduce(exec, dense_res->get_values(),
+                        static_cast<int>(this->get_size()[1]), MPI_SUM);
+    }
+}
+
+
+template <typename ValueType>
+ValueType& Vector<ValueType>::at_local(size_type row, size_type col) noexcept
+{
+    return local_.at(row, col);
+}
+
+template <typename ValueType>
+ValueType Vector<ValueType>::at_local(size_type row, size_type col) const
+    noexcept
+{
+    return local_.at(row, col);
+}
+
+template <typename ValueType>
+ValueType& Vector<ValueType>::at_local(size_type idx) noexcept
+{
+    return local_.at(idx);
+}
+
+template <typename ValueType>
+ValueType Vector<ValueType>::at_local(size_type idx) const noexcept
+{
+    return local_.at(idx);
+}
+
+
+template <typename ValueType>
+ValueType* Vector<ValueType>::get_local_values()
+{
+    return local_.get_values();
+}
+
+
+template <typename ValueType>
+const ValueType* Vector<ValueType>::get_const_local_values() const
+{
+    return local_.get_const_values();
+}
+
+
+template <typename ValueType>
+void Vector<ValueType>::resize(dim<2> global_size, dim<2> local_size)
+{
+    if (this->get_size() != global_size) {
+        this->set_size(global_size);
+    }
+    local_.resize(local_size);
+}
+
+
+template <typename ValueType>
+std::unique_ptr<const typename Vector<ValueType>::real_type>
+Vector<ValueType>::create_real_view() const
+{
+    const auto num_global_rows = this->get_size()[0];
+    const auto num_cols =
+        is_complex<ValueType>() ? 2 * this->get_size()[1] : this->get_size()[1];
+
+    return real_type::create(this->get_executor(), this->get_communicator(),
+                             dim<2>{num_global_rows, num_cols},
+                             const_cast<typename real_type::local_vector_type*>(
+                                 local_.create_real_view().get()));
+}
+
+
+template <typename ValueType>
+std::unique_ptr<typename Vector<ValueType>::real_type>
+Vector<ValueType>::create_real_view()
+{
+    const auto num_global_rows = this->get_size()[0];
+    const auto num_cols =
+        is_complex<ValueType>() ? 2 * this->get_size()[1] : this->get_size()[1];
+
+    return real_type::create(this->get_executor(), this->get_communicator(),
+                             dim<2>{num_global_rows, num_cols},
+                             local_.create_real_view().get());
+}
+
+
+#define GKO_DECLARE_DISTRIBUTED_VECTOR(ValueType) class Vector<ValueType>
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(GKO_DECLARE_DISTRIBUTED_VECTOR);
+
+
+#define GKO_DECLARE_DISTRIBUTED_VECTOR_READ_DISTRIBUTED(                       \
+    ValueType, LocalIndexType, GlobalIndexType)                                \
+    void Vector<ValueType>::read_distributed<LocalIndexType, GlobalIndexType>( \
+        const device_matrix_data<ValueType, GlobalIndexType>& data,            \
+        const Partition<LocalIndexType, GlobalIndexType>* partition);          \
+    template void                                                              \
+    Vector<ValueType>::read_distributed<LocalIndexType, GlobalIndexType>(      \
+        const matrix_data<ValueType, GlobalIndexType>& data,                   \
+        const Partition<LocalIndexType, GlobalIndexType>* partition)
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_DISTRIBUTED_VECTOR_READ_DISTRIBUTED);
+
+
+}  // namespace distributed
+}  // namespace experimental
+}  // namespace gko
diff --git a/core/distributed/vector_kernels.hpp b/core/distributed/vector_kernels.hpp
new file mode 100644
index 00000000000..91a008f50dd
--- /dev/null
+++ b/core/distributed/vector_kernels.hpp
@@ -0,0 +1,81 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CORE_DISTRIBUTED_VECTOR_KERNELS_HPP_
+#define GKO_CORE_DISTRIBUTED_VECTOR_KERNELS_HPP_
+
+
+// can't include ginkgo/core/distributed/vector.hpp since that requires linking
+// against MPI
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/distributed/partition.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/base/kernel_declaration.hpp"
+
+
+namespace gko {
+namespace kernels {
+
+
+#define GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL(ValueType, LocalIndexType, \
+                                                   GlobalIndexType)           \
+    void build_local(                                                         \
+        std::shared_ptr<const DefaultExecutor> exec,                          \
+        const device_matrix_data<ValueType, GlobalIndexType>& input,          \
+        const experimental::distributed::Partition<                           \
+            LocalIndexType, GlobalIndexType>* partition,                      \
+        comm_index_type local_part, matrix::Dense<ValueType>* local_mtx)
+
+
+#define GKO_DECLARE_ALL_AS_TEMPLATES                                      \
+    using comm_index_type = experimental::distributed::comm_index_type;   \
+    template <typename ValueType, typename LocalIndexType,                \
+              typename GlobalIndexType>                                   \
+    GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL(ValueType, LocalIndexType, \
+                                               GlobalIndexType)
+
+
+GKO_DECLARE_FOR_ALL_EXECUTOR_NAMESPACES(distributed_vector,
+                                        GKO_DECLARE_ALL_AS_TEMPLATES);
+
+
+#undef GKO_DECLARE_ALL_AS_TEMPLATES
+
+
+}  // namespace kernels
+}  // namespace gko
+
+
+#endif  // GKO_CORE_DISTRIBUTED_VECTOR_KERNELS_HPP_
diff --git a/core/matrix/dense.cpp b/core/matrix/dense.cpp
index 4c1027421b7..2969fa4fec1 100644
--- a/core/matrix/dense.cpp
+++ b/core/matrix/dense.cpp
@@ -80,6 +80,8 @@ GKO_REGISTER_OPERATION(compute_dot, dense::compute_dot_dispatch);
 GKO_REGISTER_OPERATION(compute_conj_dot, dense::compute_conj_dot_dispatch);
 GKO_REGISTER_OPERATION(compute_norm2, dense::compute_norm2_dispatch);
 GKO_REGISTER_OPERATION(compute_norm1, dense::compute_norm1);
+GKO_REGISTER_OPERATION(compute_squared_norm2, dense::compute_squared_norm2);
+GKO_REGISTER_OPERATION(compute_sqrt, dense::compute_sqrt);
 GKO_REGISTER_OPERATION(compute_max_nnz_per_row, dense::compute_max_nnz_per_row);
 GKO_REGISTER_OPERATION(compute_hybrid_coo_row_ptrs,
                        hybrid::compute_coo_row_ptrs);
diff --git a/core/matrix/dense_kernels.hpp b/core/matrix/dense_kernels.hpp
index f6041ece443..c31d00e5337 100644
--- a/core/matrix/dense_kernels.hpp
+++ b/core/matrix/dense_kernels.hpp
@@ -151,6 +151,16 @@ namespace kernels {
                              const device_matrix_data<_type, _prec>& data, \
                              matrix::Dense<_type>* output)
 
+#define GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL(_type)                \
+    void compute_squared_norm2(std::shared_ptr<const DefaultExecutor> exec,  \
+                               const matrix::Dense<_type>* x,                \
+                               matrix::Dense<remove_complex<_type>>* result, \
+                               array<char>& tmp)
+
+#define GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL(_type)               \
+    void compute_sqrt(std::shared_ptr<const DefaultExecutor> exec, \
+                      matrix::Dense<_type>* data)
+
 #define GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL(_type, _prec)        \
     void convert_to_coo(std::shared_ptr<const DefaultExecutor> exec, \
                         const matrix::Dense<_type>* source,          \
@@ -341,6 +351,10 @@ namespace kernels {
     GKO_DECLARE_DENSE_COMPUTE_NORM1_KERNEL(ValueType);                      \
     template <typename ValueType, typename IndexType>                       \
     GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL(ValueType, IndexType);     \
+    template <typename ValueType>                                           \
+    GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL(ValueType);              \
+    template <typename ValueType>                                           \
+    GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL(ValueType);                       \
     template <typename ValueType, typename IndexType>                       \
     GKO_DECLARE_DENSE_CONVERT_TO_COO_KERNEL(ValueType, IndexType);          \
     template <typename ValueType, typename IndexType>                       \
diff --git a/core/matrix/identity.cpp b/core/matrix/identity.cpp
index 21c5d91dc3a..66898f2c74d 100644
--- a/core/matrix/identity.cpp
+++ b/core/matrix/identity.cpp
@@ -54,7 +54,7 @@ template <typename ValueType>
 void Identity<ValueType>::apply_impl(const LinOp* alpha, const LinOp* b,
                                      const LinOp* beta, LinOp* x) const
 {
-    precision_dispatch_real_complex<ValueType>(
+    experimental::precision_dispatch_real_complex_distributed<ValueType>(
         [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
             dense_x->scale(dense_beta);
             dense_x->add_scaled(dense_alpha, dense_b);
diff --git a/core/solver/bicg.cpp b/core/solver/bicg.cpp
index 1808489c1ad..6a1410cbeb2 100644
--- a/core/solver/bicg.cpp
+++ b/core/solver/bicg.cpp
@@ -39,7 +39,6 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/name_demangling.hpp>
 #include <ginkgo/core/base/precision_dispatch.hpp>
-#include <ginkgo/core/base/utils.hpp>
 
 
 #include "core/solver/bicg_kernels.hpp"
diff --git a/core/solver/bicgstab.cpp b/core/solver/bicgstab.cpp
index 820bd51f0e3..1d6a6472048 100644
--- a/core/solver/bicgstab.cpp
+++ b/core/solver/bicgstab.cpp
@@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/solver_base.hpp>
 
 
+#include "core/distributed/helpers.hpp"
 #include "core/solver/bicgstab_kernels.hpp"
 #include "core/solver/solver_boilerplate.hpp"
 
@@ -95,7 +96,7 @@ void Bicgstab<ValueType>::apply_impl(const LinOp* b, LinOp* x) const
     if (!this->get_system_matrix()) {
         return;
     }
-    precision_dispatch_real_complex<ValueType>(
+    experimental::precision_dispatch_real_complex_distributed<ValueType>(
         [this](auto dense_b, auto dense_x) {
             this->apply_dense_impl(dense_b, dense_x);
         },
@@ -104,12 +105,11 @@ void Bicgstab<ValueType>::apply_impl(const LinOp* b, LinOp* x) const
 
 
 template <typename ValueType>
-void Bicgstab<ValueType>::apply_dense_impl(
-    const matrix::Dense<ValueType>* dense_b,
-    matrix::Dense<ValueType>* dense_x) const
+template <typename VectorType>
+void Bicgstab<ValueType>::apply_dense_impl(const VectorType* dense_b,
+                                           VectorType* dense_x) const
 {
     using std::swap;
-    using Vector = matrix::Dense<ValueType>;
 
     constexpr uint8 RelativeStoppingId{1};
 
@@ -141,9 +141,13 @@ void Bicgstab<ValueType>::apply_dense_impl(
     // prev_rho = rho = omega = alpha = beta = gamma = 1.0
     // rr = v = s = t = z = y = p = 0
     // stop_status = 0x00
-    exec->run(bicgstab::make_initialize(dense_b, r, rr, y, s, t, z, v, p,
-                                        prev_rho, rho, alpha, beta, gamma,
-                                        omega, &stop_status));
+    exec->run(bicgstab::make_initialize(
+        gko::detail::get_local(dense_b), gko::detail::get_local(r),
+        gko::detail::get_local(rr), gko::detail::get_local(y),
+        gko::detail::get_local(s), gko::detail::get_local(t),
+        gko::detail::get_local(z), gko::detail::get_local(v),
+        gko::detail::get_local(p), prev_rho, rho, alpha, beta, gamma, omega,
+        &stop_status));
 
     // r = b - Ax
     this->get_system_matrix()->apply(neg_one_op, dense_x, one_op, r);
@@ -183,8 +187,10 @@ void Bicgstab<ValueType>::apply_dense_impl(
 
         // tmp = rho / prev_rho * alpha / omega
         // p = r + tmp * (p - omega * v)
-        exec->run(bicgstab::make_step_1(r, p, v, rho, prev_rho, alpha, omega,
-                                        &stop_status));
+        exec->run(bicgstab::make_step_1(gko::detail::get_local(r),
+                                        gko::detail::get_local(p),
+                                        gko::detail::get_local(v), rho,
+                                        prev_rho, alpha, omega, &stop_status));
 
         // y = preconditioner * p
         this->get_preconditioner()->apply(p, y);
@@ -194,8 +200,9 @@ void Bicgstab<ValueType>::apply_dense_impl(
         rr->compute_conj_dot(v, beta, reduction_tmp);
         // alpha = rho / beta
         // s = r - alpha * v
-        exec->run(
-            bicgstab::make_step_2(r, s, v, rho, alpha, beta, &stop_status));
+        exec->run(bicgstab::make_step_2(
+            gko::detail::get_local(r), gko::detail::get_local(s),
+            gko::detail::get_local(v), rho, alpha, beta, &stop_status));
 
         auto all_converged =
             stop_criterion->update()
@@ -205,7 +212,9 @@ void Bicgstab<ValueType>::apply_dense_impl(
                 // .solution(dense_x) // outdated at this point
                 .check(RelativeStoppingId, false, &stop_status, &one_changed);
         if (one_changed) {
-            exec->run(bicgstab::make_finalize(dense_x, y, alpha, &stop_status));
+            exec->run(bicgstab::make_finalize(gko::detail::get_local(dense_x),
+                                              gko::detail::get_local(y), alpha,
+                                              &stop_status));
         }
         if (all_converged) {
             break;
@@ -222,8 +231,11 @@ void Bicgstab<ValueType>::apply_dense_impl(
         // omega = gamma / beta
         // x = x + alpha * y + omega * z
         // r = s - omega * t
-        exec->run(bicgstab::make_step_3(dense_x, r, s, t, y, z, alpha, beta,
-                                        gamma, omega, &stop_status));
+        exec->run(bicgstab::make_step_3(
+            gko::detail::get_local(dense_x), gko::detail::get_local(r),
+            gko::detail::get_local(s), gko::detail::get_local(t),
+            gko::detail::get_local(y), gko::detail::get_local(z), alpha, beta,
+            gamma, omega, &stop_status));
         swap(prev_rho, rho);
     }
 }
@@ -236,7 +248,7 @@ void Bicgstab<ValueType>::apply_impl(const LinOp* alpha, const LinOp* b,
     if (!this->get_system_matrix()) {
         return;
     }
-    precision_dispatch_real_complex<ValueType>(
+    experimental::precision_dispatch_real_complex_distributed<ValueType>(
         [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
             auto x_clone = dense_x->clone();
             this->apply_dense_impl(dense_b, x_clone.get());
diff --git a/core/solver/cg.cpp b/core/solver/cg.cpp
index 12d67fbe563..8038a361e86 100644
--- a/core/solver/cg.cpp
+++ b/core/solver/cg.cpp
@@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/utils.hpp>
 
 
+#include "core/distributed/helpers.hpp"
 #include "core/solver/cg_kernels.hpp"
 #include "core/solver/solver_boilerplate.hpp"
 
@@ -93,7 +94,7 @@ void Cg<ValueType>::apply_impl(const LinOp* b, LinOp* x) const
     if (!this->get_system_matrix()) {
         return;
     }
-    precision_dispatch_real_complex<ValueType>(
+    experimental::precision_dispatch_real_complex_distributed<ValueType>(
         [this](auto dense_b, auto dense_x) {
             this->apply_dense_impl(dense_b, dense_x);
         },
@@ -102,11 +103,12 @@ void Cg<ValueType>::apply_impl(const LinOp* b, LinOp* x) const
 
 
 template <typename ValueType>
-void Cg<ValueType>::apply_dense_impl(const matrix::Dense<ValueType>* dense_b,
-                                     matrix::Dense<ValueType>* dense_x) const
+template <typename VectorType>
+void Cg<ValueType>::apply_dense_impl(const VectorType* dense_b,
+                                     VectorType* dense_x) const
 {
     using std::swap;
-    using Vector = matrix::Dense<ValueType>;
+    using LocalVector = matrix::Dense<ValueType>;
 
     constexpr uint8 RelativeStoppingId{1};
 
@@ -132,8 +134,10 @@ void Cg<ValueType>::apply_dense_impl(const matrix::Dense<ValueType>* dense_b,
     // rho = 0.0
     // prev_rho = 1.0
     // z = p = q = 0
-    exec->run(
-        cg::make_initialize(dense_b, r, z, p, q, prev_rho, rho, &stop_status));
+    exec->run(cg::make_initialize(
+        gko::detail::get_local(dense_b), gko::detail::get_local(r),
+        gko::detail::get_local(z), gko::detail::get_local(p),
+        gko::detail::get_local(q), prev_rho, rho, &stop_status));
 
     this->get_system_matrix()->apply(neg_one_op, dense_x, one_op, r);
     auto stop_criterion = this->get_stop_criterion_factory()->generate(
@@ -170,7 +174,9 @@ void Cg<ValueType>::apply_dense_impl(const matrix::Dense<ValueType>* dense_b,
 
         // tmp = rho / prev_rho
         // p = z + tmp * p
-        exec->run(cg::make_step_1(p, z, rho, prev_rho, &stop_status));
+        exec->run(cg::make_step_1(gko::detail::get_local(p),
+                                  gko::detail::get_local(z), rho, prev_rho,
+                                  &stop_status));
         // q = A * p
         this->get_system_matrix()->apply(p, q);
         // beta = dot(p, q)
@@ -178,7 +184,10 @@ void Cg<ValueType>::apply_dense_impl(const matrix::Dense<ValueType>* dense_b,
         // tmp = rho / beta
         // x = x + tmp * p
         // r = r - tmp * q
-        exec->run(cg::make_step_2(dense_x, r, p, q, beta, rho, &stop_status));
+        exec->run(cg::make_step_2(
+            gko::detail::get_local(dense_x), gko::detail::get_local(r),
+            gko::detail::get_local(p), gko::detail::get_local(q), beta, rho,
+            &stop_status));
         swap(prev_rho, rho);
     }
 }
@@ -191,7 +200,7 @@ void Cg<ValueType>::apply_impl(const LinOp* alpha, const LinOp* b,
     if (!this->get_system_matrix()) {
         return;
     }
-    precision_dispatch_real_complex<ValueType>(
+    experimental::precision_dispatch_real_complex_distributed<ValueType>(
         [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
             auto x_clone = dense_x->clone();
             this->apply_dense_impl(dense_b, x_clone.get());
diff --git a/core/solver/cgs.cpp b/core/solver/cgs.cpp
index 53ddd773563..abf39f90a7e 100644
--- a/core/solver/cgs.cpp
+++ b/core/solver/cgs.cpp
@@ -42,6 +42,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/solver_base.hpp>
 
 
+#include "core/distributed/helpers.hpp"
 #include "core/solver/cgs_kernels.hpp"
 #include "core/solver/solver_boilerplate.hpp"
 
@@ -94,7 +95,7 @@ void Cgs<ValueType>::apply_impl(const LinOp* b, LinOp* x) const
     if (!this->get_system_matrix()) {
         return;
     }
-    precision_dispatch_real_complex<ValueType>(
+    experimental::precision_dispatch_real_complex_distributed<ValueType>(
         [this](auto dense_b, auto dense_x) {
             this->apply_dense_impl(dense_b, dense_x);
         },
@@ -103,11 +104,12 @@ void Cgs<ValueType>::apply_impl(const LinOp* b, LinOp* x) const
 
 
 template <typename ValueType>
-void Cgs<ValueType>::apply_dense_impl(const matrix::Dense<ValueType>* dense_b,
-                                      matrix::Dense<ValueType>* dense_x) const
+template <typename VectorType>
+void Cgs<ValueType>::apply_dense_impl(const VectorType* dense_b,
+                                      VectorType* dense_x) const
 {
     using std::swap;
-    using Vector = matrix::Dense<ValueType>;
+    using LocalVector = matrix::Dense<ValueType>;
 
     constexpr uint8 RelativeStoppingId{1};
 
@@ -139,9 +141,13 @@ void Cgs<ValueType>::apply_dense_impl(const matrix::Dense<ValueType>* dense_b,
     // rho = 0.0
     // prev_rho = alpha = beta = gamma = 1.0
     // p = q = u = u_hat = v_hat = t = 0
-    exec->run(cgs::make_initialize(dense_b, r, r_tld, p, q, u, u_hat, v_hat, t,
-                                   alpha, beta, gamma, prev_rho, rho,
-                                   &stop_status));
+    exec->run(cgs::make_initialize(
+        gko::detail::get_local(dense_b), gko::detail::get_local(r),
+        gko::detail::get_local(r_tld), gko::detail::get_local(p),
+        gko::detail::get_local(q), gko::detail::get_local(u),
+        gko::detail::get_local(u_hat), gko::detail::get_local(v_hat),
+        gko::detail::get_local(t), alpha, beta, gamma, prev_rho, rho,
+        &stop_status));
 
     this->get_system_matrix()->apply(neg_one_op, dense_x, one_op, r);
     auto stop_criterion = this->get_stop_criterion_factory()->generate(
@@ -178,22 +184,29 @@ void Cgs<ValueType>::apply_dense_impl(const matrix::Dense<ValueType>* dense_b,
         // beta = rho / prev_rho
         // u = r + beta * q
         // p = u + beta * ( q + beta * p )
-        exec->run(
-            cgs::make_step_1(r, u, p, q, beta, rho, prev_rho, &stop_status));
+        exec->run(cgs::make_step_1(
+            gko::detail::get_local(r), gko::detail::get_local(u),
+            gko::detail::get_local(p), gko::detail::get_local(q), beta, rho,
+            prev_rho, &stop_status));
         this->get_preconditioner()->apply(p, t);
         this->get_system_matrix()->apply(t, v_hat);
         r_tld->compute_conj_dot(v_hat, gamma, reduction_tmp);
         // alpha = rho / gamma
         // q = u - alpha * v_hat
         // t = u + q
-        exec->run(
-            cgs::make_step_2(u, v_hat, q, t, alpha, rho, gamma, &stop_status));
+        exec->run(cgs::make_step_2(
+            gko::detail::get_local(u), gko::detail::get_local(v_hat),
+            gko::detail::get_local(q), gko::detail::get_local(t), alpha, rho,
+            gamma, &stop_status));
 
         this->get_preconditioner()->apply(t, u_hat);
         this->get_system_matrix()->apply(u_hat, t);
         // r = r - alpha * t
         // x = x + alpha * u_hat
-        exec->run(cgs::make_step_3(t, u_hat, r, dense_x, alpha, &stop_status));
+        exec->run(cgs::make_step_3(
+            gko::detail::get_local(t), gko::detail::get_local(u_hat),
+            gko::detail::get_local(r), gko::detail::get_local(dense_x), alpha,
+            &stop_status));
 
         swap(prev_rho, rho);
     }
@@ -207,7 +220,7 @@ void Cgs<ValueType>::apply_impl(const LinOp* alpha, const LinOp* b,
     if (!this->get_system_matrix()) {
         return;
     }
-    precision_dispatch_real_complex<ValueType>(
+    experimental::precision_dispatch_real_complex_distributed<ValueType>(
         [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
             auto x_clone = dense_x->clone();
             this->apply_dense_impl(dense_b, x_clone.get());
diff --git a/core/solver/fcg.cpp b/core/solver/fcg.cpp
index ce789c75e4d..bf6dda5b036 100644
--- a/core/solver/fcg.cpp
+++ b/core/solver/fcg.cpp
@@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/utils.hpp>
 
 
+#include "core/distributed/helpers.hpp"
 #include "core/solver/fcg_kernels.hpp"
 #include "core/solver/solver_boilerplate.hpp"
 
@@ -92,7 +93,7 @@ void Fcg<ValueType>::apply_impl(const LinOp* b, LinOp* x) const
     if (!this->get_system_matrix()) {
         return;
     }
-    precision_dispatch_real_complex<ValueType>(
+    experimental::precision_dispatch_real_complex_distributed<ValueType>(
         [this](auto dense_b, auto dense_x) {
             this->apply_dense_impl(dense_b, dense_x);
         },
@@ -101,11 +102,12 @@ void Fcg<ValueType>::apply_impl(const LinOp* b, LinOp* x) const
 
 
 template <typename ValueType>
-void Fcg<ValueType>::apply_dense_impl(const matrix::Dense<ValueType>* dense_b,
-                                      matrix::Dense<ValueType>* dense_x) const
+template <typename VectorType>
+void Fcg<ValueType>::apply_dense_impl(const VectorType* dense_b,
+                                      VectorType* dense_x) const
 {
     using std::swap;
-    using Vector = matrix::Dense<ValueType>;
+    using LocalVector = matrix::Dense<ValueType>;
 
     constexpr uint8 RelativeStoppingId{1};
 
@@ -129,15 +131,17 @@ void Fcg<ValueType>::apply_dense_impl(const matrix::Dense<ValueType>* dense_b,
     bool one_changed{};
     GKO_SOLVER_STOP_REDUCTION_ARRAYS();
 
-    // TODO: replace this with automatic merged kernel generator
-    exec->run(fcg::make_initialize(dense_b, r, z, p, q, t, prev_rho, rho, rho_t,
-                                   &stop_status));
     // r = dense_b
     // t = r
     // rho = 0.0
     // prev_rho = 1.0
     // rho_t = 1.0
     // z = p = q = 0
+    exec->run(fcg::make_initialize(
+        gko::detail::get_local(dense_b), gko::detail::get_local(r),
+        gko::detail::get_local(z), gko::detail::get_local(p),
+        gko::detail::get_local(q), gko::detail::get_local(t), prev_rho, rho,
+        rho_t, &stop_status));
 
     this->get_system_matrix()->apply(neg_one_op, dense_x, one_op, r);
     auto stop_criterion = this->get_stop_criterion_factory()->generate(
@@ -173,7 +177,9 @@ void Fcg<ValueType>::apply_dense_impl(const matrix::Dense<ValueType>* dense_b,
 
         // tmp = rho_t / prev_rho
         // p = z + tmp * p
-        exec->run(fcg::make_step_1(p, z, rho_t, prev_rho, &stop_status));
+        exec->run(fcg::make_step_1(
+            gko::detail::get_local(p), gko::detail::get_local(z),
+            gko::detail::get_local(rho_t), prev_rho, &stop_status));
         this->get_system_matrix()->apply(p, q);
         p->compute_conj_dot(q, beta, reduction_tmp);
         // tmp = rho / beta
@@ -181,8 +187,10 @@ void Fcg<ValueType>::apply_dense_impl(const matrix::Dense<ValueType>* dense_b,
         // x = x + tmp * p
         // r = r - tmp * q
         // t = r - [prev_r]
-        exec->run(
-            fcg::make_step_2(dense_x, r, t, p, q, beta, rho, &stop_status));
+        exec->run(fcg::make_step_2(
+            gko::detail::get_local(dense_x), gko::detail::get_local(r),
+            gko::detail::get_local(t), gko::detail::get_local(p),
+            gko::detail::get_local(q), beta, rho, &stop_status));
         swap(prev_rho, rho);
     }
 }
@@ -195,7 +203,7 @@ void Fcg<ValueType>::apply_impl(const LinOp* alpha, const LinOp* b,
     if (!this->get_system_matrix()) {
         return;
     }
-    precision_dispatch_real_complex<ValueType>(
+    experimental::precision_dispatch_real_complex_distributed<ValueType>(
         [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
             auto x_clone = dense_x->clone();
             this->apply_dense_impl(dense_b, x_clone.get());
diff --git a/core/solver/idr.cpp b/core/solver/idr.cpp
index 92c61b062e4..dbc110c07e1 100644
--- a/core/solver/idr.cpp
+++ b/core/solver/idr.cpp
@@ -38,10 +38,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/executor.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/precision_dispatch.hpp>
-#include <ginkgo/core/base/utils.hpp>
 #include <ginkgo/core/solver/solver_base.hpp>
 
 
+#include "core/distributed/helpers.hpp"
 #include "core/solver/idr_kernels.hpp"
 #include "core/solver/solver_boilerplate.hpp"
 
@@ -90,11 +90,12 @@ std::unique_ptr<LinOp> Idr<ValueType>::conj_transpose() const
 
 
 template <typename ValueType>
-template <typename SubspaceType>
-void Idr<ValueType>::iterate(const matrix::Dense<SubspaceType>* dense_b,
-                             matrix::Dense<SubspaceType>* dense_x) const
+template <typename VectorType>
+void Idr<ValueType>::iterate(const VectorType* dense_b,
+                             VectorType* dense_x) const
 {
     using std::swap;
+    using SubspaceType = typename VectorType::value_type;
     using Vector = matrix::Dense<SubspaceType>;
     using AbsType = remove_complex<ValueType>;
     using ws = workspace_traits<Idr>;
@@ -161,8 +162,9 @@ void Idr<ValueType>::iterate(const matrix::Dense<SubspaceType>* dense_b,
             std::default_random_engine(15));
         subspace_vectors->read(subspace_vectors_data);
     }
-    exec->run(idr::make_initialize(nrhs, m, subspace_vectors, is_deterministic,
-                                   &stop_status));
+    exec->run(idr::make_initialize(nrhs, gko::detail::get_local(m),
+                                   gko::detail::get_local(subspace_vectors),
+                                   is_deterministic, &stop_status));
 
     // omega = 1
     omega->fill(one<SubspaceType>());
@@ -221,14 +223,19 @@ void Idr<ValueType>::iterate(const matrix::Dense<SubspaceType>* dense_b,
         for (size_type k = 0; k < subspace_dim; k++) {
             // c = M \ f = (c_1, ..., c_s)^T
             // v = residual - sum i=[k,s) of (c_i * g_i)
-            exec->run(idr::make_step_1(nrhs, k, m, f, residual, g, c, v,
-                                       &stop_status));
+            exec->run(idr::make_step_1(
+                nrhs, k, gko::detail::get_local(m), gko::detail::get_local(f),
+                gko::detail::get_local(residual), gko::detail::get_local(g),
+                gko::detail::get_local(c), gko::detail::get_local(v),
+                &stop_status));
 
             this->get_preconditioner()->apply(v, helper);
 
             // u_k = omega * precond_vector + sum i=[k,s) of (c_i * u_i)
-            exec->run(
-                idr::make_step_2(nrhs, k, omega, helper, c, u, &stop_status));
+            exec->run(idr::make_step_2(
+                nrhs, k, gko::detail::get_local(omega),
+                gko::detail::get_local(helper), gko::detail::get_local(c),
+                gko::detail::get_local(u), &stop_status));
 
             auto u_k = u->create_submatrix(span{0, problem_size},
                                            span{k * nrhs, (k + 1) * nrhs});
@@ -249,9 +256,13 @@ void Idr<ValueType>::iterate(const matrix::Dense<SubspaceType>* dense_b,
             // residual -= beta * g_k
             // dense_x += beta * u_k
             // f = (0,...,0,f_k+1 - beta * m_k+1,k,...,f_s-1 - beta * m_s-1,k)
-            exec->run(idr::make_step_3(nrhs, k, subspace_vectors, g, helper, u,
-                                       m, f, alpha, residual, dense_x,
-                                       &stop_status));
+            exec->run(idr::make_step_3(
+                nrhs, k, gko::detail::get_local(subspace_vectors),
+                gko::detail::get_local(g), gko::detail::get_local(helper),
+                gko::detail::get_local(u), gko::detail::get_local(m),
+                gko::detail::get_local(f), gko::detail::get_local(alpha),
+                gko::detail::get_local(residual),
+                gko::detail::get_local(dense_x), &stop_status));
         }
 
         this->get_preconditioner()->apply(residual, helper);
@@ -268,8 +279,10 @@ void Idr<ValueType>::iterate(const matrix::Dense<SubspaceType>* dense_b,
         // end if
         // residual -= omega * t
         // dense_x += omega * v
-        exec->run(idr::make_compute_omega(nrhs, kappa, tht, residual_norm,
-                                          omega, &stop_status));
+        exec->run(idr::make_compute_omega(
+            nrhs, kappa, gko::detail::get_local(tht),
+            gko::detail::get_local(residual_norm),
+            gko::detail::get_local(omega), &stop_status));
 
         t->scale(subspace_neg_one_op);
         residual->add_scaled(omega, t);
diff --git a/core/solver/ir.cpp b/core/solver/ir.cpp
index 1a596cf1d0c..75368453b71 100644
--- a/core/solver/ir.cpp
+++ b/core/solver/ir.cpp
@@ -38,6 +38,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/solver/solver_base.hpp>
 
 
+#include "core/distributed/helpers.hpp"
 #include "core/solver/ir_kernels.hpp"
 #include "core/solver/solver_boilerplate.hpp"
 
@@ -164,7 +165,7 @@ void Ir<ValueType>::apply_impl(const LinOp* b, LinOp* x) const
     if (!this->get_system_matrix()) {
         return;
     }
-    precision_dispatch_real_complex<ValueType>(
+    experimental::precision_dispatch_real_complex_distributed<ValueType>(
         [this](auto dense_b, auto dense_x) {
             this->apply_dense_impl(dense_b, dense_x);
         },
@@ -173,8 +174,9 @@ void Ir<ValueType>::apply_impl(const LinOp* b, LinOp* x) const
 
 
 template <typename ValueType>
-void Ir<ValueType>::apply_dense_impl(const matrix::Dense<ValueType>* dense_b,
-                                     matrix::Dense<ValueType>* dense_x) const
+template <typename VectorType>
+void Ir<ValueType>::apply_dense_impl(const VectorType* dense_b,
+                                     VectorType* dense_x) const
 {
     using Vector = matrix::Dense<ValueType>;
     using ws = workspace_traits<Ir>;
@@ -250,7 +252,7 @@ void Ir<ValueType>::apply_impl(const LinOp* alpha, const LinOp* b,
     if (!this->get_system_matrix()) {
         return;
     }
-    precision_dispatch_real_complex<ValueType>(
+    experimental::precision_dispatch_real_complex_distributed<ValueType>(
         [this](auto dense_alpha, auto dense_b, auto dense_beta, auto dense_x) {
             auto x_clone = dense_x->clone();
             this->apply_dense_impl(dense_b, x_clone.get());
diff --git a/core/stop/residual_norm.cpp b/core/stop/residual_norm.cpp
index e699e1cd122..1319ca9b027 100644
--- a/core/stop/residual_norm.cpp
+++ b/core/stop/residual_norm.cpp
@@ -33,7 +33,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/stop/residual_norm.hpp>
 
 
+#include <ginkgo/core/base/precision_dispatch.hpp>
+
+
+#include "core/base/dispatch_helper.hpp"
 #include "core/components/fill_array_kernels.hpp"
+#include "core/distributed/helpers.hpp"
 #include "core/stop/residual_norm_kernels.hpp"
 
 
@@ -62,6 +67,128 @@ GKO_REGISTER_OPERATION(implicit_residual_norm,
 }  // namespace implicit_residual_norm
 
 
+template <typename ValueType>
+bool any_is_complex()
+{
+    return false;
+}
+
+
+template <typename ValueType, typename LinOp, typename... Rest>
+bool any_is_complex(const LinOp* in, Rest&&... rest)
+{
+#if GINKGO_BUILD_MPI
+    bool is_complex_distributed = dynamic_cast<const ConvertibleTo<
+        experimental::distributed::Vector<std::complex<double>>>*>(in);
+#else
+    bool is_complex_distributed = false;
+#endif
+
+    return is_complex<ValueType>() || is_complex_distributed ||
+           dynamic_cast<
+               const ConvertibleTo<matrix::Dense<std::complex<double>>>*>(in) ||
+           any_is_complex<ValueType>(std::forward<Rest>(rest)...);
+}
+
+
+template <typename ValueType, typename Function, typename... LinOps>
+void norm_dispatch(Function&& fn, LinOps*... linops)
+{
+#if GINKGO_BUILD_MPI
+    if (gko::detail::is_distributed(linops...)) {
+        if (any_is_complex<ValueType>(linops...)) {
+            experimental::distributed::precision_dispatch<
+                to_complex<ValueType>>(std::forward<Function>(fn), linops...);
+        } else {
+            experimental::distributed::precision_dispatch<ValueType>(
+                std::forward<Function>(fn), linops...);
+        }
+    } else
+#endif
+    {
+        if (any_is_complex<ValueType>(linops...)) {
+            precision_dispatch<to_complex<ValueType>>(
+                std::forward<Function>(fn), linops...);
+        } else {
+            precision_dispatch<ValueType>(std::forward<Function>(fn),
+                                          linops...);
+        }
+    }
+}
+
+
+template <typename ValueType>
+ResidualNormBase<ValueType>::ResidualNormBase(
+    std::shared_ptr<const gko::Executor> exec, const CriterionArgs& args,
+    remove_complex<ValueType> reduction_factor, mode baseline)
+    : EnablePolymorphicObject<ResidualNormBase, Criterion>(exec),
+      device_storage_{exec, 2},
+      reduction_factor_{reduction_factor},
+      baseline_{baseline},
+      system_matrix_{args.system_matrix},
+      b_{args.b},
+      one_{gko::initialize<Vector>({1}, exec)},
+      neg_one_{gko::initialize<Vector>({-1}, exec)}
+{
+    switch (baseline_) {
+    case mode::initial_resnorm: {
+        if (args.initial_residual == nullptr) {
+            if (args.system_matrix == nullptr || args.b == nullptr ||
+                args.x == nullptr) {
+                GKO_NOT_SUPPORTED(nullptr);
+            } else {
+                this->starting_tau_ =
+                    NormVector::create(exec, dim<2>{1, args.b->get_size()[1]});
+                auto b_clone = share(args.b->clone());
+                args.system_matrix->apply(neg_one_.get(), args.x, one_.get(),
+                                          b_clone.get());
+                norm_dispatch<ValueType>(
+                    [&](auto dense_r) {
+                        dense_r->compute_norm2(this->starting_tau_.get());
+                    },
+                    b_clone.get());
+            }
+        } else {
+            this->starting_tau_ = NormVector::create(
+                exec, dim<2>{1, args.initial_residual->get_size()[1]});
+            norm_dispatch<ValueType>(
+                [&](auto dense_r) {
+                    dense_r->compute_norm2(this->starting_tau_.get());
+                },
+                args.initial_residual);
+        }
+        break;
+    }
+    case mode::rhs_norm: {
+        if (args.b == nullptr) {
+            GKO_NOT_SUPPORTED(nullptr);
+        }
+        this->starting_tau_ =
+            NormVector::create(exec, dim<2>{1, args.b->get_size()[1]});
+        norm_dispatch<ValueType>(
+            [&](auto dense_r) {
+                dense_r->compute_norm2(this->starting_tau_.get());
+            },
+            args.b.get());
+        break;
+    }
+    case mode::absolute: {
+        if (args.b == nullptr) {
+            GKO_NOT_SUPPORTED(nullptr);
+        }
+        this->starting_tau_ =
+            NormVector::create(exec, dim<2>{1, args.b->get_size()[1]});
+        this->starting_tau_->fill(gko::one<remove_complex<ValueType>>());
+        break;
+    }
+    default:
+        GKO_NOT_SUPPORTED(nullptr);
+    }
+    this->u_dense_tau_ =
+        NormVector::create_with_config_of(this->starting_tau_.get());
+}
+
+
 template <typename ValueType>
 bool ResidualNormBase<ValueType>::check_impl(
     uint8 stopping_id, bool set_finalized, array<stopping_status>* stop_status,
@@ -71,33 +198,21 @@ bool ResidualNormBase<ValueType>::check_impl(
     if (updater.residual_norm_ != nullptr) {
         dense_tau = as<NormVector>(updater.residual_norm_);
     } else if (updater.residual_ != nullptr) {
-        if (dynamic_cast<const ComplexVector*>(updater.residual_)) {
-            auto* dense_r = as<ComplexVector>(updater.residual_);
-            dense_r->compute_norm2(u_dense_tau_.get());
-        } else {
-            auto* dense_r = as<Vector>(updater.residual_);
-            dense_r->compute_norm2(u_dense_tau_.get());
-        }
+        norm_dispatch<ValueType>(
+            [&](auto dense_r) { dense_r->compute_norm2(u_dense_tau_.get()); },
+            updater.residual_);
         dense_tau = u_dense_tau_.get();
     } else if (updater.solution_ != nullptr && system_matrix_ != nullptr &&
                b_ != nullptr) {
         auto exec = this->get_executor();
-        // when LinOp is real but rhs is complex, we use real view on complex,
-        // so it still uses the same type of scalar in apply.
-        if (auto vec_b = std::dynamic_pointer_cast<const Vector>(b_)) {
-            auto dense_r = vec_b->clone();
-            system_matrix_->apply(neg_one_.get(), updater.solution_, one_.get(),
-                                  dense_r.get());
-            dense_r->compute_norm2(u_dense_tau_.get());
-        } else if (auto vec_b =
-                       std::dynamic_pointer_cast<const ComplexVector>(b_)) {
-            auto dense_r = vec_b->clone();
-            system_matrix_->apply(neg_one_.get(), updater.solution_, one_.get(),
-                                  dense_r.get());
-            dense_r->compute_norm2(u_dense_tau_.get());
-        } else {
-            GKO_NOT_SUPPORTED(nullptr);
-        }
+        norm_dispatch<ValueType>(
+            [&](auto dense_b, auto dense_x) {
+                auto dense_r = dense_b->clone();
+                system_matrix_->apply(neg_one_.get(), dense_x, one_.get(),
+                                      dense_r.get());
+                dense_r->compute_norm2(u_dense_tau_.get());
+            },
+            b_.get(), updater.solution_);
         dense_tau = u_dense_tau_.get();
     } else {
         GKO_NOT_SUPPORTED(nullptr);
diff --git a/core/test/base/CMakeLists.txt b/core/test/base/CMakeLists.txt
index b35dcfe723e..aa79ca3ed92 100644
--- a/core/test/base/CMakeLists.txt
+++ b/core/test/base/CMakeLists.txt
@@ -1,6 +1,7 @@
 ginkgo_create_test(abstract_factory)
 ginkgo_create_test(allocator)
 ginkgo_create_test(array)
+ginkgo_create_test(dense_cache)
 ginkgo_create_test(combination)
 ginkgo_create_test(composition)
 ginkgo_create_test(dim)
diff --git a/core/test/base/abstract_factory.cpp b/core/test/base/abstract_factory.cpp
index 07d2e490f18..108e32436e7 100644
--- a/core/test/base/abstract_factory.cpp
+++ b/core/test/base/abstract_factory.cpp
@@ -55,7 +55,7 @@ using base = gko::AbstractFactory<MyInt, int>;
 struct IntFactory
     : gko::EnableDefaultFactory<IntFactory, MyInt, parameters_type, base> {
     friend class gko::enable_parameters_type<parameters_type, IntFactory>;
-    friend class gko::EnablePolymorphicObject<IntFactory, base>;
+    friend class gko::polymorphic_object_traits<IntFactory>;
     using gko::EnableDefaultFactory<IntFactory, MyInt, parameters_type,
                                     base>::EnableDefaultFactory;
 };
diff --git a/core/test/base/dense_cache.cpp b/core/test/base/dense_cache.cpp
new file mode 100644
index 00000000000..41bac8c01c6
--- /dev/null
+++ b/core/test/base/dense_cache.cpp
@@ -0,0 +1,229 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/core/base/dense_cache.hpp>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+template <typename ValueType>
+class DenseCache : public ::testing::Test {
+protected:
+    using value_type = ValueType;
+
+    DenseCache() {}
+
+    void SetUp() { ref = gko::ReferenceExecutor::create(); }
+
+    void TearDown() {}
+
+    void gen_cache(gko::dim<2> size) { cache.init(ref, size); }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    gko::detail::DenseCache<value_type> cache;
+};
+
+
+TYPED_TEST_SUITE(DenseCache, gko::test::ValueTypes, TypenameNameGenerator);
+
+
+TYPED_TEST(DenseCache, CanDefaultConstruct)
+{
+    using value_type = typename TestFixture::value_type;
+    gko::detail::DenseCache<value_type> cache;
+
+    ASSERT_EQ(cache.get(), nullptr);
+}
+
+
+TYPED_TEST(DenseCache, CanInitWithSize)
+{
+    using value_type = typename TestFixture::value_type;
+    gko::dim<2> size{4, 7};
+
+    this->cache.init(this->ref, size);
+
+    ASSERT_NE(this->cache.get(), nullptr);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(), size);
+    ASSERT_EQ(this->cache->get_executor(), this->ref);
+}
+
+
+TYPED_TEST(DenseCache, SecondInitWithSameSizeIsNoOp)
+{
+    using value_type = typename TestFixture::value_type;
+    gko::dim<2> size{4, 7};
+    this->cache.init(this->ref, size);
+    auto first_ptr = this->cache.get();
+
+    this->cache.init(this->ref, size);
+
+    ASSERT_NE(this->cache.get(), nullptr);
+    ASSERT_EQ(first_ptr, this->cache.get());
+}
+
+
+TYPED_TEST(DenseCache, SecondInitWithDifferentSizeInitializes)
+{
+    using value_type = typename TestFixture::value_type;
+    gko::dim<2> size{4, 7};
+    gko::dim<2> second_size{7, 4};
+    this->cache.init(this->ref, size);
+    auto first_ptr = this->cache.get();
+
+    this->cache.init(this->ref, second_size);
+
+    ASSERT_NE(this->cache.get(), nullptr);
+    ASSERT_NE(first_ptr, this->cache.get());
+}
+
+
+TYPED_TEST(DenseCache, CanInitFromDense)
+{
+    using value_type = typename TestFixture::value_type;
+    gko::dim<2> size{5, 2};
+    auto dense = gko::matrix::Dense<value_type>::create(this->ref, size);
+
+    this->cache.init_from(dense.get());
+
+    ASSERT_NE(this->cache.get(), nullptr);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(), size);
+    ASSERT_EQ(this->cache->get_executor(), dense->get_executor());
+}
+
+
+TYPED_TEST(DenseCache, SecondInitFromSameDenseIsNoOp)
+{
+    using value_type = typename TestFixture::value_type;
+    gko::dim<2> size{4, 7};
+    auto dense = gko::matrix::Dense<value_type>::create(this->ref, size);
+    this->cache.init_from(dense.get());
+    auto first_ptr = this->cache.get();
+
+    this->cache.init_from(dense.get());
+
+    ASSERT_NE(this->cache.get(), nullptr);
+    ASSERT_EQ(first_ptr, this->cache.get());
+}
+
+
+TYPED_TEST(DenseCache, SecondInitFromDifferentDenseWithSameSizeIsNoOp)
+{
+    using value_type = typename TestFixture::value_type;
+    gko::dim<2> size{4, 7};
+    auto first_dense = gko::matrix::Dense<value_type>::create(this->ref, size);
+    auto second_dense = gko::matrix::Dense<value_type>::create(this->ref, size);
+    this->cache.init_from(first_dense.get());
+    auto first_ptr = this->cache.get();
+
+    this->cache.init_from(second_dense.get());
+
+    ASSERT_NE(this->cache.get(), nullptr);
+    ASSERT_EQ(first_ptr, this->cache.get());
+}
+
+
+TYPED_TEST(DenseCache, SecondInitFromDifferentDenseWithDifferentSizeInitializes)
+{
+    using value_type = typename TestFixture::value_type;
+    gko::dim<2> size{4, 7};
+    gko::dim<2> second_size{7, 4};
+    auto first_dense = gko::matrix::Dense<value_type>::create(this->ref, size);
+    auto second_dense =
+        gko::matrix::Dense<value_type>::create(this->ref, second_size);
+    this->cache.init_from(first_dense.get());
+    auto first_ptr = this->cache.get();
+
+    this->cache.init_from(second_dense.get());
+
+    ASSERT_NE(this->cache.get(), nullptr);
+    ASSERT_NE(first_ptr, this->cache.get());
+}
+
+
+TYPED_TEST(DenseCache, VectorIsNotCopied)
+{
+    using value_type = typename TestFixture::value_type;
+    this->gen_cache({1, 1});
+    gko::detail::DenseCache<value_type> cache(this->cache);
+
+    ASSERT_EQ(cache.get(), nullptr);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(), gko::dim<2>(1, 1));
+}
+
+
+TYPED_TEST(DenseCache, VectorIsNotMoved)
+{
+    using value_type = typename TestFixture::value_type;
+    this->gen_cache({1, 1});
+    gko::detail::DenseCache<value_type> cache(std::move(this->cache));
+
+    ASSERT_EQ(cache.get(), nullptr);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(), gko::dim<2>(1, 1));
+}
+
+
+TYPED_TEST(DenseCache, VectorIsNotCopyAssigned)
+{
+    using value_type = typename TestFixture::value_type;
+    this->gen_cache({1, 1});
+    gko::detail::DenseCache<value_type> cache;
+    cache = this->cache;
+
+    ASSERT_EQ(cache.get(), nullptr);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(), gko::dim<2>(1, 1));
+}
+
+
+TYPED_TEST(DenseCache, VectorIsNotMoveAssigned)
+{
+    using value_type = typename TestFixture::value_type;
+    this->gen_cache({1, 1});
+    gko::detail::DenseCache<value_type> cache;
+    cache = std::move(this->cache);
+
+    ASSERT_EQ(cache.get(), nullptr);
+    GKO_ASSERT_EQUAL_DIMENSIONS(this->cache->get_size(), gko::dim<2>(1, 1));
+}
+
+
+}  // namespace
diff --git a/core/test/base/mtx_io.cpp b/core/test/base/mtx_io.cpp
index 7f492cdc991..79e7ba35b35 100644
--- a/core/test/base/mtx_io.cpp
+++ b/core/test/base/mtx_io.cpp
@@ -870,7 +870,7 @@ class DummyLinOp
       public gko::EnableCreateMethod<DummyLinOp<ValueType, IndexType>>,
       public gko::ReadableFromMatrixData<ValueType, IndexType>,
       public gko::WritableToMatrixData<ValueType, IndexType> {
-    friend class gko::EnablePolymorphicObject<DummyLinOp, gko::LinOp>;
+    friend class gko::polymorphic_object_traits<DummyLinOp>;
     friend class gko::EnableCreateMethod<DummyLinOp>;
 
 public:
diff --git a/core/test/matrix/dense.cpp b/core/test/matrix/dense.cpp
index dbf9670c5c2..7493c5eb727 100644
--- a/core/test/matrix/dense.cpp
+++ b/core/test/matrix/dense.cpp
@@ -430,6 +430,7 @@ TYPED_TEST(Dense, CanMakeConstView)
 class CustomDense : public gko::EnableLinOp<CustomDense, gko::matrix::Dense<>> {
     friend class gko::EnablePolymorphicObject<CustomDense,
                                               gko::matrix::Dense<>>;
+    friend struct gko::polymorphic_object_traits<CustomDense>;
 
 public:
     static std::unique_ptr<CustomDense> create(
diff --git a/core/test/mpi/CMakeLists.txt b/core/test/mpi/CMakeLists.txt
index 8edc6781c4e..eb2c9192ebc 100644
--- a/core/test/mpi/CMakeLists.txt
+++ b/core/test/mpi/CMakeLists.txt
@@ -1,7 +1,2 @@
-add_library(gtest_mpi_main "")
-target_sources(gtest_mpi_main
-    PRIVATE
-    gtest/mpi_listener.cpp)
-find_package(MPI REQUIRED)
-target_link_libraries(gtest_mpi_main PRIVATE GTest::GTest MPI::MPI_CXX)
 add_subdirectory(base)
+add_subdirectory(distributed)
diff --git a/core/test/mpi/base/CMakeLists.txt b/core/test/mpi/base/CMakeLists.txt
index 0b22157d269..bdf82c4337c 100644
--- a/core/test/mpi/base/CMakeLists.txt
+++ b/core/test/mpi/base/CMakeLists.txt
@@ -1,3 +1,7 @@
 ginkgo_create_test(communicator MPI_SIZE 8)
 ginkgo_create_test(exception_helpers MPI_SIZE 2)
 ginkgo_create_test(bindings MPI_SIZE 4)
+if(NOT (MSVC OR WIN32))
+    # This test uses some UNIX function so it's disabled on Windows
+    ginkgo_create_test(rank_mapping MPI_SIZE 4)
+endif()
diff --git a/core/test/mpi/base/bindings.cpp b/core/test/mpi/base/bindings.cpp
index ee34e8aa451..04b90441b5e 100644
--- a/core/test/mpi/base/bindings.cpp
+++ b/core/test/mpi/base/bindings.cpp
@@ -60,7 +60,7 @@ TYPED_TEST_SUITE(MpiBindings, gko::test::PODTypes, TypenameNameGenerator);
 
 TYPED_TEST(MpiBindings, CanSetADefaultwindow)
 {
-    gko::mpi::window<TypeParam> win;
+    gko::experimental::mpi::window<TypeParam> win;
     ASSERT_EQ(win.get_window(), MPI_WIN_NULL);
 }
 
@@ -68,10 +68,10 @@ TYPED_TEST(MpiBindings, CanSetADefaultwindow)
 TYPED_TEST(MpiBindings, CanCreatewindow)
 {
     auto data = std::vector<TypeParam>{1, 2, 3, 4};
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
 
-    auto win =
-        gko::mpi::window<TypeParam>(data.data(), 4 * sizeof(TypeParam), comm);
+    auto win = gko::experimental::mpi::window<TypeParam>(
+        this->ref, data.data(), 4 * sizeof(TypeParam), comm);
 
     ASSERT_NE(win.get_window(), MPI_WIN_NULL);
     win.lock_all();
@@ -81,7 +81,7 @@ TYPED_TEST(MpiBindings, CanCreatewindow)
 
 TYPED_TEST(MpiBindings, CanSendAndRecvValues)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     auto recv_array = gko::array<TypeParam>{this->ref};
@@ -90,12 +90,12 @@ TYPED_TEST(MpiBindings, CanSendAndRecvValues)
         auto send_array = std::vector<TypeParam>{1, 2, 3, 4};
         for (auto rank = 0; rank < num_ranks; ++rank) {
             if (rank != my_rank) {
-                comm.send(send_array.data(), 4, rank, 40 + rank);
+                comm.send(this->ref, send_array.data(), 4, rank, 40 + rank);
             }
         }
     } else {
         recv_array = gko::array<TypeParam>{this->ref, 4};
-        comm.recv(recv_array.get_data(), 4, 0, 40 + my_rank);
+        comm.recv(this->ref, recv_array.get_data(), 4, 0, 40 + my_rank);
     }
 
     if (my_rank != 0) {
@@ -107,25 +107,27 @@ TYPED_TEST(MpiBindings, CanSendAndRecvValues)
 
 TYPED_TEST(MpiBindings, CanNonBlockingSendAndNonBlockingRecvValues)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     std::vector<TypeParam> send_array;
     auto recv_array = gko::array<TypeParam>{this->ref};
     TypeParam* data;
-    auto req1 = std::vector<gko::mpi::request>(num_ranks);
-    auto req2 = gko::mpi::request();
+    auto req1 = std::vector<gko::experimental::mpi::request>(num_ranks);
+    auto req2 = gko::experimental::mpi::request();
 
     if (my_rank == 0) {
         send_array = std::vector<TypeParam>{1, 2, 3, 4};
         for (auto rank = 0; rank < num_ranks; ++rank) {
             if (rank != my_rank) {
-                req1[rank] = comm.i_send(send_array.data(), 4, rank, 40 + rank);
+                req1[rank] = comm.i_send(this->ref, send_array.data(), 4, rank,
+                                         40 + rank);
             }
         }
     } else {
         recv_array = gko::array<TypeParam>{this->ref, 4};
-        req2 = comm.i_recv(recv_array.get_data(), 4, 0, 40 + my_rank);
+        req2 =
+            comm.i_recv(this->ref, recv_array.get_data(), 4, 0, 40 + my_rank);
     }
 
     if (my_rank == 0) {
@@ -142,8 +144,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingSendAndNonBlockingRecvValues)
 
 TYPED_TEST(MpiBindings, CanPutValuesWithLockAll)
 {
-    using window = gko::mpi::window<TypeParam>;
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     std::vector<TypeParam> data;
@@ -154,12 +156,12 @@ TYPED_TEST(MpiBindings, CanPutValuesWithLockAll)
     }
 
     {
-        auto win = window(data.data(), 4, comm);
+        auto win = window(this->ref, data.data(), 4, comm);
         if (my_rank == 0) {
             win.lock_all();
             for (auto rank = 0; rank < num_ranks; ++rank) {
                 if (rank != my_rank) {
-                    win.put(data.data(), 4, rank, 0, 4);
+                    win.put(this->ref, data.data(), 4, rank, 0, 4);
                 }
             }
             win.unlock_all();
@@ -173,8 +175,8 @@ TYPED_TEST(MpiBindings, CanPutValuesWithLockAll)
 
 TYPED_TEST(MpiBindings, CanNonBlockingPutValuesWithLockAll)
 {
-    using window = gko::mpi::window<TypeParam>;
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     std::vector<TypeParam> data;
@@ -185,13 +187,13 @@ TYPED_TEST(MpiBindings, CanNonBlockingPutValuesWithLockAll)
     }
 
     {
-        gko::mpi::request req;
-        auto win = window(data.data(), 4, comm);
+        gko::experimental::mpi::request req;
+        auto win = window(this->ref, data.data(), 4, comm);
         if (my_rank == 0) {
             win.lock_all();
             for (auto rank = 0; rank < num_ranks; ++rank) {
                 if (rank != my_rank) {
-                    req = win.r_put(data.data(), 4, rank, 0, 4);
+                    req = win.r_put(this->ref, data.data(), 4, rank, 0, 4);
                 }
             }
             req.wait();
@@ -206,8 +208,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingPutValuesWithLockAll)
 
 TYPED_TEST(MpiBindings, CanPutValuesWithExclusiveLock)
 {
-    using window = gko::mpi::window<TypeParam>;
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     std::vector<TypeParam> data;
@@ -219,12 +221,12 @@ TYPED_TEST(MpiBindings, CanPutValuesWithExclusiveLock)
     }
 
     {
-        auto win = window(data.data(), 4, comm);
+        auto win = window(this->ref, data.data(), 4, comm);
         if (my_rank == 0) {
             for (auto rank = 0; rank < num_ranks; ++rank) {
                 if (rank != my_rank) {
                     win.lock(rank, window::lock_type::exclusive);
-                    win.put(data.data(), 4, rank, 0, 4);
+                    win.put(this->ref, data.data(), 4, rank, 0, 4);
                     win.flush(0);
                     win.unlock(rank);
                 }
@@ -239,8 +241,8 @@ TYPED_TEST(MpiBindings, CanPutValuesWithExclusiveLock)
 
 TYPED_TEST(MpiBindings, CanPutValuesWithSharedLock)
 {
-    using window = gko::mpi::window<TypeParam>;
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     std::vector<TypeParam> data;
@@ -252,12 +254,12 @@ TYPED_TEST(MpiBindings, CanPutValuesWithSharedLock)
     }
 
     {
-        auto win = window(data.data(), 4, comm);
+        auto win = window(this->ref, data.data(), 4, comm);
         if (my_rank == 0) {
             for (auto rank = 0; rank < num_ranks; ++rank) {
                 if (rank != my_rank) {
                     win.lock(rank);
-                    win.put(data.data(), 4, rank, 0, 4);
+                    win.put(this->ref, data.data(), 4, rank, 0, 4);
                     win.flush(0);
                     win.unlock(rank);
                 }
@@ -272,8 +274,8 @@ TYPED_TEST(MpiBindings, CanPutValuesWithSharedLock)
 
 TYPED_TEST(MpiBindings, CanPutValuesWithFence)
 {
-    using window = gko::mpi::window<TypeParam>;
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     std::vector<TypeParam> data;
@@ -282,13 +284,13 @@ TYPED_TEST(MpiBindings, CanPutValuesWithFence)
     } else {
         data = std::vector<TypeParam>{0, 0, 0, 0};
     }
-    auto win = window(data.data(), 4, comm);
+    auto win = window(this->ref, data.data(), 4, comm);
 
     win.fence();
     if (my_rank == 0) {
         for (auto rank = 0; rank < num_ranks; ++rank) {
             if (rank != my_rank) {
-                win.put(data.data(), 4, rank, 0, 4);
+                win.put(this->ref, data.data(), 4, rank, 0, 4);
             }
         }
     }
@@ -301,8 +303,8 @@ TYPED_TEST(MpiBindings, CanPutValuesWithFence)
 
 TYPED_TEST(MpiBindings, CanAccumulateValues)
 {
-    using window = gko::mpi::window<TypeParam>;
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     std::vector<TypeParam> data;
@@ -317,12 +319,13 @@ TYPED_TEST(MpiBindings, CanAccumulateValues)
     }
 
     {
-        auto win = window(data.data(), 4, comm);
+        auto win = window(this->ref, data.data(), 4, comm);
         if (my_rank == 0) {
             win.lock_all();
             for (auto rank = 0; rank < num_ranks; ++rank) {
                 if (rank != my_rank) {
-                    win.accumulate(data.data(), 4, rank, 0, 4, MPI_SUM);
+                    win.accumulate(this->ref, data.data(), 4, rank, 0, 4,
+                                   MPI_SUM);
                 }
             }
             win.unlock_all();
@@ -348,8 +351,8 @@ TYPED_TEST(MpiBindings, CanAccumulateValues)
 
 TYPED_TEST(MpiBindings, CanNonBlockingAccumulateValues)
 {
-    using window = gko::mpi::window<TypeParam>;
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     std::vector<TypeParam> data;
@@ -363,14 +366,15 @@ TYPED_TEST(MpiBindings, CanNonBlockingAccumulateValues)
         data = std::vector<TypeParam>{0, 0, 0, 0};
     }
 
-    gko::mpi::request req;
+    gko::experimental::mpi::request req;
     {
-        auto win = window(data.data(), 4, comm);
+        auto win = window(this->ref, data.data(), 4, comm);
         if (my_rank == 0) {
             win.lock_all();
             for (auto rank = 0; rank < num_ranks; ++rank) {
                 if (rank != my_rank) {
-                    req = win.r_accumulate(data.data(), 4, rank, 0, 4, MPI_SUM);
+                    req = win.r_accumulate(this->ref, data.data(), 4, rank, 0,
+                                           4, MPI_SUM);
                 }
             }
             win.unlock_all();
@@ -397,8 +401,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingAccumulateValues)
 
 TYPED_TEST(MpiBindings, CanGetValuesWithLockAll)
 {
-    using window = gko::mpi::window<TypeParam>;
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     std::vector<TypeParam> data;
@@ -407,11 +411,11 @@ TYPED_TEST(MpiBindings, CanGetValuesWithLockAll)
     } else {
         data = std::vector<TypeParam>{0, 0, 0, 0};
     }
-    auto win = window(data.data(), 4, comm);
+    auto win = window(this->ref, data.data(), 4, comm);
 
     if (my_rank != 0) {
         win.lock_all();
-        win.get(data.data(), 4, 0, 0, 4);
+        win.get(this->ref, data.data(), 4, 0, 0, 4);
         win.unlock_all();
     }
 
@@ -422,8 +426,8 @@ TYPED_TEST(MpiBindings, CanGetValuesWithLockAll)
 
 TYPED_TEST(MpiBindings, CanNonBlockingGetValuesWithLockAll)
 {
-    using window = gko::mpi::window<TypeParam>;
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     std::vector<TypeParam> data;
@@ -432,12 +436,12 @@ TYPED_TEST(MpiBindings, CanNonBlockingGetValuesWithLockAll)
     } else {
         data = std::vector<TypeParam>{0, 0, 0, 0};
     }
-    gko::mpi::request req;
-    auto win = window(data.data(), 4, comm);
+    gko::experimental::mpi::request req;
+    auto win = window(this->ref, data.data(), 4, comm);
 
     if (my_rank != 0) {
         win.lock_all();
-        req = win.r_get(data.data(), 4, 0, 0, 4);
+        req = win.r_get(this->ref, data.data(), 4, 0, 0, 4);
         win.unlock_all();
     }
 
@@ -449,8 +453,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingGetValuesWithLockAll)
 
 TYPED_TEST(MpiBindings, CanGetValuesWithExclusiveLock)
 {
-    using window = gko::mpi::window<TypeParam>;
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     std::vector<TypeParam> data;
@@ -459,11 +463,11 @@ TYPED_TEST(MpiBindings, CanGetValuesWithExclusiveLock)
     } else {
         data = std::vector<TypeParam>{0, 0, 0, 0};
     }
-    auto win = window(data.data(), 4, comm);
+    auto win = window(this->ref, data.data(), 4, comm);
 
     if (my_rank != 0) {
         win.lock(0, window::lock_type::exclusive);
-        win.get(data.data(), 4, 0, 0, 4);
+        win.get(this->ref, data.data(), 4, 0, 0, 4);
         win.unlock(0);
     }
 
@@ -474,8 +478,8 @@ TYPED_TEST(MpiBindings, CanGetValuesWithExclusiveLock)
 
 TYPED_TEST(MpiBindings, CanGetValuesWithSharedLock)
 {
-    using window = gko::mpi::window<TypeParam>;
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     std::vector<TypeParam> data;
@@ -484,11 +488,11 @@ TYPED_TEST(MpiBindings, CanGetValuesWithSharedLock)
     } else {
         data = std::vector<TypeParam>{0, 0, 0, 0};
     }
-    auto win = window(data.data(), 4, comm);
+    auto win = window(this->ref, data.data(), 4, comm);
 
     if (my_rank != 0) {
         win.lock(0);
-        win.get(data.data(), 4, 0, 0, 4);
+        win.get(this->ref, data.data(), 4, 0, 0, 4);
         win.unlock(0);
     }
 
@@ -499,8 +503,8 @@ TYPED_TEST(MpiBindings, CanGetValuesWithSharedLock)
 
 TYPED_TEST(MpiBindings, CanGetValuesWithFence)
 {
-    using window = gko::mpi::window<TypeParam>;
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     std::vector<TypeParam> data;
@@ -509,11 +513,11 @@ TYPED_TEST(MpiBindings, CanGetValuesWithFence)
     } else {
         data = std::vector<TypeParam>{0, 0, 0, 0};
     }
-    auto win = window(data.data(), 4, comm);
+    auto win = window(this->ref, data.data(), 4, comm);
 
     win.fence();
     if (my_rank != 0) {
-        win.get(data.data(), 4, 0, 0, 4);
+        win.get(this->ref, data.data(), 4, 0, 0, 4);
     }
     win.fence();
 
@@ -524,8 +528,8 @@ TYPED_TEST(MpiBindings, CanGetValuesWithFence)
 
 TYPED_TEST(MpiBindings, CanGetAccumulateValuesWithLockAll)
 {
-    using window = gko::mpi::window<TypeParam>;
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     std::vector<TypeParam> data;
@@ -546,12 +550,12 @@ TYPED_TEST(MpiBindings, CanGetAccumulateValuesWithLockAll)
     }
 
     {
-        auto win = window(target.data(), 4, comm);
+        auto win = window(this->ref, target.data(), 4, comm);
 
         if (my_rank == 2) {
             win.lock_all();
-            win.get_accumulate(data.data(), 4, result.data(), 4, 0, 0, 4,
-                               MPI_SUM);
+            win.get_accumulate(this->ref, data.data(), 4, result.data(), 4, 0,
+                               0, 4, MPI_SUM);
             win.unlock_all();
         }
     }
@@ -570,8 +574,8 @@ TYPED_TEST(MpiBindings, CanGetAccumulateValuesWithLockAll)
 
 TYPED_TEST(MpiBindings, CanNonBlockingGetAccumulateValuesWithLockAll)
 {
-    using window = gko::mpi::window<TypeParam>;
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     std::vector<TypeParam> data;
@@ -590,15 +594,15 @@ TYPED_TEST(MpiBindings, CanNonBlockingGetAccumulateValuesWithLockAll)
         data = std::vector<TypeParam>{0, 0, 0, 0};
         target = std::vector<TypeParam>{0, 0, 0, 0};
     }
-    gko::mpi::request req;
+    gko::experimental::mpi::request req;
 
     {
-        auto win = window(target.data(), 4, comm);
+        auto win = window(this->ref, target.data(), 4, comm);
 
         if (my_rank == 2) {
             win.lock_all();
-            req = win.r_get_accumulate(data.data(), 4, result.data(), 4, 0, 0,
-                                       4, MPI_SUM);
+            req = win.r_get_accumulate(this->ref, data.data(), 4, result.data(),
+                                       4, 0, 0, 4, MPI_SUM);
             win.unlock_all();
         }
     }
@@ -623,8 +627,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingGetAccumulateValuesWithLockAll)
 
 TYPED_TEST(MpiBindings, CanFetchAndOperate)
 {
-    using window = gko::mpi::window<TypeParam>;
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    using window = gko::experimental::mpi::window<TypeParam>;
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     std::vector<TypeParam> data;
@@ -645,11 +649,12 @@ TYPED_TEST(MpiBindings, CanFetchAndOperate)
     }
 
     {
-        auto win = window(target.data(), 4, comm);
+        auto win = window(this->ref, target.data(), 4, comm);
 
         if (my_rank == 2) {
             win.lock_all();
-            win.fetch_and_op(data.data(), result.data(), 0, 1, MPI_SUM);
+            win.fetch_and_op(this->ref, data.data(), result.data(), 0, 1,
+                             MPI_SUM);
             win.unlock_all();
         }
     }
@@ -668,7 +673,7 @@ TYPED_TEST(MpiBindings, CanFetchAndOperate)
 
 TYPED_TEST(MpiBindings, CanBroadcastValues)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     auto array = gko::array<TypeParam>{this->ref, 8};
@@ -676,7 +681,7 @@ TYPED_TEST(MpiBindings, CanBroadcastValues)
         array = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
     }
 
-    comm.broadcast(array.get_data(), 8, 0);
+    comm.broadcast(this->ref, array.get_data(), 8, 0);
 
     auto ref = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
     GKO_ASSERT_ARRAY_EQ(ref, array);
@@ -685,7 +690,7 @@ TYPED_TEST(MpiBindings, CanBroadcastValues)
 
 TYPED_TEST(MpiBindings, CanNonBlockingBroadcastValues)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     auto array = gko::array<TypeParam>{this->ref, 8};
@@ -693,7 +698,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingBroadcastValues)
         array = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
     }
 
-    auto req = comm.i_broadcast(array.get_data(), 8, 0);
+    auto req = comm.i_broadcast(this->ref, array.get_data(), 8, 0);
 
     req.wait();
     auto ref = gko::array<TypeParam>(this->ref, {2, 3, 1, 3, -1, 0, 3, 1});
@@ -703,7 +708,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingBroadcastValues)
 
 TYPED_TEST(MpiBindings, CanReduceValues)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     TypeParam data, sum, max, min;
@@ -717,9 +722,9 @@ TYPED_TEST(MpiBindings, CanReduceValues)
         data = 6;
     }
 
-    comm.reduce(&data, &sum, 1, MPI_SUM, 0);
-    comm.reduce(&data, &max, 1, MPI_MAX, 0);
-    comm.reduce(&data, &min, 1, MPI_MIN, 0);
+    comm.reduce(this->ref, &data, &sum, 1, MPI_SUM, 0);
+    comm.reduce(this->ref, &data, &max, 1, MPI_MAX, 0);
+    comm.reduce(this->ref, &data, &min, 1, MPI_MIN, 0);
 
     if (my_rank == 0) {
         EXPECT_EQ(sum, TypeParam{16});
@@ -731,7 +736,7 @@ TYPED_TEST(MpiBindings, CanReduceValues)
 
 TYPED_TEST(MpiBindings, CanNonBlockingReduceValues)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     TypeParam data, sum, max, min;
@@ -745,9 +750,9 @@ TYPED_TEST(MpiBindings, CanNonBlockingReduceValues)
         data = 6;
     }
 
-    auto req1 = comm.i_reduce(&data, &sum, 1, MPI_SUM, 0);
-    auto req2 = comm.i_reduce(&data, &max, 1, MPI_MAX, 0);
-    auto req3 = comm.i_reduce(&data, &min, 1, MPI_MIN, 0);
+    auto req1 = comm.i_reduce(this->ref, &data, &sum, 1, MPI_SUM, 0);
+    auto req2 = comm.i_reduce(this->ref, &data, &max, 1, MPI_MAX, 0);
+    auto req3 = comm.i_reduce(this->ref, &data, &min, 1, MPI_MIN, 0);
 
     req1.wait();
     req2.wait();
@@ -762,7 +767,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingReduceValues)
 
 TYPED_TEST(MpiBindings, CanAllReduceValues)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     TypeParam data, sum;
@@ -776,7 +781,7 @@ TYPED_TEST(MpiBindings, CanAllReduceValues)
         data = 6;
     }
 
-    comm.all_reduce(&data, &sum, 1, MPI_SUM);
+    comm.all_reduce(this->ref, &data, &sum, 1, MPI_SUM);
 
     ASSERT_EQ(sum, TypeParam{16});
 }
@@ -784,7 +789,7 @@ TYPED_TEST(MpiBindings, CanAllReduceValues)
 
 TYPED_TEST(MpiBindings, CanAllReduceValuesInPlace)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     TypeParam data;
@@ -798,7 +803,7 @@ TYPED_TEST(MpiBindings, CanAllReduceValuesInPlace)
         data = 6;
     }
 
-    comm.all_reduce(&data, 1, MPI_SUM);
+    comm.all_reduce(this->ref, &data, 1, MPI_SUM);
 
     ASSERT_EQ(data, TypeParam{16});
 }
@@ -806,7 +811,7 @@ TYPED_TEST(MpiBindings, CanAllReduceValuesInPlace)
 
 TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValues)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     TypeParam data, sum;
@@ -820,7 +825,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValues)
         data = 6;
     }
 
-    auto req = comm.i_all_reduce(&data, &sum, 1, MPI_SUM);
+    auto req = comm.i_all_reduce(this->ref, &data, &sum, 1, MPI_SUM);
 
     req.wait();
     ASSERT_EQ(sum, TypeParam{16});
@@ -829,7 +834,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValues)
 
 TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValuesInPlace)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     TypeParam data;
@@ -843,7 +848,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValuesInPlace)
         data = 6;
     }
 
-    auto req = comm.i_all_reduce(&data, 1, MPI_SUM);
+    auto req = comm.i_all_reduce(this->ref, &data, 1, MPI_SUM);
 
     req.wait();
     ASSERT_EQ(data, TypeParam{16});
@@ -852,7 +857,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllReduceValuesInPlace)
 
 TYPED_TEST(MpiBindings, CanGatherValues)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     TypeParam data;
@@ -868,7 +873,7 @@ TYPED_TEST(MpiBindings, CanGatherValues)
     auto gather_array = gko::array<TypeParam>{
         this->ref, static_cast<gko::size_type>(num_ranks)};
 
-    comm.gather(&data, 1, gather_array.get_data(), 1, 0);
+    comm.gather(this->ref, &data, 1, gather_array.get_data(), 1, 0);
 
     if (my_rank == 0) {
         auto ref = gko::array<TypeParam>(this->ref, {3, 5, 2, 6});
@@ -879,7 +884,7 @@ TYPED_TEST(MpiBindings, CanGatherValues)
 
 TYPED_TEST(MpiBindings, CanNonBlockingGatherValues)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     TypeParam data;
@@ -895,7 +900,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingGatherValues)
     auto gather_array = gko::array<TypeParam>{
         this->ref, static_cast<gko::size_type>(num_ranks)};
 
-    auto req = comm.i_gather(&data, 1, gather_array.get_data(), 1, 0);
+    auto req =
+        comm.i_gather(this->ref, &data, 1, gather_array.get_data(), 1, 0);
 
     req.wait();
     if (my_rank == 0) {
@@ -907,7 +913,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingGatherValues)
 
 TYPED_TEST(MpiBindings, CanAllGatherValues)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     TypeParam data;
@@ -923,7 +929,7 @@ TYPED_TEST(MpiBindings, CanAllGatherValues)
     auto gather_array = gko::array<TypeParam>{
         this->ref, static_cast<gko::size_type>(num_ranks)};
 
-    comm.all_gather(&data, 1, gather_array.get_data(), 1);
+    comm.all_gather(this->ref, &data, 1, gather_array.get_data(), 1);
 
     auto ref = gko::array<TypeParam>(this->ref, {3, 5, 2, 6});
     GKO_ASSERT_ARRAY_EQ(ref, gather_array);
@@ -932,7 +938,7 @@ TYPED_TEST(MpiBindings, CanAllGatherValues)
 
 TYPED_TEST(MpiBindings, CanNonBlockingAllGatherValues)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     TypeParam data;
@@ -948,7 +954,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllGatherValues)
     auto gather_array = gko::array<TypeParam>{
         this->ref, static_cast<gko::size_type>(num_ranks)};
 
-    auto req = comm.i_all_gather(&data, 1, gather_array.get_data(), 1);
+    auto req =
+        comm.i_all_gather(this->ref, &data, 1, gather_array.get_data(), 1);
 
     req.wait();
     auto ref = gko::array<TypeParam>(this->ref, {3, 5, 2, 6});
@@ -958,7 +965,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingAllGatherValues)
 
 TYPED_TEST(MpiBindings, CanGatherValuesWithDisplacements)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     auto gather_from_array = gko::array<TypeParam>{this->ref};
@@ -983,8 +990,8 @@ TYPED_TEST(MpiBindings, CanGatherValuesWithDisplacements)
         gather_from_array = gko::array<TypeParam>{this->ref, {1, -4, 5}};
     }
 
-    comm.gather(&nelems, 1, r_counts.get_data(), 1, 0);
-    comm.gather_v(gather_from_array.get_data(), nelems,
+    comm.gather(this->ref, &nelems, 1, r_counts.get_data(), 1, 0);
+    comm.gather_v(this->ref, gather_from_array.get_data(), nelems,
                   gather_into_array.get_data(), r_counts.get_data(),
                   displacements.get_data(), 0);
 
@@ -1001,7 +1008,7 @@ TYPED_TEST(MpiBindings, CanGatherValuesWithDisplacements)
 
 TYPED_TEST(MpiBindings, CanNonBlockingGatherValuesWithDisplacements)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     auto gather_from_array = gko::array<TypeParam>{this->ref};
@@ -1026,10 +1033,11 @@ TYPED_TEST(MpiBindings, CanNonBlockingGatherValuesWithDisplacements)
         gather_from_array = gko::array<TypeParam>{this->ref, {1, -4, 5}};
     }
 
-    comm.gather(&nelems, 1, r_counts.get_data(), 1, 0);
-    auto req = comm.i_gather_v(
-        gather_from_array.get_data(), nelems, gather_into_array.get_data(),
-        r_counts.get_data(), displacements.get_data(), 0);
+    comm.gather(this->ref, &nelems, 1, r_counts.get_data(), 1, 0);
+    auto req =
+        comm.i_gather_v(this->ref, gather_from_array.get_data(), nelems,
+                        gather_into_array.get_data(), r_counts.get_data(),
+                        displacements.get_data(), 0);
 
     req.wait();
     auto comp_data = gather_into_array.get_data();
@@ -1045,7 +1053,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingGatherValuesWithDisplacements)
 
 TYPED_TEST(MpiBindings, CanScatterValues)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     auto scatter_from_array = gko::array<TypeParam>{this->ref};
@@ -1055,7 +1063,7 @@ TYPED_TEST(MpiBindings, CanScatterValues)
     }
     auto scatter_into_array = gko::array<TypeParam>{this->ref, 2};
 
-    comm.scatter(scatter_from_array.get_data(), 2,
+    comm.scatter(this->ref, scatter_from_array.get_data(), 2,
                  scatter_into_array.get_data(), 2, 0);
 
     auto comp_data = scatter_into_array.get_data();
@@ -1078,7 +1086,7 @@ TYPED_TEST(MpiBindings, CanScatterValues)
 
 TYPED_TEST(MpiBindings, CanNonBlockingScatterValues)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     auto scatter_from_array = gko::array<TypeParam>{this->ref};
@@ -1088,7 +1096,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingScatterValues)
     }
     auto scatter_into_array = gko::array<TypeParam>{this->ref, 2};
 
-    auto req = comm.i_scatter(scatter_from_array.get_data(), 2,
+    auto req = comm.i_scatter(this->ref, scatter_from_array.get_data(), 2,
                               scatter_into_array.get_data(), 2, 0);
 
     req.wait();
@@ -1112,7 +1120,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingScatterValues)
 
 TYPED_TEST(MpiBindings, CanScatterValuesWithDisplacements)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     auto scatter_from_array = gko::array<TypeParam>{this->ref};
@@ -1136,10 +1144,10 @@ TYPED_TEST(MpiBindings, CanScatterValuesWithDisplacements)
     scatter_into_array =
         gko::array<TypeParam>{this->ref, static_cast<gko::size_type>(nelems)};
 
-    comm.gather(&nelems, 1, s_counts.get_data(), 1, 0);
-    comm.scatter_v(scatter_from_array.get_data(), s_counts.get_data(),
-                   displacements.get_data(), scatter_into_array.get_data(),
-                   nelems, 0);
+    comm.gather(this->ref, &nelems, 1, s_counts.get_data(), 1, 0);
+    comm.scatter_v(this->ref, scatter_from_array.get_data(),
+                   s_counts.get_data(), displacements.get_data(),
+                   scatter_into_array.get_data(), nelems, 0);
 
     auto comp_data = scatter_into_array.get_data();
     if (my_rank == 0) {
@@ -1163,7 +1171,7 @@ TYPED_TEST(MpiBindings, CanScatterValuesWithDisplacements)
 
 TYPED_TEST(MpiBindings, CanNonBlockingScatterValuesWithDisplacements)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     auto scatter_from_array = gko::array<TypeParam>{this->ref};
@@ -1187,8 +1195,8 @@ TYPED_TEST(MpiBindings, CanNonBlockingScatterValuesWithDisplacements)
     scatter_into_array =
         gko::array<TypeParam>{this->ref, static_cast<gko::size_type>(nelems)};
 
-    comm.gather(&nelems, 1, s_counts.get_data(), 1, 0);
-    auto req = comm.i_scatter_v(scatter_from_array.get_data(),
+    comm.gather(this->ref, &nelems, 1, s_counts.get_data(), 1, 0);
+    auto req = comm.i_scatter_v(this->ref, scatter_from_array.get_data(),
                                 s_counts.get_data(), displacements.get_data(),
                                 scatter_into_array.get_data(), nelems, 0);
 
@@ -1215,7 +1223,7 @@ TYPED_TEST(MpiBindings, CanNonBlockingScatterValuesWithDisplacements)
 
 TYPED_TEST(MpiBindings, AllToAllWorksCorrectly)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     auto send_array = gko::array<TypeParam>{this->ref};
@@ -1236,7 +1244,8 @@ TYPED_TEST(MpiBindings, AllToAllWorksCorrectly)
         ref_array = gko::array<TypeParam>(this->ref, {2, 2, 0, -2});
     }
 
-    comm.all_to_all(send_array.get_data(), 1, recv_array.get_data(), 1);
+    comm.all_to_all(this->ref, send_array.get_data(), 1, recv_array.get_data(),
+                    1);
 
     GKO_ASSERT_ARRAY_EQ(recv_array, ref_array);
 }
@@ -1244,7 +1253,7 @@ TYPED_TEST(MpiBindings, AllToAllWorksCorrectly)
 
 TYPED_TEST(MpiBindings, NonBlockingAllToAllWorksCorrectly)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     auto send_array = gko::array<TypeParam>{this->ref};
@@ -1265,8 +1274,8 @@ TYPED_TEST(MpiBindings, NonBlockingAllToAllWorksCorrectly)
         ref_array = gko::array<TypeParam>(this->ref, {2, 2, 0, -2});
     }
 
-    auto req =
-        comm.i_all_to_all(send_array.get_data(), 1, recv_array.get_data(), 1);
+    auto req = comm.i_all_to_all(this->ref, send_array.get_data(), 1,
+                                 recv_array.get_data(), 1);
 
     req.wait();
     GKO_ASSERT_ARRAY_EQ(recv_array, ref_array);
@@ -1275,7 +1284,7 @@ TYPED_TEST(MpiBindings, NonBlockingAllToAllWorksCorrectly)
 
 TYPED_TEST(MpiBindings, AllToAllInPlaceWorksCorrectly)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     auto recv_array = gko::array<TypeParam>{this->ref};
@@ -1295,14 +1304,14 @@ TYPED_TEST(MpiBindings, AllToAllInPlaceWorksCorrectly)
         ref_array = gko::array<TypeParam>(this->ref, {2, 2, 0, -2});
     }
 
-    comm.all_to_all(recv_array.get_data(), 1);
+    comm.all_to_all(this->ref, recv_array.get_data(), 1);
     GKO_ASSERT_ARRAY_EQ(recv_array, ref_array);
 }
 
 
 TYPED_TEST(MpiBindings, NonBlockingAllToAllInPlaceWorksCorrectly)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     auto recv_array = gko::array<TypeParam>{this->ref};
@@ -1322,7 +1331,7 @@ TYPED_TEST(MpiBindings, NonBlockingAllToAllInPlaceWorksCorrectly)
         ref_array = gko::array<TypeParam>(this->ref, {2, 2, 0, -2});
     }
 
-    auto req = comm.i_all_to_all(recv_array.get_data(), 1);
+    auto req = comm.i_all_to_all(this->ref, recv_array.get_data(), 1);
 
     req.wait();
     GKO_ASSERT_ARRAY_EQ(recv_array, ref_array);
@@ -1331,7 +1340,7 @@ TYPED_TEST(MpiBindings, NonBlockingAllToAllInPlaceWorksCorrectly)
 
 TYPED_TEST(MpiBindings, AllToAllVWorksCorrectly)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     auto send_array = gko::array<TypeParam>{this->ref};
@@ -1375,16 +1384,17 @@ TYPED_TEST(MpiBindings, AllToAllVWorksCorrectly)
         ref_array = gko::array<TypeParam>{this->ref, {0, 2, 3, 3}};
     }
 
-    comm.all_to_all_v(send_array.get_data(), scounts_array.get_data(),
-                      soffset_array.get_data(), recv_array.get_data(),
-                      rcounts_array.get_data(), roffset_array.get_data());
+    comm.all_to_all_v(this->ref, send_array.get_data(),
+                      scounts_array.get_data(), soffset_array.get_data(),
+                      recv_array.get_data(), rcounts_array.get_data(),
+                      roffset_array.get_data());
     GKO_ASSERT_ARRAY_EQ(recv_array, ref_array);
 }
 
 
 TYPED_TEST(MpiBindings, NonBlockingAllToAllVWorksCorrectly)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     auto send_array = gko::array<TypeParam>{this->ref};
@@ -1428,10 +1438,10 @@ TYPED_TEST(MpiBindings, NonBlockingAllToAllVWorksCorrectly)
         ref_array = gko::array<TypeParam>{this->ref, {0, 2, 3, 3}};
     }
 
-    auto req =
-        comm.i_all_to_all_v(send_array.get_data(), scounts_array.get_data(),
-                            soffset_array.get_data(), recv_array.get_data(),
-                            rcounts_array.get_data(), roffset_array.get_data());
+    auto req = comm.i_all_to_all_v(
+        this->ref, send_array.get_data(), scounts_array.get_data(),
+        soffset_array.get_data(), recv_array.get_data(),
+        rcounts_array.get_data(), roffset_array.get_data());
 
     req.wait();
     GKO_ASSERT_ARRAY_EQ(recv_array, ref_array);
@@ -1440,7 +1450,7 @@ TYPED_TEST(MpiBindings, NonBlockingAllToAllVWorksCorrectly)
 
 TYPED_TEST(MpiBindings, CanScanValues)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     TypeParam data, sum, max, min;
@@ -1454,9 +1464,9 @@ TYPED_TEST(MpiBindings, CanScanValues)
         data = 6;
     }
 
-    comm.scan(&data, &sum, 1, MPI_SUM);
-    comm.scan(&data, &max, 1, MPI_MAX);
-    comm.scan(&data, &min, 1, MPI_MIN);
+    comm.scan(this->ref, &data, &sum, 1, MPI_SUM);
+    comm.scan(this->ref, &data, &max, 1, MPI_MAX);
+    comm.scan(this->ref, &data, &min, 1, MPI_MIN);
 
     if (my_rank == 0) {
         EXPECT_EQ(sum, TypeParam{3});
@@ -1480,7 +1490,7 @@ TYPED_TEST(MpiBindings, CanScanValues)
 
 TYPED_TEST(MpiBindings, CanNonBlockingScanValues)
 {
-    auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    auto comm = gko::experimental::mpi::communicator(MPI_COMM_WORLD);
     auto my_rank = comm.rank();
     auto num_ranks = comm.size();
     TypeParam data, sum, max, min;
@@ -1494,9 +1504,9 @@ TYPED_TEST(MpiBindings, CanNonBlockingScanValues)
         data = 6;
     }
 
-    auto req1 = comm.i_scan(&data, &sum, 1, MPI_SUM);
-    auto req2 = comm.i_scan(&data, &max, 1, MPI_MAX);
-    auto req3 = comm.i_scan(&data, &min, 1, MPI_MIN);
+    auto req1 = comm.i_scan(this->ref, &data, &sum, 1, MPI_SUM);
+    auto req2 = comm.i_scan(this->ref, &data, &max, 1, MPI_MAX);
+    auto req3 = comm.i_scan(this->ref, &data, &min, 1, MPI_MIN);
 
     req1.wait();
     req2.wait();
diff --git a/core/test/mpi/base/communicator.cpp b/core/test/mpi/base/communicator.cpp
index d335b84ce09..2121ef41503 100644
--- a/core/test/mpi/base/communicator.cpp
+++ b/core/test/mpi/base/communicator.cpp
@@ -53,7 +53,7 @@ class Communicator : public ::testing::Test {
         ASSERT_EQ(comm.size(), 8);
     }
 
-    gko::mpi::communicator comm;
+    gko::experimental::mpi::communicator comm;
     int rank;
 };
 
@@ -88,7 +88,7 @@ TEST_F(Communicator, CommKnowsItsLocalRank)
 
 TEST_F(Communicator, CommunicatorCanBeCopyConstructed)
 {
-    gko::mpi::communicator copy(comm);
+    gko::experimental::mpi::communicator copy(comm);
 
     EXPECT_TRUE(copy == comm);
 }
@@ -96,7 +96,7 @@ TEST_F(Communicator, CommunicatorCanBeCopyConstructed)
 
 TEST_F(Communicator, CommunicatorCanBeCopyAssigned)
 {
-    gko::mpi::communicator copy = comm;
+    gko::experimental::mpi::communicator copy = comm;
 
     EXPECT_TRUE(copy == comm);
 }
@@ -104,8 +104,8 @@ TEST_F(Communicator, CommunicatorCanBeCopyAssigned)
 
 TEST_F(Communicator, CommunicatorCanBeMoveConstructed)
 {
-    gko::mpi::communicator comm2(MPI_COMM_WORLD);
-    gko::mpi::communicator copy(std::move(comm2));
+    gko::experimental::mpi::communicator comm2(MPI_COMM_WORLD);
+    gko::experimental::mpi::communicator copy(std::move(comm2));
 
     EXPECT_TRUE(copy == comm);
 }
@@ -113,8 +113,8 @@ TEST_F(Communicator, CommunicatorCanBeMoveConstructed)
 
 TEST_F(Communicator, CommunicatorCanBeMoveAssigned)
 {
-    gko::mpi::communicator comm2(MPI_COMM_WORLD);
-    gko::mpi::communicator copy(MPI_COMM_NULL);
+    gko::experimental::mpi::communicator comm2(MPI_COMM_WORLD);
+    gko::experimental::mpi::communicator copy(MPI_COMM_NULL);
     copy = std::move(comm2);
 
     EXPECT_TRUE(copy == comm);
@@ -133,7 +133,8 @@ TEST_F(Communicator, CanSetCustomCommunicator)
     auto world_size = comm.size();
     auto color = world_rank / 4;
 
-    auto row_comm = gko::mpi::communicator(comm.get(), color, world_rank);
+    auto row_comm =
+        gko::experimental::mpi::communicator(comm.get(), color, world_rank);
     for (auto i = 0; i < world_size; ++i) {
         EXPECT_LT(row_comm.rank(), 4);
     }
diff --git a/core/test/mpi/base/rank_mapping.cpp b/core/test/mpi/base/rank_mapping.cpp
new file mode 100644
index 00000000000..ab7459de66b
--- /dev/null
+++ b/core/test/mpi/base/rank_mapping.cpp
@@ -0,0 +1,133 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <memory>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/mpi.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+class MapRankToDevice : public ::testing::Test {
+protected:
+    MapRankToDevice()
+        : comm(MPI_COMM_WORLD),
+          rank(gko::experimental::mpi::communicator(comm).rank()),
+          size(gko::experimental::mpi::communicator(comm).size()),
+          env({{"MV2_COMM_WORLD_LOCAL_RANK", ""},
+               {"OMPI_COMM_WORLD_LOCAL_RANK", ""},
+               {"MPI_LOCALRANKID", ""},
+               {"SLURM_LOCALID", ""}})
+    {}
+
+    void SetUp() override
+    {
+        for (auto& it : env) {
+            const auto& env_name = it.first;
+            if (auto v = std::getenv(env_name.c_str())) {
+                env[env_name] = std::string(v);
+            }
+            unsetenv(env_name.c_str());
+        }
+    }
+
+    void TearDown() override
+    {
+        for (auto& it : env) {
+            const auto& env_name = it.first;
+            const auto& env_value = it.second;
+            setenv(env_name.c_str(), env_value.c_str(), 1);
+        }
+    }
+
+    MPI_Comm comm;
+    int rank;
+    int size;
+    std::map<std::string, std::string> env;
+};
+
+
+TEST_F(MapRankToDevice, OneDevice)
+{
+    ASSERT_EQ(gko::experimental::mpi::map_rank_to_device_id(comm, 1), 0);
+}
+
+
+TEST_F(MapRankToDevice, EqualDevicesAndRanks)
+{
+    auto id = gko::experimental::mpi::map_rank_to_device_id(comm, size);
+
+    ASSERT_EQ(id, rank);
+}
+
+
+TEST_F(MapRankToDevice, LessDevicesThanRanks)
+{
+    int target_id[] = {0, 1, 2, 0};
+
+    auto id = gko::experimental::mpi::map_rank_to_device_id(comm, 3);
+
+    ASSERT_EQ(id, target_id[rank]);
+}
+
+
+TEST_F(MapRankToDevice, UsesRankFromEnvironment)
+{
+    int reordered_rank[] = {2, 3, 1, 0};
+    for (const auto& it : env) {
+        SCOPED_TRACE("Using environment variable " + it.first);
+        setenv(it.first.c_str(), std::to_string(reordered_rank[rank]).c_str(),
+               1);
+
+        auto id = gko::experimental::mpi::map_rank_to_device_id(comm, size);
+
+        ASSERT_EQ(id, reordered_rank[rank]);
+        unsetenv(it.first.c_str());
+    }
+}
+
+
+TEST_F(MapRankToDevice, NonCommWorld)
+{
+    MPI_Comm split;
+    MPI_Comm_split(comm, static_cast<int>(rank < 3), rank, &split);
+    int target_id[] = {0, 1, 0, 0};
+
+    auto id = gko::experimental::mpi::map_rank_to_device_id(split, 2);
+
+    ASSERT_EQ(id, target_id[rank]);
+}
diff --git a/core/test/mpi/distributed/CMakeLists.txt b/core/test/mpi/distributed/CMakeLists.txt
new file mode 100644
index 00000000000..ec4f4dc5954
--- /dev/null
+++ b/core/test/mpi/distributed/CMakeLists.txt
@@ -0,0 +1 @@
+ginkgo_create_test(matrix MPI_SIZE 3)
diff --git a/core/test/mpi/distributed/matrix.cpp b/core/test/mpi/distributed/matrix.cpp
new file mode 100644
index 00000000000..98167bd5d1f
--- /dev/null
+++ b/core/test/mpi/distributed/matrix.cpp
@@ -0,0 +1,282 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/distributed/matrix.hpp>
+#include <ginkgo/core/distributed/partition.hpp>
+#include <ginkgo/core/distributed/vector.hpp>
+#include <ginkgo/core/matrix/coo.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/matrix/ell.hpp>
+#include <ginkgo/core/matrix/fbcsr.hpp>
+#include <ginkgo/core/matrix/hybrid.hpp>
+#include <ginkgo/core/matrix/sellp.hpp>
+
+
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+using comm_index_type = gko::experimental::distributed::comm_index_type;
+
+
+template <typename ValueType, typename IndexType>
+class CustomLinOp
+    : public gko::EnableLinOp<CustomLinOp<ValueType, IndexType>>,
+      public gko::ReadableFromMatrixData<ValueType, IndexType>,
+      public gko::EnableCreateMethod<CustomLinOp<ValueType, IndexType>> {
+public:
+    void read(const gko::matrix_data<ValueType, IndexType>& data) override {}
+
+    explicit CustomLinOp(std::shared_ptr<const gko::Executor> exec)
+        : gko::EnableLinOp<CustomLinOp>(exec)
+    {}
+
+protected:
+    void apply_impl(const gko::LinOp* b, gko::LinOp* x) const override {}
+
+    void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
+                    const gko::LinOp* beta, gko::LinOp* x) const override
+    {}
+};
+
+
+template <typename ValueLocalGlobalIndexType>
+class MatrixBuilder : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using local_index_type =
+        typename std::tuple_element<1, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using global_index_type =
+        typename std::tuple_element<2, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using dist_mtx_type =
+        gko::experimental::distributed::Matrix<value_type, local_index_type,
+                                               global_index_type>;
+    using dist_vec_type = gko::experimental::distributed::Vector<value_type>;
+
+    MatrixBuilder()
+        : ref(gko::ReferenceExecutor::create()),
+          comm(gko::experimental::mpi::communicator(MPI_COMM_WORLD))
+    {}
+
+    void SetUp() override {}
+
+    template <typename F>
+    void forall_matrix_types(F&& f)
+    {
+        using namespace gko::matrix;
+        auto empty_test = [](const gko::LinOp*) {};
+        {
+            SCOPED_TRACE("With Coo");
+            f(gko::with_matrix_type<Coo>(),
+              Coo<value_type, local_index_type>::create(this->ref), empty_test);
+        }
+        {
+            SCOPED_TRACE("With Csr");
+            f(gko::with_matrix_type<Csr>(),
+              Csr<value_type, local_index_type>::create(this->ref), empty_test);
+        }
+        {
+            SCOPED_TRACE("With Csr with strategy");
+            using ConcreteCsr = Csr<value_type, local_index_type>;
+            f(gko::with_matrix_type<Csr>(
+                  std::make_shared<typename ConcreteCsr::classical>()),
+              ConcreteCsr::create(this->ref), [](const gko::LinOp* local_mat) {
+                  auto local_csr = gko::as<ConcreteCsr>(local_mat);
+
+                  ASSERT_NO_THROW(gko::as<typename ConcreteCsr::classical>(
+                      local_csr->get_strategy()));
+              });
+        }
+        {
+            SCOPED_TRACE("With Ell");
+            f(gko::with_matrix_type<Ell>(),
+              Ell<value_type, local_index_type>::create(this->ref), empty_test);
+        }
+        {
+            SCOPED_TRACE("With Fbcsr");
+            f(gko::with_matrix_type<Fbcsr>(),
+              Fbcsr<value_type, local_index_type>::create(this->ref),
+              empty_test);
+        }
+        {
+            SCOPED_TRACE("With Fbcsr with block_size");
+            f(gko::with_matrix_type<Fbcsr>(5),
+              Fbcsr<value_type, local_index_type>::create(this->ref),
+              [](const gko::LinOp* local_mat) {
+                  auto local_fbcsr =
+                      gko::as<Fbcsr<value_type, local_index_type>>(local_mat);
+
+                  ASSERT_EQ(local_fbcsr->get_block_size(), 5);
+              });
+        }
+        {
+            SCOPED_TRACE("With Hybrid");
+            f(gko::with_matrix_type<Hybrid>(),
+              Hybrid<value_type, local_index_type>::create(this->ref),
+              empty_test);
+        }
+        {
+            SCOPED_TRACE("With Hybrid with strategy");
+            using Concrete = Hybrid<value_type, local_index_type>;
+            f(gko::with_matrix_type<Hybrid>(
+                  std::make_shared<typename Concrete::column_limit>(11)),
+              Concrete::create(this->ref), [](const gko::LinOp* local_mat) {
+                  auto local_hy = gko::as<Concrete>(local_mat);
+
+                  ASSERT_NO_THROW(gko::as<typename Concrete::column_limit>(
+                      local_hy->get_strategy()));
+                  ASSERT_EQ(gko::as<typename Concrete::column_limit>(
+                                local_hy->get_strategy())
+                                ->get_num_columns(),
+                            11);
+              });
+        }
+        {
+            SCOPED_TRACE("With Sellp");
+            f(gko::with_matrix_type<Sellp>(),
+              Sellp<value_type, local_index_type>::create(this->ref),
+              empty_test);
+        }
+    }
+
+    template <typename LocalMatrixType, typename NonLocalMatrixType>
+    void expected_interface_no_throw(dist_mtx_type* mat,
+                                     LocalMatrixType local_matrix_type,
+                                     NonLocalMatrixType non_local_matrix_type)
+    {
+        auto num_rows = mat->get_size()[0];
+        auto a = dist_vec_type::create(ref, comm);
+        auto b = dist_vec_type::create(ref, comm);
+        auto convert_result = dist_mtx_type::create(
+            ref, comm, local_matrix_type, non_local_matrix_type);
+        auto move_result = dist_mtx_type::create(ref, comm, local_matrix_type,
+                                                 non_local_matrix_type);
+
+        ASSERT_NO_THROW(mat->apply(a.get(), b.get()));
+        ASSERT_NO_THROW(mat->convert_to(convert_result.get()));
+        ASSERT_NO_THROW(mat->move_to(move_result.get()));
+    }
+
+
+    std::shared_ptr<const gko::ReferenceExecutor> ref;
+    gko::experimental::mpi::communicator comm;
+};
+
+TYPED_TEST_SUITE(MatrixBuilder, gko::test::ValueLocalGlobalIndexTypes);
+
+
+TYPED_TEST(MatrixBuilder, BuildWithLocal)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::local_index_type;
+    using dist_mat_type = typename TestFixture::dist_mtx_type;
+    this->template forall_matrix_types([this](auto with_matrix_type,
+                                              auto expected_type_ptr,
+                                              auto additional_test) {
+        using expected_type = typename std::remove_pointer<decltype(
+            expected_type_ptr.get())>::type;
+
+        auto mat =
+            dist_mat_type ::create(this->ref, this->comm, with_matrix_type);
+
+        ASSERT_NO_THROW(gko::as<expected_type>(mat->get_local_matrix()));
+        additional_test(mat->get_local_matrix().get());
+        additional_test(mat->get_non_local_matrix().get());
+        this->expected_interface_no_throw(mat.get(), with_matrix_type,
+                                          with_matrix_type);
+    });
+}
+
+
+TYPED_TEST(MatrixBuilder, BuildWithLocalAndNonLocal)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::local_index_type;
+    using dist_mat_type = typename TestFixture::dist_mtx_type;
+    this->template forall_matrix_types([this](auto with_local_matrix_type,
+                                              auto expected_local_type_ptr,
+                                              auto additional_local_test) {
+        using expected_local_type = typename std::remove_pointer<decltype(
+            expected_local_type_ptr.get())>::type;
+        this->forall_matrix_types([=](auto with_non_local_matrix_type,
+                                      auto expected_non_local_type_ptr,
+                                      auto additional_non_local_test) {
+            using expected_non_local_type =
+                typename std::remove_pointer<decltype(
+                    expected_non_local_type_ptr.get())>::type;
+
+            auto mat = dist_mat_type ::create(this->ref, this->comm,
+                                              with_local_matrix_type,
+                                              with_non_local_matrix_type);
+
+            ASSERT_NO_THROW(
+                gko::as<expected_local_type>(mat->get_local_matrix()));
+            ASSERT_NO_THROW(
+                gko::as<expected_non_local_type>(mat->get_non_local_matrix()));
+            additional_local_test(mat->get_local_matrix().get());
+            additional_non_local_test(mat->get_non_local_matrix().get());
+            this->expected_interface_no_throw(mat.get(), with_local_matrix_type,
+                                              with_non_local_matrix_type);
+        });
+    });
+}
+
+
+TYPED_TEST(MatrixBuilder, BuildWithCustomLinOp)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::local_index_type;
+    using dist_mat_type = typename TestFixture::dist_mtx_type;
+    using custom_type = CustomLinOp<value_type, index_type>;
+
+    auto mat = dist_mat_type::create(this->ref, this->comm,
+                                     gko::with_matrix_type<CustomLinOp>());
+
+    ASSERT_NO_THROW(gko::as<custom_type>(mat->get_local_matrix()));
+    this->expected_interface_no_throw(mat.get(),
+                                      gko::with_matrix_type<CustomLinOp>(),
+                                      gko::with_matrix_type<CustomLinOp>());
+}
+
+
+}  // namespace
diff --git a/core/test/mpi/gtest/mpi_listener.cpp b/core/test/mpi/gtest/mpi_listener.cpp
index d74a77040aa..f26f8d5d60b 100644
--- a/core/test/mpi/gtest/mpi_listener.cpp
+++ b/core/test/mpi/gtest/mpi_listener.cpp
@@ -40,12 +40,14 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *******************************************************************************/
 
 #include <cassert>
-#include <mpi.h>
 #include <sstream>
 #include <string>
 #include <vector>
 
 
+#include <mpi.h>
+
+
 #include <gtest/gtest.h>
 
 
@@ -383,5 +385,5 @@ int main(int argc, char** argv)
     listeners.Append(
         new GTestMPIListener::MPIWrapperPrinter(l, MPI_COMM_WORLD));
     int result = RUN_ALL_TESTS();
-    return 0;
+    return result;
 }
diff --git a/core/test/stop/criterion.cpp b/core/test/stop/criterion.cpp
index 61b5439fe5a..c9776df7da2 100644
--- a/core/test/stop/criterion.cpp
+++ b/core/test/stop/criterion.cpp
@@ -73,8 +73,7 @@ struct DummyLogger : public gko::log::Logger {
 class DummyCriterion
     : public gko::EnablePolymorphicObject<DummyCriterion,
                                           gko::stop::Criterion> {
-    friend class gko::EnablePolymorphicObject<DummyCriterion,
-                                              gko::stop::Criterion>;
+    friend class gko::polymorphic_object_traits<DummyCriterion>;
 
 public:
     explicit DummyCriterion(std::shared_ptr<const gko::Executor> exec)
diff --git a/core/test/utils.hpp b/core/test/utils.hpp
index c99b323108f..361aaa74e72 100644
--- a/core/test/utils.hpp
+++ b/core/test/utils.hpp
@@ -185,6 +185,30 @@ using TwoValueIndexType =
 #endif
 
 
+using ValueLocalGlobalIndexTypes =
+#if GINKGO_DPCPP_SINGLE_MODE
+    ::testing::Types<std::tuple<float, gko::int32, int32>,
+                     std::tuple<float, gko::int32, int64>,
+                     std::tuple<float, gko::int64, int64>,
+                     std::tuple<std::complex<float>, gko::int32, int32>,
+                     std::tuple<std::complex<float>, gko::int32, int64>,
+                     std::tuple<std::complex<float>, gko::int64, int64>>;
+#else
+    ::testing::Types<std::tuple<float, gko::int32, int32>,
+                     std::tuple<float, gko::int32, int64>,
+                     std::tuple<float, gko::int64, int64>,
+                     std::tuple<double, gko::int32, int32>,
+                     std::tuple<double, gko::int32, int64>,
+                     std::tuple<double, gko::int64, int64>,
+                     std::tuple<std::complex<float>, gko::int32, int32>,
+                     std::tuple<std::complex<float>, gko::int32, int64>,
+                     std::tuple<std::complex<float>, gko::int64, int64>,
+                     std::tuple<std::complex<double>, gko::int32, int32>,
+                     std::tuple<std::complex<double>, gko::int32, int64>,
+                     std::tuple<std::complex<double>, gko::int64, int64>>;
+#endif
+
+
 template <typename Precision, typename OutputType>
 struct reduction_factor {
     using nc_output = remove_complex<OutputType>;
diff --git a/core/test/utils/matrix_generator.hpp b/core/test/utils/matrix_generator.hpp
index cc03ade81c9..f4e6e4e26dc 100644
--- a/core/test/utils/matrix_generator.hpp
+++ b/core/test/utils/matrix_generator.hpp
@@ -132,6 +132,30 @@ matrix_data<ValueType, IndexType> generate_random_matrix_data(
 }
 
 
+/**
+ * Generates device matrix data for a random matrix.
+ *
+ * @see generate_random_matrix_data
+ */
+template <typename ValueType, typename IndexType, typename NonzeroDistribution,
+          typename ValueDistribution, typename Engine>
+gko::device_matrix_data<ValueType, IndexType>
+generate_random_device_matrix_data(gko::size_type num_rows,
+                                   gko::size_type num_cols,
+                                   NonzeroDistribution&& nonzero_dist,
+                                   ValueDistribution&& value_dist,
+                                   Engine&& engine,
+                                   std::shared_ptr<const gko::Executor> exec)
+{
+    auto md = gko::test::generate_random_matrix_data<ValueType, IndexType>(
+        num_rows, num_cols, std::forward<NonzeroDistribution>(nonzero_dist),
+        std::forward<ValueDistribution>(value_dist),
+        std::forward<Engine>(engine));
+    return gko::device_matrix_data<ValueType, IndexType>::create_from_host(exec,
+                                                                           md);
+}
+
+
 /**
  * Generates a random matrix.
  *
diff --git a/cuda/CMakeLists.txt b/cuda/CMakeLists.txt
index 11a91936679..ac4e5c5bb8c 100644
--- a/cuda/CMakeLists.txt
+++ b/cuda/CMakeLists.txt
@@ -5,9 +5,12 @@ target_sources(ginkgo_cuda
     base/exception.cpp
     base/executor.cpp
     base/index_set_kernels.cpp
+    base/scoped_device_id.cpp
     base/version.cpp
     components/prefix_sum_kernels.cu
+    distributed/matrix_kernels.cu
     distributed/partition_kernels.cu
+    distributed/vector_kernels.cu
     factorization/cholesky_kernels.cu
     factorization/factorization_kernels.cu
     factorization/ic_kernels.cu
diff --git a/cuda/base/executor.cpp b/cuda/base/executor.cpp
index 949fafbe0a2..d5c8cf323eb 100644
--- a/cuda/base/executor.cpp
+++ b/cuda/base/executor.cpp
@@ -47,7 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "cuda/base/config.hpp"
 #include "cuda/base/cublas_bindings.hpp"
 #include "cuda/base/cusparse_handle.hpp"
-#include "cuda/base/device_guard.hpp"
+#include "cuda/base/scoped_device_id.hpp"
 
 
 namespace gko {
@@ -71,7 +71,7 @@ std::shared_ptr<CudaExecutor> CudaExecutor::create(
             auto& num_execs = nvidia_device::get_num_execs(device_id);
             num_execs--;
             if (!num_execs && device_reset) {
-                cuda::device_guard g(device_id);
+                detail::cuda_scoped_device_id_guard g(device_id);
                 cudaDeviceReset();
             }
         });
@@ -82,7 +82,7 @@ void CudaExecutor::populate_exec_info(const machine_topology* mach_topo)
 {
     if (this->get_device_id() < this->get_num_devices() &&
         this->get_device_id() >= 0) {
-        cuda::device_guard g(this->get_device_id());
+        detail::cuda_scoped_device_id_guard g(this->get_device_id());
         GKO_ASSERT_NO_CUDA_ERRORS(
             cudaDeviceGetPCIBusId(&(this->get_exec_info().pci_bus_id.front()),
                                   13, this->get_device_id()));
@@ -102,7 +102,7 @@ void OmpExecutor::raw_copy_to(const CudaExecutor* dest, size_type num_bytes,
                               const void* src_ptr, void* dest_ptr) const
 {
     if (num_bytes > 0) {
-        cuda::device_guard g(dest->get_device_id());
+        detail::cuda_scoped_device_id_guard g(dest->get_device_id());
         GKO_ASSERT_NO_CUDA_ERRORS(
             cudaMemcpy(dest_ptr, src_ptr, num_bytes, cudaMemcpyHostToDevice));
     }
@@ -111,7 +111,7 @@ void OmpExecutor::raw_copy_to(const CudaExecutor* dest, size_type num_bytes,
 
 void CudaExecutor::raw_free(void* ptr) const noexcept
 {
-    cuda::device_guard g(this->get_device_id());
+    detail::cuda_scoped_device_id_guard g(this->get_device_id());
     auto error_code = cudaFree(ptr);
     if (error_code != cudaSuccess) {
 #if GKO_VERBOSE_LEVEL >= 1
@@ -130,7 +130,7 @@ void CudaExecutor::raw_free(void* ptr) const noexcept
 void* CudaExecutor::raw_alloc(size_type num_bytes) const
 {
     void* dev_ptr = nullptr;
-    cuda::device_guard g(this->get_device_id());
+    detail::cuda_scoped_device_id_guard g(this->get_device_id());
     int error_code = 0;
     if (this->alloc_mode_ == allocation_mode::unified_host) {
         error_code = cudaMallocManaged(&dev_ptr, num_bytes, cudaMemAttachHost);
@@ -154,7 +154,7 @@ void CudaExecutor::raw_copy_to(const OmpExecutor*, size_type num_bytes,
                                const void* src_ptr, void* dest_ptr) const
 {
     if (num_bytes > 0) {
-        cuda::device_guard g(this->get_device_id());
+        detail::cuda_scoped_device_id_guard g(this->get_device_id());
         GKO_ASSERT_NO_CUDA_ERRORS(
             cudaMemcpy(dest_ptr, src_ptr, num_bytes, cudaMemcpyDeviceToHost));
     }
@@ -166,7 +166,7 @@ void CudaExecutor::raw_copy_to(const HipExecutor* dest, size_type num_bytes,
 {
 #if GINKGO_HIP_PLATFORM_NVCC == 1
     if (num_bytes > 0) {
-        cuda::device_guard g(this->get_device_id());
+        detail::cuda_scoped_device_id_guard g(this->get_device_id());
         GKO_ASSERT_NO_CUDA_ERRORS(
             cudaMemcpyPeer(dest_ptr, dest->get_device_id(), src_ptr,
                            this->get_device_id(), num_bytes));
@@ -188,7 +188,7 @@ void CudaExecutor::raw_copy_to(const CudaExecutor* dest, size_type num_bytes,
                                const void* src_ptr, void* dest_ptr) const
 {
     if (num_bytes > 0) {
-        cuda::device_guard g(this->get_device_id());
+        detail::cuda_scoped_device_id_guard g(this->get_device_id());
         GKO_ASSERT_NO_CUDA_ERRORS(
             cudaMemcpyPeer(dest_ptr, dest->get_device_id(), src_ptr,
                            this->get_device_id(), num_bytes));
@@ -198,15 +198,21 @@ void CudaExecutor::raw_copy_to(const CudaExecutor* dest, size_type num_bytes,
 
 void CudaExecutor::synchronize() const
 {
-    cuda::device_guard g(this->get_device_id());
+    detail::cuda_scoped_device_id_guard g(this->get_device_id());
     GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceSynchronize());
 }
 
 
+scoped_device_id_guard CudaExecutor::get_scoped_device_id_guard() const
+{
+    return {this, this->get_device_id()};
+}
+
+
 void CudaExecutor::run(const Operation& op) const
 {
     this->template log<log::Logger::operation_launched>(this, &op);
-    cuda::device_guard g(this->get_device_id());
+    detail::cuda_scoped_device_id_guard g(this->get_device_id());
     op.run(
         std::static_pointer_cast<const CudaExecutor>(this->shared_from_this()));
     this->template log<log::Logger::operation_completed>(this, &op);
@@ -229,7 +235,7 @@ void CudaExecutor::set_gpu_property()
 {
     if (this->get_device_id() < this->get_num_devices() &&
         this->get_device_id() >= 0) {
-        cuda::device_guard g(this->get_device_id());
+        detail::cuda_scoped_device_id_guard g(this->get_device_id());
         GKO_ASSERT_NO_CUDA_ERRORS(cudaDeviceGetAttribute(
             &this->get_exec_info().major, cudaDevAttrComputeCapabilityMajor,
             this->get_device_id()));
@@ -270,15 +276,15 @@ void CudaExecutor::init_handles()
     if (this->get_device_id() < this->get_num_devices() &&
         this->get_device_id() >= 0) {
         const auto id = this->get_device_id();
-        cuda::device_guard g(id);
+        detail::cuda_scoped_device_id_guard g(id);
         this->cublas_handle_ = handle_manager<cublasContext>(
             kernels::cuda::cublas::init(), [id](cublasHandle_t handle) {
-                cuda::device_guard g(id);
+                detail::cuda_scoped_device_id_guard g(id);
                 kernels::cuda::cublas::destroy(handle);
             });
         this->cusparse_handle_ = handle_manager<cusparseContext>(
             kernels::cuda::cusparse::init(), [id](cusparseHandle_t handle) {
-                cuda::device_guard g(id);
+                detail::cuda_scoped_device_id_guard g(id);
                 kernels::cuda::cusparse::destroy(handle);
             });
     }
diff --git a/cuda/base/scoped_device_id.cpp b/cuda/base/scoped_device_id.cpp
new file mode 100644
index 00000000000..f43117c4ca1
--- /dev/null
+++ b/cuda/base/scoped_device_id.cpp
@@ -0,0 +1,107 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <exception>
+#include <utility>
+
+
+#include <cuda_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "cuda/base/scoped_device_id.hpp"
+
+
+namespace gko {
+namespace detail {
+
+
+cuda_scoped_device_id_guard::cuda_scoped_device_id_guard(int device_id)
+    : original_device_id_{}, need_reset_{}
+{
+    GKO_ASSERT_NO_CUDA_ERRORS(cudaGetDevice(&original_device_id_));
+    if (original_device_id_ != device_id) {
+        GKO_ASSERT_NO_CUDA_ERRORS(cudaSetDevice(device_id));
+        need_reset_ = true;
+    }
+}
+
+
+cuda_scoped_device_id_guard::~cuda_scoped_device_id_guard()
+{
+    if (need_reset_) {
+        auto error_code = cudaSetDevice(original_device_id_);
+        if (error_code != cudaSuccess) {
+#if GKO_VERBOSE_LEVEL >= 1
+            std::cerr
+                << "Unrecoverable CUDA error while resetting the device id to "
+                << original_device_id_ << " in " << __func__ << ": "
+                << cudaGetErrorName(error_code) << ": "
+                << cudaGetErrorString(error_code) << std::endl
+                << "Exiting program" << std::endl;
+#endif  // GKO_VERBOSE_LEVEL >= 1
+            std::exit(error_code);
+        }
+    }
+}
+
+
+cuda_scoped_device_id_guard::cuda_scoped_device_id_guard(
+    gko::detail::cuda_scoped_device_id_guard&& other) noexcept
+{
+    *this = std::move(other);
+}
+
+
+cuda_scoped_device_id_guard& cuda_scoped_device_id_guard::operator=(
+    gko::detail::cuda_scoped_device_id_guard&& other) noexcept
+{
+    if (this != &other) {
+        original_device_id_ = std::exchange(other.original_device_id_, 0);
+        need_reset_ = std::exchange(other.need_reset_, false);
+    }
+    return *this;
+}
+
+
+}  // namespace detail
+
+
+scoped_device_id_guard::scoped_device_id_guard(const CudaExecutor* exec,
+                                               int device_id)
+    : scope_(std::make_unique<detail::cuda_scoped_device_id_guard>(device_id))
+{}
+
+
+}  // namespace gko
diff --git a/cuda/base/scoped_device_id.hpp b/cuda/base/scoped_device_id.hpp
new file mode 100644
index 00000000000..1e5f57e122a
--- /dev/null
+++ b/cuda/base/scoped_device_id.hpp
@@ -0,0 +1,77 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_CUDA_BASE_SCOPED_DEVICE_ID_HPP_
+#define GKO_CUDA_BASE_SCOPED_DEVICE_ID_HPP_
+
+
+#include <ginkgo/core/base/scoped_device_id_guard.hpp>
+
+
+namespace gko {
+namespace detail {
+
+
+/**
+ * A scoped device id for CUDA.
+ */
+class cuda_scoped_device_id_guard : public generic_scoped_device_id_guard {
+public:
+    /**
+     * The constructor sets the device id to the passed in value for the
+     * lifetime of the created object.
+     *
+     * @param device_id  Set the device id to this.
+     */
+    explicit cuda_scoped_device_id_guard(int device_id);
+
+    /**
+     * This resets the device id. If this fails, the program is terminated.
+     */
+    ~cuda_scoped_device_id_guard() override;
+
+    cuda_scoped_device_id_guard(cuda_scoped_device_id_guard&& other) noexcept;
+
+    cuda_scoped_device_id_guard& operator=(
+        cuda_scoped_device_id_guard&& other) noexcept;
+
+private:
+    int original_device_id_;
+    bool need_reset_;
+};
+
+
+}  // namespace detail
+}  // namespace gko
+
+
+#endif  // GKO_CUDA_BASE_SCOPED_DEVICE_ID_HPP_
diff --git a/cuda/base/device_guard.hpp b/cuda/distributed/matrix_kernels.cu
similarity index 52%
rename from cuda/base/device_guard.hpp
rename to cuda/distributed/matrix_kernels.cu
index e6f885e4c66..f4629799b63 100644
--- a/cuda/base/device_guard.hpp
+++ b/cuda/distributed/matrix_kernels.cu
@@ -30,70 +30,38 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_CUDA_BASE_DEVICE_GUARD_HPP_
-#define GKO_CUDA_BASE_DEVICE_GUARD_HPP_
+#include "core/distributed/matrix_kernels.hpp"
 
 
-#include <exception>
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/unique.h>
 
 
-#include <cuda_runtime.h>
+#include <ginkgo/core/base/exception_helpers.hpp>
 
 
-#include <ginkgo/core/base/exception_helpers.hpp>
+#include "cuda/components/atomic.cuh"
 
 
 namespace gko {
+namespace kernels {
 namespace cuda {
+namespace distributed_matrix {
 
 
-/**
- * This class defines a device guard for the cuda functions and the cuda module.
- * The guard is used to make sure that the device code is run on the correct
- * cuda device, when run with multiple devices. The class records the current
- * device id and uses `cudaSetDevice` to set the device id to the one being
- * passed in. After the scope has been exited, the destructor sets the device_id
- * back to the one before entering the scope.
- */
-class device_guard {
-public:
-    device_guard(int device_id) : original_device_id{}, need_reset{}
-    {
-        GKO_ASSERT_NO_CUDA_ERRORS(cudaGetDevice(&original_device_id));
-        if (original_device_id != device_id) {
-            GKO_ASSERT_NO_CUDA_ERRORS(cudaSetDevice(device_id));
-            need_reset = true;
-        }
-    }
-
-    device_guard(device_guard& other) = delete;
-
-    device_guard& operator=(const device_guard& other) = delete;
-
-    device_guard(device_guard&& other) = delete;
-
-    device_guard const& operator=(device_guard&& other) = delete;
-
-    ~device_guard() noexcept(false)
-    {
-        if (need_reset) {
-            /* Ignore the error during stack unwinding for this call */
-            if (std::uncaught_exception()) {
-                cudaSetDevice(original_device_id);
-            } else {
-                GKO_ASSERT_NO_CUDA_ERRORS(cudaSetDevice(original_device_id));
-            }
-        }
-    }
-
-private:
-    int original_device_id;
-    bool need_reset;
-};
+#include "common/cuda_hip/distributed/matrix_kernels.hpp.inc"
 
 
+}  // namespace distributed_matrix
 }  // namespace cuda
+}  // namespace kernels
 }  // namespace gko
-
-
-#endif  // GKO_CUDA_BASE_DEVICE_GUARD_HPP_
diff --git a/cuda/distributed/vector_kernels.cu b/cuda/distributed/vector_kernels.cu
new file mode 100644
index 00000000000..def3fc8ec87
--- /dev/null
+++ b/cuda/distributed/vector_kernels.cu
@@ -0,0 +1,59 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/distributed/vector_kernels.hpp"
+
+
+#include <thrust/binary_search.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/scatter.h>
+#include <thrust/tuple.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace cuda {
+namespace distributed_vector {
+
+
+#include "common/cuda_hip/distributed/vector_kernels.hpp.inc"
+
+
+}  // namespace distributed_vector
+}  // namespace cuda
+}  // namespace kernels
+}  // namespace gko
diff --git a/cuda/test/base/CMakeLists.txt b/cuda/test/base/CMakeLists.txt
index 483d1a47913..7b0cd28436c 100644
--- a/cuda/test/base/CMakeLists.txt
+++ b/cuda/test/base/CMakeLists.txt
@@ -10,3 +10,4 @@ ginkgo_create_cuda_test(exception_helpers)
 ginkgo_create_cuda_test(kernel_launch)
 ginkgo_create_cuda_test(lin_op)
 ginkgo_create_cuda_test(math)
+ginkgo_create_cuda_test(scoped_device_id)
diff --git a/cuda/test/base/scoped_device_id.cu b/cuda/test/base/scoped_device_id.cu
new file mode 100644
index 00000000000..608d8fcc3db
--- /dev/null
+++ b/cuda/test/base/scoped_device_id.cu
@@ -0,0 +1,90 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+// force-top: on
+// prevent compilation failure related to disappearing assert(...) statements
+#include <cuda_runtime.h>
+// force-top: off
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "cuda/base/scoped_device_id.hpp"
+
+
+namespace {
+
+
+class ScopedDeviceIdGuard : public ::testing::Test {
+protected:
+    ScopedDeviceIdGuard()
+        : ref(gko::ReferenceExecutor::create()),
+          cuda(gko::CudaExecutor::create(0, ref))
+    {}
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::CudaExecutor> cuda;
+};
+
+
+TEST_F(ScopedDeviceIdGuard, SetsId)
+{
+    auto new_device_id = std::max(cuda->get_num_devices() - 1, 0);
+
+    gko::detail::cuda_scoped_device_id_guard g{new_device_id};
+
+    int device_id;
+    cudaGetDevice(&device_id);
+    ASSERT_EQ(device_id, new_device_id);
+}
+
+
+TEST_F(ScopedDeviceIdGuard, ResetsId)
+{
+    auto old_device_id = cuda->get_device_id();
+
+    {
+        auto new_device_id = std::max(cuda->get_num_devices() - 1, 0);
+        gko::detail::cuda_scoped_device_id_guard g{new_device_id};
+    }
+
+    int device_id;
+    cudaGetDevice(&device_id);
+    ASSERT_EQ(device_id, old_device_id);
+}
+
+
+}  // namespace
diff --git a/dev_tools/scripts/regroup b/dev_tools/scripts/regroup
index 1756481e2e4..25455dada69 100644
--- a/dev_tools/scripts/regroup
+++ b/dev_tools/scripts/regroup
@@ -2,7 +2,7 @@ IncludeBlocks: Regroup
 IncludeCategories:
   - Regex: '^<(rapidjson|gflags|gtest|papi).*'
     Priority: 3
-  - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative|oneapi).*'
+  - Regex: '^<(omp|cu|hip|thrust|CL/|cooperative|oneapi|mpi).*'
     Priority: 2
   - Regex: '^<ginkgo.*'
     Priority: 5
diff --git a/devices/omp/executor.cpp b/devices/omp/executor.cpp
index 26e357892dd..7169b5c9dbf 100644
--- a/devices/omp/executor.cpp
+++ b/devices/omp/executor.cpp
@@ -92,4 +92,10 @@ void OmpExecutor::synchronize() const
 }
 
 
+scoped_device_id_guard OmpExecutor::get_scoped_device_id_guard() const
+{
+    return {this, 0};
+}
+
+
 }  // namespace gko
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index 7583a3b657c..8b975bb6544 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -9,6 +9,8 @@ if(GINKGO_DOC_GENERATE_EXAMPLES)
     add_subdirectory(examples)
 endif()
 
+set(default_predefined_macros "GKO_HAVE_PAPI_SDE=1 GINKGO_BUILD_MPI=1")
+
 if (GINKGO_DOC_GENERATE_PDF)
     find_package(LATEX COMPONENTS PDFLATEX REQUIRED)
 endif()
diff --git a/doc/conf/Doxyfile-pdf.in b/doc/conf/Doxyfile-pdf.in
index f24f4a2d8c3..9f7f3bf94cf 100644
--- a/doc/conf/Doxyfile-pdf.in
+++ b/doc/conf/Doxyfile-pdf.in
@@ -5,7 +5,7 @@ GENERATE_LATEX         = YES
 LATEX_OUTPUT           = .
 LATEX_CMD_NAME         = latex
 MAKEINDEX_CMD_NAME     = makeindex
-COMPACT_LATEX          = NO 
+COMPACT_LATEX          = NO
 PAPER_TYPE             = a4
 EXTRA_PACKAGES         = amsmath amsfonts amssymb mathtools
 LATEX_HEADER           =
@@ -21,7 +21,7 @@ LATEX_BIB_STYLE        = plain
 LATEX_TIMESTAMP        = NO
 
 # this is used to hide protected members
-PREDEFINED = protected=private
+PREDEFINED = @default_predefined_macros@ protected=private
 
 # hide all unnecessary graphs
 CLASS_DIAGRAMS         = NO
diff --git a/doc/conf/Doxyfile-usr.in b/doc/conf/Doxyfile-usr.in
index 17c95fc2a0d..99fbbd5ecb4 100644
--- a/doc/conf/Doxyfile-usr.in
+++ b/doc/conf/Doxyfile-usr.in
@@ -1,7 +1,7 @@
 GENERATE_HTML          = YES
 
 # this is used to hide protected members
-PREDEFINED = protected=private
+PREDEFINED = @default_predefined_macros@ protected=private
 
 INPUT = @doxygen_markdown_files@ @doxygen_base_input_str@
 EXTRACT_STATIC         = NO
diff --git a/doc/conf/Doxyfile.in b/doc/conf/Doxyfile.in
index 4ec815c0164..9ad1631bf8a 100644
--- a/doc/conf/Doxyfile.in
+++ b/doc/conf/Doxyfile.in
@@ -28,7 +28,7 @@ FILTER_PATTERNS        = *.cpp=@DIR_SCRIPT@/filter *.hpp=@DIR_SCRIPT@/filter *.c
 FILTER_SOURCE_FILES    = YES
 
 # Parsing options
-PREDEFINED             = GKO_HAVE_PAPI_SDE=1
+PREDEFINED             = @default_predefined_macros@
 JAVADOC_AUTOBRIEF      = Yes
 TAB_SIZE               = 4
 MARKDOWN_SUPPORT       = YES
diff --git a/doc/examples/examples.hpp.in b/doc/examples/examples.hpp.in
index 55dfc30a14e..131d6a2ef22 100644
--- a/doc/examples/examples.hpp.in
+++ b/doc/examples/examples.hpp.in
@@ -251,6 +251,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *       <td> Use multigrid with different precision multigrid_level as a
  *            solver.
  *       </td></tr>
+
+ *   <tr valign="top">
+ *       <td>@ref distributed_solver</td>
+ *       <td> Use a distributed solver to solve a 1D Laplace equation.
+ *       </td></tr>
  *
  * </table>
  *
@@ -405,5 +410,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *         @ref mixed_multigrid_solver
  *     </td>
  *   </tr>
+
+ *   <tr valign="top">
+ *     <td> Distributed
+ *     </td>
+ *     <td>@ref distributed_solver
+ *     </td>
+ *   </tr>
  * </table>
  */
diff --git a/doc/scripts/examples.pl b/doc/scripts/examples.pl
index 87632d2e721..4194d8ae7a7 100644
--- a/doc/scripts/examples.pl
+++ b/doc/scripts/examples.pl
@@ -35,6 +35,7 @@
     "stopping-criteria"         => ',height=.25,width=.25,fillcolor="deepskyblue"',
     "preconditioners" => ',height=.25,width=.25,fillcolor="crimson"',
     "mixed-precision" => ',height=.25,width=.25,fillcolor="aquamarine"',
+    "distributed"     => ',height=.25,width=.25,fillcolor="plum"',
     "unfinished"     => ',height=.25,width=.25,style="dashed"',
     );
 
@@ -196,6 +197,7 @@
     "stopping-criteria"         => 'Stopping criteria',
     "preconditioners" => 'Preconditioners',
     "mixed-precision" => 'Mixed Precision',
+    "distributed"     => 'Distributed techniques',
     "unfinished"     => 'Unfinished codes',
     );
 
@@ -213,12 +215,12 @@
 }
 # now add connections to make sure they appear nicely next to each other
 # in the legend
-print "  basic -- techniques -- logging -- stopping_criteria -- preconditioners -- mixed_precision -- unfinished;\n";
+print "  basic -- techniques -- logging -- stopping_criteria -- preconditioners -- mixed_precision -- distributed -- unfinished;\n";
 
 # we need to tell 'dot' that all of these are at the same
 # rank to ensure they appear next to (as opposed to atop)
 # each other
-print "  {rank=same; basic, techniques, logging, stopping_criteria, preconditioners, mixed_precision, unfinished}";
+print "  {rank=same; basic, techniques, logging, stopping_criteria, preconditioners, mixed_precision, distributed, unfinished}";
 
 # end the graph
 print "}\n";
diff --git a/dpcpp/CMakeLists.txt b/dpcpp/CMakeLists.txt
index e0f9806cf96..c1fae0ede26 100644
--- a/dpcpp/CMakeLists.txt
+++ b/dpcpp/CMakeLists.txt
@@ -13,9 +13,12 @@ target_sources(ginkgo_dpcpp
     base/executor.dp.cpp
     base/helper.dp.cpp
     base/index_set_kernels.dp.cpp
+    base/scoped_device_id.dp.cpp
     base/version.dp.cpp
     components/prefix_sum_kernels.dp.cpp
+    distributed/matrix_kernels.dp.cpp
     distributed/partition_kernels.dp.cpp
+    distributed/vector_kernels.dp.cpp
     factorization/cholesky_kernels.dp.cpp
     factorization/ic_kernels.dp.cpp
     factorization/ilu_kernels.dp.cpp
diff --git a/dpcpp/base/executor.dp.cpp b/dpcpp/base/executor.dp.cpp
index cf51b504e64..8930d7afe2d 100644
--- a/dpcpp/base/executor.dp.cpp
+++ b/dpcpp/base/executor.dp.cpp
@@ -187,6 +187,11 @@ void DpcppExecutor::raw_copy_to(const DpcppExecutor* dest, size_type num_bytes,
 
 void DpcppExecutor::synchronize() const { queue_->wait_and_throw(); }
 
+scoped_device_id_guard DpcppExecutor::get_scoped_device_id_guard() const
+{
+    return {this, this->get_device_id()};
+}
+
 
 void DpcppExecutor::run(const Operation& op) const
 {
diff --git a/dpcpp/base/scoped_device_id.dp.cpp b/dpcpp/base/scoped_device_id.dp.cpp
new file mode 100644
index 00000000000..8bcda156266
--- /dev/null
+++ b/dpcpp/base/scoped_device_id.dp.cpp
@@ -0,0 +1,49 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/scoped_device_id_guard.hpp>
+
+
+#include "core/base/noop_scoped_device_id_guard.hpp"
+
+
+namespace gko {
+
+
+scoped_device_id_guard::scoped_device_id_guard(const DpcppExecutor* exec,
+                                               int device_id)
+    : scope_(std::make_unique<detail::noop_scoped_device_id_guard>())
+{}
+
+
+}  // namespace gko
diff --git a/dpcpp/distributed/matrix_kernels.dp.cpp b/dpcpp/distributed/matrix_kernels.dp.cpp
new file mode 100644
index 00000000000..cf94ec43ef0
--- /dev/null
+++ b/dpcpp/distributed/matrix_kernels.dp.cpp
@@ -0,0 +1,69 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/distributed/matrix_kernels.hpp"
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace distributed_matrix {
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void build_local_nonlocal(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        col_partition,
+    comm_index_type local_part, array<LocalIndexType>& local_row_idxs,
+    array<LocalIndexType>& local_col_idxs, array<ValueType>& local_values,
+    array<LocalIndexType>& non_local_row_idxs,
+    array<LocalIndexType>& non_local_col_idxs,
+    array<ValueType>& non_local_values,
+    array<LocalIndexType>& local_gather_idxs,
+    array<comm_index_type>& recv_sizes,
+    array<GlobalIndexType>& non_local_to_global) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_BUILD_LOCAL_NONLOCAL);
+
+
+}  // namespace distributed_matrix
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/dpcpp/distributed/vector_kernels.dp.cpp b/dpcpp/distributed/vector_kernels.dp.cpp
new file mode 100644
index 00000000000..c294ab0c0fb
--- /dev/null
+++ b/dpcpp/distributed/vector_kernels.dp.cpp
@@ -0,0 +1,61 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/distributed/vector_kernels.hpp"
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace dpcpp {
+namespace distributed_vector {
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void build_local(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        partition,
+    comm_index_type local_part,
+    matrix::Dense<ValueType>* local_mtx) GKO_NOT_IMPLEMENTED;
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
+
+
+}  // namespace distributed_vector
+}  // namespace dpcpp
+}  // namespace kernels
+}  // namespace gko
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index b284bd244fe..2f4f1392fe8 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -49,6 +49,10 @@ if(GINKGO_HAVE_PAPI_SDE)
     list(APPEND EXAMPLES_LIST papi-logging)
 endif()
 
+if(GINKGO_BUILD_MPI)
+    list(APPEND EXAMPLES_LIST distributed-solver)
+endif()
+
 foreach(example ${EXAMPLES_LIST})
     add_subdirectory(${example})
 endforeach()
diff --git a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp
index 3a6291e9e14..fbdb6b01384 100644
--- a/examples/custom-stopping-criterion/custom-stopping-criterion.cpp
+++ b/examples/custom-stopping-criterion/custom-stopping-criterion.cpp
@@ -47,8 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 class ByInteraction
     : public gko::EnablePolymorphicObject<ByInteraction, gko::stop::Criterion> {
-    friend class gko::EnablePolymorphicObject<ByInteraction,
-                                              gko::stop::Criterion>;
+    friend class gko::polymorphic_object_traits<ByInteraction>;
     using Criterion = gko::stop::Criterion;
 
 public:
diff --git a/examples/distributed-solver/CMakeLists.txt b/examples/distributed-solver/CMakeLists.txt
new file mode 100644
index 00000000000..9e520b71559
--- /dev/null
+++ b/examples/distributed-solver/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(distributed-solver distributed-solver.cpp)
+target_link_libraries(distributed-solver Ginkgo::ginkgo)
diff --git a/examples/distributed-solver/distributed-solver.cpp b/examples/distributed-solver/distributed-solver.cpp
new file mode 100644
index 00000000000..d9a1050f32d
--- /dev/null
+++ b/examples/distributed-solver/distributed-solver.cpp
@@ -0,0 +1,268 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+// @sect3{Include files}
+
+// This is the main ginkgo header file.
+#include <ginkgo/ginkgo.hpp>
+
+// Add the C++ iostream header to output information to the console.
+#include <iostream>
+// Add the STL map header for the executor selection
+#include <map>
+// Add the string manipulation header to handle strings.
+#include <string>
+
+
+int main(int argc, char* argv[])
+{
+    // @sect3{Type Definitiions}
+    // Define the needed types. In a parallel program we need to differentiate
+    // beweeen global and local indices, thus we have two index types.
+    using GlobalIndexType = gko::int64;
+    using LocalIndexType = gko::int32;
+    // The underlying value type.
+    using ValueType = double;
+    // As vector type we use the following, which implements a subset of @ref
+    // gko::matrix::Dense.
+    using dist_vec = gko::experimental::distributed::Vector<ValueType>;
+    // As matrix type we simply use the following type, which can read
+    // distributed data and be applied to a distributed vector.
+    using dist_mtx =
+        gko::experimental::distributed::Matrix<ValueType, LocalIndexType,
+                                               GlobalIndexType>;
+    // We still need a localized vector type to be used as scalars in the
+    // advanced apply operations.
+    using vec = gko::matrix::Dense<ValueType>;
+    // The partition type describes how the rows of the matrices are
+    // distributed.
+    using part_type =
+        gko::experimental::distributed::Partition<LocalIndexType,
+                                                  GlobalIndexType>;
+    // We can use here the same solver type as you would use in a
+    // non-distributed program. Please note that not all solvers support
+    // distributed systems at the moment.
+    using solver = gko::solver::Cg<ValueType>;
+
+    // @sect3{Initialization and User Input Handling}
+    // Since this is an MPI program, we need to initialize and finalize
+    // MPI at the begin and end respectively of our program. This can be easily
+    // done with the following helper construct that uses RAII to automize the
+    // initialization and finalization.
+    const gko::experimental::mpi::environment env(argc, argv);
+
+    // Create an MPI communicator wrapper and get the rank.
+    const gko::experimental::mpi::communicator comm{MPI_COMM_WORLD};
+    const auto rank = comm.rank();
+
+    // Print the ginkgo version information and help message.
+    if (rank == 0) {
+        std::cout << gko::version_info::get() << std::endl;
+    }
+    if (argc == 2 && (std::string(argv[1]) == "--help")) {
+        if (rank == 0) {
+            std::cerr << "Usage: " << argv[0]
+                      << " [executor] [num_grid_points] " << std::endl;
+        }
+        std::exit(-1);
+    }
+
+    ValueType t_init = gko::experimental::mpi::get_walltime();
+
+    // User input settings:
+    // - The executor, defaults to reference.
+    // - The number of grid points, defaults to 100.
+    const auto executor_string = argc >= 2 ? argv[1] : "reference";
+    const auto grid_dim =
+        static_cast<gko::size_type>(argc >= 3 ? std::atoi(argv[2]) : 100);
+
+    // Pick the requested executor.
+    std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
+        exec_map{
+            {"omp", [] { return gko::OmpExecutor::create(); }},
+            {"cuda",
+             [&] {
+                 return gko::CudaExecutor::create(
+                     gko::experimental::mpi::map_rank_to_device_id(
+                         MPI_COMM_WORLD, gko::CudaExecutor::get_num_devices()),
+                     gko::ReferenceExecutor::create(), false,
+                     gko::allocation_mode::device);
+             }},
+            {"hip",
+             [&] {
+                 return gko::HipExecutor::create(
+                     gko::experimental::mpi::map_rank_to_device_id(
+                         MPI_COMM_WORLD, gko::HipExecutor::get_num_devices()),
+                     gko::ReferenceExecutor::create(), true);
+             }},
+            {"dpcpp",
+             [&] {
+                 auto ref = gko::ReferenceExecutor::create();
+                 if (gko::DpcppExecutor::get_num_devices("gpu") > 0) {
+                     return gko::DpcppExecutor::create(
+                         gko::experimental::mpi::map_rank_to_device_id(
+                             MPI_COMM_WORLD,
+                             gko::DpcppExecutor::get_num_devices("gpu")),
+                         ref);
+                 } else if (gko::DpcppExecutor::get_num_devices("cpu") > 0) {
+                     return gko::DpcppExecutor::create(
+                         gko::experimental::mpi::map_rank_to_device_id(
+                             MPI_COMM_WORLD,
+                             gko::DpcppExecutor::get_num_devices("cpu")),
+                         ref);
+                 } else {
+                     throw std::runtime_error("No suitable DPC++ devices");
+                 }
+             }},
+            {"reference", [] { return gko::ReferenceExecutor::create(); }}};
+    const auto exec = exec_map.at(executor_string)();
+
+    // @sect3{Creating the Distributed Matrix and Vectors}
+    // As a first step, we create a partition of the rows. The partition
+    // consists of ranges of consecutive rows which are assigned a part-id.
+    // These part-ids will be used for the distributed data structures to
+    // determine which rows will be stored locally. In this example each rank
+    // has (nearly) the same number of rows, so we can use the following
+    // specialized constructor. See @ref
+    // gko::experimental::distributed::Partition for other modes of creating a
+    // partition.
+    const auto num_rows = grid_dim;
+    auto partition = gko::share(part_type::build_from_global_size_uniform(
+        exec->get_master(), comm.size(),
+        static_cast<GlobalIndexType>(num_rows)));
+
+    // Assemble the matrix using a 3-pt stencil and fill the right-hand-side
+    // with a sine value. The distributed matrix supports only constructing an
+    // empty matrix of zero size and filling in the values with
+    // gko::experimental::distributed::Matrix::read_distributed. Only the data
+    // that belongs to the rows by this rank will be assembled.
+    gko::matrix_data<ValueType, GlobalIndexType> A_data;
+    gko::matrix_data<ValueType, GlobalIndexType> b_data;
+    gko::matrix_data<ValueType, GlobalIndexType> x_data;
+    A_data.size = {num_rows, num_rows};
+    b_data.size = {num_rows, 1};
+    x_data.size = {num_rows, 1};
+    const auto range_start = partition->get_range_bounds()[rank];
+    const auto range_end = partition->get_range_bounds()[rank + 1];
+    for (int i = range_start; i < range_end; i++) {
+        if (i > 0) {
+            A_data.nonzeros.emplace_back(i, i - 1, -1);
+        }
+        A_data.nonzeros.emplace_back(i, i, 2);
+        if (i < grid_dim - 1) {
+            A_data.nonzeros.emplace_back(i, i + 1, -1);
+        }
+        b_data.nonzeros.emplace_back(i, 0, std::sin(i * 0.01));
+        x_data.nonzeros.emplace_back(i, 0, gko::zero<ValueType>());
+    }
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_init_end = gko::experimental::mpi::get_walltime();
+
+    // Read the matrix data, currently this is only supported on CPU executors.
+    // This will also set up the communication pattern needed for the
+    // distributed matrix-vector multiplication.
+    auto A_host = gko::share(dist_mtx::create(exec->get_master(), comm));
+    auto x_host = dist_vec::create(exec->get_master(), comm);
+    auto b_host = dist_vec::create(exec->get_master(), comm);
+    A_host->read_distributed(A_data, partition.get());
+    b_host->read_distributed(b_data, partition.get());
+    x_host->read_distributed(x_data, partition.get());
+    // After reading, the matrix and vector can be moved to the chosen executor,
+    // since the distributed matrix supports SpMV also on devices.
+    auto A = gko::share(dist_mtx::create(exec, comm));
+    auto x = dist_vec::create(exec, comm);
+    auto b = dist_vec::create(exec, comm);
+    A->copy_from(A_host.get());
+    b->copy_from(b_host.get());
+    x->copy_from(x_host.get());
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_read_setup_end = gko::experimental::mpi::get_walltime();
+
+    // @sect3{Solve the Distributed System}
+    // Generate the solver, this is the same as in the non-distributed case.
+    auto Ainv =
+        solver::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(100u).on(exec),
+                gko::stop::ResidualNorm<ValueType>::build()
+                    .with_baseline(gko::stop::mode::absolute)
+                    .with_reduction_factor(1e-4)
+                    .on(exec))
+            .on(exec)
+            ->generate(A);
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_solver_generate_end = gko::experimental::mpi::get_walltime();
+
+    // Apply the distributed solver, this is the same as in the non-distributed
+    // case.
+    Ainv->apply(gko::lend(b), gko::lend(x));
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_solver_apply_end = gko::experimental::mpi::get_walltime();
+
+    // Compute the residual, this is done in the same way as in the
+    // non-distributed case.
+    x_host->copy_from(x.get());
+    auto one = gko::initialize<vec>({1.0}, exec);
+    auto minus_one = gko::initialize<vec>({-1.0}, exec);
+    A_host->apply(gko::lend(minus_one), gko::lend(x_host), gko::lend(one),
+                  gko::lend(b_host));
+    auto res_norm = gko::initialize<vec>({0.0}, exec->get_master());
+    b_host->compute_norm2(gko::lend(res_norm));
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_end = gko::experimental::mpi::get_walltime();
+
+    // @sect3{Printing Results}
+    // Print the achieved residual norm and timings on rank 0.
+    if (comm.rank() == 0) {
+        // clang-format off
+        std::cout << "\nNum rows in matrix: " << num_rows
+                  << "\nNum ranks: " << comm.size()
+                  << "\nFinal Res norm: " << *res_norm->get_values()
+                  << "\nInit time: " << t_init_end - t_init
+                  << "\nRead time: " << t_read_setup_end - t_init
+                  << "\nSolver generate time: " << t_solver_generate_end - t_read_setup_end
+                  << "\nSolver apply time: " << t_solver_apply_end - t_solver_generate_end
+                  << "\nTotal time: " << t_end - t_init
+                  << std::endl;
+        // clang-format on
+    }
+}
diff --git a/examples/distributed-solver/doc/builds-on b/examples/distributed-solver/doc/builds-on
new file mode 100644
index 00000000000..896db74e274
--- /dev/null
+++ b/examples/distributed-solver/doc/builds-on
@@ -0,0 +1 @@
+simple-solver three-pt-stencil-solver
diff --git a/examples/distributed-solver/doc/intro.dox b/examples/distributed-solver/doc/intro.dox
new file mode 100644
index 00000000000..4f5e6532b6f
--- /dev/null
+++ b/examples/distributed-solver/doc/intro.dox
@@ -0,0 +1,8 @@
+<a name="Intro"></a>
+<h1>Introduction</h1>
+This distributed solver example should help you understand the basics of using Ginkgo in a distributed setting.
+The example will solve a simple 1D Laplace equation where the system can be distributed row-wise to multiple processes.
+To run the solver with multiple processes, use `mpirun -n NUM_PROCS ./distributed-solver [executor] [num_grid_points]`.
+
+If you are using GPU devices, please make sure that you run this example with at most as many processes as you have GPU
+devices available.
diff --git a/examples/distributed-solver/doc/kind b/examples/distributed-solver/doc/kind
new file mode 100644
index 00000000000..196aa616342
--- /dev/null
+++ b/examples/distributed-solver/doc/kind
@@ -0,0 +1 @@
+distributed
diff --git a/examples/distributed-solver/doc/results.dox b/examples/distributed-solver/doc/results.dox
new file mode 100644
index 00000000000..e888bf14a6f
--- /dev/null
+++ b/examples/distributed-solver/doc/results.dox
@@ -0,0 +1,17 @@
+<h1>Results</h1>
+This is the expected output for `mpirun -n 4 ./distributed-solver`:
+
+@code{.cpp}
+
+Num rows in matrix: 100
+Num ranks: 4
+Final Res norm: 5.58392e-12
+Init time: 0.0663887
+Read time: 0.0729806
+Solver generate time: 7.6348e-05
+Solver apply time: 0.0680783
+Total time: 0.141351
+
+@endcode
+
+The timings may vary depending on the machine.
diff --git a/examples/distributed-solver/doc/short-intro b/examples/distributed-solver/doc/short-intro
new file mode 100644
index 00000000000..57a54287458
--- /dev/null
+++ b/examples/distributed-solver/doc/short-intro
@@ -0,0 +1 @@
+The distributed solver example.
diff --git a/examples/distributed-solver/doc/tooltip b/examples/distributed-solver/doc/tooltip
new file mode 100644
index 00000000000..3e6cc291852
--- /dev/null
+++ b/examples/distributed-solver/doc/tooltip
@@ -0,0 +1 @@
+Solves a distributed linear system.
diff --git a/hip/CMakeLists.txt b/hip/CMakeLists.txt
index 3300e73b483..35f19e77406 100644
--- a/hip/CMakeLists.txt
+++ b/hip/CMakeLists.txt
@@ -3,9 +3,12 @@ set(GINKGO_HIP_SOURCES
     base/exception.hip.cpp
     base/executor.hip.cpp
     base/index_set_kernels.hip.cpp
+    base/scoped_device_id.hip.cpp
     base/version.hip.cpp
     components/prefix_sum_kernels.hip.cpp
+    distributed/matrix_kernels.hip.cpp
     distributed/partition_kernels.hip.cpp
+    distributed/vector_kernels.hip.cpp
     factorization/cholesky_kernels.hip.cpp
     factorization/factorization_kernels.hip.cpp
     factorization/ic_kernels.hip.cpp
diff --git a/hip/base/executor.hip.cpp b/hip/base/executor.hip.cpp
index 0832a87a39a..d7fc631bb2d 100644
--- a/hip/base/executor.hip.cpp
+++ b/hip/base/executor.hip.cpp
@@ -45,9 +45,9 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include "hip/base/config.hip.hpp"
-#include "hip/base/device_guard.hip.hpp"
 #include "hip/base/hipblas_bindings.hip.hpp"
 #include "hip/base/hipsparse_bindings.hip.hpp"
+#include "hip/base/scoped_device_id.hip.hpp"
 
 
 namespace gko {
@@ -77,7 +77,7 @@ std::shared_ptr<HipExecutor> HipExecutor::create(
             auto& num_execs = hip_device_class::get_num_execs(device_id);
             num_execs--;
             if (!num_execs && device_reset) {
-                hip::device_guard g(device_id);
+                detail::hip_scoped_device_id_guard g(device_id);
                 hipDeviceReset();
             }
         });
@@ -88,7 +88,7 @@ void HipExecutor::populate_exec_info(const machine_topology* mach_topo)
 {
     if (this->get_device_id() < this->get_num_devices() &&
         this->get_device_id() >= 0) {
-        hip::device_guard g(this->get_device_id());
+        detail::hip_scoped_device_id_guard g(this->get_device_id());
         GKO_ASSERT_NO_HIP_ERRORS(
             hipDeviceGetPCIBusId(&(this->get_exec_info().pci_bus_id.front()),
                                  13, this->get_device_id()));
@@ -108,7 +108,7 @@ void OmpExecutor::raw_copy_to(const HipExecutor* dest, size_type num_bytes,
                               const void* src_ptr, void* dest_ptr) const
 {
     if (num_bytes > 0) {
-        hip::device_guard g(dest->get_device_id());
+        detail::hip_scoped_device_id_guard g(dest->get_device_id());
         GKO_ASSERT_NO_HIP_ERRORS(
             hipMemcpy(dest_ptr, src_ptr, num_bytes, hipMemcpyHostToDevice));
     }
@@ -117,7 +117,7 @@ void OmpExecutor::raw_copy_to(const HipExecutor* dest, size_type num_bytes,
 
 void HipExecutor::raw_free(void* ptr) const noexcept
 {
-    hip::device_guard g(this->get_device_id());
+    detail::hip_scoped_device_id_guard g(this->get_device_id());
     auto error_code = hipFree(ptr);
     if (error_code != hipSuccess) {
 #if GKO_VERBOSE_LEVEL >= 1
@@ -136,7 +136,7 @@ void HipExecutor::raw_free(void* ptr) const noexcept
 void* HipExecutor::raw_alloc(size_type num_bytes) const
 {
     void* dev_ptr = nullptr;
-    hip::device_guard g(this->get_device_id());
+    detail::hip_scoped_device_id_guard g(this->get_device_id());
     int error_code = 0;
     if (this->alloc_mode_ == allocation_mode::device) {
         error_code = hipMalloc(&dev_ptr, num_bytes);
@@ -161,7 +161,7 @@ void HipExecutor::raw_copy_to(const OmpExecutor*, size_type num_bytes,
                               const void* src_ptr, void* dest_ptr) const
 {
     if (num_bytes > 0) {
-        hip::device_guard g(this->get_device_id());
+        detail::hip_scoped_device_id_guard g(this->get_device_id());
         GKO_ASSERT_NO_HIP_ERRORS(
             hipMemcpy(dest_ptr, src_ptr, num_bytes, hipMemcpyDeviceToHost));
     }
@@ -173,7 +173,7 @@ void HipExecutor::raw_copy_to(const CudaExecutor* dest, size_type num_bytes,
 {
 #if GINKGO_HIP_PLATFORM_NVCC == 1
     if (num_bytes > 0) {
-        hip::device_guard g(this->get_device_id());
+        detail::hip_scoped_device_id_guard g(this->get_device_id());
         GKO_ASSERT_NO_HIP_ERRORS(hipMemcpyPeer(dest_ptr, dest->get_device_id(),
                                                src_ptr, this->get_device_id(),
                                                num_bytes));
@@ -195,7 +195,7 @@ void HipExecutor::raw_copy_to(const HipExecutor* dest, size_type num_bytes,
                               const void* src_ptr, void* dest_ptr) const
 {
     if (num_bytes > 0) {
-        hip::device_guard g(this->get_device_id());
+        detail::hip_scoped_device_id_guard g(this->get_device_id());
         GKO_ASSERT_NO_HIP_ERRORS(hipMemcpyPeer(dest_ptr, dest->get_device_id(),
                                                src_ptr, this->get_device_id(),
                                                num_bytes));
@@ -205,7 +205,7 @@ void HipExecutor::raw_copy_to(const HipExecutor* dest, size_type num_bytes,
 
 void HipExecutor::synchronize() const
 {
-    hip::device_guard g(this->get_device_id());
+    detail::hip_scoped_device_id_guard g(this->get_device_id());
     GKO_ASSERT_NO_HIP_ERRORS(hipDeviceSynchronize());
 }
 
@@ -213,13 +213,19 @@ void HipExecutor::synchronize() const
 void HipExecutor::run(const Operation& op) const
 {
     this->template log<log::Logger::operation_launched>(this, &op);
-    hip::device_guard g(this->get_device_id());
+    detail::hip_scoped_device_id_guard g(this->get_device_id());
     op.run(
         std::static_pointer_cast<const HipExecutor>(this->shared_from_this()));
     this->template log<log::Logger::operation_completed>(this, &op);
 }
 
 
+scoped_device_id_guard HipExecutor::get_scoped_device_id_guard() const
+{
+    return {this, this->get_device_id()};
+}
+
+
 int HipExecutor::get_num_devices()
 {
     int deviceCount = 0;
@@ -236,7 +242,7 @@ void HipExecutor::set_gpu_property()
 {
     if (this->get_device_id() < this->get_num_devices() &&
         this->get_device_id() >= 0) {
-        hip::device_guard g(this->get_device_id());
+        detail::hip_scoped_device_id_guard g(this->get_device_id());
         GKO_ASSERT_NO_HIP_ERRORS(hipDeviceGetAttribute(
             &this->get_exec_info().num_computing_units,
             hipDeviceAttributeMultiprocessorCount, this->get_device_id()));
@@ -285,15 +291,15 @@ void HipExecutor::init_handles()
     if (this->get_device_id() < this->get_num_devices() &&
         this->get_device_id() >= 0) {
         const auto id = this->get_device_id();
-        hip::device_guard g(id);
+        detail::hip_scoped_device_id_guard g(id);
         this->hipblas_handle_ = handle_manager<hipblasContext>(
             kernels::hip::hipblas::init(), [id](hipblasContext* handle) {
-                hip::device_guard g(id);
+                detail::hip_scoped_device_id_guard g(id);
                 kernels::hip::hipblas::destroy_hipblas_handle(handle);
             });
         this->hipsparse_handle_ = handle_manager<hipsparseContext>(
             kernels::hip::hipsparse::init(), [id](hipsparseContext* handle) {
-                hip::device_guard g(id);
+                detail::hip_scoped_device_id_guard g(id);
                 kernels::hip::hipsparse::destroy_hipsparse_handle(handle);
             });
     }
diff --git a/hip/base/scoped_device_id.hip.cpp b/hip/base/scoped_device_id.hip.cpp
new file mode 100644
index 00000000000..a6d59f1122a
--- /dev/null
+++ b/hip/base/scoped_device_id.hip.cpp
@@ -0,0 +1,107 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <exception>
+#include <utility>
+
+
+#include <hip/hip_runtime.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "hip/base/scoped_device_id.hip.hpp"
+
+
+namespace gko {
+namespace detail {
+
+
+hip_scoped_device_id_guard::hip_scoped_device_id_guard(int device_id)
+    : original_device_id_{}, need_reset_{}
+{
+    GKO_ASSERT_NO_HIP_ERRORS(hipGetDevice(&original_device_id_));
+    if (original_device_id_ != device_id) {
+        GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(device_id));
+        need_reset_ = true;
+    }
+}
+
+
+hip_scoped_device_id_guard::~hip_scoped_device_id_guard()
+{
+    if (need_reset_) {
+        auto error_code = hipSetDevice(original_device_id_);
+        if (error_code != hipSuccess) {
+#if GKO_VERBOSE_LEVEL >= 1
+            std::cerr
+                << "Unrecoverable CUDA error while resetting the device id to "
+                << original_device_id_ << " in " << __func__ << ": "
+                << hipGetErrorName(error_code) << ": "
+                << hipGetErrorString(error_code) << std::endl
+                << "Exiting program" << std::endl;
+#endif  // GKO_VERBOSE_LEVEL >= 1
+            std::exit(error_code);
+        }
+    }
+}
+
+
+hip_scoped_device_id_guard::hip_scoped_device_id_guard(
+    hip_scoped_device_id_guard&& other) noexcept
+{
+    *this = std::move(other);
+}
+
+
+hip_scoped_device_id_guard& hip_scoped_device_id_guard::operator=(
+    gko::detail::hip_scoped_device_id_guard&& other) noexcept
+{
+    if (this != &other) {
+        original_device_id_ = std::exchange(other.original_device_id_, 0);
+        need_reset_ = std::exchange(other.need_reset_, false);
+    }
+    return *this;
+}
+
+
+}  // namespace detail
+
+
+scoped_device_id_guard::scoped_device_id_guard(const HipExecutor* exec,
+                                               int device_id)
+    : scope_(std::make_unique<detail::hip_scoped_device_id_guard>(device_id))
+{}
+
+
+}  // namespace gko
diff --git a/hip/base/scoped_device_id.hip.hpp b/hip/base/scoped_device_id.hip.hpp
new file mode 100644
index 00000000000..b64825998a0
--- /dev/null
+++ b/hip/base/scoped_device_id.hip.hpp
@@ -0,0 +1,77 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_HIP_BASE_SCOPED_DEVICE_ID_HIP_HPP_
+#define GKO_HIP_BASE_SCOPED_DEVICE_ID_HIP_HPP_
+
+
+#include <ginkgo/core/base/scoped_device_id_guard.hpp>
+
+
+namespace gko {
+namespace detail {
+
+
+/**
+ * A scoped device id for HIP.
+ */
+class hip_scoped_device_id_guard : public generic_scoped_device_id_guard {
+public:
+    /**
+     * The constructor sets the device id to the passed in value for the
+     * lifetime of the created object.
+     *
+     * @param device_id  Set the device id to this.
+     */
+    explicit hip_scoped_device_id_guard(int device_id);
+
+    /**
+     * This resets the device id. If this fails, the program is terminated.
+     */
+    ~hip_scoped_device_id_guard() override;
+
+    hip_scoped_device_id_guard(hip_scoped_device_id_guard&& other) noexcept;
+
+    hip_scoped_device_id_guard& operator=(
+        hip_scoped_device_id_guard&& other) noexcept;
+
+private:
+    int original_device_id_;
+    bool need_reset_;
+};
+
+
+}  // namespace detail
+}  // namespace gko
+
+
+#endif  // GKO_HIP_BASE_SCOPED_DEVICE_ID_HIP_HPP_
diff --git a/hip/base/device_guard.hip.hpp b/hip/distributed/matrix_kernels.hip.cpp
similarity index 52%
rename from hip/base/device_guard.hip.hpp
rename to hip/distributed/matrix_kernels.hip.cpp
index 3999ebb7be8..03d46967831 100644
--- a/hip/base/device_guard.hip.hpp
+++ b/hip/distributed/matrix_kernels.hip.cpp
@@ -30,70 +30,38 @@ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 ******************************<GINKGO LICENSE>*******************************/
 
-#ifndef GKO_HIP_BASE_DEVICE_GUARD_HIP_HPP_
-#define GKO_HIP_BASE_DEVICE_GUARD_HIP_HPP_
+#include "core/distributed/matrix_kernels.hpp"
 
 
-#include <exception>
+#include <thrust/binary_search.h>
+#include <thrust/copy.h>
+#include <thrust/distance.h>
+#include <thrust/execution_policy.h>
+#include <thrust/for_each.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform_reduce.h>
+#include <thrust/unique.h>
 
 
-#include <hip/hip_runtime.h>
+#include <ginkgo/core/base/exception_helpers.hpp>
 
 
-#include <ginkgo/core/base/exception_helpers.hpp>
+#include "hip/components/atomic.hip.hpp"
 
 
 namespace gko {
+namespace kernels {
 namespace hip {
+namespace distributed_matrix {
 
 
-/**
- * This class defines a device guard for the hip functions and the hip module.
- * The guard is used to make sure that the device code is run on the correct
- * hip device, when run with multiple devices. The class records the current
- * device id and uses `hipSetDevice` to set the device id to the one being
- * passed in. After the scope has been exited, the destructor sets the device_id
- * back to the one before entering the scope.
- */
-class device_guard {
-public:
-    device_guard(int device_id) : original_device_id{}, need_reset{}
-    {
-        GKO_ASSERT_NO_HIP_ERRORS(hipGetDevice(&original_device_id));
-        if (original_device_id != device_id) {
-            GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(device_id));
-            need_reset = true;
-        }
-    }
-
-    device_guard(device_guard& other) = delete;
-
-    device_guard& operator=(const device_guard& other) = delete;
-
-    device_guard(device_guard&& other) = delete;
-
-    device_guard const& operator=(device_guard&& other) = delete;
-
-    ~device_guard() noexcept(false)
-    {
-        if (need_reset) {
-            /* Ignore the error during stack unwinding for this call */
-            if (std::uncaught_exception()) {
-                hipSetDevice(original_device_id);
-            } else {
-                GKO_ASSERT_NO_HIP_ERRORS(hipSetDevice(original_device_id));
-            }
-        }
-    }
-
-private:
-    int original_device_id;
-    bool need_reset;
-};
+#include "common/cuda_hip/distributed/matrix_kernels.hpp.inc"
 
 
+}  // namespace distributed_matrix
 }  // namespace hip
+}  // namespace kernels
 }  // namespace gko
-
-
-#endif  // GKO_HIP_BASE_DEVICE_GUARD_HIP_HPP_
diff --git a/hip/distributed/vector_kernels.hip.cpp b/hip/distributed/vector_kernels.hip.cpp
new file mode 100644
index 00000000000..6cbfa1224e9
--- /dev/null
+++ b/hip/distributed/vector_kernels.hip.cpp
@@ -0,0 +1,62 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/distributed/vector_kernels.hpp"
+
+
+#include <functional>
+
+
+#include <thrust/binary_search.h>
+#include <thrust/execution_policy.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/zip_iterator.h>
+#include <thrust/scatter.h>
+#include <thrust/tuple.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+namespace gko {
+namespace kernels {
+namespace hip {
+namespace distributed_vector {
+
+
+#include "common/cuda_hip/distributed/vector_kernels.hpp.inc"
+
+
+}  // namespace distributed_vector
+}  // namespace hip
+}  // namespace kernels
+}  // namespace gko
diff --git a/hip/test/base/CMakeLists.txt b/hip/test/base/CMakeLists.txt
index 970746acb35..91dd4209d5d 100644
--- a/hip/test/base/CMakeLists.txt
+++ b/hip/test/base/CMakeLists.txt
@@ -15,3 +15,4 @@ if (GINKGO_HIP_PLATFORM MATCHES "${HIP_PLATFORM_AMD_REGEX}")
 else()
     ginkgo_create_hip_test(exception_helpers)
 endif()
+ginkgo_create_hip_test(scoped_device_id)
diff --git a/hip/test/base/scoped_device_id.hip.cpp b/hip/test/base/scoped_device_id.hip.cpp
new file mode 100644
index 00000000000..032476ab3fb
--- /dev/null
+++ b/hip/test/base/scoped_device_id.hip.cpp
@@ -0,0 +1,90 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+// force-top: on
+// prevent compilation failure related to disappearing assert(...) statements
+#include <hip/hip_runtime.h>
+// force-top: off
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include "hip/base/scoped_device_id.hip.hpp"
+
+
+namespace {
+
+
+class ScopedDeviceIdGuard : public ::testing::Test {
+protected:
+    ScopedDeviceIdGuard()
+        : ref(gko::ReferenceExecutor::create()),
+          hip(gko::HipExecutor::create(0, ref))
+    {}
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::HipExecutor> hip;
+};
+
+
+TEST_F(ScopedDeviceIdGuard, SetsId)
+{
+    auto new_device_id = std::max(hip->get_num_devices() - 1, 0);
+
+    gko::detail::hip_scoped_device_id_guard g{new_device_id};
+
+    int device_id;
+    hipGetDevice(&device_id);
+    ASSERT_EQ(device_id, new_device_id);
+}
+
+
+TEST_F(ScopedDeviceIdGuard, ResetsId)
+{
+    auto old_device_id = hip->get_device_id();
+
+    {
+        auto new_device_id = std::max(hip->get_num_devices() - 1, 0);
+        gko::detail::hip_scoped_device_id_guard g{new_device_id};
+    }
+
+    int device_id;
+    hipGetDevice(&device_id);
+    ASSERT_EQ(device_id, old_device_id);
+}
+
+
+}  // namespace
diff --git a/include/ginkgo/config.hpp.in b/include/ginkgo/config.hpp.in
index 198f465d4d0..a87ce01d37a 100644
--- a/include/ginkgo/config.hpp.in
+++ b/include/ginkgo/config.hpp.in
@@ -87,6 +87,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #cmakedefine01 GINKGO_BUILD_MPI
 // clang-format on
 
+/* Is the MPI implementation GPU aware? */
+// clang-format off
+#cmakedefine01 GINKGO_HAVE_GPU_AWARE_MPI
+// clang-format on
+
 
 /* Is HWLOC available ? */
 // clang-format off
@@ -94,4 +99,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 // clang-format on
 
 
+/* Do we need to use blocking communication in our SpMV? */
+// clang-format off
+#cmakedefine GINKGO_FORCE_SPMV_BLOCKING_COMM
+// clang-format on
+
+
 #endif  // GKO_INCLUDE_CONFIG_H
diff --git a/include/ginkgo/core/base/abstract_factory.hpp b/include/ginkgo/core/base/abstract_factory.hpp
index 3b7ecd712b6..1fae391d63d 100644
--- a/include/ginkgo/core/base/abstract_factory.hpp
+++ b/include/ginkgo/core/base/abstract_factory.hpp
@@ -150,7 +150,7 @@ class EnableDefaultFactory
     : public EnablePolymorphicObject<ConcreteFactory, PolymorphicBase>,
       public EnablePolymorphicAssignment<ConcreteFactory> {
 public:
-    friend class EnablePolymorphicObject<ConcreteFactory, PolymorphicBase>;
+    friend struct polymorphic_object_traits<ConcreteFactory>;
 
     using product_type = ProductType;
     using parameters_type = ParametersType;
diff --git a/include/ginkgo/core/base/combination.hpp b/include/ginkgo/core/base/combination.hpp
index 8686854720d..9e86dbd80d9 100644
--- a/include/ginkgo/core/base/combination.hpp
+++ b/include/ginkgo/core/base/combination.hpp
@@ -59,7 +59,7 @@ template <typename ValueType = default_precision>
 class Combination : public EnableLinOp<Combination<ValueType>>,
                     public EnableCreateMethod<Combination<ValueType>>,
                     public Transposable {
-    friend class EnablePolymorphicObject<Combination, LinOp>;
+    friend struct polymorphic_object_traits<Combination>;
     friend class EnableCreateMethod<Combination>;
 
 public:
diff --git a/include/ginkgo/core/base/composition.hpp b/include/ginkgo/core/base/composition.hpp
index 6c9b9666323..02bd0ed2431 100644
--- a/include/ginkgo/core/base/composition.hpp
+++ b/include/ginkgo/core/base/composition.hpp
@@ -67,7 +67,7 @@ template <typename ValueType = default_precision>
 class Composition : public EnableLinOp<Composition<ValueType>>,
                     public EnableCreateMethod<Composition<ValueType>>,
                     public Transposable {
-    friend class EnablePolymorphicObject<Composition, LinOp>;
+    friend struct polymorphic_object_traits<Composition>;
     friend class EnableCreateMethod<Composition>;
 
 public:
diff --git a/include/ginkgo/core/base/dense_cache.hpp b/include/ginkgo/core/base/dense_cache.hpp
new file mode 100644
index 00000000000..ea2a29ddf3c
--- /dev/null
+++ b/include/ginkgo/core/base/dense_cache.hpp
@@ -0,0 +1,126 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_BASE_DENSE_CACHE_HPP_
+#define GKO_PUBLIC_CORE_BASE_DENSE_CACHE_HPP_
+
+
+#include <memory>
+
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+namespace gko {
+namespace matrix {
+
+
+template <typename ValueType>
+class Dense;
+
+
+}
+
+
+namespace detail {
+
+
+/**
+ * Manages a Dense vector that is buffered and reused internally to avoid
+ * repeated allocations. Copying an instance will only yield an empty object
+ * since copying the cached vector would not make sense. The stored object is
+ * always mutable, so the cache can be used in a const-context.
+ *
+ * @internal  The struct is present to wrap cache-like buffer storage that will
+ *            not be copied when the outer object gets copied.
+ */
+template <typename ValueType>
+struct DenseCache {
+    DenseCache() = default;
+    ~DenseCache() = default;
+    DenseCache(const DenseCache&) {}
+    DenseCache(DenseCache&&) noexcept {}
+    DenseCache& operator=(const DenseCache&) { return *this; }
+    DenseCache& operator=(DenseCache&&) noexcept { return *this; }
+    mutable std::unique_ptr<matrix::Dense<ValueType>> vec{};
+
+
+    /**
+     * Initializes the buffered vector with the same configuration as the
+     * template vector, if
+     * - the current vector is null,
+     * - the sizes of the buffered and template vector differ,
+     * - the executor of the buffered and template vector differ.
+     *
+     * @note This does not copy any data from the template vector.
+     *
+     * @param template_vec  Defines the configuration (executor, size, stride)
+     *                      of the buffered vector.
+     */
+    void init_from(const matrix::Dense<ValueType>* template_vec) const;
+
+    /**
+     * Initializes the buffered vector, if
+     * - the current vector is null,
+     * - the sizes differ,
+     * - the executor differs.
+     *
+     * @param exec  Executor of the buffered vector.
+     * @param size  Size of the buffered vector.
+     */
+    void init(std::shared_ptr<const Executor> exec, dim<2> size) const;
+
+    /**
+     * Reference access to the underlying vector.
+     * @return  Reference to the stored vector.
+     */
+    matrix::Dense<ValueType>& operator*() const { return *vec; }
+
+    /**
+     * Pointer access to the underlying vector.
+     * @return  Pointer to the stored vector.
+     */
+    matrix::Dense<ValueType>* operator->() const { return vec.get(); }
+
+    /**
+     * Pointer access to the underlying vector.
+     * @return  Pointer to the stored vector.
+     */
+    matrix::Dense<ValueType>* get() const { return vec.get(); }
+};
+
+
+}  // namespace detail
+}  // namespace gko
+
+
+#endif  // GKO_PUBLIC_CORE_BASE_DENSE_CACHE_HPP_
diff --git a/include/ginkgo/core/base/executor.hpp b/include/ginkgo/core/base/executor.hpp
index 697e86f941c..7623411d657 100644
--- a/include/ginkgo/core/base/executor.hpp
+++ b/include/ginkgo/core/base/executor.hpp
@@ -47,6 +47,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 #include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/machine_topology.hpp>
+#include <ginkgo/core/base/scoped_device_id_guard.hpp>
 #include <ginkgo/core/base/types.hpp>
 #include <ginkgo/core/log/logger.hpp>
 #include <ginkgo/core/synthesizer/containers.hpp>
@@ -796,6 +797,8 @@ class Executor : public log::EnableLogging<Executor> {
         return this->verify_memory_from(other.get());
     }
 
+    virtual scoped_device_id_guard get_scoped_device_id_guard() const = 0;
+
 protected:
     /**
      * A struct that abstracts the executor info for different executors
@@ -1272,6 +1275,8 @@ class OmpExecutor : public detail::ExecutorBase<OmpExecutor>,
         return this->get_exec_info().num_pu_per_cu;
     }
 
+    scoped_device_id_guard get_scoped_device_id_guard() const override;
+
 protected:
     OmpExecutor()
     {
@@ -1327,6 +1332,11 @@ class ReferenceExecutor : public OmpExecutor {
         this->template log<log::Logger::operation_completed>(this, &op);
     }
 
+    scoped_device_id_guard get_scoped_device_id_guard() const override
+    {
+        return {this, 0};
+    }
+
 protected:
     ReferenceExecutor()
     {
@@ -1401,6 +1411,8 @@ class CudaExecutor : public detail::ExecutorBase<CudaExecutor>,
 
     void run(const Operation& op) const override;
 
+    scoped_device_id_guard get_scoped_device_id_guard() const override;
+
     /**
      * Get the CUDA device id of the device associated to this executor.
      */
@@ -1606,6 +1618,8 @@ class HipExecutor : public detail::ExecutorBase<HipExecutor>,
 
     void run(const Operation& op) const override;
 
+    scoped_device_id_guard get_scoped_device_id_guard() const override;
+
     /**
      * Get the HIP device id of the device associated to this executor.
      */
@@ -1807,6 +1821,8 @@ class DpcppExecutor : public detail::ExecutorBase<DpcppExecutor>,
 
     void run(const Operation& op) const override;
 
+    scoped_device_id_guard get_scoped_device_id_guard() const override;
+
     /**
      * Get the DPCPP device id of the device associated to this executor.
      *
diff --git a/include/ginkgo/core/base/lin_op.hpp b/include/ginkgo/core/base/lin_op.hpp
index e29230186f6..f174bfcfe6c 100644
--- a/include/ginkgo/core/base/lin_op.hpp
+++ b/include/ginkgo/core/base/lin_op.hpp
@@ -1041,8 +1041,7 @@ public:                                                                      \
     class _factory_name                                                      \
         : public ::gko::EnableDefaultLinOpFactory<_factory_name, _lin_op,    \
                                                   _parameters_name##_type> { \
-        friend class ::gko::EnablePolymorphicObject<_factory_name,           \
-                                                    ::gko::LinOpFactory>;    \
+        friend class ::gko::polymorphic_object_traits<_factory_name>;        \
         friend class ::gko::enable_parameters_type<_parameters_name##_type,  \
                                                    _factory_name>;           \
         explicit _factory_name(std::shared_ptr<const ::gko::Executor> exec)  \
diff --git a/include/ginkgo/core/base/mpi.hpp b/include/ginkgo/core/base/mpi.hpp
index c25f92d3f34..e41a6dd98ed 100644
--- a/include/ginkgo/core/base/mpi.hpp
+++ b/include/ginkgo/core/base/mpi.hpp
@@ -35,9 +35,8 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 #include <memory>
-#include <sstream>
-#include <string>
 #include <type_traits>
+#include <utility>
 
 
 #include <ginkgo/config.hpp>
@@ -55,15 +54,53 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 namespace gko {
+namespace experimental {
+/**
+ * @brief The mpi namespace, contains wrapper for many MPI functions.
+ *
+ * @ingroup mpi
+ * @ingroup distributed
+ */
 namespace mpi {
 
 
+/**
+ * Return if GPU aware functionality is available
+ */
+inline constexpr bool is_gpu_aware()
+{
+#if GINKGO_HAVE_GPU_AWARE_MPI
+    return true;
+#else
+    return false;
+#endif
+}
+
+
+/**
+ * Maps each MPI rank to a single device id in a round robin manner.
+ * @param comm  used to determine the node-local rank, if no suitable
+ *              environment variable is available.
+ * @param num_devices  the number of devices per node.
+ * @return  device id that this rank should use.
+ */
+int map_rank_to_device_id(MPI_Comm comm, int num_devices);
+
+
 #define GKO_REGISTER_MPI_TYPE(input_type, mpi_type)         \
     template <>                                             \
     struct type_impl<input_type> {                          \
         static MPI_Datatype get_type() { return mpi_type; } \
     }
 
+/**
+ * A struct that is used to determine the MPI_Datatype of a specified type.
+ *
+ * @tparam T  type of which the MPI_Datatype should be inferred.
+ *
+ * @note  any specialization of this type hast to provide a static function
+ *        `get_type()` that returns an MPI_Datatype
+ */
 template <typename T>
 struct type_impl {};
 
@@ -75,6 +112,7 @@ GKO_REGISTER_MPI_TYPE(int, MPI_INT);
 GKO_REGISTER_MPI_TYPE(unsigned short, MPI_UNSIGNED_SHORT);
 GKO_REGISTER_MPI_TYPE(unsigned long, MPI_UNSIGNED_LONG);
 GKO_REGISTER_MPI_TYPE(long, MPI_LONG);
+GKO_REGISTER_MPI_TYPE(long long, MPI_LONG_LONG_INT);
 GKO_REGISTER_MPI_TYPE(float, MPI_FLOAT);
 GKO_REGISTER_MPI_TYPE(double, MPI_DOUBLE);
 GKO_REGISTER_MPI_TYPE(long double, MPI_LONG_DOUBLE);
@@ -82,11 +120,86 @@ GKO_REGISTER_MPI_TYPE(std::complex<float>, MPI_C_COMPLEX);
 GKO_REGISTER_MPI_TYPE(std::complex<double>, MPI_C_DOUBLE_COMPLEX);
 
 
-template <typename T>
-inline const T* in_place()
-{
-    return reinterpret_cast<const T*>(MPI_IN_PLACE);
-}
+/**
+ * A move-only wrapper for a contiguous MPI_Datatype.
+ *
+ * The underlying MPI_Datatype is automatically created and committed when an
+ * object of this type is constructed, and freed when it is destructed.
+ */
+class contiguous_type {
+public:
+    /**
+     * Constructs a wrapper for a contiguous MPI_Datatype.
+     *
+     * @param count  the number of old_type elements the new datatype contains.
+     * @param old_type  the MPI_Datatype that is contained.
+     */
+    contiguous_type(int count, MPI_Datatype old_type) : type_(MPI_DATATYPE_NULL)
+    {
+        GKO_ASSERT_NO_MPI_ERRORS(MPI_Type_contiguous(count, old_type, &type_));
+        GKO_ASSERT_NO_MPI_ERRORS(MPI_Type_commit(&type_));
+    }
+
+    /**
+     * Constructs empty wrapper with MPI_DATATYPE_NULL.
+     */
+    contiguous_type() : type_(MPI_DATATYPE_NULL) {}
+
+    /**
+     * Disallow copying of wrapper type.
+     */
+    contiguous_type(const contiguous_type&) = delete;
+
+    /**
+     * Disallow copying of wrapper type.
+     */
+    contiguous_type& operator=(const contiguous_type&) = delete;
+
+    /**
+     * Move constructor, leaves other with MPI_DATATYPE_NULL.
+     *
+     * @param other  to be moved from object.
+     */
+    contiguous_type(contiguous_type&& other) noexcept : type_(MPI_DATATYPE_NULL)
+    {
+        *this = std::move(other);
+    }
+
+    /**
+     * Move assignment, leaves other with MPI_DATATYPE_NULL.
+     *
+     * @param other  to be moved from object.
+     *
+     * @return  this object.
+     */
+    contiguous_type& operator=(contiguous_type&& other) noexcept
+    {
+        if (this != &other) {
+            this->type_ = std::exchange(other.type_, MPI_DATATYPE_NULL);
+        }
+        return *this;
+    }
+
+    /**
+     * Destructs object by freeing wrapped MPI_Datatype.
+     */
+    ~contiguous_type()
+    {
+        if (type_ != MPI_DATATYPE_NULL) {
+            MPI_Type_free(&type_);
+        }
+    }
+
+    /**
+     * Access the underlying MPI_Datatype.
+     *
+     * @return  the underlying MPI_Datatype.
+     */
+    MPI_Datatype get() const { return type_; }
+
+private:
+    MPI_Datatype type_;
+};
 
 
 /**
@@ -228,7 +341,8 @@ struct status {
 
 
 /**
- * The request class is a light wrapper around the MPI_Request handle class.
+ * The request class is a light, move-only wrapper around the MPI_Request
+ * handle.
  */
 class request {
 public:
@@ -238,6 +352,30 @@ class request {
      */
     request() : req_(MPI_REQUEST_NULL) {}
 
+    request(const request&) = delete;
+
+    request& operator=(const request&) = delete;
+
+    request(request&& o) noexcept { *this = std::move(o); }
+
+    request& operator=(request&& o) noexcept
+    {
+        if (this != &o) {
+            this->req_ = std::exchange(o.req_, MPI_REQUEST_NULL);
+        }
+        return *this;
+    }
+
+    ~request()
+    {
+        if (req_ != MPI_REQUEST_NULL) {
+            if (MPI_Request_free(&req_) != MPI_SUCCESS) {
+                std::terminate();  // since we can't throw in destructors, we
+                                   // have to terminate the program
+            }
+        }
+    }
+
     /**
      * Get a pointer to the underlying MPI_Request handle.
      *
@@ -282,9 +420,18 @@ inline std::vector<status> wait_all(std::vector<request>& req)
 
 
 /**
- * A communicator class that takes in the given communicator and duplicates it
- * for our purposes. As the class or object goes out of scope, the communicator
- * is freed.
+ * A thin wrapper of MPI_Comm that supports most MPI calls.
+ *
+ * A wrapper class that takes in the given MPI communicator. If a bare MPI_Comm
+ * is provided, the wrapper takes no ownership of the MPI_Comm. Thus the
+ * MPI_Comm must remain valid throughout the lifetime of the communicator. If
+ * the communicator was created through splitting, the wrapper takes ownership
+ * of the MPI_Comm. In this case, as the class or object goes out of scope, the
+ * underlying MPI_Comm is freed.
+ *
+ * @note All MPI calls that work on a buffer take in an Executor as an
+ *       additional argument. This argument specifies the memory space the
+ *       buffer lives in.
  */
 class communicator {
 public:
@@ -389,15 +536,22 @@ class communicator {
     /**
      * Send (Blocking) data from calling process to destination rank.
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param send_buffer  the buffer to send
      * @param send_count  the number of elements to send
      * @param destination_rank  the rank to send the data to
      * @param send_tag  the tag for the send call
+     *
+     * @tparam SendType  the type of the data to send. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
      */
     template <typename SendType>
-    void send(const SendType* send_buffer, const int send_count,
-              const int destination_rank, const int send_tag) const
+    void send(std::shared_ptr<const Executor> exec, const SendType* send_buffer,
+              const int send_count, const int destination_rank,
+              const int send_tag) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         GKO_ASSERT_NO_MPI_ERRORS(
             MPI_Send(send_buffer, send_count, type_impl<SendType>::get_type(),
                      destination_rank, send_tag, this->get()));
@@ -407,17 +561,24 @@ class communicator {
      * Send (Non-blocking, Immediate return) data from calling process to
      * destination rank.
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param send_buffer  the buffer to send
      * @param send_count  the number of elements to send
      * @param destination_rank  the rank to send the data to
      * @param send_tag  the tag for the send call
      *
+     * @tparam SendType  the type of the data to send. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     *
      * @return  the request handle for the send call
      */
     template <typename SendType>
-    request i_send(const SendType* send_buffer, const int send_count,
+    request i_send(std::shared_ptr<const Executor> exec,
+                   const SendType* send_buffer, const int send_count,
                    const int destination_rank, const int send_tag) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         request req;
         GKO_ASSERT_NO_MPI_ERRORS(
             MPI_Isend(send_buffer, send_count, type_impl<SendType>::get_type(),
@@ -428,17 +589,24 @@ class communicator {
     /**
      * Receive data from source rank.
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param recv_buffer  the buffer to receive
      * @param recv_count  the number of elements to receive
      * @param source_rank  the rank to receive the data from
      * @param recv_tag  the tag for the recv call
      *
+     * @tparam RecvType  the type of the data to receive. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     *
      * @return  the status of completion of this call
      */
     template <typename RecvType>
-    status recv(RecvType* recv_buffer, const int recv_count,
-                const int source_rank, const int recv_tag) const
+    status recv(std::shared_ptr<const Executor> exec, RecvType* recv_buffer,
+                const int recv_count, const int source_rank,
+                const int recv_tag) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         status st;
         GKO_ASSERT_NO_MPI_ERRORS(
             MPI_Recv(recv_buffer, recv_count, type_impl<RecvType>::get_type(),
@@ -449,17 +617,24 @@ class communicator {
     /**
      * Receive (Non-blocking, Immediate return) data from source rank.
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param recv_buffer  the buffer to send
      * @param recv_count  the number of elements to receive
      * @param source_rank  the rank to receive the data from
      * @param recv_tag  the tag for the recv call
      *
+     * @tparam RecvType  the type of the data to receive. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     *
      * @return  the request handle for the recv call
      */
     template <typename RecvType>
-    request i_recv(RecvType* recv_buffer, const int recv_count,
-                   const int source_rank, const int recv_tag) const
+    request i_recv(std::shared_ptr<const Executor> exec, RecvType* recv_buffer,
+                   const int recv_count, const int source_rank,
+                   const int recv_tag) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         request req;
         GKO_ASSERT_NO_MPI_ERRORS(
             MPI_Irecv(recv_buffer, recv_count, type_impl<RecvType>::get_type(),
@@ -470,13 +645,20 @@ class communicator {
     /**
      * Broadcast data from calling process to all ranks in the communicator
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param buffer  the buffer to broadcsat
      * @param count  the number of elements to broadcast
      * @param root_rank  the rank to broadcast from
+     *
+     * @tparam BroadcastType  the type of the data to broadcast. Has to be a
+     *                        type which has a specialization of type_impl that
+     *                        defines its MPI_Datatype.
      */
     template <typename BroadcastType>
-    void broadcast(BroadcastType* buffer, int count, int root_rank) const
+    void broadcast(std::shared_ptr<const Executor> exec, BroadcastType* buffer,
+                   int count, int root_rank) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Bcast(buffer, count,
                                            type_impl<BroadcastType>::get_type(),
                                            root_rank, this->get()));
@@ -486,15 +668,22 @@ class communicator {
      * (Non-blocking) Broadcast data from calling process to all ranks in the
      * communicator
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param buffer  the buffer to broadcsat
      * @param count  the number of elements to broadcast
      * @param root_rank  the rank to broadcast from
      *
+     * @tparam BroadcastType  the type of the data to broadcast. Has to be a
+     *                        type which has a specialization of type_impl that
+     *                        defines its MPI_Datatype.
+     *
      * @return  the request handle for the call
      */
     template <typename BroadcastType>
-    request i_broadcast(BroadcastType* buffer, int count, int root_rank) const
+    request i_broadcast(std::shared_ptr<const Executor> exec,
+                        BroadcastType* buffer, int count, int root_rank) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         request req;
         GKO_ASSERT_NO_MPI_ERRORS(
             MPI_Ibcast(buffer, count, type_impl<BroadcastType>::get_type(),
@@ -506,15 +695,22 @@ class communicator {
      * Reduce data into root from all calling processes on the same
      * communicator.
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param send_buffer  the buffer to reduce
      * @param recv_buffer  the reduced result
      * @param count  the number of elements to reduce
      * @param operation  the MPI_Op type reduce operation.
+     *
+     * @tparam ReduceType  the type of the data to reduce. Has to be a type
+     *                     which has a specialization of type_impl that defines
+     *                     its MPI_Datatype.
      */
     template <typename ReduceType>
-    void reduce(const ReduceType* send_buffer, ReduceType* recv_buffer,
+    void reduce(std::shared_ptr<const Executor> exec,
+                const ReduceType* send_buffer, ReduceType* recv_buffer,
                 int count, MPI_Op operation, int root_rank) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Reduce(send_buffer, recv_buffer, count,
                                             type_impl<ReduceType>::get_type(),
                                             operation, root_rank, this->get()));
@@ -524,17 +720,24 @@ class communicator {
      * (Non-blocking) Reduce data into root from all calling processes on the
      * same communicator.
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param send_buffer  the buffer to reduce
      * @param recv_buffer  the reduced result
      * @param count  the number of elements to reduce
      * @param operation  the MPI_Op type reduce operation.
      *
+     * @tparam ReduceType  the type of the data to reduce. Has to be a type
+     *                     which has a specialization of type_impl that defines
+     *                     its MPI_Datatype.
+     *
      * @return  the request handle for the call
      */
     template <typename ReduceType>
-    request i_reduce(const ReduceType* send_buffer, ReduceType* recv_buffer,
+    request i_reduce(std::shared_ptr<const Executor> exec,
+                     const ReduceType* send_buffer, ReduceType* recv_buffer,
                      int count, MPI_Op operation, int root_rank) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         request req;
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Ireduce(
             send_buffer, recv_buffer, count, type_impl<ReduceType>::get_type(),
@@ -546,37 +749,50 @@ class communicator {
      * (In-place) Reduce data from all calling processes from all calling
      * processes on same communicator.
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param recv_buffer  the data to reduce and the reduced result
      * @param count  the number of elements to reduce
      * @param operation  the MPI_Op type reduce operation.
+     *
+     * @tparam ReduceType  the type of the data to send. Has to be a type which
+     *                     has a specialization of type_impl that defines its
+     *                     MPI_Datatype.
      */
     template <typename ReduceType>
-    void all_reduce(ReduceType* recv_buffer, int count, MPI_Op operation) const
+    void all_reduce(std::shared_ptr<const Executor> exec,
+                    ReduceType* recv_buffer, int count, MPI_Op operation) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce(
-            in_place<ReduceType>(), recv_buffer, count,
-            type_impl<ReduceType>::get_type(), operation, this->get()));
+            MPI_IN_PLACE, recv_buffer, count, type_impl<ReduceType>::get_type(),
+            operation, this->get()));
     }
 
     /**
      * (In-place, non-blocking) Reduce data from all calling processes from all
      * calling processes on same communicator.
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param recv_buffer  the data to reduce and the reduced result
      * @param count  the number of elements to reduce
      * @param operation  the reduce operation. See @MPI_Op
      *
+     * @tparam ReduceType  the type of the data to reduce. Has to be a type
+     *                     which has a specialization of type_impl that defines
+     *                     its MPI_Datatype.
+     *
      * @return  the request handle for the call
      */
     template <typename ReduceType>
-    request i_all_reduce(ReduceType* recv_buffer, int count,
+    request i_all_reduce(std::shared_ptr<const Executor> exec,
+                         ReduceType* recv_buffer, int count,
                          MPI_Op operation) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         request req;
-        GKO_ASSERT_NO_MPI_ERRORS(
-            MPI_Iallreduce(in_place<ReduceType>(), recv_buffer, count,
-                           type_impl<ReduceType>::get_type(), operation,
-                           this->get(), req.get()));
+        GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallreduce(
+            MPI_IN_PLACE, recv_buffer, count, type_impl<ReduceType>::get_type(),
+            operation, this->get(), req.get()));
         return req;
     }
 
@@ -584,15 +800,22 @@ class communicator {
      * Reduce data from all calling processes from all calling processes on same
      * communicator.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param send_buffer  the data to reduce
      * @param recv_buffer  the reduced result
      * @param count  the number of elements to reduce
      * @param operation  the reduce operation. See @MPI_Op
+     *
+     * @tparam ReduceType  the type of the data to send. Has to be a type which
+     *                     has a specialization of type_impl that defines its
+     *                     MPI_Datatype.
      */
     template <typename ReduceType>
-    void all_reduce(const ReduceType* send_buffer, ReduceType* recv_buffer,
+    void all_reduce(std::shared_ptr<const Executor> exec,
+                    const ReduceType* send_buffer, ReduceType* recv_buffer,
                     int count, MPI_Op operation) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Allreduce(
             send_buffer, recv_buffer, count, type_impl<ReduceType>::get_type(),
             operation, this->get()));
@@ -602,17 +825,24 @@ class communicator {
      * Reduce data from all calling processes from all calling processes on same
      * communicator.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param send_buffer  the data to reduce
      * @param recv_buffer  the reduced result
      * @param count  the number of elements to reduce
      * @param operation  the reduce operation. See @MPI_Op
      *
+     * @tparam ReduceType  the type of the data to reduce. Has to be a type
+     *                     which has a specialization of type_impl that defines
+     *                     its MPI_Datatype.
+     *
      * @return  the request handle for the call
      */
     template <typename ReduceType>
-    request i_all_reduce(const ReduceType* send_buffer, ReduceType* recv_buffer,
+    request i_all_reduce(std::shared_ptr<const Executor> exec,
+                         const ReduceType* send_buffer, ReduceType* recv_buffer,
                          int count, MPI_Op operation) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         request req;
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallreduce(
             send_buffer, recv_buffer, count, type_impl<ReduceType>::get_type(),
@@ -623,17 +853,26 @@ class communicator {
     /**
      * Gather data onto the root rank from all ranks in the communicator.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param send_buffer  the buffer to gather from
      * @param send_count  the number of elements to send
      * @param recv_buffer  the buffer to gather into
      * @param recv_count  the number of elements to receive
      * @param root_rank  the rank to gather into
+     *
+     * @tparam SendType  the type of the data to send. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     * @tparam RecvType  the type of the data to receive. The same restrictions
+     *                   as for SendType apply.
      */
     template <typename SendType, typename RecvType>
-    void gather(const SendType* send_buffer, const int send_count,
+    void gather(std::shared_ptr<const Executor> exec,
+                const SendType* send_buffer, const int send_count,
                 RecvType* recv_buffer, const int recv_count,
                 int root_rank) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         GKO_ASSERT_NO_MPI_ERRORS(
             MPI_Gather(send_buffer, send_count, type_impl<SendType>::get_type(),
                        recv_buffer, recv_count, type_impl<RecvType>::get_type(),
@@ -644,19 +883,28 @@ class communicator {
      * (Non-blocking) Gather data onto the root rank from all ranks in the
      * communicator.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param send_buffer  the buffer to gather from
      * @param send_count  the number of elements to send
      * @param recv_buffer  the buffer to gather into
      * @param recv_count  the number of elements to receive
      * @param root_rank  the rank to gather into
      *
+     * @tparam SendType  the type of the data to send. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     * @tparam RecvType  the type of the data to receive. The same restrictions
+     *                   as for SendType apply.
+     *
      * @return  the request handle for the call
      */
     template <typename SendType, typename RecvType>
-    request i_gather(const SendType* send_buffer, const int send_count,
+    request i_gather(std::shared_ptr<const Executor> exec,
+                     const SendType* send_buffer, const int send_count,
                      RecvType* recv_buffer, const int recv_count,
                      int root_rank) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         request req;
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Igather(
             send_buffer, send_count, type_impl<SendType>::get_type(),
@@ -669,18 +917,27 @@ class communicator {
      * Gather data onto the root rank from all ranks in the communicator with
      * offsets.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param send_buffer  the buffer to gather from
      * @param send_count  the number of elements to send
      * @param recv_buffer  the buffer to gather into
      * @param recv_count  the number of elements to receive
      * @param displacements  the offsets for the buffer
      * @param root_rank  the rank to gather into
+     *
+     * @tparam SendType  the type of the data to send. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     * @tparam RecvType  the type of the data to receive. The same restrictions
+     *                   as for SendType apply.
      */
     template <typename SendType, typename RecvType>
-    void gather_v(const SendType* send_buffer, const int send_count,
+    void gather_v(std::shared_ptr<const Executor> exec,
+                  const SendType* send_buffer, const int send_count,
                   RecvType* recv_buffer, const int* recv_counts,
                   const int* displacements, int root_rank) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Gatherv(
             send_buffer, send_count, type_impl<SendType>::get_type(),
             recv_buffer, recv_counts, displacements,
@@ -691,6 +948,7 @@ class communicator {
      * (Non-blocking) Gather data onto the root rank from all ranks in the
      * communicator with offsets.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param send_buffer  the buffer to gather from
      * @param send_count  the number of elements to send
      * @param recv_buffer  the buffer to gather into
@@ -698,13 +956,21 @@ class communicator {
      * @param displacements  the offsets for the buffer
      * @param root_rank  the rank to gather into
      *
+     * @tparam SendType  the type of the data to send. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     * @tparam RecvType  the type of the data to receive. The same restrictions
+     *                   as for SendType apply.
+     *
      * @return  the request handle for the call
      */
     template <typename SendType, typename RecvType>
-    request i_gather_v(const SendType* send_buffer, const int send_count,
+    request i_gather_v(std::shared_ptr<const Executor> exec,
+                       const SendType* send_buffer, const int send_count,
                        RecvType* recv_buffer, const int* recv_counts,
                        const int* displacements, int root_rank) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         request req;
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Igatherv(
             send_buffer, send_count, type_impl<SendType>::get_type(),
@@ -717,15 +983,24 @@ class communicator {
     /**
      * Gather data onto all ranks from all ranks in the communicator.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param send_buffer  the buffer to gather from
      * @param send_count  the number of elements to send
      * @param recv_buffer  the buffer to gather into
      * @param recv_count  the number of elements to receive
+     *
+     * @tparam SendType  the type of the data to send. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     * @tparam RecvType  the type of the data to receive. The same restrictions
+     *                   as for SendType apply.
      */
     template <typename SendType, typename RecvType>
-    void all_gather(const SendType* send_buffer, const int send_count,
+    void all_gather(std::shared_ptr<const Executor> exec,
+                    const SendType* send_buffer, const int send_count,
                     RecvType* recv_buffer, const int recv_count) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Allgather(
             send_buffer, send_count, type_impl<SendType>::get_type(),
             recv_buffer, recv_count, type_impl<RecvType>::get_type(),
@@ -736,17 +1011,26 @@ class communicator {
      * (Non-blocking) Gather data onto all ranks from all ranks in the
      * communicator.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param send_buffer  the buffer to gather from
      * @param send_count  the number of elements to send
      * @param recv_buffer  the buffer to gather into
      * @param recv_count  the number of elements to receive
      *
+     * @tparam SendType  the type of the data to send. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     * @tparam RecvType  the type of the data to receive. The same restrictions
+     *                   as for SendType apply.
+     *
      * @return  the request handle for the call
      */
     template <typename SendType, typename RecvType>
-    request i_all_gather(const SendType* send_buffer, const int send_count,
+    request i_all_gather(std::shared_ptr<const Executor> exec,
+                         const SendType* send_buffer, const int send_count,
                          RecvType* recv_buffer, const int recv_count) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         request req;
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Iallgather(
             send_buffer, send_count, type_impl<SendType>::get_type(),
@@ -758,16 +1042,25 @@ class communicator {
     /**
      * Scatter data from root rank to all ranks in the communicator.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param send_buffer  the buffer to gather from
      * @param send_count  the number of elements to send
      * @param recv_buffer  the buffer to gather into
      * @param recv_count  the number of elements to receive
+     *
+     * @tparam SendType  the type of the data to send. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     * @tparam RecvType  the type of the data to receive. The same restrictions
+     *                   as for SendType apply.
      */
     template <typename SendType, typename RecvType>
-    void scatter(const SendType* send_buffer, const int send_count,
+    void scatter(std::shared_ptr<const Executor> exec,
+                 const SendType* send_buffer, const int send_count,
                  RecvType* recv_buffer, const int recv_count,
                  int root_rank) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Scatter(
             send_buffer, send_count, type_impl<SendType>::get_type(),
             recv_buffer, recv_count, type_impl<RecvType>::get_type(), root_rank,
@@ -778,18 +1071,27 @@ class communicator {
      * (Non-blocking) Scatter data from root rank to all ranks in the
      * communicator.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param send_buffer  the buffer to gather from
      * @param send_count  the number of elements to send
      * @param recv_buffer  the buffer to gather into
      * @param recv_count  the number of elements to receive
      *
+     * @tparam SendType  the type of the data to send. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     * @tparam RecvType  the type of the data to receive. The same restrictions
+     *                   as for SendType apply.
+     *
      * @return  the request handle for the call
      */
     template <typename SendType, typename RecvType>
-    request i_scatter(const SendType* send_buffer, const int send_count,
+    request i_scatter(std::shared_ptr<const Executor> exec,
+                      const SendType* send_buffer, const int send_count,
                       RecvType* recv_buffer, const int recv_count,
                       int root_rank) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         request req;
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Iscatter(
             send_buffer, send_count, type_impl<SendType>::get_type(),
@@ -802,18 +1104,27 @@ class communicator {
      * Scatter data from root rank to all ranks in the communicator with
      * offsets.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param send_buffer  the buffer to gather from
      * @param send_count  the number of elements to send
      * @param recv_buffer  the buffer to gather into
      * @param recv_count  the number of elements to receive
      * @param displacements  the offsets for the buffer
      * @param comm  the communicator
+     *
+     * @tparam SendType  the type of the data to send. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     * @tparam RecvType  the type of the data to receive. The same restrictions
+     *                   as for SendType apply.
      */
     template <typename SendType, typename RecvType>
-    void scatter_v(const SendType* send_buffer, const int* send_counts,
+    void scatter_v(std::shared_ptr<const Executor> exec,
+                   const SendType* send_buffer, const int* send_counts,
                    const int* displacements, RecvType* recv_buffer,
                    const int recv_count, int root_rank) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Scatterv(
             send_buffer, send_counts, displacements,
             type_impl<SendType>::get_type(), recv_buffer, recv_count,
@@ -824,6 +1135,7 @@ class communicator {
      * (Non-blocking) Scatter data from root rank to all ranks in the
      * communicator with offsets.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param send_buffer  the buffer to gather from
      * @param send_count  the number of elements to send
      * @param recv_buffer  the buffer to gather into
@@ -831,13 +1143,21 @@ class communicator {
      * @param displacements  the offsets for the buffer
      * @param comm  the communicator
      *
+     * @tparam SendType  the type of the data to send. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     * @tparam RecvType  the type of the data to receive. The same restrictions
+     *                   as for SendType apply.
+     *
      * @return  the request handle for the call
      */
     template <typename SendType, typename RecvType>
-    request i_scatter_v(const SendType* send_buffer, const int* send_counts,
+    request i_scatter_v(std::shared_ptr<const Executor> exec,
+                        const SendType* send_buffer, const int* send_counts,
                         const int* displacements, RecvType* recv_buffer,
                         const int recv_count, int root_rank) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         request req;
         GKO_ASSERT_NO_MPI_ERRORS(
             MPI_Iscatterv(send_buffer, send_counts, displacements,
@@ -851,41 +1171,55 @@ class communicator {
      * (In-place) Communicate data from all ranks to all other ranks in place
      * (MPI_Alltoall). See MPI documentation for more details.
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param buffer  the buffer to send and the buffer receive
      * @param recv_count  the number of elements to receive
      * @param comm  the communicator
      *
+     * @tparam RecvType  the type of the data to send. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     *
      * @note This overload uses MPI_IN_PLACE and the source and destination
      * buffers are the same.
      */
     template <typename RecvType>
-    void all_to_all(RecvType* recv_buffer, const int recv_count) const
+    void all_to_all(std::shared_ptr<const Executor> exec, RecvType* recv_buffer,
+                    const int recv_count) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Alltoall(
-            in_place<RecvType>(), recv_count, type_impl<RecvType>::get_type(),
+            MPI_IN_PLACE, recv_count, type_impl<RecvType>::get_type(),
             recv_buffer, recv_count, type_impl<RecvType>::get_type(),
             this->get()));
     }
 
     /**
      * (In-place, Non-blocking) Communicate data from all ranks to all other
-     * ranks in place (MPI_Alltoall). See MPI documentation for more details.
+     * ranks in place (MPI_Ialltoall). See MPI documentation for more details.
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param buffer  the buffer to send and the buffer receive
      * @param recv_count  the number of elements to receive
      * @param comm  the communicator
      *
+     * @tparam RecvType  the type of the data to receive. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     *
      * @return  the request handle for the call
      *
      * @note This overload uses MPI_IN_PLACE and the source and destination
      * buffers are the same.
      */
     template <typename RecvType>
-    request i_all_to_all(RecvType* recv_buffer, const int recv_count) const
+    request i_all_to_all(std::shared_ptr<const Executor> exec,
+                         RecvType* recv_buffer, const int recv_count) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         request req;
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Ialltoall(
-            in_place<RecvType>(), recv_count, type_impl<RecvType>::get_type(),
+            MPI_IN_PLACE, recv_count, type_impl<RecvType>::get_type(),
             recv_buffer, recv_count, type_impl<RecvType>::get_type(),
             this->get(), req.get()));
         return req;
@@ -895,15 +1229,24 @@ class communicator {
      * Communicate data from all ranks to all other ranks (MPI_Alltoall).
      * See MPI documentation for more details.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param send_buffer  the buffer to send
      * @param send_count  the number of elements to send
      * @param recv_buffer  the buffer to receive
      * @param recv_count  the number of elements to receive
+     *
+     * @tparam SendType  the type of the data to send. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     * @tparam RecvType  the type of the data to receive. The same restrictions
+     *                   as for SendType apply.
      */
     template <typename SendType, typename RecvType>
-    void all_to_all(const SendType* send_buffer, const int send_count,
+    void all_to_all(std::shared_ptr<const Executor> exec,
+                    const SendType* send_buffer, const int send_count,
                     RecvType* recv_buffer, const int recv_count) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Alltoall(
             send_buffer, send_count, type_impl<SendType>::get_type(),
             recv_buffer, recv_count, type_impl<RecvType>::get_type(),
@@ -912,19 +1255,28 @@ class communicator {
 
     /**
      * (Non-blocking) Communicate data from all ranks to all other ranks
-     * (MPI_Alltoall). See MPI documentation for more details.
+     * (MPI_Ialltoall). See MPI documentation for more details.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param send_buffer  the buffer to send
      * @param send_count  the number of elements to send
      * @param recv_buffer  the buffer to receive
      * @param recv_count  the number of elements to receive
      *
+     * @tparam SendType  the type of the data to send. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     * @tparam RecvType  the type of the data to receive. The same restrictions
+     *                   as for SendType apply.
+     *
      * @return  the request handle for the call
      */
     template <typename SendType, typename RecvType>
-    request i_all_to_all(const SendType* send_buffer, const int send_count,
+    request i_all_to_all(std::shared_ptr<const Executor> exec,
+                         const SendType* send_buffer, const int send_count,
                          RecvType* recv_buffer, const int recv_count) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         request req;
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Ialltoall(
             send_buffer, send_count, type_impl<SendType>::get_type(),
@@ -937,6 +1289,7 @@ class communicator {
      * Communicate data from all ranks to all other ranks with
      * offsets (MPI_Alltoallv). See MPI documentation for more details.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param send_buffer  the buffer to send
      * @param send_count  the number of elements to send
      * @param send_offsets  the offsets for the send buffer
@@ -944,59 +1297,138 @@ class communicator {
      * @param recv_count  the number of elements to receive
      * @param recv_offsets  the offsets for the recv buffer
      * @param comm  the communicator
+     *
+     * @tparam SendType  the type of the data to send. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     * @tparam RecvType  the type of the data to receive. The same restrictions
+     *                   as for SendType apply.
      */
     template <typename SendType, typename RecvType>
-    void all_to_all_v(const SendType* send_buffer, const int* send_counts,
+    void all_to_all_v(std::shared_ptr<const Executor> exec,
+                      const SendType* send_buffer, const int* send_counts,
                       const int* send_offsets, RecvType* recv_buffer,
                       const int* recv_counts, const int* recv_offsets) const
     {
-        GKO_ASSERT_NO_MPI_ERRORS(MPI_Alltoallv(
-            send_buffer, send_counts, send_offsets,
-            type_impl<SendType>::get_type(), recv_buffer, recv_counts,
-            recv_offsets, type_impl<RecvType>::get_type(), this->get()));
+        this->all_to_all_v(std::move(exec), send_buffer, send_counts,
+                           send_offsets, type_impl<SendType>::get_type(),
+                           recv_buffer, recv_counts, recv_offsets,
+                           type_impl<RecvType>::get_type());
     }
 
     /**
      * Communicate data from all ranks to all other ranks with
      * offsets (MPI_Alltoallv). See MPI documentation for more details.
      *
+     * @param exec  The executor, on which the message buffers are located.
+     * @param send_buffer  the buffer to send
+     * @param send_count  the number of elements to send
+     * @param send_offsets  the offsets for the send buffer
+     * @param send_type  the MPI_Datatype for the send buffer
+     * @param recv_buffer  the buffer to gather into
+     * @param recv_count  the number of elements to receive
+     * @param recv_offsets  the offsets for the recv buffer
+     * @param recv_type  the MPI_Datatype for the recv buffer
+     * @param comm  the communicator
+     */
+    void all_to_all_v(std::shared_ptr<const Executor> exec,
+                      const void* send_buffer, const int* send_counts,
+                      const int* send_offsets, MPI_Datatype send_type,
+                      void* recv_buffer, const int* recv_counts,
+                      const int* recv_offsets, MPI_Datatype recv_type) const
+    {
+        auto guard = exec->get_scoped_device_id_guard();
+        GKO_ASSERT_NO_MPI_ERRORS(MPI_Alltoallv(
+            send_buffer, send_counts, send_offsets, send_type, recv_buffer,
+            recv_counts, recv_offsets, recv_type, this->get()));
+    }
+
+    /**
+     * Communicate data from all ranks to all other ranks with
+     * offsets (MPI_Ialltoallv). See MPI documentation for more details.
+     *
+     * @param exec  The executor, on which the message buffers are located.
      * @param send_buffer  the buffer to send
      * @param send_count  the number of elements to send
      * @param send_offsets  the offsets for the send buffer
+     * @param send_type  the MPI_Datatype for the send buffer
      * @param recv_buffer  the buffer to gather into
      * @param recv_count  the number of elements to receive
      * @param recv_offsets  the offsets for the recv buffer
+     * @param recv_type  the MPI_Datatype for the recv buffer
+     *
+     * @return  the request handle for the call
+     *
+     * @note This overload allows specifying the MPI_Datatype for both
+     *       the send and received data.
+     */
+    request i_all_to_all_v(std::shared_ptr<const Executor> exec,
+                           const void* send_buffer, const int* send_counts,
+                           const int* send_offsets, MPI_Datatype send_type,
+                           void* recv_buffer, const int* recv_counts,
+                           const int* recv_offsets,
+                           MPI_Datatype recv_type) const
+    {
+        auto guard = exec->get_scoped_device_id_guard();
+        request req;
+        GKO_ASSERT_NO_MPI_ERRORS(MPI_Ialltoallv(
+            send_buffer, send_counts, send_offsets, send_type, recv_buffer,
+            recv_counts, recv_offsets, recv_type, this->get(), req.get()));
+        return req;
+    }
+
+    /**
+     * Communicate data from all ranks to all other ranks with
+     * offsets (MPI_Ialltoallv). See MPI documentation for more details.
+     *
+     * @param exec  The executor, on which the message buffers are located.
+     * @param send_buffer  the buffer to send
+     * @param send_count  the number of elements to send
+     * @param send_offsets  the offsets for the send buffer
+     * @param recv_buffer  the buffer to gather into
+     * @param recv_count  the number of elements to receive
+     * @param recv_offsets  the offsets for the recv buffer
+     *
+     * @tparam SendType  the type of the data to send. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     * @tparam RecvType  the type of the data to receive. The same restrictions
+     *                   as for SendType apply.
      *
      * @return  the request handle for the call
      */
     template <typename SendType, typename RecvType>
-    request i_all_to_all_v(const SendType* send_buffer, const int* send_counts,
+    request i_all_to_all_v(std::shared_ptr<const Executor> exec,
+                           const SendType* send_buffer, const int* send_counts,
                            const int* send_offsets, RecvType* recv_buffer,
                            const int* recv_counts,
                            const int* recv_offsets) const
     {
-        request req;
-        GKO_ASSERT_NO_MPI_ERRORS(MPI_Ialltoallv(
-            send_buffer, send_counts, send_offsets,
+        return this->i_all_to_all_v(
+            std::move(exec), send_buffer, send_counts, send_offsets,
             type_impl<SendType>::get_type(), recv_buffer, recv_counts,
-            recv_offsets, type_impl<RecvType>::get_type(), this->get(),
-            req.get()));
-        return req;
+            recv_offsets, type_impl<RecvType>::get_type());
     }
 
     /**
      * Does a scan operation with the given operator.
      * (MPI_Scan). See MPI documentation for more details.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param send_buffer  the buffer to scan from
      * @param recv_buffer  the result buffer
      * @param recv_count  the number of elements to scan
      * @param operation  the operation type to be used for the scan. See @MPI_Op
+     *
+     * @tparam ScanType  the type of the data to scan. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
      */
     template <typename ScanType>
-    void scan(const ScanType* send_buffer, ScanType* recv_buffer, int count,
-              MPI_Op operation) const
+    void scan(std::shared_ptr<const Executor> exec, const ScanType* send_buffer,
+              ScanType* recv_buffer, int count, MPI_Op operation) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Scan(send_buffer, recv_buffer, count,
                                           type_impl<ScanType>::get_type(),
                                           operation, this->get()));
@@ -1004,19 +1436,26 @@ class communicator {
 
     /**
      * Does a scan operation with the given operator.
-     * (MPI_Scan). See MPI documentation for more details.
+     * (MPI_Iscan). See MPI documentation for more details.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param send_buffer  the buffer to scan from
      * @param recv_buffer  the result buffer
      * @param recv_count  the number of elements to scan
      * @param operation  the operation type to be used for the scan. See @MPI_Op
      *
+     * @tparam ScanType  the type of the data to scan. Has to be a type which
+     *                   has a specialization of type_impl that defines its
+     *                   MPI_Datatype.
+     *
      * @return  the request handle for the call
      */
     template <typename ScanType>
-    request i_scan(const ScanType* send_buffer, ScanType* recv_buffer,
+    request i_scan(std::shared_ptr<const Executor> exec,
+                   const ScanType* send_buffer, ScanType* recv_buffer,
                    int count, MPI_Op operation) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         request req;
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Iscan(send_buffer, recv_buffer, count,
                                            type_impl<ScanType>::get_type(),
@@ -1026,9 +1465,6 @@ class communicator {
 
 private:
     std::shared_ptr<MPI_Comm> comm_;
-    int size_{};
-    int rank_{};
-    int node_local_rank_{};
 
     int get_my_rank() const
     {
@@ -1126,6 +1562,7 @@ class window {
      * Create a window object with a given data pointer and type. A collective
      * operation.
      *
+     * @param exec  The executor, on which the base pointer is located.
      * @param base  the base pointer for the window object.
      * @param num_elems  the num_elems of type ValueType the window points to.
      * @param comm  the communicator whose ranks will have windows created.
@@ -1133,11 +1570,12 @@ class window {
      * @param input_info  the MPI_Info object used to set certain properties.
      * @param c_type  the type of creation method to use to create the window.
      */
-    window(ValueType* base, int num_elems, const communicator& comm,
-           const int disp_unit = sizeof(ValueType),
+    window(std::shared_ptr<const Executor> exec, ValueType* base, int num_elems,
+           const communicator& comm, const int disp_unit = sizeof(ValueType),
            MPI_Info input_info = MPI_INFO_NULL,
            create_type c_type = create_type::create)
     {
+        auto guard = exec->get_scoped_device_id_guard();
         unsigned size = num_elems * sizeof(ValueType);
         if (c_type == create_type::create) {
             GKO_ASSERT_NO_MPI_ERRORS(MPI_Win_create(
@@ -1283,6 +1721,7 @@ class window {
     /**
      * Put data into the target window.
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param origin_buffer  the buffer to send
      * @param origin_count  the number of elements to put
      * @param target_rank  the rank to put the data to
@@ -1290,10 +1729,11 @@ class window {
      * @param target_count  the request handle for the send call
      */
     template <typename PutType>
-    void put(const PutType* origin_buffer, const int origin_count,
-             const int target_rank, const unsigned int target_disp,
-             const int target_count) const
+    void put(std::shared_ptr<const Executor> exec, const PutType* origin_buffer,
+             const int origin_count, const int target_rank,
+             const unsigned int target_disp, const int target_count) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         GKO_ASSERT_NO_MPI_ERRORS(
             MPI_Put(origin_buffer, origin_count, type_impl<PutType>::get_type(),
                     target_rank, target_disp, target_count,
@@ -1303,6 +1743,7 @@ class window {
     /**
      * Put data into the target window.
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param origin_buffer  the buffer to send
      * @param origin_count  the number of elements to put
      * @param target_rank  the rank to put the data to
@@ -1312,10 +1753,12 @@ class window {
      * @return  the request handle for the send call
      */
     template <typename PutType>
-    request r_put(const PutType* origin_buffer, const int origin_count,
+    request r_put(std::shared_ptr<const Executor> exec,
+                  const PutType* origin_buffer, const int origin_count,
                   const int target_rank, const unsigned int target_disp,
                   const int target_count) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         request req;
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Rput(
             origin_buffer, origin_count, type_impl<PutType>::get_type(),
@@ -1327,6 +1770,7 @@ class window {
     /**
      * Accumulate data into the target window.
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param origin_buffer  the buffer to send
      * @param origin_count  the number of elements to put
      * @param target_rank  the rank to put the data to
@@ -1335,10 +1779,12 @@ class window {
      * @param operation  the reduce operation. See @MPI_Op
      */
     template <typename PutType>
-    void accumulate(const PutType* origin_buffer, const int origin_count,
+    void accumulate(std::shared_ptr<const Executor> exec,
+                    const PutType* origin_buffer, const int origin_count,
                     const int target_rank, const unsigned int target_disp,
                     const int target_count, MPI_Op operation) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Accumulate(
             origin_buffer, origin_count, type_impl<PutType>::get_type(),
             target_rank, target_disp, target_count,
@@ -1348,6 +1794,7 @@ class window {
     /**
      * (Non-blocking) Accumulate data into the target window.
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param origin_buffer  the buffer to send
      * @param origin_count  the number of elements to put
      * @param target_rank  the rank to put the data to
@@ -1358,10 +1805,12 @@ class window {
      * @return  the request handle for the send call
      */
     template <typename PutType>
-    request r_accumulate(const PutType* origin_buffer, const int origin_count,
+    request r_accumulate(std::shared_ptr<const Executor> exec,
+                         const PutType* origin_buffer, const int origin_count,
                          const int target_rank, const unsigned int target_disp,
                          const int target_count, MPI_Op operation) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         request req;
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Raccumulate(
             origin_buffer, origin_count, type_impl<PutType>::get_type(),
@@ -1374,6 +1823,7 @@ class window {
     /**
      * Get data from the target window.
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param origin_buffer  the buffer to send
      * @param origin_count  the number of elements to get
      * @param target_rank  the rank to get the data from
@@ -1381,10 +1831,11 @@ class window {
      * @param target_count  the request handle for the send call
      */
     template <typename GetType>
-    void get(GetType* origin_buffer, const int origin_count,
-             const int target_rank, const unsigned int target_disp,
-             const int target_count) const
+    void get(std::shared_ptr<const Executor> exec, GetType* origin_buffer,
+             const int origin_count, const int target_rank,
+             const unsigned int target_disp, const int target_count) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         GKO_ASSERT_NO_MPI_ERRORS(
             MPI_Get(origin_buffer, origin_count, type_impl<GetType>::get_type(),
                     target_rank, target_disp, target_count,
@@ -1394,6 +1845,7 @@ class window {
     /**
      * Get data (with handle) from the target window.
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param origin_buffer  the buffer to send
      * @param origin_count  the number of elements to get
      * @param target_rank  the rank to get the data from
@@ -1403,10 +1855,11 @@ class window {
      * @return  the request handle for the send call
      */
     template <typename GetType>
-    request r_get(GetType* origin_buffer, const int origin_count,
-                  const int target_rank, const unsigned int target_disp,
-                  const int target_count) const
+    request r_get(std::shared_ptr<const Executor> exec, GetType* origin_buffer,
+                  const int origin_count, const int target_rank,
+                  const unsigned int target_disp, const int target_count) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         request req;
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Rget(
             origin_buffer, origin_count, type_impl<GetType>::get_type(),
@@ -1418,6 +1871,7 @@ class window {
     /**
      * Get Accumulate data from the target window.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param origin_buffer  the buffer to send
      * @param origin_count  the number of elements to get
      * @param result_buffer  the buffer to receive the target data
@@ -1428,11 +1882,13 @@ class window {
      * @param operation  the reduce operation. See @MPI_Op
      */
     template <typename GetType>
-    void get_accumulate(GetType* origin_buffer, const int origin_count,
+    void get_accumulate(std::shared_ptr<const Executor> exec,
+                        GetType* origin_buffer, const int origin_count,
                         GetType* result_buffer, const int result_count,
                         const int target_rank, const unsigned int target_disp,
                         const int target_count, MPI_Op operation) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Get_accumulate(
             origin_buffer, origin_count, type_impl<GetType>::get_type(),
             result_buffer, result_count, type_impl<GetType>::get_type(),
@@ -1443,6 +1899,7 @@ class window {
     /**
      * (Non-blocking) Get Accumulate data (with handle) from the target window.
      *
+     * @param exec  The executor, on which the message buffers are located.
      * @param origin_buffer  the buffer to send
      * @param origin_count  the number of elements to get
      * @param result_buffer  the buffer to receive the target data
@@ -1455,12 +1912,14 @@ class window {
      * @return  the request handle for the send call
      */
     template <typename GetType>
-    request r_get_accumulate(GetType* origin_buffer, const int origin_count,
+    request r_get_accumulate(std::shared_ptr<const Executor> exec,
+                             GetType* origin_buffer, const int origin_count,
                              GetType* result_buffer, const int result_count,
                              const int target_rank,
                              const unsigned int target_disp,
                              const int target_count, MPI_Op operation) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         request req;
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Rget_accumulate(
             origin_buffer, origin_count, type_impl<GetType>::get_type(),
@@ -1475,16 +1934,19 @@ class window {
      * Fetch and operate on data from the target window (An optimized version of
      * Get_accumulate).
      *
+     * @param exec  The executor, on which the message buffer is located.
      * @param origin_buffer  the buffer to send
      * @param target_rank  the rank to get the data from
      * @param target_disp  the displacement at the target window
      * @param operation  the reduce operation. See @MPI_Op
      */
     template <typename GetType>
-    void fetch_and_op(GetType* origin_buffer, GetType* result_buffer,
+    void fetch_and_op(std::shared_ptr<const Executor> exec,
+                      GetType* origin_buffer, GetType* result_buffer,
                       const int target_rank, const unsigned int target_disp,
                       MPI_Op operation) const
     {
+        auto guard = exec->get_scoped_device_id_guard();
         GKO_ASSERT_NO_MPI_ERRORS(MPI_Fetch_and_op(
             origin_buffer, result_buffer, type_impl<GetType>::get_type(),
             target_rank, target_disp, operation, this->get_window()));
@@ -1496,6 +1958,7 @@ class window {
 
 
 }  // namespace mpi
+}  // namespace experimental
 }  // namespace gko
 
 
diff --git a/include/ginkgo/core/base/perturbation.hpp b/include/ginkgo/core/base/perturbation.hpp
index 38da78be2cf..0da68a375bc 100644
--- a/include/ginkgo/core/base/perturbation.hpp
+++ b/include/ginkgo/core/base/perturbation.hpp
@@ -66,7 +66,7 @@ namespace gko {
 template <typename ValueType = default_precision>
 class Perturbation : public EnableLinOp<Perturbation<ValueType>>,
                      public EnableCreateMethod<Perturbation<ValueType>> {
-    friend class EnablePolymorphicObject<Perturbation, LinOp>;
+    friend struct polymorphic_object_traits<Perturbation>;
     friend class EnableCreateMethod<Perturbation>;
 
 public:
diff --git a/include/ginkgo/core/base/polymorphic_object.hpp b/include/ginkgo/core/base/polymorphic_object.hpp
index 19af66924bb..281838dfedf 100644
--- a/include/ginkgo/core/base/polymorphic_object.hpp
+++ b/include/ginkgo/core/base/polymorphic_object.hpp
@@ -609,6 +609,29 @@ std::shared_ptr<const R> copy_and_convert_to(
 }
 
 
+template <typename ConcreteObject>
+struct polymorphic_object_traits {
+    static std::unique_ptr<PolymorphicObject> create_default_impl(
+        const ConcreteObject* self, std::shared_ptr<const Executor> exec)
+    {
+        return std::unique_ptr<ConcreteObject>{new ConcreteObject(exec)};
+    }
+
+    template <typename OtherType>
+    static std::unique_ptr<ConcreteObject> create_conversion_target_impl(
+        const OtherType* self, std::shared_ptr<const Executor> exec)
+    {
+        return std::unique_ptr<ConcreteObject>{new ConcreteObject(exec)};
+    }
+
+    static PolymorphicObject* clear_impl(ConcreteObject* self)
+    {
+        *self = ConcreteObject{self->get_executor()};
+        return self;
+    }
+};
+
+
 /**
  * This mixin inherits from (a subclass of) PolymorphicObject and provides a
  * base implementation of a new concrete polymorphic object.
@@ -653,7 +676,8 @@ class EnablePolymorphicObject
     std::unique_ptr<PolymorphicObject> create_default_impl(
         std::shared_ptr<const Executor> exec) const override
     {
-        return std::unique_ptr<ConcreteObject>{new ConcreteObject(exec)};
+        return polymorphic_object_traits<ConcreteObject>::create_default_impl(
+            self(), std::move(exec));
     }
 
     PolymorphicObject* copy_from_impl(const PolymorphicObject* other) override
@@ -684,8 +708,7 @@ class EnablePolymorphicObject
 
     PolymorphicObject* clear_impl() override
     {
-        *self() = ConcreteObject{this->get_executor()};
-        return this;
+        return polymorphic_object_traits<ConcreteObject>::clear_impl(self());
     }
 
 private:
diff --git a/include/ginkgo/core/base/precision_dispatch.hpp b/include/ginkgo/core/base/precision_dispatch.hpp
index a10f1871718..3e03519ff7b 100644
--- a/include/ginkgo/core/base/precision_dispatch.hpp
+++ b/include/ginkgo/core/base/precision_dispatch.hpp
@@ -34,8 +34,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GKO_PUBLIC_CORE_BASE_PRECISION_DISPATCH_HPP_
 
 
+#include <ginkgo/config.hpp>
 #include <ginkgo/core/base/math.hpp>
 #include <ginkgo/core/base/temporary_conversion.hpp>
+#include <ginkgo/core/distributed/vector.hpp>
 #include <ginkgo/core/matrix/dense.hpp>
 
 
@@ -330,6 +332,288 @@ void mixed_precision_dispatch_real_complex(Function fn, const LinOp* in,
 }
 
 
+namespace experimental {
+
+
+#if GINKGO_BUILD_MPI
+
+
+namespace distributed {
+
+
+/**
+ * Convert the given LinOp from experimental::distributed::Vector<...> to
+ * experimental::distributed::Vector<ValueType>. The conversion tries to convert
+ * the input LinOp to all Dense types with value type recursively reachable by
+ * next_precision<...> starting from the ValueType template parameter. This
+ * means that all real-to-real and complex-to-complex conversions for default
+ * precisions are being considered. If the input matrix is non-const, the
+ * contents of the modified converted object will be converted back to the input
+ * matrix when the returned object is destroyed. This may lead to a loss of
+ * precision!
+ *
+ * @param matrix  the input matrix which is supposed to be converted. It is
+ *                wrapped unchanged if it is already of type
+ *                experimental::distributed::Vector<ValueType>, otherwise it
+ * will be converted to this type if possible.
+ *
+ * @returns  a detail::temporary_conversion pointing to the (potentially
+ *           converted) object.
+ *
+ * @throws NotSupported  if the input matrix cannot be converted to
+ *                       experimental::distributed::Vector<ValueType>
+ *
+ * @tparam ValueType  the value type into whose associated
+ * experimental::distributed::Vector type to convert the input LinOp.
+ */
+template <typename ValueType>
+detail::temporary_conversion<experimental::distributed::Vector<ValueType>>
+make_temporary_conversion(LinOp* matrix)
+{
+    auto result = detail::temporary_conversion<
+        experimental::distributed::Vector<ValueType>>::
+        template create<
+            experimental::distributed::Vector<next_precision<ValueType>>>(
+            matrix);
+    if (!result) {
+        GKO_NOT_SUPPORTED(matrix);
+    }
+    return result;
+}
+
+
+/**
+ * @copydoc make_temporary_conversion
+ */
+template <typename ValueType>
+detail::temporary_conversion<const experimental::distributed::Vector<ValueType>>
+make_temporary_conversion(const LinOp* matrix)
+{
+    auto result = detail::temporary_conversion<
+        const experimental::distributed::Vector<ValueType>>::
+        template create<
+            experimental::distributed::Vector<next_precision<ValueType>>>(
+            matrix);
+    if (!result) {
+        GKO_NOT_SUPPORTED(matrix);
+    }
+    return result;
+}
+
+
+/**
+ * Calls the given function with each given argument LinOp temporarily
+ * converted into experimental::distributed::Vector<ValueType> as parameters.
+ *
+ * @param fn  the given function. It will be passed one (potentially const)
+ *            experimental::distributed::Vector<ValueType>* parameter per
+ * parameter in the parameter pack `linops`.
+ * @param linops  the given arguments to be converted and passed on to fn.
+ *
+ * @tparam ValueType  the value type to use for the parameters of `fn`.
+ * @tparam Function  the function pointer, lambda or other functor type to call
+ *                   with the converted arguments.
+ * @tparam Args  the argument type list.
+ */
+template <typename ValueType, typename Function, typename... Args>
+void precision_dispatch(Function fn, Args*... linops)
+{
+    fn(distributed::make_temporary_conversion<ValueType>(linops).get()...);
+}
+
+
+/**
+ * Calls the given function with the given LinOps temporarily converted to
+ * experimental::distributed::Vector<ValueType>* as parameters.
+ * If ValueType is real and both input vectors are complex, uses
+ * experimental::distributed::Vector::get_real_view() to convert them into real
+ * matrices after precision conversion.
+ *
+ * @see precision_dispatch()
+ */
+template <typename ValueType, typename Function>
+void precision_dispatch_real_complex(Function fn, const LinOp* in, LinOp* out)
+{
+    auto complex_to_real = !(
+        is_complex<ValueType>() ||
+        dynamic_cast<const ConvertibleTo<experimental::distributed::Vector<>>*>(
+            in));
+    if (complex_to_real) {
+        auto dense_in =
+            distributed::make_temporary_conversion<to_complex<ValueType>>(in);
+        auto dense_out =
+            distributed::make_temporary_conversion<to_complex<ValueType>>(out);
+        using Vector = experimental::distributed::Vector<ValueType>;
+        // These dynamic_casts are only needed to make the code compile
+        // If ValueType is complex, this branch will never be taken
+        // If ValueType is real, the cast is a no-op
+        fn(dynamic_cast<const Vector*>(dense_in->create_real_view().get()),
+           dynamic_cast<Vector*>(dense_out->create_real_view().get()));
+    } else {
+        distributed::precision_dispatch<ValueType>(fn, in, out);
+    }
+}
+
+
+/**
+ * @copydoc precision_dispatch_real_complex(Function, const LinOp*, LinOp*)
+ */
+template <typename ValueType, typename Function>
+void precision_dispatch_real_complex(Function fn, const LinOp* alpha,
+                                     const LinOp* in, LinOp* out)
+{
+    auto complex_to_real = !(
+        is_complex<ValueType>() ||
+        dynamic_cast<const ConvertibleTo<experimental::distributed::Vector<>>*>(
+            in));
+    if (complex_to_real) {
+        auto dense_in =
+            distributed::make_temporary_conversion<to_complex<ValueType>>(in);
+        auto dense_out =
+            distributed::make_temporary_conversion<to_complex<ValueType>>(out);
+        auto dense_alpha = gko::make_temporary_conversion<ValueType>(alpha);
+        using Vector = experimental::distributed::Vector<ValueType>;
+        // These dynamic_casts are only needed to make the code compile
+        // If ValueType is complex, this branch will never be taken
+        // If ValueType is real, the cast is a no-op
+        fn(dense_alpha.get(),
+           dynamic_cast<const Vector*>(dense_in->create_real_view().get()),
+           dynamic_cast<Vector*>(dense_out->create_real_view().get()));
+    } else {
+        fn(gko::make_temporary_conversion<ValueType>(alpha).get(),
+           distributed::make_temporary_conversion<ValueType>(in).get(),
+           distributed::make_temporary_conversion<ValueType>(out).get());
+    }
+}
+
+
+/**
+ * @copydoc precision_dispatch_real_complex(Function, const LinOp*, LinOp*)
+ */
+template <typename ValueType, typename Function>
+void precision_dispatch_real_complex(Function fn, const LinOp* alpha,
+                                     const LinOp* in, const LinOp* beta,
+                                     LinOp* out)
+{
+    auto complex_to_real = !(
+        is_complex<ValueType>() ||
+        dynamic_cast<const ConvertibleTo<experimental::distributed::Vector<>>*>(
+            in));
+    if (complex_to_real) {
+        auto dense_in =
+            distributed::make_temporary_conversion<to_complex<ValueType>>(in);
+        auto dense_out =
+            distributed::make_temporary_conversion<to_complex<ValueType>>(out);
+        auto dense_alpha = gko::make_temporary_conversion<ValueType>(alpha);
+        auto dense_beta = gko::make_temporary_conversion<ValueType>(beta);
+        using Vector = experimental::distributed::Vector<ValueType>;
+        // These dynamic_casts are only needed to make the code compile
+        // If ValueType is complex, this branch will never be taken
+        // If ValueType is real, the cast is a no-op
+        fn(dense_alpha.get(),
+           dynamic_cast<const Vector*>(dense_in->create_real_view().get()),
+           dense_beta.get(),
+           dynamic_cast<Vector*>(dense_out->create_real_view().get()));
+    } else {
+        fn(gko::make_temporary_conversion<ValueType>(alpha).get(),
+           distributed::make_temporary_conversion<ValueType>(in).get(),
+           gko::make_temporary_conversion<ValueType>(beta).get(),
+           distributed::make_temporary_conversion<ValueType>(out).get());
+    }
+}
+
+
+}  // namespace distributed
+
+
+/**
+ * Calls the given function with the given LinOps temporarily converted to
+ * either experimental::distributed::Vector<ValueType>* or
+ * matrix::Dense<ValueType> as parameters. The choice depends on the runtime
+ * type of `in` and `out` is assumed to fall into the same category. If
+ * ValueType is real and both input vectors are complex, uses
+ * experimental::distributed::Vector::get_real_view(), or
+ * matrix::Dense::get_real_view() to convert them into real matrices after
+ * precision conversion.
+ *
+ * @see precision_dispatch()
+ * @see distributed::precision_dispatch()
+ */
+template <typename ValueType, typename Function>
+void precision_dispatch_real_complex_distributed(Function fn, const LinOp* in,
+                                                 LinOp* out)
+{
+    if (dynamic_cast<const experimental::distributed::DistributedBase*>(in)) {
+        experimental::distributed::precision_dispatch_real_complex<ValueType>(
+            fn, in, out);
+    } else {
+        gko::precision_dispatch_real_complex<ValueType>(fn, in, out);
+    }
+}
+
+
+/**
+ * @copydoc precision_dispatch_real_complex_distributed(Function, const LinOp*,
+ * LinOp*)
+ */
+template <typename ValueType, typename Function>
+void precision_dispatch_real_complex_distributed(Function fn,
+                                                 const LinOp* alpha,
+                                                 const LinOp* in, LinOp* out)
+{
+    if (dynamic_cast<const experimental::distributed::DistributedBase*>(in)) {
+        experimental::distributed::precision_dispatch_real_complex<ValueType>(
+            fn, alpha, in, out);
+    } else {
+        gko::precision_dispatch_real_complex<ValueType>(fn, alpha, in, out);
+    }
+}
+
+
+/**
+ * @copydoc precision_dispatch_real_complex_distributed(Function, const LinOp*,
+ * LinOp*)
+ */
+template <typename ValueType, typename Function>
+void precision_dispatch_real_complex_distributed(Function fn,
+                                                 const LinOp* alpha,
+                                                 const LinOp* in,
+                                                 const LinOp* beta, LinOp* out)
+{
+    if (dynamic_cast<const experimental::distributed::DistributedBase*>(in)) {
+        experimental::distributed::precision_dispatch_real_complex<ValueType>(
+            fn, alpha, in, beta, out);
+    } else {
+        gko::precision_dispatch_real_complex<ValueType>(fn, alpha, in, beta,
+                                                        out);
+    }
+}
+
+
+#else
+
+
+/**
+ * Calls the given function with the given LinOps temporarily converted to
+ * matrix::Dense<ValueType> as parameters.
+ * If ValueType is real and both input vectors are complex, uses
+ * experimental::distributed::Vector::get_real_view(), or
+ * matrix::Dense::get_real_view() to convert them into real matrices after
+ * precision conversion.
+ *
+ * @see precision_dispatch()
+ */
+template <typename ValueType, typename Function, typename... Args>
+void precision_dispatch_real_complex_distributed(Function fn, Args*... args)
+{
+    precision_dispatch_real_complex<ValueType>(fn, args...);
+}
+
+
+#endif
+
+
+}  // namespace experimental
 }  // namespace gko
 
 
diff --git a/include/ginkgo/core/base/scoped_device_id_guard.hpp b/include/ginkgo/core/base/scoped_device_id_guard.hpp
new file mode 100644
index 00000000000..d68598c73f1
--- /dev/null
+++ b/include/ginkgo/core/base/scoped_device_id_guard.hpp
@@ -0,0 +1,181 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_BASE_SCOPED_DEVICE_ID_GUARD_HPP_
+#define GKO_PUBLIC_CORE_BASE_SCOPED_DEVICE_ID_GUARD_HPP_
+
+
+#include <memory>
+
+
+namespace gko {
+
+
+class OmpExecutor;
+class ReferenceExecutor;
+class CudaExecutor;
+class HipExecutor;
+class DpcppExecutor;
+
+
+namespace detail {
+
+
+/**
+ * A RAII, move-only base class for the scoped device id used for different
+ * executors.
+ */
+class generic_scoped_device_id_guard {
+public:
+    generic_scoped_device_id_guard() = default;
+
+    // TODO: this should be a purely virtual funtion, but somehow that leads to
+    // linker errors
+    virtual ~generic_scoped_device_id_guard() = default;
+
+    // Prohibit copy construction
+    generic_scoped_device_id_guard(
+        const generic_scoped_device_id_guard& other) = delete;
+
+    // Prohibit copy assignment
+    generic_scoped_device_id_guard& operator=(
+        const generic_scoped_device_id_guard& other) = delete;
+};
+
+
+}  // namespace detail
+
+
+/**
+ * This move-only class uses RAII to set the device id within a scoped block, if
+ * necessary.
+ *
+ * The class behaves similar to std::scoped_lock. The scoped guard will make
+ * sure that the device code is run on the correct device within one scoped
+ * block, when run with multiple devices. Depending on the executor it will
+ * record the current device id and set the device id to the one being passed
+ * in. After the scope has been exited, the destructor sets the device_id back
+ * to the one before entering the scope. The OmpExecutor and DpcppExecutor don't
+ * require setting an device id, so in those cases, the class is a no-op.
+ *
+ * The device id scope has to be constructed from a executor with concrete type
+ * (not plain Executor) and a device id. Only the type of the executor object is
+ * relevant, so the pointer will not be accessed, and may even be a nullptr.
+ * From the executor type the correct derived class of
+ * detail::generic_scoped_device_id_guard is picked. The following illustrates
+ * the usage of this class:
+ * ```
+ * {
+ *   scoped_device_id_guard g{static_cast<CudaExecutor>(nullptr), 1};
+ *   // now the device id is set to 1
+ * }
+ * // now the device id is reverted again
+ * ```
+ */
+class scoped_device_id_guard {
+public:
+    /**
+     * Create a scoped device id from an Reference.
+     *
+     * The resulting object will be a noop.
+     *
+     * @param exec  Not used.
+     * @param device_id  Not used.
+     */
+    scoped_device_id_guard(const ReferenceExecutor* exec, int device_id);
+
+    /**
+     * Create a scoped device id from an OmpExecutor.
+     *
+     * The resulting object will be a noop.
+     *
+     * @param exec  Not used.
+     * @param device_id  Not used.
+     */
+    scoped_device_id_guard(const OmpExecutor* exec, int device_id);
+
+    /**
+     * Create a scoped device id from an CudaExecutor.
+     *
+     * The resulting object will set the cuda device id accordingly.
+     *
+     * @param exec  Not used.
+     * @param device_id  The device id to use within the scope.
+     */
+    scoped_device_id_guard(const CudaExecutor* exec, int device_id);
+
+    /**
+     * Create a scoped device id from an HipExecutor.
+     *
+     * The resulting object will set the hip device id accordingly.
+     *
+     * @param exec  Not used.
+     * @param device_id  The device id to use within the scope.
+     */
+    scoped_device_id_guard(const HipExecutor* exec, int device_id);
+
+    /**
+     * Create a scoped device id from an DpcppExecutor.
+     *
+     * The resulting object will be a noop.
+     *
+     * @param exec  Not used.
+     * @param device_id  Not used.
+     */
+    scoped_device_id_guard(const DpcppExecutor* exec, int device_id);
+
+    scoped_device_id_guard() = default;
+
+    // Prohibit copy construction.
+    scoped_device_id_guard(const scoped_device_id_guard&) = delete;
+
+    // Allow move construction.
+    // These are needed, since C++14 does not guarantee copy elision.
+    scoped_device_id_guard(scoped_device_id_guard&&) = default;
+
+    // Prohibit copy assignment.
+    scoped_device_id_guard& operator=(const scoped_device_id_guard&) = delete;
+
+    // Allow move construction.
+    // These are needed, since C++14 does not guarantee copy elision.
+    scoped_device_id_guard& operator=(scoped_device_id_guard&&) = default;
+
+    ~scoped_device_id_guard() = default;
+
+private:
+    std::unique_ptr<detail::generic_scoped_device_id_guard> scope_;
+};
+
+
+}  // namespace gko
+
+#endif  // GKO_PUBLIC_CORE_BASE_SCOPED_DEVICE_ID_GUARD_HPP_
diff --git a/include/ginkgo/core/base/temporary_conversion.hpp b/include/ginkgo/core/base/temporary_conversion.hpp
index 314e169cc78..a553011c02c 100644
--- a/include/ginkgo/core/base/temporary_conversion.hpp
+++ b/include/ginkgo/core/base/temporary_conversion.hpp
@@ -146,7 +146,11 @@ struct conversion_helper {
         if ((cast_obj = dynamic_cast<candidate_type*>(obj))) {
             // if the cast is successful, obj is of dynamic type candidate_type
             // so we can convert from this type to TargetType
-            auto converted = TargetType::create(obj->get_executor());
+            auto converted =
+                polymorphic_object_traits<std::remove_cv_t<TargetType>>::
+                    create_conversion_target_impl(cast_obj,
+                                                  cast_obj->get_executor());
+            // TargetType::create(obj->get_executor());
             cast_obj->convert_to(converted.get());
             // Make sure ConvertibleTo<TargetType> is available and symmetric
             static_assert(
diff --git a/include/ginkgo/core/base/types.hpp b/include/ginkgo/core/base/types.hpp
index 009c2e182cd..efb6dffe30d 100644
--- a/include/ginkgo/core/base/types.hpp
+++ b/include/ginkgo/core/base/types.hpp
@@ -152,7 +152,7 @@ using uint64 = std::uint64_t;
 
 
 /**
- *
+ * Unsigned integer type capable of holding a pointer to void
  */
 using uintptr = std::uintptr_t;
 
@@ -559,6 +559,73 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
 #endif
 
 
+/**
+ * Instantiates a template for each non-complex value, local and global index
+ * type compiled by Ginkgo.
+ *
+ * @param _macro  A macro which expands the template instantiation
+ *                (not including the leading `template` specifier).
+ *                Should take three arguments, which are replaced by the
+ *                value, the local and the global index types.
+ */
+#if GINKGO_DPCPP_SINGLE_MODE
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \
+    _macro)                                                                     \
+    template _macro(float, int32, int32);                                       \
+    template _macro(float, int32, int64);                                       \
+    template _macro(float, int64, int64);                                       \
+    template <>                                                                 \
+    _macro(double, int32, int32) GKO_NOT_IMPLEMENTED;                           \
+    template <>                                                                 \
+    _macro(double, int32, int64) GKO_NOT_IMPLEMENTED;                           \
+    template <>                                                                 \
+    _macro(double, int64, int64) GKO_NOT_IMPLEMENTED
+#else
+#define GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \
+    _macro)                                                                     \
+    template _macro(float, int32, int32);                                       \
+    template _macro(float, int32, int64);                                       \
+    template _macro(float, int64, int64);                                       \
+    template _macro(double, int32, int32);                                      \
+    template _macro(double, int32, int64);                                      \
+    template _macro(double, int64, int64)
+#endif
+
+
+/**
+ * Instantiates a template for each value and index type compiled by Ginkgo.
+ *
+ * @param _macro  A macro which expands the template instantiation
+ *                (not including the leading `template` specifier).
+ *                Should take two arguments, which are replaced by the
+ *                value and index types.
+ */
+#if GINKGO_DPCPP_SINGLE_MODE
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro)  \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \
+        _macro);                                                            \
+    template _macro(std::complex<float>, int32, int32);                     \
+    template _macro(std::complex<float>, int32, int64);                     \
+    template _macro(std::complex<float>, int64, int64);                     \
+    template <>                                                             \
+    _macro(std::complex<double>, int32, int32) GKO_NOT_IMPLEMENTED;         \
+    template <>                                                             \
+    _macro(std::complex<double>, int32, int64) GKO_NOT_IMPLEMENTED;         \
+    template <>                                                             \
+    _macro(std::complex<double>, int64, int64) GKO_NOT_IMPLEMENTED
+#else
+#define GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(_macro)  \
+    GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE( \
+        _macro);                                                            \
+    template _macro(std::complex<float>, int32, int32);                     \
+    template _macro(std::complex<float>, int32, int64);                     \
+    template _macro(std::complex<float>, int64, int64);                     \
+    template _macro(std::complex<double>, int32, int32);                    \
+    template _macro(std::complex<double>, int32, int64);                    \
+    template _macro(std::complex<double>, int64, int64)
+#endif
+
+
 #if GINKGO_DPCPP_SINGLE_MODE
 #define GKO_INSTANTIATE_FOR_EACH_VALUE_CONVERSION(_macro)                  \
     template <>                                                            \
@@ -580,8 +647,6 @@ GKO_ATTRIBUTES constexpr bool operator!=(precision_reduction x,
     template <>                                                   \
     _macro(std::complex<double>, std::complex<double>) GKO_NOT_IMPLEMENTED
 #else
-
-
 /**
  * Instantiates a template for each value type conversion pair compiled by
  * Ginkgo.
@@ -699,6 +764,7 @@ inline constexpr GKO_ATTRIBUTES IndexType invalid_index()
 }
 
 
+namespace experimental {
 namespace distributed {
 
 
@@ -726,6 +792,7 @@ using comm_index_type = int;
 
 
 }  // namespace distributed
+}  // namespace experimental
 }  // namespace gko
 
 
diff --git a/include/ginkgo/core/distributed/base.hpp b/include/ginkgo/core/distributed/base.hpp
new file mode 100644
index 00000000000..70459d9cbf0
--- /dev/null
+++ b/include/ginkgo/core/distributed/base.hpp
@@ -0,0 +1,106 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_DISTRIBUTED_BASE_HPP_
+#define GKO_PUBLIC_CORE_DISTRIBUTED_BASE_HPP_
+
+
+#include <ginkgo/config.hpp>
+
+
+#if GINKGO_BUILD_MPI
+
+
+#include <ginkgo/core/base/mpi.hpp>
+
+
+namespace gko {
+namespace experimental {
+namespace distributed {
+
+
+/**
+ * A base class for distributed objects.
+ *
+ * This class stores and gives access to the used mpi::communicator object.
+ *
+ * @note The communicator is not changed on assignment.
+ *
+ * @ingroup distributed
+ */
+class DistributedBase {
+public:
+    virtual ~DistributedBase() = default;
+
+    DistributedBase(const DistributedBase& other) = default;
+
+    DistributedBase(DistributedBase&& other) = default;
+
+    /**
+     * Copy assignment that doesn't change the used mpi::communicator.
+     * @return  unmodified *this
+     */
+    DistributedBase& operator=(const DistributedBase&) { return *this; }
+
+    /**
+     * Move assignment that doesn't change the used mpi::communicator.
+     * @return  unmodified *this
+     */
+    DistributedBase& operator=(DistributedBase&&) noexcept { return *this; }
+
+    /**
+     * Access the used mpi::communicator.
+     * @return  used mpi::communicator
+     */
+    mpi::communicator get_communicator() const { return comm_; }
+
+protected:
+    /**
+     * Creates a new DistributedBase with the specified mpi::communicator.
+     * @param comm  used mpi::communicator
+     */
+    explicit DistributedBase(mpi::communicator comm) : comm_{std::move(comm)} {}
+
+private:
+    mpi::communicator comm_;
+};
+
+
+}  // namespace distributed
+}  // namespace experimental
+}  // namespace gko
+
+
+#endif  // GINKGO_BUILD_MPI
+
+
+#endif  // GKO_PUBLIC_CORE_DISTRIBUTED_BASE_HPP_
diff --git a/include/ginkgo/core/distributed/matrix.hpp b/include/ginkgo/core/distributed/matrix.hpp
new file mode 100644
index 00000000000..0000d56600c
--- /dev/null
+++ b/include/ginkgo/core/distributed/matrix.hpp
@@ -0,0 +1,568 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_DISTRIBUTED_MATRIX_HPP_
+#define GKO_PUBLIC_CORE_DISTRIBUTED_MATRIX_HPP_
+
+
+#include <ginkgo/config.hpp>
+
+
+#if GINKGO_BUILD_MPI
+
+
+#include <ginkgo/core/base/dense_cache.hpp>
+#include <ginkgo/core/base/lin_op.hpp>
+#include <ginkgo/core/base/mpi.hpp>
+#include <ginkgo/core/distributed/base.hpp>
+
+
+namespace gko {
+namespace matrix {
+
+
+template <typename ValueType, typename IndexType>
+class Csr;
+
+
+}
+
+
+namespace detail {
+
+
+template <template <typename, typename> class MatrixType,
+          typename... CreateArgs>
+struct MatrixTypeBuilderFromValueAndIndex {
+    template <typename ValueType, typename IndexType, std::size_t... I>
+    auto create_impl(std::shared_ptr<const Executor> exec,
+                     std::index_sequence<I...>)
+    {
+        return MatrixType<ValueType, IndexType>::create(
+            exec, std::get<I>(create_args)...);
+    }
+
+
+    template <typename ValueType, typename IndexType>
+    auto create(std::shared_ptr<const Executor> exec)
+    {
+        // with c++17 we could use std::apply
+        static constexpr auto size = sizeof...(CreateArgs);
+        return create_impl<ValueType, IndexType>(
+            std::move(exec), std::make_index_sequence<size>{});
+    }
+
+    std::tuple<CreateArgs...> create_args;
+};
+
+
+}  // namespace detail
+
+
+/**
+ * This function returns a type that delays a call to MatrixType::create.
+ *
+ * It can be used to set the used value and index type, as well as the executor
+ * at a later stage.
+ *
+ * For example, the following code creates first a temporary object, which is
+ * then used later to construct an operator of the previously defined base type:
+ * ```
+ * auto type = gko::with_matrix_type<gko::matrix::Csr>();
+ * ...
+ * std::unique_ptr<LinOp> concrete_op
+ * if(flag1){
+ *   concrete_op = type.template create<double, int>(exec);
+ * } else {
+ *   concrete_op = type.template create<float, int>(exec);
+ * }
+ * ```
+ *
+ * @note This is mainly a helper function to specify the local matrix type for a
+ *       gko::experimental::distributed::Matrix more easily.
+ *
+ * @tparam MatrixType  A template type that accepts two types, the first one
+ *                     will be set to the value type, the second one to the
+ *                     index type.
+ * @tparam Args  Types of the arguments passed to MatrixType::create.
+ *
+ * @param create_args  arguments that will be forwarded to MatrixType::create
+ *
+ * @return  A type with a function `create<value_type, index_type>(executor)`.
+ */
+template <template <typename, typename> class MatrixType, typename... Args>
+auto with_matrix_type(Args&&... create_args)
+{
+    return detail::MatrixTypeBuilderFromValueAndIndex<MatrixType, Args...>{
+        std::make_tuple(create_args...)};
+}
+
+
+namespace experimental {
+namespace distributed {
+
+
+template <typename LocalIndexType, typename GlobalIndexType>
+class Partition;
+template <typename ValueType>
+class Vector;
+
+
+/**
+ * The Matrix class defines a (MPI-)distributed matrix.
+ *
+ * The matrix is stored in a row-wise distributed format.
+ * Each process owns a specific set of rows, where the assignment of rows is
+ * defined by a row Partition. The following depicts the distribution of
+ * global rows according to their assigned part-id (which will usually be the
+ * owning process id):
+ * ```
+ * Part-Id  Global Rows                   Part-Id  Local Rows
+ * 0        | .. 1  2  .. .. .. |         0        | .. 1  2  .. .. .. |
+ * 1        | 3  4  .. .. .. .. |                  | 13 .. .. .. 14 .. |
+ * 2        | .. 5  6  ..  7 .. |  ---->  1        | 3  4  .. .. .. .. |
+ * 2        | .. .. .. 8  ..  9 |  ---->           | .. .. .. 10 11 12 |
+ * 1        | .. .. .. 10 11 12 |         2        | .. 5  6  ..  7 .. |
+ * 0        | 13 .. .. .. 14 .. |                  | .. .. .. 8  ..  9 |
+ * ```
+ * The local rows are further split into two matrices on each process.
+ * One matrix, called `local`, contains only entries from columns that are
+ * also owned by the process, while the other one, called `non_local`,
+ * contains entries from columns that are not owned by the process. The
+ * non-local matrix is stored in a compressed format, where empty columns are
+ * discarded and the remaining columns are renumbered. This splitting is
+ * depicted in the following:
+ * ```
+ * Part-Id  Global                            Local      Non-Local
+ * 0        | .. 1  ⁞ 2  .. ⁞ .. .. |         | .. 1  |  | 2  |
+ * 0        | 3  4  ⁞ .. .. ⁞ .. .. |         | 3  4  |  | .. |
+ *          |-----------------------|
+ * 1        | .. 5  ⁞ 6  .. ⁞ 7  .. |  ---->  | 6  .. |  | 5  7  .. |
+ * 1        | .. .. ⁞ .. 8  ⁞ ..  9 |  ---->  | 8  .. |  | .. .. 9  |
+ *          |-----------------------|
+ * 2        | .. .. ⁞ .. 10 ⁞ 11 12 |         | 11 12 |  | .. 10 |
+ * 2        | 13 .. ⁞ .. .. ⁞ 14 .. |         | 14 .. |  | 13 .. |
+ * ```
+ * This uses the same ownership of the columns as for the rows.
+ * Additionally, the ownership of the columns may be explicitly defined with an
+ * second column partition. If that is not provided, the same row partition will
+ * be used for the columns. Using a column partition also allows to create
+ * non-square matrices, like the one below:
+ * ```
+ * Part-Id  Global                  Local      Non-Local
+ * P_R/P_C    2  2  0  1
+ * 0        | .. 1  2  .. |         | 2  |     | 1  .. |
+ * 0        | 3  4  .. .. |         | .. |     | 3  4  |
+ *          |-------------|
+ * 1        | .. 5  6  .. |  ---->  | .. |     | 6  5  |
+ * 1        | .. .. .. 8  |  ---->  | 8  |     | .. .. |
+ *          |-------------|
+ * 2        | .. .. .. 10 |         | .. .. |  | 10 |
+ * 2        | 13 .. .. .. |         | 13 .. |  | .. |
+ * ```
+ * Here `P_R` denotes the row partition and `P_C` denotes the column partition.
+ *
+ * The Matrix should be filled using the read_distributed method, e.g.
+ * ```
+ * auto part = Partition<...>::build_from_mapping(...);
+ * auto mat = Matrix<...>::create(exec, comm);
+ * mat->read_distributed(matrix_data, part);
+ * ```
+ * or if different partitions for the rows and columns are used:
+ * ```
+ * auto row_part = Partition<...>::build_from_mapping(...);
+ * auto col_part = Partition<...>::build_from_mapping(...);
+ * auto mat = Matrix<...>::create(exec, comm);
+ * mat->read_distributed(matrix_data, row_part, col_part);
+ * ```
+ * This will set the dimensions of the global and local matrices automatically
+ * by deducing the sizes from the partitions.
+ *
+ * By default the Matrix type uses Csr for both stored matrices. It is possible
+ * to explicitly change the datatype for the stored matrices, with the
+ * constraint that the new type should implement the LinOp and
+ * ReadableFromMatrixData interface. The type can be set by:
+ * ```
+ * auto mat = Matrix<ValueType, LocalIndexType[, ...]>::create(
+ *   exec, comm,
+ *   Ell<ValueType, LocalIndexType>::create(exec).get(),
+ *   Coo<ValueType, LocalIndexType>::create(exec).get());
+ * ```
+ * Alternatively, the helper function with_matrix_type can be used:
+ * ```
+ * auto mat = Matrix<ValueType, LocalIndexType>::create(
+ *   exec, comm,
+ *   with_matrix_type<Ell>(),
+ *   with_matrix_type<Coo>());
+ * ```
+ * @see with_matrix_type
+ *
+ * The Matrix LinOp supports the following operations:
+ * ```cpp
+ * experimental::distributed::Matrix *A;       // distributed matrix
+ * experimental::distributed::Vector *b, *x;   // distributed multi-vectors
+ * matrix::Dense *alpha, *beta;  // scalars of dimension 1x1
+ *
+ * // Applying to distributed multi-vectors computes an SpMV/SpMM product
+ * A->apply(b, x)              // x = A*b
+ * A->apply(alpha, b, beta, x) // x = alpha*A*b + beta*x
+ * ```
+ *
+ * @tparam ValueType  The underlying value type.
+ * @tparam LocalIndexType  The index type used by the local matrices.
+ * @tparam GlobalIndexType  The type for global indices.
+ */
+template <typename ValueType = default_precision,
+          typename LocalIndexType = int32, typename GlobalIndexType = int64>
+class Matrix
+    : public EnableLinOp<Matrix<ValueType, LocalIndexType, GlobalIndexType>>,
+      public EnableCreateMethod<
+          Matrix<ValueType, LocalIndexType, GlobalIndexType>>,
+      public ConvertibleTo<
+          Matrix<next_precision<ValueType>, LocalIndexType, GlobalIndexType>>,
+      public DistributedBase {
+    friend class EnableCreateMethod<Matrix>;
+    friend struct polymorphic_object_traits<Matrix>;
+    friend class Matrix<next_precision<ValueType>, LocalIndexType,
+                        GlobalIndexType>;
+
+public:
+    using value_type = ValueType;
+    using index_type = GlobalIndexType;
+    using local_index_type = LocalIndexType;
+    using global_index_type = GlobalIndexType;
+    using global_vector_type =
+        gko::experimental::distributed::Vector<ValueType>;
+    using local_vector_type = typename global_vector_type::local_vector_type;
+
+    using EnableLinOp<Matrix>::convert_to;
+    using EnableLinOp<Matrix>::move_to;
+
+    void convert_to(Matrix<next_precision<value_type>, local_index_type,
+                           global_index_type>* result) const override;
+
+    void move_to(Matrix<next_precision<value_type>, local_index_type,
+                        global_index_type>* result) override;
+
+    /**
+     * Reads a square matrix from the device_matrix_data structure and a global
+     * partition.
+     *
+     * The global size of the final matrix is inferred from the size of the
+     * partition. Both the number of rows and columns of the device_matrix_data
+     * are ignored.
+     *
+     * @note The matrix data can contain entries for rows other than those owned
+     *        by the process. Entries for those rows are discarded.
+     *
+     * @param data  The device_matrix_data structure.
+     * @param partition  The global row and column partition.
+     */
+    void read_distributed(
+        const device_matrix_data<value_type, global_index_type>& data,
+        const Partition<local_index_type, global_index_type>* partition);
+
+    /**
+     * Reads a square matrix from the matrix_data structure and a global
+     * partition.
+     *
+     * @see read_distributed
+     *
+     * @note For efficiency it is advised to use the device_matrix_data
+     * overload.
+     */
+    void read_distributed(
+        const matrix_data<value_type, global_index_type>& data,
+        const Partition<local_index_type, global_index_type>* partition);
+
+    /**
+     * Reads a matrix from the device_matrix_data structure, a global row
+     * partition, and a global column partition.
+     *
+     * The global size of the final matrix is inferred from the size of the row
+     * partition and the size of the column partition. Both the number of rows
+     * and columns of the device_matrix_data are ignored.
+     *
+     * @note The matrix data can contain entries for rows other than those owned
+     *        by the process. Entries for those rows are discarded.
+     *
+     * @param data  The device_matrix_data structure.
+     * @param row_partition  The global row partition.
+     * @param col_partition  The global col partition.
+     */
+    void read_distributed(
+        const device_matrix_data<value_type, global_index_type>& data,
+        const Partition<local_index_type, global_index_type>* row_partition,
+        const Partition<local_index_type, global_index_type>* col_partition);
+
+    /**
+     * Reads a matrix from the matrix_data structure, a global row partition,
+     * and a global column partition.
+     *
+     * @see read_distributed
+     *
+     * @note For efficiency it is advised to use the device_matrix_data
+     * overload.
+     */
+    void read_distributed(
+        const matrix_data<value_type, global_index_type>& data,
+        const Partition<local_index_type, global_index_type>* row_partition,
+        const Partition<local_index_type, global_index_type>* col_partition);
+
+    /**
+     * Get read access to the stored local matrix.
+     *
+     * @return  Shared pointer to the stored local matrix
+     */
+    std::shared_ptr<const LinOp> get_local_matrix() const { return local_mtx_; }
+
+    /**
+     * Get read access to the stored non-local matrix.
+     *
+     * @return  Shared pointer to the stored non-local matrix
+     */
+    std::shared_ptr<const LinOp> get_non_local_matrix() const
+    {
+        return non_local_mtx_;
+    }
+
+    /**
+     * Copy constructs a Matrix.
+     *
+     * @param other  Matrix to copy from.
+     */
+    Matrix(const Matrix& other);
+
+    /**
+     * Move constructs a Matrix.
+     *
+     * @param other  Matrix to move from.
+     */
+    Matrix(Matrix&& other) noexcept;
+
+    /**
+     * Copy assigns a Matrix.
+     *
+     * @param other  Matrix to copy from, has to have a communicator of the same
+     *               size as this.
+     *
+     * @return  this.
+     */
+    Matrix& operator=(const Matrix& other);
+
+    /**
+     * Move assigns a Matrix.
+     *
+     * @param other  Matrix to move from, has to have a communicator of the same
+     *               size as this.
+     *
+     * @return  this.
+     */
+    Matrix& operator=(Matrix&& other);
+
+protected:
+    /**
+     * Creates an empty distributed matrix.
+     *
+     * @param exec  Executor associated with this matrix.
+     * @param comm  Communicator associated with this matrix.
+     *              The default is the MPI_COMM_WORLD.
+     */
+    Matrix(std::shared_ptr<const Executor> exec, mpi::communicator comm);
+
+    /**
+     * Creates an empty distributed matrix with specified type
+     * for local matricies.
+     *
+     * @note This is mainly a convenience wrapper for
+     *       Matrix(std::shared_ptr<const Executor>, mpi::communicator, const
+     *       LinOp*)
+     *
+     * @tparam MatrixType  A type that has a `create<ValueType,
+     *                     IndexType>(exec)` function to create a smart pointer
+     *                     of a type derived from LinOp and
+     *                     ReadableFromMatrixData. @see with_matrix_type
+     * @param exec  Executor associated with this matrix.
+     * @param comm  Communicator associated with this matrix.
+     * @param matrix_template  the local matrices will be constructed with the
+     *                         same type as `create` returns. It should be the
+     *                         return value of make_matrix_template.
+     */
+    template <typename MatrixType>
+    explicit Matrix(std::shared_ptr<const Executor> exec,
+                    mpi::communicator comm, MatrixType matrix_template)
+        : Matrix(exec, comm,
+                 static_cast<const LinOp*>(
+                     matrix_template
+                         .template create<ValueType, LocalIndexType>(exec)
+                         .get()))
+    {}
+
+    /**
+     * Creates an empty distributed matrix with specified types for the local
+     * matrix and the non-local matrix.
+     *
+     * @note This is mainly a convenience wrapper for
+     *       Matrix(std::shared_ptr<const Executor>, mpi::communicator,
+     *       const LinOp*, const LinOp*)
+     *
+     * @tparam LocalMatrixType  A type that has a `create<ValueType,
+     *                          IndexType>(exec)` function to create a smart
+     *                          pointer of a type derived from LinOp and
+     *                          ReadableFromMatrixData. @see with_matrix_type
+     * @tparam NonLocalMatrixType  A (possible different) type with the same
+     *                             constraints as LocalMatrixType.
+     *
+     * @param exec  Executor associated with this matrix.
+     * @param comm  Communicator associated with this matrix.
+     * @param local_matrix_template  the local matrix will be constructed
+     *                               with the same type as `create` returns. It
+     *                               should be the return value of
+     *                               make_matrix_template.
+     * @param non_local_matrix_template  the non-local matrix will be
+     *                                   constructed with the same type as
+     *                                   `create` returns. It should be the
+     *                                   return value of make_matrix_template.
+     */
+    template <typename LocalMatrixType, typename NonLocalMatrixType>
+    explicit Matrix(std::shared_ptr<const Executor> exec,
+                    mpi::communicator comm,
+                    LocalMatrixType local_matrix_template,
+                    NonLocalMatrixType non_local_matrix_template)
+        : Matrix(exec, comm,
+                 static_cast<const LinOp*>(
+                     local_matrix_template
+                         .template create<ValueType, LocalIndexType>(exec)
+                         .get()),
+                 static_cast<const LinOp*>(
+                     non_local_matrix_template
+                         .template create<ValueType, LocalIndexType>(exec)
+                         .get()))
+    {}
+
+    /**
+     * Creates an empty distributed matrix with specified type
+     * for local matricies.
+     *
+     * @note It internally clones the passed in matrix_template. Therefore, the
+     *       LinOp should be empty.
+     *
+     * @param exec  Executor associated with this matrix.
+     * @param comm  Communicator associated with this matrix.
+     * @param matrix_template  the local matrices will be constructed with the
+     *                         same runtime type.
+     */
+    explicit Matrix(std::shared_ptr<const Executor> exec,
+                    mpi::communicator comm, const LinOp* matrix_template);
+
+    /**
+     * Creates an empty distributed matrix with specified types for the local
+     * matrix and the non-local matrix.
+     *
+     * @note It internally clones the passed in local_matrix_template and
+     *       non_local_matrix_template. Therefore, those LinOps should be empty.
+     *
+     * @param exec  Executor associated with this matrix.
+     * @param comm  Communicator associated with this matrix.
+     * @param local_matrix_template  the local matrix will be constructed
+     *                               with the same runtime type.
+     * @param non_local_matrix_template  the non-local matrix will be
+     *                                   constructed with the same runtime type.
+     */
+    explicit Matrix(std::shared_ptr<const Executor> exec,
+                    mpi::communicator comm, const LinOp* local_matrix_template,
+                    const LinOp* non_local_matrix_template);
+
+    /**
+     * Starts a non-blocking communication of the values of b that are shared
+     * with other processors.
+     *
+     * @param local_b  The full local vector to be communicated. The subset of
+     *                 shared values is automatically extracted.
+     * @return  MPI request for the non-blocking communication.
+     */
+    mpi::request communicate(const local_vector_type* local_b) const;
+
+    void apply_impl(const LinOp* b, LinOp* x) const override;
+
+    void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta,
+                    LinOp* x) const override;
+
+private:
+    std::vector<comm_index_type> send_offsets_;
+    std::vector<comm_index_type> send_sizes_;
+    std::vector<comm_index_type> recv_offsets_;
+    std::vector<comm_index_type> recv_sizes_;
+    array<local_index_type> gather_idxs_;
+    array<global_index_type> non_local_to_global_;
+    gko::detail::DenseCache<value_type> one_scalar_;
+    gko::detail::DenseCache<value_type> host_send_buffer_;
+    gko::detail::DenseCache<value_type> host_recv_buffer_;
+    gko::detail::DenseCache<value_type> send_buffer_;
+    gko::detail::DenseCache<value_type> recv_buffer_;
+    std::shared_ptr<LinOp> local_mtx_;
+    std::shared_ptr<LinOp> non_local_mtx_;
+};
+
+
+}  // namespace distributed
+}  // namespace experimental
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+struct polymorphic_object_traits<experimental::distributed::Matrix<
+    ValueType, LocalIndexType, GlobalIndexType>> {
+    using Matrix = experimental::distributed::Matrix<ValueType, LocalIndexType,
+                                                     GlobalIndexType>;
+
+    static std::unique_ptr<PolymorphicObject> create_default_impl(
+        const Matrix* self, std::shared_ptr<const Executor> exec)
+    {
+        return std::unique_ptr<Matrix>{
+            new Matrix(exec, self->get_communicator())};
+    }
+
+    static PolymorphicObject* clear_impl(Matrix* self)
+    {
+        *self = Matrix{self->get_executor(), self->get_communicator()};
+        return self;
+    }
+};
+
+
+}  // namespace gko
+
+
+#endif
+
+
+#endif  // GKO_PUBLIC_CORE_DISTRIBUTED_MATRIX_HPP_
diff --git a/include/ginkgo/core/distributed/partition.hpp b/include/ginkgo/core/distributed/partition.hpp
index e78ee6329c8..3c5b0824818 100644
--- a/include/ginkgo/core/distributed/partition.hpp
+++ b/include/ginkgo/core/distributed/partition.hpp
@@ -40,6 +40,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 namespace gko {
+namespace experimental {
+/**
+ * @brief The distributed namespace.
+ *
+ * @ingroup distributed
+ */
 namespace distributed {
 
 
@@ -87,12 +93,15 @@ namespace distributed {
  * for(int i = 0; i < r[3] - r[2]; ++i){
  *   data[starting_index[2] + i] = val;
  * }
+ * ```
  *
  * @tparam LocalIndexType  The index type used for part-local indices.
  *                         To prevent overflows, no single part's size may
  *                         exceed this index type's maximum value.
  * @tparam GlobalIndexType  The index type used for the global indices. Needs
  *                          to be at least as large a type as LocalIndexType.
+ *
+ * @ingroup distributed
  */
 template <typename LocalIndexType = int32, typename GlobalIndexType = int64>
 class Partition
@@ -102,7 +111,7 @@ class Partition
           Partition<LocalIndexType, GlobalIndexType>>,
       public EnableCreateMethod<Partition<LocalIndexType, GlobalIndexType>> {
     friend class EnableCreateMethod<Partition>;
-    friend class EnablePolymorphicObject<Partition>;
+    friend struct polymorphic_object_traits<Partition>;
     static_assert(sizeof(GlobalIndexType) >= sizeof(LocalIndexType),
                   "GlobalIndexType must be at least as large as "
                   "LocalIndexType");
@@ -316,6 +325,7 @@ class Partition
 
 
 }  // namespace distributed
+}  // namespace experimental
 }  // namespace gko
 
 
diff --git a/include/ginkgo/core/distributed/vector.hpp b/include/ginkgo/core/distributed/vector.hpp
new file mode 100644
index 00000000000..a9309fa61d9
--- /dev/null
+++ b/include/ginkgo/core/distributed/vector.hpp
@@ -0,0 +1,547 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_PUBLIC_CORE_DISTRIBUTED_VECTOR_HPP_
+#define GKO_PUBLIC_CORE_DISTRIBUTED_VECTOR_HPP_
+
+
+#include <ginkgo/config.hpp>
+
+
+#if GINKGO_BUILD_MPI
+
+
+#include <ginkgo/core/base/dense_cache.hpp>
+#include <ginkgo/core/base/mpi.hpp>
+#include <ginkgo/core/distributed/base.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+
+
+namespace gko {
+namespace experimental {
+namespace distributed {
+
+
+template <typename LocalIndexType, typename GlobalIndexType>
+class Partition;
+
+
+/**
+ * Vector is a format which explicitly stores (multiple) distributed column
+ * vectors in a dense storage format.
+ *
+ * The (multi-)vector is distributed by row, which is described by a @see
+ * Partition. The local vectors are stored using the @see Dense format. The
+ * vector should be filled using the read_distributed method, e.g.
+ * ```
+ * auto part = Partition<...>::build_from_mapping(...);
+ * auto vector = Vector<...>::create(exec, comm);
+ * vector->read_distributed(matrix_data, part);
+ * ```
+ * Using this approach the size of the global vectors, as well as the size of
+ * the local vectors, will be automatically inferred. It is possible to create a
+ * vector with specified global and local sizes and fill the local vectors using
+ * the accessor get_local_vector.
+ *
+ * @note Operations between two vectors (axpy, dot product, etc.) are only valid
+ * if both vectors where created using the same partition.
+ *
+ * @tparam ValueType  The precision of vector elements.
+ *
+ * @ingroup dist_vector
+ * @ingroup distributed
+ * @ingroup mat_formats
+ * @ingroup LinOp
+ */
+template <typename ValueType = double>
+class Vector
+    : public EnableLinOp<Vector<ValueType>>,
+      public EnableCreateMethod<Vector<ValueType>>,
+      public ConvertibleTo<Vector<next_precision<ValueType>>>,
+      public EnableAbsoluteComputation<remove_complex<Vector<ValueType>>>,
+      public DistributedBase {
+    friend class EnableCreateMethod<Vector>;
+    friend struct polymorphic_object_traits<Vector>;
+    friend class Vector<to_complex<ValueType>>;
+    friend class Vector<remove_complex<ValueType>>;
+    friend class Vector<next_precision<ValueType>>;
+
+public:
+    using EnableLinOp<Vector>::convert_to;
+    using EnableLinOp<Vector>::move_to;
+
+    using value_type = ValueType;
+    using absolute_type = remove_complex<Vector>;
+    using real_type = absolute_type;
+    using complex_type = Vector<to_complex<value_type>>;
+    using local_vector_type = gko::matrix::Dense<value_type>;
+
+    /**
+     * Creates a distributed Vector with the same size and stride as another
+     * Vector.
+     *
+     * @param other  The other vector whose configuration needs to copied.
+     */
+    static std::unique_ptr<Vector> create_with_config_of(const Vector* other)
+    {
+        // De-referencing `other` before calling the functions (instead of
+        // using operator `->`) is currently required to be compatible with
+        // CUDA 10.1.
+        // Otherwise, it results in a compile error.
+        return (*other).create_with_same_config();
+    }
+
+    /**
+     * Reads a vector from the device_matrix_data structure and a global row
+     * partition.
+     *
+     * The number of rows of the matrix data is ignored, only its number of
+     * columns is relevant. Both the number of local and global rows are
+     * inferred from the row partition.
+     *
+     * @note The matrix data can contain entries for rows other than those owned
+     *        by the process. Entries for those rows are discarded.
+     *
+     * @param data  The device_matrix_data structure
+     * @param partition  The global row partition
+     */
+    template <typename LocalIndexType, typename GlobalIndexType>
+    void read_distributed(
+        const device_matrix_data<ValueType, GlobalIndexType>& data,
+        const Partition<LocalIndexType, GlobalIndexType>* partition);
+
+    /**
+     * Reads a vector from the matrix_data structure and a global row
+     * partition.
+     *
+     * See @read_distributed
+     *
+     * @note For efficiency it is advised to use the device_matrix_data
+     * overload.
+     */
+    template <typename LocalIndexType, typename GlobalIndexType>
+    void read_distributed(
+        const matrix_data<ValueType, GlobalIndexType>& data,
+        const Partition<LocalIndexType, GlobalIndexType>* partition);
+
+    void convert_to(Vector<next_precision<ValueType>>* result) const override;
+
+    void move_to(Vector<next_precision<ValueType>>* result) override;
+
+    std::unique_ptr<absolute_type> compute_absolute() const override;
+
+    void compute_absolute_inplace() override;
+
+    /**
+     * Creates a complex copy of the original vectors. If the original vectors
+     * were real, the imaginary part of the result will be zero.
+     */
+    std::unique_ptr<complex_type> make_complex() const;
+
+    /**
+     * Writes a complex copy of the original vectors to given complex vectors.
+     * If the original vectors were real, the imaginary part of the result will
+     * be zero.
+     */
+    void make_complex(complex_type* result) const;
+
+    /**
+     * Creates new real vectors and extracts the real part of the original
+     * vectors into that.
+     */
+    std::unique_ptr<real_type> get_real() const;
+
+    /**
+     * Extracts the real part of the original vectors into given real vectors.
+     */
+    void get_real(real_type* result) const;
+
+    /**
+     * Creates new real vectors and extracts the imaginary part of the
+     * original vectors into that.
+     */
+    std::unique_ptr<real_type> get_imag() const;
+
+    /**
+     * Extracts the imaginary part of the original vectors into given real
+     * vectors.
+     */
+    void get_imag(real_type* result) const;
+
+    /**
+     * Fill the distributed vectors with a given value.
+     *
+     * @param value  the value to be filled
+     */
+    void fill(ValueType value);
+
+    /**
+     * Scales the vectors with a scalar (aka: BLAS scal).
+     *
+     * @param alpha  If alpha is 1x1 Dense matrx, the all vectors are scaled
+     *               by alpha. If it is a Dense row vector of values,
+     *               then i-th column vector is scaled with the i-th
+     *               element of alpha (the number of columns of alpha has to
+     *               match the number of vectors).
+     */
+    void scale(const LinOp* alpha);
+
+    /**
+     * Scales the vectors with the inverse of a scalar.
+     *
+     * @param alpha  If alpha is 1x1 Dense matrix, the all vectors are scaled
+     *               by 1 / alpha. If it is a Dense row vector of values,
+     *               then i-th column vector is scaled with the inverse
+     *               of the i-th element of alpha (the number of columns of
+     *               alpha has to match the number of vectors).
+     */
+    void inv_scale(const LinOp* alpha);
+
+    /**
+     * Adds `b` scaled by `alpha` to the vectors (aka: BLAS axpy).
+     *
+     * @param alpha  If alpha is 1x1 Dense matrix, the all vectors of b are
+     * scaled by alpha. If it is a Dense row vector of values, then i-th column
+     * vector of b is scaled with the i-th element of alpha (the number of
+     * columns of alpha has to match the number of vectors).
+     * @param b  a (multi-)vector of the same dimension as this
+     */
+    void add_scaled(const LinOp* alpha, const LinOp* b);
+
+    /**
+     * Subtracts `b` scaled by `alpha` from the vectors (aka: BLAS axpy).
+     *
+     * @param alpha  If alpha is 1x1 Dense matrix, the all vectors of b are
+     * scaled by alpha. If it is a Dense row vector of values, then i-th column
+     * vector of b is scaled with the i-th element of alpha (the number of c
+     * @param b  a (multi-)vector of the same dimension as this
+     */
+    void sub_scaled(const LinOp* alpha, const LinOp* b);
+
+    /**
+     * Computes the column-wise dot product of this (multi-)vector and `b` using
+     * a global reduction.
+     *
+     * @param b  a (multi-)vector of same dimension as this
+     * @param result  a Dense row matrix, used to store the dot product
+     *                (the number of column in result must match the number
+     *                of columns of this)
+     */
+    void compute_dot(const LinOp* b, LinOp* result) const;
+
+    /**
+     * Computes the column-wise dot product of this (multi-)vector and `b` using
+     * a global reduction.
+     *
+     * @param b  a (multi-)vector of same dimension as this
+     * @param result  a Dense row matrix, used to store the dot product
+     *                (the number of column in result must match the number
+     *                of columns of this)
+     * @param tmp  the temporary storage to use for partial sums during the
+     *             reduction computation. It may be resized and/or reset to the
+     *             correct executor.
+     */
+    void compute_dot(const LinOp* b, LinOp* result, array<char>& tmp) const;
+
+    /**
+     * Computes the column-wise dot product of this (multi-)vector and `conj(b)`
+     * using a global reduction.
+     *
+     * @param b  a (multi-)vector of same dimension as this
+     * @param result  a Dense row matrix, used to store the dot product
+     *                (the number of column in result must match the number
+     *                of columns of this)
+     */
+    void compute_conj_dot(const LinOp* b, LinOp* result) const;
+
+    /**
+     * Computes the column-wise dot product of this (multi-)vector and `conj(b)`
+     * using a global reduction.
+     *
+     * @param b  a (multi-)vector of same dimension as this
+     * @param result  a Dense row matrix, used to store the dot product
+     *                (the number of column in result must match the number
+     *                of columns of this)
+     * @param tmp  the temporary storage to use for partial sums during the
+     *             reduction computation. It may be resized and/or reset to the
+     *             correct executor.
+     */
+    void compute_conj_dot(const LinOp* b, LinOp* result,
+                          array<char>& tmp) const;
+
+    /**
+     * Computes the Euclidian (L^2) norm of this (multi-)vector using a global
+     * reduction.
+     *
+     * @param result  a Dense row matrix, used to store the norm
+     *                (the number of columns in result must match the number
+     *                of columns of this)
+     */
+    void compute_norm2(LinOp* result) const;
+
+    /**
+     * Computes the Euclidian (L^2) norm of this (multi-)vector using a global
+     * reduction.
+     *
+     * @param result  a Dense row matrix, used to store the norm
+     *                (the number of columns in result must match the number
+     *                of columns of this)
+     * @param tmp  the temporary storage to use for partial sums during the
+     *             reduction computation. It may be resized and/or reset to the
+     *             correct executor.
+     */
+    void compute_norm2(LinOp* result, array<char>& tmp) const;
+
+    /**
+     * Computes the column-wise (L^1) norm of this (multi-)vector.
+     *
+     * @param result  a Dense row matrix, used to store the norm
+     *                (the number of columns in result must match the number
+     *                of columns of this)
+     */
+    void compute_norm1(LinOp* result) const;
+
+    /**
+     * Computes the column-wise (L^1) norm of this (multi-)vector using a global
+     * reduction.
+     *
+     * @param result  a Dense row matrix, used to store the norm
+     *                (the number of columns in result must match the number
+     *                of columns of this)
+     * @param tmp  the temporary storage to use for partial sums during the
+     *             reduction computation. It may be resized and/or reset to the
+     *             correct executor.
+     */
+    void compute_norm1(LinOp* result, array<char>& tmp) const;
+
+    /**
+     * Returns a single element of the multi-vector.
+     *
+     * @param row  the local row of the requested element
+     * @param col  the local column of the requested element
+     *
+     * @note  the method has to be called on the same Executor the multi-vector
+     * is stored at (e.g. trying to call this method on a GPU multi-vector from
+     *        the OMP results in a runtime error)
+     */
+    value_type& at_local(size_type row, size_type col) noexcept;
+
+    /**
+     * @copydoc Vector::at(size_type, size_type)
+     */
+    value_type at_local(size_type row, size_type col) const noexcept;
+
+    /**
+     * Returns a single element of the multi-vector.
+     *
+     * Useful for iterating across all elements of the multi-vector.
+     * However, it is less efficient than the two-parameter variant of this
+     * method.
+     *
+     * @param idx  a linear index of the requested element
+     *             (ignoring the stride)
+     *
+     * @note  the method has to be called on the same Executor the matrix is
+     *        stored at (e.g. trying to call this method on a GPU matrix from
+     *        the OMP results in a runtime error)
+     */
+    ValueType& at_local(size_type idx) noexcept;
+
+    /**
+     * @copydoc Vector::at(size_type)
+     */
+    ValueType at_local(size_type idx) const noexcept;
+
+    /**
+     * Returns a pointer to the array of local values of the multi-vector.
+     *
+     * @return the pointer to the array of local values
+     */
+    value_type* get_local_values();
+
+    /**
+     * @copydoc get_local_values()
+     *
+     * @note This is the constant version of the function, which can be
+     *       significantly more memory efficient than the non-constant version,
+     *       so always prefer this version.
+     */
+    const value_type* get_const_local_values() const;
+
+    /**
+     * Direct (read) access to the underlying local local_vector_type vectors.
+     *
+     * @return a constant pointer to the underlying local_vector_type vectors
+     */
+    const local_vector_type* get_local_vector() const;
+
+    /**
+     * Create a real view of the (potentially) complex original multi-vector.
+     * If the original vector is real, nothing changes. If the original vector
+     * is complex, the result is created by viewing the complex vector with as
+     * real with a reinterpret_cast with twice the number of columns and
+     * double the stride.
+     */
+    std::unique_ptr<const real_type> create_real_view() const;
+
+    /**
+     * @copydoc create_real_view
+     */
+    std::unique_ptr<real_type> create_real_view();
+
+    size_type get_stride() const noexcept { return local_.get_stride(); }
+
+protected:
+    /**
+     * Creates an empty distributed vector with a specified size
+     *
+     * @param exec  Executor associated with vector
+     * @param comm  Communicator associated with vector
+     * @param global_size  Global size of the vector
+     * @param local_size  Processor-local size of the vector
+     * @param stride  Stride of the local vector.
+     */
+    Vector(std::shared_ptr<const Executor> exec, mpi::communicator comm,
+           dim<2> global_size, dim<2> local_size, size_type stride);
+
+    /**
+     * Creates an empty distributed vector with a specified size
+     *
+     * @param exec  Executor associated with vector
+     * @param comm  Communicator associated with vector
+     * @param global_size  Global size of the vector
+     * @param local_size  Processor-local size of the vector, uses local_size[1]
+     *                    as the stride
+     */
+    explicit Vector(std::shared_ptr<const Executor> exec,
+                    mpi::communicator comm, dim<2> global_size = {},
+                    dim<2> local_size = {});
+
+    /**
+     * Creates a distributed vector from local vectors with a specified size.
+     *
+     * @note  The data form the local_vector will be moved into the new
+     *        distributed vector. This means, access to local_vector
+     *        will be invalid after this call.
+     *
+     * @param exec  Executor associated with this vector
+     * @param comm  Communicator associated with this vector
+     * @param global_size  The global size of the vector
+     * @param local_vector  The underlying local vector, the data will be moved
+     *                      into this
+     */
+    Vector(std::shared_ptr<const Executor> exec, mpi::communicator comm,
+           dim<2> global_size, local_vector_type* local_vector);
+
+    /**
+     * Creates a distributed vector from local vectors. The global size will
+     * be deduced from the local sizes, which will incur a collective
+     * communication.
+     *
+     * @note  The data form the local_vector will be moved into the new
+     *        distributed vector. This means, access to local_vector
+     *        will be invalid after this call.
+     *
+     * @param exec  Executor associated with this vector
+     * @param comm  Communicator associated with this vector
+     * @param local_vector  The underlying local vector, the data will be moved
+     *                      into this
+     */
+    Vector(std::shared_ptr<const Executor> exec, mpi::communicator comm,
+           local_vector_type* local_vector);
+
+    void resize(dim<2> global_size, dim<2> local_size);
+
+    void apply_impl(const LinOp*, LinOp*) const override;
+
+    void apply_impl(const LinOp*, const LinOp*, const LinOp*,
+                    LinOp*) const override;
+
+    /**
+     * Creates a distributed vector with the same size and stride as the callers
+     * vector.
+     *
+     * @returns a Vector with the same size and stride as the caller.
+     */
+    std::unique_ptr<Vector> create_with_same_config() const
+    {
+        return Vector::create(
+            this->get_executor(), this->get_communicator(), this->get_size(),
+            this->get_local_vector()->get_size(), this->get_stride());
+    }
+
+private:
+    local_vector_type local_;
+    ::gko::detail::DenseCache<ValueType> host_reduction_buffer_;
+    ::gko::detail::DenseCache<remove_complex<ValueType>> host_norm_buffer_;
+};
+
+
+}  // namespace distributed
+}  // namespace experimental
+
+
+template <typename ValueType>
+struct polymorphic_object_traits<experimental::distributed::Vector<ValueType>> {
+    using Vector = experimental::distributed::Vector<ValueType>;
+
+    static std::unique_ptr<PolymorphicObject> create_default_impl(
+        const Vector* self, std::shared_ptr<const Executor> exec)
+    {
+        return std::unique_ptr<Vector>{
+            new Vector(exec, self->get_communicator())};
+    }
+
+    static std::unique_ptr<Vector> create_conversion_target_impl(
+        const experimental::distributed::Vector<next_precision<ValueType>>*
+            self,
+        std::shared_ptr<const Executor> exec)
+    {
+        return std::unique_ptr<Vector>{
+            new Vector(exec, self->get_communicator())};
+    }
+
+    static PolymorphicObject* clear_impl(Vector* self)
+    {
+        *self = Vector{self->get_executor(), self->get_communicator()};
+        return self;
+    }
+};
+
+
+}  // namespace gko
+
+
+#endif  // GINKGO_BUILD_MPI
+
+
+#endif  // GKO_PUBLIC_CORE_DISTRIBUTED_VECTOR_HPP_
diff --git a/include/ginkgo/core/matrix/coo.hpp b/include/ginkgo/core/matrix/coo.hpp
index cc12b543144..8738681b549 100644
--- a/include/ginkgo/core/matrix/coo.hpp
+++ b/include/ginkgo/core/matrix/coo.hpp
@@ -86,7 +86,7 @@ class Coo : public EnableLinOp<Coo<ValueType, IndexType>>,
             public EnableAbsoluteComputation<
                 remove_complex<Coo<ValueType, IndexType>>> {
     friend class EnableCreateMethod<Coo>;
-    friend class EnablePolymorphicObject<Coo, LinOp>;
+    friend struct polymorphic_object_traits<Coo>;
     friend class Csr<ValueType, IndexType>;
     friend class Dense<ValueType>;
     friend class CooBuilder<ValueType, IndexType>;
diff --git a/include/ginkgo/core/matrix/csr.hpp b/include/ginkgo/core/matrix/csr.hpp
index 4a7d5a1660b..2ed31256742 100644
--- a/include/ginkgo/core/matrix/csr.hpp
+++ b/include/ginkgo/core/matrix/csr.hpp
@@ -143,7 +143,7 @@ class Csr : public EnableLinOp<Csr<ValueType, IndexType>>,
                 remove_complex<Csr<ValueType, IndexType>>>,
             public ScaledIdentityAddable {
     friend class EnableCreateMethod<Csr>;
-    friend class EnablePolymorphicObject<Csr, LinOp>;
+    friend struct polymorphic_object_traits<Csr>;
     friend class Coo<ValueType, IndexType>;
     friend class Dense<ValueType>;
     friend class Diagonal<ValueType>;
diff --git a/include/ginkgo/core/matrix/dense.hpp b/include/ginkgo/core/matrix/dense.hpp
index 4ca3c226240..2dcff5f75c0 100644
--- a/include/ginkgo/core/matrix/dense.hpp
+++ b/include/ginkgo/core/matrix/dense.hpp
@@ -47,6 +47,18 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 
 namespace gko {
+namespace experimental {
+namespace distributed {
+
+
+template <typename ValueType>
+class Vector;
+
+
+}
+}  // namespace experimental
+
+
 namespace matrix {
 
 
@@ -120,7 +132,7 @@ class Dense
       public EnableAbsoluteComputation<remove_complex<Dense<ValueType>>>,
       public ScaledIdentityAddable {
     friend class EnableCreateMethod<Dense>;
-    friend class EnablePolymorphicObject<Dense, LinOp>;
+    friend struct polymorphic_object_traits<Dense>;
     friend class Coo<ValueType, int32>;
     friend class Coo<ValueType, int64>;
     friend class Csr<ValueType, int32>;
@@ -137,6 +149,7 @@ class Dense
     friend class SparsityCsr<ValueType, int32>;
     friend class SparsityCsr<ValueType, int64>;
     friend class Dense<to_complex<ValueType>>;
+    friend class experimental::distributed::Vector<ValueType>;
 
 public:
     using EnableLinOp<Dense>::convert_to;
diff --git a/include/ginkgo/core/matrix/diagonal.hpp b/include/ginkgo/core/matrix/diagonal.hpp
index 74b22300eb0..ca0ca514287 100644
--- a/include/ginkgo/core/matrix/diagonal.hpp
+++ b/include/ginkgo/core/matrix/diagonal.hpp
@@ -77,7 +77,7 @@ class Diagonal
       public ReadableFromMatrixData<ValueType, int32>,
       public ReadableFromMatrixData<ValueType, int64>,
       public EnableAbsoluteComputation<remove_complex<Diagonal<ValueType>>> {
-    friend class EnablePolymorphicObject<Diagonal, LinOp>;
+    friend struct polymorphic_object_traits<Diagonal>;
     friend class EnableCreateMethod<Diagonal>;
     friend class Csr<ValueType, int32>;
     friend class Csr<ValueType, int64>;
diff --git a/include/ginkgo/core/matrix/ell.hpp b/include/ginkgo/core/matrix/ell.hpp
index 79867f59969..784100b1af6 100644
--- a/include/ginkgo/core/matrix/ell.hpp
+++ b/include/ginkgo/core/matrix/ell.hpp
@@ -88,7 +88,7 @@ class Ell : public EnableLinOp<Ell<ValueType, IndexType>>,
             public EnableAbsoluteComputation<
                 remove_complex<Ell<ValueType, IndexType>>> {
     friend class EnableCreateMethod<Ell>;
-    friend class EnablePolymorphicObject<Ell, LinOp>;
+    friend struct polymorphic_object_traits<Ell>;
     friend class Dense<ValueType>;
     friend class Coo<ValueType, IndexType>;
     friend class Csr<ValueType, IndexType>;
diff --git a/include/ginkgo/core/matrix/fbcsr.hpp b/include/ginkgo/core/matrix/fbcsr.hpp
index 6bd42bec483..fb0a5e35d23 100644
--- a/include/ginkgo/core/matrix/fbcsr.hpp
+++ b/include/ginkgo/core/matrix/fbcsr.hpp
@@ -137,7 +137,7 @@ class Fbcsr : public EnableLinOp<Fbcsr<ValueType, IndexType>>,
               public EnableAbsoluteComputation<
                   remove_complex<Fbcsr<ValueType, IndexType>>> {
     friend class EnableCreateMethod<Fbcsr>;
-    friend class EnablePolymorphicObject<Fbcsr, LinOp>;
+    friend struct polymorphic_object_traits<Fbcsr>;
     friend class Csr<ValueType, IndexType>;
     friend class Dense<ValueType>;
     friend class SparsityCsr<ValueType, IndexType>;
diff --git a/include/ginkgo/core/matrix/fft.hpp b/include/ginkgo/core/matrix/fft.hpp
index 955f51b1c95..40280aa9ade 100644
--- a/include/ginkgo/core/matrix/fft.hpp
+++ b/include/ginkgo/core/matrix/fft.hpp
@@ -77,7 +77,7 @@ class Fft : public EnableLinOp<Fft>,
             public WritableToMatrixData<std::complex<double>, int32>,
             public WritableToMatrixData<std::complex<double>, int64>,
             public Transposable {
-    friend class EnablePolymorphicObject<Fft, LinOp>;
+    friend struct polymorphic_object_traits<Fft>;
     friend class EnableCreateMethod<Fft>;
 
 public:
@@ -173,7 +173,7 @@ class Fft2 : public EnableLinOp<Fft2>,
              public WritableToMatrixData<std::complex<double>, int32>,
              public WritableToMatrixData<std::complex<double>, int64>,
              public Transposable {
-    friend class EnablePolymorphicObject<Fft2, LinOp>;
+    friend struct polymorphic_object_traits<Fft2>;
     friend class EnableCreateMethod<Fft2>;
 
 public:
@@ -285,7 +285,7 @@ class Fft3 : public EnableLinOp<Fft3>,
              public WritableToMatrixData<std::complex<double>, int32>,
              public WritableToMatrixData<std::complex<double>, int64>,
              public Transposable {
-    friend class EnablePolymorphicObject<Fft3, LinOp>;
+    friend struct polymorphic_object_traits<Fft3>;
     friend class EnableCreateMethod<Fft3>;
 
 public:
diff --git a/include/ginkgo/core/matrix/hybrid.hpp b/include/ginkgo/core/matrix/hybrid.hpp
index 686cc93266a..f8a53a41006 100644
--- a/include/ginkgo/core/matrix/hybrid.hpp
+++ b/include/ginkgo/core/matrix/hybrid.hpp
@@ -80,7 +80,7 @@ class Hybrid
       public EnableAbsoluteComputation<
           remove_complex<Hybrid<ValueType, IndexType>>> {
     friend class EnableCreateMethod<Hybrid>;
-    friend class EnablePolymorphicObject<Hybrid, LinOp>;
+    friend struct polymorphic_object_traits<Hybrid>;
     friend class Dense<ValueType>;
     friend class Csr<ValueType, IndexType>;
     friend class Hybrid<to_complex<ValueType>, IndexType>;
diff --git a/include/ginkgo/core/matrix/identity.hpp b/include/ginkgo/core/matrix/identity.hpp
index 6112ac461c7..57a638b8ed3 100644
--- a/include/ginkgo/core/matrix/identity.hpp
+++ b/include/ginkgo/core/matrix/identity.hpp
@@ -63,7 +63,7 @@ template <typename ValueType = default_precision>
 class Identity : public EnableLinOp<Identity<ValueType>>,
                  public EnableCreateMethod<Identity<ValueType>>,
                  public Transposable {
-    friend class EnablePolymorphicObject<Identity, LinOp>;
+    friend struct polymorphic_object_traits<Identity>;
     friend class EnableCreateMethod<Identity>;
 
 public:
@@ -130,7 +130,7 @@ class Identity : public EnableLinOp<Identity<ValueType>>,
 template <typename ValueType = default_precision>
 class IdentityFactory
     : public EnablePolymorphicObject<IdentityFactory<ValueType>, LinOpFactory> {
-    friend class EnablePolymorphicObject<IdentityFactory, LinOpFactory>;
+    friend struct polymorphic_object_traits<IdentityFactory>;
 
 public:
     using value_type = ValueType;
diff --git a/include/ginkgo/core/matrix/permutation.hpp b/include/ginkgo/core/matrix/permutation.hpp
index ab257f90a2c..d340029947d 100644
--- a/include/ginkgo/core/matrix/permutation.hpp
+++ b/include/ginkgo/core/matrix/permutation.hpp
@@ -79,7 +79,7 @@ template <typename IndexType = int32>
 class Permutation : public EnableLinOp<Permutation<IndexType>>,
                     public EnableCreateMethod<Permutation<IndexType>> {
     friend class EnableCreateMethod<Permutation>;
-    friend class EnablePolymorphicObject<Permutation, LinOp>;
+    friend struct polymorphic_object_traits<Permutation>;
 
 public:
     using index_type = IndexType;
diff --git a/include/ginkgo/core/matrix/row_gatherer.hpp b/include/ginkgo/core/matrix/row_gatherer.hpp
index 79b4f340bd5..01bbd6cf57e 100644
--- a/include/ginkgo/core/matrix/row_gatherer.hpp
+++ b/include/ginkgo/core/matrix/row_gatherer.hpp
@@ -72,7 +72,7 @@ template <typename IndexType = int32>
 class RowGatherer : public EnableLinOp<RowGatherer<IndexType>>,
                     public EnableCreateMethod<RowGatherer<IndexType>> {
     friend class EnableCreateMethod<RowGatherer>;
-    friend class EnablePolymorphicObject<RowGatherer, LinOp>;
+    friend struct polymorphic_object_traits<RowGatherer>;
 
 public:
     using index_type = IndexType;
diff --git a/include/ginkgo/core/matrix/sellp.hpp b/include/ginkgo/core/matrix/sellp.hpp
index b186444fbfe..e857f6d397c 100644
--- a/include/ginkgo/core/matrix/sellp.hpp
+++ b/include/ginkgo/core/matrix/sellp.hpp
@@ -79,7 +79,7 @@ class Sellp : public EnableLinOp<Sellp<ValueType, IndexType>>,
               public EnableAbsoluteComputation<
                   remove_complex<Sellp<ValueType, IndexType>>> {
     friend class EnableCreateMethod<Sellp>;
-    friend class EnablePolymorphicObject<Sellp, LinOp>;
+    friend struct polymorphic_object_traits<Sellp>;
     friend class Dense<ValueType>;
     friend class Csr<ValueType, IndexType>;
     friend class Sellp<to_complex<ValueType>, IndexType>;
diff --git a/include/ginkgo/core/matrix/sparsity_csr.hpp b/include/ginkgo/core/matrix/sparsity_csr.hpp
index 8fda5895455..ab15114a47f 100644
--- a/include/ginkgo/core/matrix/sparsity_csr.hpp
+++ b/include/ginkgo/core/matrix/sparsity_csr.hpp
@@ -86,7 +86,7 @@ class SparsityCsr
       public WritableToMatrixData<ValueType, IndexType>,
       public Transposable {
     friend class EnableCreateMethod<SparsityCsr>;
-    friend class EnablePolymorphicObject<SparsityCsr, LinOp>;
+    friend struct polymorphic_object_traits<SparsityCsr>;
     friend class Csr<ValueType, IndexType>;
     friend class Dense<ValueType>;
     friend class Fbcsr<ValueType, IndexType>;
diff --git a/include/ginkgo/core/multigrid/fixed_coarsening.hpp b/include/ginkgo/core/multigrid/fixed_coarsening.hpp
index b6b07faadab..454b1a8bd64 100644
--- a/include/ginkgo/core/multigrid/fixed_coarsening.hpp
+++ b/include/ginkgo/core/multigrid/fixed_coarsening.hpp
@@ -71,7 +71,7 @@ class FixedCoarsening
     : public EnableLinOp<FixedCoarsening<ValueType, IndexType>>,
       public EnableMultigridLevel<ValueType> {
     friend class EnableLinOp<FixedCoarsening>;
-    friend class EnablePolymorphicObject<FixedCoarsening, LinOp>;
+    friend struct polymorphic_object_traits<FixedCoarsening>;
 
 public:
     using value_type = ValueType;
diff --git a/include/ginkgo/core/multigrid/pgm.hpp b/include/ginkgo/core/multigrid/pgm.hpp
index 875d558f6f5..05bd117b0be 100644
--- a/include/ginkgo/core/multigrid/pgm.hpp
+++ b/include/ginkgo/core/multigrid/pgm.hpp
@@ -75,7 +75,7 @@ template <typename ValueType = default_precision, typename IndexType = int32>
 class Pgm : public EnableLinOp<Pgm<ValueType, IndexType>>,
             public EnableMultigridLevel<ValueType> {
     friend class EnableLinOp<Pgm>;
-    friend class EnablePolymorphicObject<Pgm, LinOp>;
+    friend struct polymorphic_object_traits<Pgm>;
 
 public:
     using value_type = ValueType;
diff --git a/include/ginkgo/core/preconditioner/ic.hpp b/include/ginkgo/core/preconditioner/ic.hpp
index bc2127b9674..3dabf41fbbf 100644
--- a/include/ginkgo/core/preconditioner/ic.hpp
+++ b/include/ginkgo/core/preconditioner/ic.hpp
@@ -106,7 +106,7 @@ namespace preconditioner {
 template <typename LSolverType = solver::LowerTrs<>, typename IndexType = int32>
 class Ic : public EnableLinOp<Ic<LSolverType, IndexType>>, public Transposable {
     friend class EnableLinOp<Ic>;
-    friend class EnablePolymorphicObject<Ic, LinOp>;
+    friend struct polymorphic_object_traits<Ic>;
 
 public:
     static_assert(
diff --git a/include/ginkgo/core/preconditioner/ilu.hpp b/include/ginkgo/core/preconditioner/ilu.hpp
index c8578bad6c7..3fa9c6f8f92 100644
--- a/include/ginkgo/core/preconditioner/ilu.hpp
+++ b/include/ginkgo/core/preconditioner/ilu.hpp
@@ -115,7 +115,7 @@ class Ilu : public EnableLinOp<
                 Ilu<LSolverType, USolverType, ReverseApply, IndexType>>,
             public Transposable {
     friend class EnableLinOp<Ilu>;
-    friend class EnablePolymorphicObject<Ilu, LinOp>;
+    friend struct polymorphic_object_traits<Ilu>;
 
 public:
     static_assert(
diff --git a/include/ginkgo/core/preconditioner/isai.hpp b/include/ginkgo/core/preconditioner/isai.hpp
index b3efd93b168..78aecedbe0d 100644
--- a/include/ginkgo/core/preconditioner/isai.hpp
+++ b/include/ginkgo/core/preconditioner/isai.hpp
@@ -106,7 +106,7 @@ template <isai_type IsaiType, typename ValueType, typename IndexType>
 class Isai : public EnableLinOp<Isai<IsaiType, ValueType, IndexType>>,
              public Transposable {
     friend class EnableLinOp<Isai>;
-    friend class EnablePolymorphicObject<Isai, LinOp>;
+    friend struct polymorphic_object_traits<Isai>;
     friend class Isai<isai_type::general, ValueType, IndexType>;
     friend class Isai<isai_type::lower, ValueType, IndexType>;
     friend class Isai<isai_type::upper, ValueType, IndexType>;
diff --git a/include/ginkgo/core/preconditioner/jacobi.hpp b/include/ginkgo/core/preconditioner/jacobi.hpp
index 16d6fbc0a3e..0ef2711a29b 100644
--- a/include/ginkgo/core/preconditioner/jacobi.hpp
+++ b/include/ginkgo/core/preconditioner/jacobi.hpp
@@ -215,7 +215,7 @@ class Jacobi : public EnableLinOp<Jacobi<ValueType, IndexType>>,
                public WritableToMatrixData<ValueType, IndexType>,
                public Transposable {
     friend class EnableLinOp<Jacobi>;
-    friend class EnablePolymorphicObject<Jacobi, LinOp>;
+    friend struct polymorphic_object_traits<Jacobi>;
 
 public:
     using EnableLinOp<Jacobi>::convert_to;
diff --git a/include/ginkgo/core/reorder/rcm.hpp b/include/ginkgo/core/reorder/rcm.hpp
index 1e75a30d546..45785a7c0cf 100644
--- a/include/ginkgo/core/reorder/rcm.hpp
+++ b/include/ginkgo/core/reorder/rcm.hpp
@@ -95,7 +95,7 @@ template <typename ValueType = default_precision, typename IndexType = int32>
 class Rcm
     : public EnablePolymorphicObject<Rcm<ValueType, IndexType>, ReorderingBase>,
       public EnablePolymorphicAssignment<Rcm<ValueType, IndexType>> {
-    friend class EnablePolymorphicObject<Rcm, ReorderingBase>;
+    friend struct polymorphic_object_traits<Rcm>;
 
 public:
     using SparsityMatrix = matrix::SparsityCsr<ValueType, IndexType>;
diff --git a/include/ginkgo/core/reorder/reordering_base.hpp b/include/ginkgo/core/reorder/reordering_base.hpp
index 0ef642e2fae..b737b26230a 100644
--- a/include/ginkgo/core/reorder/reordering_base.hpp
+++ b/include/ginkgo/core/reorder/reordering_base.hpp
@@ -140,8 +140,7 @@ public:                                                                        \
     class _factory_name                                                        \
         : public ::gko::reorder::EnableDefaultReorderingBaseFactory<           \
               _factory_name, _reordering_base, _parameters_name##_type> {      \
-        friend class ::gko::EnablePolymorphicObject<                           \
-            _factory_name, ::gko::reorder::ReorderingBaseFactory>;             \
+        friend class ::gko::polymorphic_object_traits<_factory_name>;          \
         friend class ::gko::enable_parameters_type<_parameters_name##_type,    \
                                                    _factory_name>;             \
         explicit _factory_name(std::shared_ptr<const ::gko::Executor> exec)    \
diff --git a/include/ginkgo/core/solver/bicg.hpp b/include/ginkgo/core/solver/bicg.hpp
index 80c39d107b7..8b77fd7a597 100644
--- a/include/ginkgo/core/solver/bicg.hpp
+++ b/include/ginkgo/core/solver/bicg.hpp
@@ -82,7 +82,7 @@ class Bicg
       public EnablePreconditionedIterativeSolver<ValueType, Bicg<ValueType>>,
       public Transposable {
     friend class EnableLinOp<Bicg>;
-    friend class EnablePolymorphicObject<Bicg, LinOp>;
+    friend struct polymorphic_object_traits<Bicg>;
 
 public:
     using value_type = ValueType;
diff --git a/include/ginkgo/core/solver/bicgstab.hpp b/include/ginkgo/core/solver/bicgstab.hpp
index f916396289c..20646bbb1c9 100644
--- a/include/ginkgo/core/solver/bicgstab.hpp
+++ b/include/ginkgo/core/solver/bicgstab.hpp
@@ -81,7 +81,7 @@ class Bicgstab
                                                  Bicgstab<ValueType>>,
       public Transposable {
     friend class EnableLinOp<Bicgstab>;
-    friend class EnablePolymorphicObject<Bicgstab, LinOp>;
+    friend struct polymorphic_object_traits<Bicgstab>;
 
 public:
     using value_type = ValueType;
@@ -125,8 +125,8 @@ class Bicgstab
 protected:
     void apply_impl(const LinOp* b, LinOp* x) const override;
 
-    void apply_dense_impl(const matrix::Dense<ValueType>* b,
-                          matrix::Dense<ValueType>* x) const;
+    template <typename VectorType>
+    void apply_dense_impl(const VectorType* b, VectorType* x) const;
 
     void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta,
                     LinOp* x) const override;
diff --git a/include/ginkgo/core/solver/cb_gmres.hpp b/include/ginkgo/core/solver/cb_gmres.hpp
index 4a780c47071..f6d78af41d6 100644
--- a/include/ginkgo/core/solver/cb_gmres.hpp
+++ b/include/ginkgo/core/solver/cb_gmres.hpp
@@ -124,7 +124,7 @@ class CbGmres : public EnableLinOp<CbGmres<ValueType>>,
                 public EnablePreconditionedIterativeSolver<ValueType,
                                                            CbGmres<ValueType>> {
     friend class EnableLinOp<CbGmres>;
-    friend class EnablePolymorphicObject<CbGmres, LinOp>;
+    friend struct polymorphic_object_traits<CbGmres>;
 
 public:
     using value_type = ValueType;
diff --git a/include/ginkgo/core/solver/cg.hpp b/include/ginkgo/core/solver/cg.hpp
index 36603b3b501..0d42f0cbd74 100644
--- a/include/ginkgo/core/solver/cg.hpp
+++ b/include/ginkgo/core/solver/cg.hpp
@@ -75,7 +75,7 @@ class Cg : public EnableLinOp<Cg<ValueType>>,
            public EnablePreconditionedIterativeSolver<ValueType, Cg<ValueType>>,
            public Transposable {
     friend class EnableLinOp<Cg>;
-    friend class EnablePolymorphicObject<Cg, LinOp>;
+    friend struct polymorphic_object_traits<Cg>;
 
 public:
     using value_type = ValueType;
@@ -119,8 +119,8 @@ class Cg : public EnableLinOp<Cg<ValueType>>,
 protected:
     void apply_impl(const LinOp* b, LinOp* x) const override;
 
-    void apply_dense_impl(const matrix::Dense<ValueType>* b,
-                          matrix::Dense<ValueType>* x) const;
+    template <typename VectorType>
+    void apply_dense_impl(const VectorType* b, VectorType* x) const;
 
     void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta,
                     LinOp* x) const override;
diff --git a/include/ginkgo/core/solver/cgs.hpp b/include/ginkgo/core/solver/cgs.hpp
index 3c7869dd661..752f42abc70 100644
--- a/include/ginkgo/core/solver/cgs.hpp
+++ b/include/ginkgo/core/solver/cgs.hpp
@@ -73,7 +73,7 @@ class Cgs
       public EnablePreconditionedIterativeSolver<ValueType, Cgs<ValueType>>,
       public Transposable {
     friend class EnableLinOp<Cgs>;
-    friend class EnablePolymorphicObject<Cgs, LinOp>;
+    friend struct polymorphic_object_traits<Cgs>;
 
 public:
     using value_type = ValueType;
@@ -117,8 +117,8 @@ class Cgs
 protected:
     void apply_impl(const LinOp* b, LinOp* x) const override;
 
-    void apply_dense_impl(const matrix::Dense<ValueType>* b,
-                          matrix::Dense<ValueType>* x) const;
+    template <typename VectorType>
+    void apply_dense_impl(const VectorType* b, VectorType* x) const;
 
     void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta,
                     LinOp* x) const override;
diff --git a/include/ginkgo/core/solver/fcg.hpp b/include/ginkgo/core/solver/fcg.hpp
index 081d12771f7..df47a6ab65f 100644
--- a/include/ginkgo/core/solver/fcg.hpp
+++ b/include/ginkgo/core/solver/fcg.hpp
@@ -81,7 +81,7 @@ class Fcg
       public EnablePreconditionedIterativeSolver<ValueType, Fcg<ValueType>>,
       public Transposable {
     friend class EnableLinOp<Fcg>;
-    friend class EnablePolymorphicObject<Fcg, LinOp>;
+    friend struct polymorphic_object_traits<Fcg>;
 
 public:
     using value_type = ValueType;
@@ -125,8 +125,8 @@ class Fcg
 protected:
     void apply_impl(const LinOp* b, LinOp* x) const override;
 
-    void apply_dense_impl(const matrix::Dense<ValueType>* b,
-                          matrix::Dense<ValueType>* x) const;
+    template <typename VectorType>
+    void apply_dense_impl(const VectorType* b, VectorType* x) const;
 
     void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta,
                     LinOp* x) const override;
diff --git a/include/ginkgo/core/solver/gmres.hpp b/include/ginkgo/core/solver/gmres.hpp
index c880d2e6f99..08c879091c9 100644
--- a/include/ginkgo/core/solver/gmres.hpp
+++ b/include/ginkgo/core/solver/gmres.hpp
@@ -76,7 +76,7 @@ class Gmres
       public EnablePreconditionedIterativeSolver<ValueType, Gmres<ValueType>>,
       public Transposable {
     friend class EnableLinOp<Gmres>;
-    friend class EnablePolymorphicObject<Gmres, LinOp>;
+    friend struct polymorphic_object_traits<Gmres>;
 
 public:
     using value_type = ValueType;
diff --git a/include/ginkgo/core/solver/idr.hpp b/include/ginkgo/core/solver/idr.hpp
index 679ced8da12..31a879e04fe 100644
--- a/include/ginkgo/core/solver/idr.hpp
+++ b/include/ginkgo/core/solver/idr.hpp
@@ -85,7 +85,7 @@ class Idr
       public EnablePreconditionedIterativeSolver<ValueType, Idr<ValueType>>,
       public Transposable {
     friend class EnableLinOp<Idr>;
-    friend class EnablePolymorphicObject<Idr, LinOp>;
+    friend struct polymorphic_object_traits<Idr>;
 
 public:
     using value_type = ValueType;
@@ -232,9 +232,8 @@ class Idr
     void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta,
                     LinOp* x) const override;
 
-    template <typename SubspaceType>
-    void iterate(const matrix::Dense<SubspaceType>* dense_b,
-                 matrix::Dense<SubspaceType>* dense_x) const;
+    template <typename VectorType>
+    void iterate(const VectorType* dense_b, VectorType* dense_x) const;
 
     explicit Idr(std::shared_ptr<const Executor> exec)
         : EnableLinOp<Idr>(std::move(exec))
diff --git a/include/ginkgo/core/solver/ir.hpp b/include/ginkgo/core/solver/ir.hpp
index 34181f3e329..0776596c10f 100644
--- a/include/ginkgo/core/solver/ir.hpp
+++ b/include/ginkgo/core/solver/ir.hpp
@@ -110,7 +110,7 @@ class Ir : public EnableLinOp<Ir<ValueType>>,
            public EnableIterativeBase<Ir<ValueType>>,
            public Transposable {
     friend class EnableLinOp<Ir>;
-    friend class EnablePolymorphicObject<Ir, LinOp>;
+    friend struct polymorphic_object_traits<Ir>;
 
 public:
     using value_type = ValueType;
@@ -204,8 +204,8 @@ class Ir : public EnableLinOp<Ir<ValueType>>,
 protected:
     void apply_impl(const LinOp* b, LinOp* x) const override;
 
-    void apply_dense_impl(const matrix::Dense<ValueType>* b,
-                          matrix::Dense<ValueType>* x) const;
+    template <typename VectorType>
+    void apply_dense_impl(const VectorType* b, VectorType* x) const;
 
     void apply_impl(const LinOp* alpha, const LinOp* b, const LinOp* beta,
                     LinOp* x) const override;
diff --git a/include/ginkgo/core/solver/multigrid.hpp b/include/ginkgo/core/solver/multigrid.hpp
index 6706beb7635..3e3f67ca569 100644
--- a/include/ginkgo/core/solver/multigrid.hpp
+++ b/include/ginkgo/core/solver/multigrid.hpp
@@ -121,7 +121,7 @@ enum class mid_smooth_type { both, post_smoother, pre_smoother, standalone };
  */
 class Multigrid : public EnableLinOp<Multigrid> {
     friend class EnableLinOp<Multigrid>;
-    friend class EnablePolymorphicObject<Multigrid, LinOp>;
+    friend struct polymorphic_object_traits<Multigrid>;
 
 public:
     /**
diff --git a/include/ginkgo/core/solver/triangular.hpp b/include/ginkgo/core/solver/triangular.hpp
index 874331bf93f..3625f992f3a 100644
--- a/include/ginkgo/core/solver/triangular.hpp
+++ b/include/ginkgo/core/solver/triangular.hpp
@@ -94,7 +94,7 @@ class LowerTrs : public EnableLinOp<LowerTrs<ValueType, IndexType>>,
                                          matrix::Csr<ValueType, IndexType>>,
                  public Transposable {
     friend class EnableLinOp<LowerTrs>;
-    friend class EnablePolymorphicObject<LowerTrs, LinOp>;
+    friend struct polymorphic_object_traits<LowerTrs>;
     friend class UpperTrs<ValueType, IndexType>;
 
 public:
@@ -247,7 +247,7 @@ class UpperTrs : public EnableLinOp<UpperTrs<ValueType, IndexType>>,
                                          matrix::Csr<ValueType, IndexType>>,
                  public Transposable {
     friend class EnableLinOp<UpperTrs>;
-    friend class EnablePolymorphicObject<UpperTrs, LinOp>;
+    friend struct polymorphic_object_traits<UpperTrs>;
     friend class LowerTrs<ValueType, IndexType>;
 
 public:
diff --git a/include/ginkgo/core/stop/combined.hpp b/include/ginkgo/core/stop/combined.hpp
index 7595fa68c35..c4ecf6436a4 100644
--- a/include/ginkgo/core/stop/combined.hpp
+++ b/include/ginkgo/core/stop/combined.hpp
@@ -53,7 +53,7 @@ namespace stop {
  * @ingroup stop
  */
 class Combined : public EnablePolymorphicObject<Combined, Criterion> {
-    friend class EnablePolymorphicObject<Combined, Criterion>;
+    friend struct polymorphic_object_traits<Combined>;
 
 public:
     GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
diff --git a/include/ginkgo/core/stop/criterion.hpp b/include/ginkgo/core/stop/criterion.hpp
index 2d991c929be..bdc68b4c237 100644
--- a/include/ginkgo/core/stop/criterion.hpp
+++ b/include/ginkgo/core/stop/criterion.hpp
@@ -298,8 +298,7 @@ public:                                                                      \
     class _factory_name                                                      \
         : public ::gko::stop::EnableDefaultCriterionFactory<                 \
               _factory_name, _criterion, _parameters_name##_type> {          \
-        friend class ::gko::EnablePolymorphicObject<                         \
-            _factory_name, ::gko::stop::CriterionFactory>;                   \
+        friend class ::gko::polymorphic_object_traits<_factory_name>;        \
         friend class ::gko::enable_parameters_type<_parameters_name##_type,  \
                                                    _factory_name>;           \
         explicit _factory_name(std::shared_ptr<const ::gko::Executor> exec)  \
diff --git a/include/ginkgo/core/stop/iteration.hpp b/include/ginkgo/core/stop/iteration.hpp
index 2fbae22840b..37914bca088 100644
--- a/include/ginkgo/core/stop/iteration.hpp
+++ b/include/ginkgo/core/stop/iteration.hpp
@@ -50,7 +50,7 @@ namespace stop {
  * @ingroup stop
  */
 class Iteration : public EnablePolymorphicObject<Iteration, Criterion> {
-    friend class EnablePolymorphicObject<Iteration, Criterion>;
+    friend struct polymorphic_object_traits<Iteration>;
 
 public:
     GKO_CREATE_FACTORY_PARAMETERS(parameters, Factory)
diff --git a/include/ginkgo/core/stop/residual_norm.hpp b/include/ginkgo/core/stop/residual_norm.hpp
index eff9fef772f..91a7cea8bee 100644
--- a/include/ginkgo/core/stop/residual_norm.hpp
+++ b/include/ginkgo/core/stop/residual_norm.hpp
@@ -77,12 +77,12 @@ enum class mode { absolute, initial_resnorm, rhs_norm };
 template <typename ValueType>
 class ResidualNormBase
     : public EnablePolymorphicObject<ResidualNormBase<ValueType>, Criterion> {
-    friend class EnablePolymorphicObject<ResidualNormBase<ValueType>,
-                                         Criterion>;
+    friend struct polymorphic_object_traits<ResidualNormBase>;
 
 protected:
+    using absolute_type = remove_complex<ValueType>;
     using ComplexVector = matrix::Dense<to_complex<ValueType>>;
-    using NormVector = matrix::Dense<remove_complex<ValueType>>;
+    using NormVector = matrix::Dense<absolute_type>;
     using Vector = matrix::Dense<ValueType>;
     bool check_impl(uint8 stoppingId, bool setFinalized,
                     array<stopping_status>* stop_status, bool* one_changed,
@@ -95,84 +95,7 @@ class ResidualNormBase
 
     explicit ResidualNormBase(std::shared_ptr<const gko::Executor> exec,
                               const CriterionArgs& args,
-                              remove_complex<ValueType> reduction_factor,
-                              mode baseline)
-        : EnablePolymorphicObject<ResidualNormBase, Criterion>(exec),
-          reduction_factor_{reduction_factor},
-          device_storage_{exec, 2},
-          baseline_{baseline},
-          system_matrix_{args.system_matrix},
-          b_{args.b},
-          one_{gko::initialize<Vector>({1}, exec)},
-          neg_one_{gko::initialize<Vector>({-1}, exec)}
-    {
-        switch (baseline_) {
-        case mode::initial_resnorm: {
-            if (args.initial_residual == nullptr) {
-                if (args.system_matrix == nullptr || args.b == nullptr ||
-                    args.x == nullptr) {
-                    GKO_NOT_SUPPORTED(nullptr);
-                } else {
-                    this->starting_tau_ = NormVector::create(
-                        exec, dim<2>{1, args.b->get_size()[1]});
-                    auto b_clone = share(args.b->clone());
-                    args.system_matrix->apply(neg_one_.get(), args.x,
-                                              one_.get(), b_clone.get());
-                    if (auto vec =
-                            std::dynamic_pointer_cast<const ComplexVector>(
-                                b_clone)) {
-                        vec->compute_norm2(this->starting_tau_.get());
-                    } else if (auto vec =
-                                   std::dynamic_pointer_cast<const Vector>(
-                                       b_clone)) {
-                        vec->compute_norm2(this->starting_tau_.get());
-                    } else {
-                        GKO_NOT_SUPPORTED(nullptr);
-                    }
-                }
-            } else {
-                this->starting_tau_ = NormVector::create(
-                    exec, dim<2>{1, args.initial_residual->get_size()[1]});
-                if (dynamic_cast<const ComplexVector*>(args.initial_residual)) {
-                    auto dense_r = as<ComplexVector>(args.initial_residual);
-                    dense_r->compute_norm2(this->starting_tau_.get());
-                } else {
-                    auto dense_r = as<Vector>(args.initial_residual);
-                    dense_r->compute_norm2(this->starting_tau_.get());
-                }
-            }
-            break;
-        }
-        case mode::rhs_norm: {
-            if (args.b == nullptr) {
-                GKO_NOT_SUPPORTED(nullptr);
-            }
-            this->starting_tau_ =
-                NormVector::create(exec, dim<2>{1, args.b->get_size()[1]});
-            if (dynamic_cast<const ComplexVector*>(args.b.get())) {
-                auto dense_rhs = as<ComplexVector>(args.b);
-                dense_rhs->compute_norm2(this->starting_tau_.get());
-            } else {
-                auto dense_rhs = as<Vector>(args.b);
-                dense_rhs->compute_norm2(this->starting_tau_.get());
-            }
-            break;
-        }
-        case mode::absolute: {
-            if (args.b == nullptr) {
-                GKO_NOT_SUPPORTED(nullptr);
-            }
-            this->starting_tau_ =
-                NormVector::create(exec, dim<2>{1, args.b->get_size()[1]});
-            this->starting_tau_->fill(gko::one<remove_complex<ValueType>>());
-            break;
-        }
-        default:
-            GKO_NOT_SUPPORTED(nullptr);
-        }
-        this->u_dense_tau_ =
-            NormVector::create_with_config_of(this->starting_tau_.get());
-    }
+                              absolute_type reduction_factor, mode baseline);
 
     remove_complex<ValueType> reduction_factor_{};
     std::unique_ptr<NormVector> starting_tau_{};
diff --git a/include/ginkgo/core/stop/time.hpp b/include/ginkgo/core/stop/time.hpp
index 0cd93339420..ad7996c5eae 100644
--- a/include/ginkgo/core/stop/time.hpp
+++ b/include/ginkgo/core/stop/time.hpp
@@ -50,7 +50,7 @@ namespace stop {
  * @ingroup stop
  */
 class Time : public EnablePolymorphicObject<Time, Criterion> {
-    friend class EnablePolymorphicObject<Time, Criterion>;
+    friend struct polymorphic_object_traits<Time>;
 
 public:
     using clock = std::chrono::steady_clock;
diff --git a/include/ginkgo/ginkgo.hpp b/include/ginkgo/ginkgo.hpp
index 03ba363c69d..5f8ff7f6744 100644
--- a/include/ginkgo/ginkgo.hpp
+++ b/include/ginkgo/ginkgo.hpp
@@ -41,6 +41,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/array.hpp>
 #include <ginkgo/core/base/combination.hpp>
 #include <ginkgo/core/base/composition.hpp>
+#include <ginkgo/core/base/dense_cache.hpp>
 #include <ginkgo/core/base/device.hpp>
 #include <ginkgo/core/base/device_matrix_data.hpp>
 #include <ginkgo/core/base/dim.hpp>
@@ -62,6 +63,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/precision_dispatch.hpp>
 #include <ginkgo/core/base/range.hpp>
 #include <ginkgo/core/base/range_accessors.hpp>
+#include <ginkgo/core/base/scoped_device_id_guard.hpp>
 #include <ginkgo/core/base/std_extensions.hpp>
 #include <ginkgo/core/base/temporary_clone.hpp>
 #include <ginkgo/core/base/temporary_conversion.hpp>
@@ -70,7 +72,10 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <ginkgo/core/base/utils_helper.hpp>
 #include <ginkgo/core/base/version.hpp>
 
+#include <ginkgo/core/distributed/base.hpp>
+#include <ginkgo/core/distributed/matrix.hpp>
 #include <ginkgo/core/distributed/partition.hpp>
+#include <ginkgo/core/distributed/vector.hpp>
 
 #include <ginkgo/core/factorization/ic.hpp>
 #include <ginkgo/core/factorization/ilu.hpp>
diff --git a/omp/CMakeLists.txt b/omp/CMakeLists.txt
index 8704b7bb3b3..d7e88b57c86 100644
--- a/omp/CMakeLists.txt
+++ b/omp/CMakeLists.txt
@@ -3,9 +3,12 @@ target_sources(ginkgo_omp
     PRIVATE
     base/device_matrix_data_kernels.cpp
     base/index_set_kernels.cpp
+    base/scoped_device_id.cpp
     base/version.cpp
     components/prefix_sum_kernels.cpp
+    distributed/matrix_kernels.cpp
     distributed/partition_kernels.cpp
+    distributed/vector_kernels.cpp
     factorization/cholesky_kernels.cpp
     factorization/factorization_kernels.cpp
     factorization/ic_kernels.cpp
diff --git a/omp/base/scoped_device_id.cpp b/omp/base/scoped_device_id.cpp
new file mode 100644
index 00000000000..551b7fc0e7d
--- /dev/null
+++ b/omp/base/scoped_device_id.cpp
@@ -0,0 +1,49 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/scoped_device_id_guard.hpp>
+
+
+#include "core/base/noop_scoped_device_id_guard.hpp"
+
+
+namespace gko {
+
+
+scoped_device_id_guard::scoped_device_id_guard(const OmpExecutor* exec,
+                                               int device_id)
+    : scope_(std::make_unique<detail::noop_scoped_device_id_guard>())
+{}
+
+
+}  // namespace gko
diff --git a/omp/distributed/matrix_kernels.cpp b/omp/distributed/matrix_kernels.cpp
new file mode 100644
index 00000000000..c68b4a87fc2
--- /dev/null
+++ b/omp/distributed/matrix_kernels.cpp
@@ -0,0 +1,270 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/distributed/matrix_kernels.hpp"
+
+
+#include <omp.h>
+
+
+#include <ginkgo/core/base/exception_helpers.hpp>
+
+
+#include "core/base/allocator.hpp"
+#include "core/base/device_matrix_data_kernels.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+namespace distributed_matrix {
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void build_local_nonlocal(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        col_partition,
+    comm_index_type local_part, array<LocalIndexType>& local_row_idxs,
+    array<LocalIndexType>& local_col_idxs, array<ValueType>& local_values,
+    array<LocalIndexType>& non_local_row_idxs,
+    array<LocalIndexType>& non_local_col_idxs,
+    array<ValueType>& non_local_values,
+    array<LocalIndexType>& local_gather_idxs,
+    array<comm_index_type>& recv_sizes,
+    array<GlobalIndexType>& non_local_to_global)
+{
+    using partition_type =
+        experimental::distributed::Partition<LocalIndexType, GlobalIndexType>;
+    using range_index_type = GlobalIndexType;
+    using global_nonzero = matrix_data_entry<ValueType, GlobalIndexType>;
+    using local_nonzero = matrix_data_entry<ValueType, LocalIndexType>;
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto input_col_idxs = input.get_const_col_idxs();
+    auto input_vals = input.get_const_values();
+    auto row_part_ids = row_partition->get_part_ids();
+    auto col_part_ids = col_partition->get_part_ids();
+    auto num_parts = row_partition->get_num_parts();
+    auto recv_sizes_ptr = recv_sizes.get_data();
+    size_type row_range_id_hint = 0;
+    size_type col_range_id_hint = 0;
+    // zero recv_sizes values
+    std::fill_n(recv_sizes_ptr, num_parts, comm_index_type{});
+
+    auto find_range = [](GlobalIndexType idx, const partition_type* partition,
+                         size_type hint) {
+        auto range_bounds = partition->get_range_bounds();
+        auto num_ranges = partition->get_num_ranges();
+        if (range_bounds[hint] <= idx && idx < range_bounds[hint + 1]) {
+            return hint;
+        } else {
+            auto it = std::upper_bound(range_bounds + 1,
+                                       range_bounds + num_ranges + 1, idx);
+            return static_cast<size_type>(std::distance(range_bounds + 1, it));
+        }
+    };
+    auto map_to_local = [](GlobalIndexType idx, const partition_type* partition,
+                           size_type range_id) {
+        auto range_bounds = partition->get_range_bounds();
+        auto range_starting_indices = partition->get_range_starting_indices();
+        return static_cast<LocalIndexType>(idx - range_bounds[range_id]) +
+               range_starting_indices[range_id];
+    };
+
+    // store non-local columns and their range indices
+    map<GlobalIndexType, range_index_type> non_local_cols(exec);
+    // store non-local entries with global column idxs
+    vector<global_nonzero> non_local_entries(exec);
+    vector<local_nonzero> local_entries(exec);
+
+    auto num_threads = static_cast<size_type>(omp_get_max_threads());
+    auto num_input = input.get_num_elems();
+    auto size_per_thread = (num_input + num_threads - 1) / num_threads;
+    std::vector<size_type> local_entry_offsets(num_threads, 0);
+    std::vector<size_type> non_local_entry_offsets(num_threads, 0);
+
+#pragma omp parallel firstprivate(col_range_id_hint, row_range_id_hint)
+    {
+        std::unordered_map<GlobalIndexType, range_index_type>
+            thread_non_local_cols;
+        std::vector<global_nonzero> thread_non_local_entries;
+        std::vector<local_nonzero> thread_local_entries;
+        std::vector<comm_index_type> thread_recv_sizes;
+        auto thread_id = omp_get_thread_num();
+        auto thread_begin = thread_id * size_per_thread;
+        auto thread_end = std::min(thread_begin + size_per_thread, num_input);
+        // separate local and non-local entries for our input chunk
+        for (auto i = thread_begin; i < thread_end; ++i) {
+            const auto global_row = input_row_idxs[i];
+            const auto global_col = input_col_idxs[i];
+            const auto value = input_vals[i];
+            auto row_range_id =
+                find_range(global_row, row_partition, row_range_id_hint);
+            row_range_id_hint = row_range_id;
+            // skip non-local rows
+            if (row_part_ids[row_range_id] == local_part) {
+                // map to part-local indices
+                auto local_row =
+                    map_to_local(global_row, row_partition, row_range_id);
+
+                auto col_range_id =
+                    find_range(global_col, col_partition, col_range_id_hint);
+                col_range_id_hint = col_range_id;
+                if (col_part_ids[col_range_id] == local_part) {
+                    // store local entry
+                    auto local_col =
+                        map_to_local(global_col, col_partition, col_range_id);
+                    thread_local_entries.emplace_back(local_row, local_col,
+                                                      value);
+                } else {
+                    thread_non_local_cols.emplace(global_col, col_range_id);
+                    thread_non_local_entries.emplace_back(local_row, global_col,
+                                                          value);
+                }
+            }
+        }
+        local_entry_offsets[thread_id] = thread_local_entries.size();
+        non_local_entry_offsets[thread_id] = thread_non_local_entries.size();
+
+#pragma omp critical
+        {
+            // collect global non-local columns
+            non_local_cols.insert(thread_non_local_cols.begin(),
+                                  thread_non_local_cols.end());
+        }
+#pragma omp barrier
+#pragma omp single
+        {
+            // assign output ranges to the individual threads
+            size_type local{};
+            size_type non_local{};
+            for (size_type thread = 0; thread < num_threads; ++thread) {
+                auto size_local = local_entry_offsets[thread];
+                auto size_non_local = non_local_entry_offsets[thread];
+                local_entry_offsets[thread] = local;
+                non_local_entry_offsets[thread] = non_local;
+                local += size_local;
+                non_local += size_non_local;
+            }
+            local_entries.resize(local);
+            non_local_entries.resize(non_local);
+        }
+        // write back the local data to the output ranges
+        auto local = local_entry_offsets[thread_id];
+        auto non_local = non_local_entry_offsets[thread_id];
+        for (const auto& entry : thread_local_entries) {
+            local_entries[local] = entry;
+            local++;
+        }
+        for (const auto& entry : thread_non_local_entries) {
+            non_local_entries[non_local] = entry;
+            non_local++;
+        }
+    }
+    // store local data to output
+    local_row_idxs.resize_and_reset(local_entries.size());
+    local_col_idxs.resize_and_reset(local_entries.size());
+    local_values.resize_and_reset(local_entries.size());
+#pragma omp parallel for
+    for (size_type i = 0; i < local_entries.size(); ++i) {
+        const auto& entry = local_entries[i];
+        local_row_idxs.get_data()[i] = entry.row;
+        local_col_idxs.get_data()[i] = entry.column;
+        local_values.get_data()[i] = entry.value;
+    }
+
+    // count non-local columns per part
+    for (const auto& entry : non_local_cols) {
+        auto col_range_id = entry.second;
+        recv_sizes_ptr[col_part_ids[col_range_id]]++;
+    }
+    const auto num_non_local_cols = std::accumulate(
+        recv_sizes_ptr, recv_sizes_ptr + num_parts, size_type{});
+    components::prefix_sum(exec, recv_sizes_ptr, num_parts);
+
+    // collect and renumber offdiagonal columns
+    local_gather_idxs.resize_and_reset(num_non_local_cols);
+    std::unordered_map<GlobalIndexType, LocalIndexType>
+        non_local_global_to_local;
+    for (const auto& entry : non_local_cols) {
+        auto range = entry.second;
+        auto part = col_part_ids[range];
+        auto idx = recv_sizes_ptr[part];
+        local_gather_idxs.get_data()[idx] =
+            map_to_local(entry.first, col_partition, entry.second);
+        non_local_global_to_local[entry.first] = idx;
+        ++recv_sizes_ptr[part];
+    }
+
+    // build local-to-global map for non-local columns
+    non_local_to_global.resize_and_reset(num_non_local_cols);
+    std::fill_n(non_local_to_global.get_data(),
+                non_local_to_global.get_num_elems(),
+                invalid_index<GlobalIndexType>());
+    for (const auto& key_value : non_local_global_to_local) {
+        const auto global_idx = key_value.first;
+        const auto local_idx = key_value.second;
+        non_local_to_global.get_data()[local_idx] = global_idx;
+    }
+
+    // compute sizes from shifted offsets
+    for (size_type i = num_parts - 1; i > 0; --i) {
+        recv_sizes_ptr[i] -= recv_sizes_ptr[i - 1];
+    }
+
+    // map non-local values to local column indices
+    non_local_row_idxs.resize_and_reset(non_local_entries.size());
+    non_local_col_idxs.resize_and_reset(non_local_entries.size());
+    non_local_values.resize_and_reset(non_local_entries.size());
+#pragma omp parallel for
+    for (size_type i = 0; i < non_local_entries.size(); i++) {
+        auto global = non_local_entries[i];
+        non_local_row_idxs.get_data()[i] =
+            static_cast<LocalIndexType>(global.row);
+        non_local_col_idxs.get_data()[i] =
+            non_local_global_to_local[global.column];
+        non_local_values.get_data()[i] = global.value;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_BUILD_LOCAL_NONLOCAL);
+
+
+}  // namespace distributed_matrix
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/omp/distributed/vector_kernels.cpp b/omp/distributed/vector_kernels.cpp
new file mode 100644
index 00000000000..e2c96703551
--- /dev/null
+++ b/omp/distributed/vector_kernels.cpp
@@ -0,0 +1,99 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/distributed/vector_kernels.hpp"
+
+
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace omp {
+namespace distributed_vector {
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void build_local(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        partition,
+    comm_index_type local_part, matrix::Dense<ValueType>* local_mtx)
+{
+    auto row_idxs = input.get_const_row_idxs();
+    auto col_idxs = input.get_const_col_idxs();
+    auto values = input.get_const_values();
+    auto range_bounds = partition->get_range_bounds();
+    auto range_parts = partition->get_part_ids();
+    auto range_starting_indices = partition->get_range_starting_indices();
+    auto num_ranges = partition->get_num_ranges();
+
+    auto find_range = [range_bounds, num_ranges](GlobalIndexType idx,
+                                                 size_type hint) {
+        if (range_bounds[hint] <= idx && idx < range_bounds[hint + 1]) {
+            return hint;
+        } else {
+            auto it = std::upper_bound(range_bounds + 1,
+                                       range_bounds + num_ranges + 1, idx);
+            return static_cast<size_type>(std::distance(range_bounds + 1, it));
+        }
+    };
+    auto map_to_local = [range_bounds, range_starting_indices](
+                            GlobalIndexType idx,
+                            size_type range_id) -> LocalIndexType {
+        return static_cast<LocalIndexType>(idx - range_bounds[range_id]) +
+               range_starting_indices[range_id];
+    };
+
+    size_type range_id_hint = 0;
+#pragma omp parallel for firstprivate(range_id_hint)
+    for (size_type i = 0; i < input.get_num_elems(); ++i) {
+        auto range_id = find_range(row_idxs[i], range_id_hint);
+        range_id_hint = range_id;
+        auto part_id = range_parts[range_id];
+        // skip non-local rows
+        if (part_id == local_part) {
+            local_mtx->at(map_to_local(row_idxs[i], range_id),
+                          static_cast<LocalIndexType>(col_idxs[i])) = values[i];
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
+
+
+}  // namespace distributed_vector
+}  // namespace omp
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/CMakeLists.txt b/reference/CMakeLists.txt
index 4733fe228d2..a18dbc49296 100644
--- a/reference/CMakeLists.txt
+++ b/reference/CMakeLists.txt
@@ -3,6 +3,7 @@ target_sources(ginkgo_reference
     PRIVATE
     base/device_matrix_data_kernels.cpp
     base/index_set_kernels.cpp
+    base/scoped_device_id.cpp
     base/version.cpp
     components/absolute_array_kernels.cpp
     components/fill_array_kernels.cpp
@@ -10,7 +11,9 @@ target_sources(ginkgo_reference
     components/reduce_array_kernels.cpp
     components/precision_conversion_kernels.cpp
     components/prefix_sum_kernels.cpp
+    distributed/matrix_kernels.cpp
     distributed/partition_kernels.cpp
+    distributed/vector_kernels.cpp
     factorization/cholesky_kernels.cpp
     factorization/factorization_kernels.cpp
     factorization/ic_kernels.cpp
diff --git a/reference/base/scoped_device_id.cpp b/reference/base/scoped_device_id.cpp
new file mode 100644
index 00000000000..e7ab581eba8
--- /dev/null
+++ b/reference/base/scoped_device_id.cpp
@@ -0,0 +1,49 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/scoped_device_id_guard.hpp>
+
+
+#include "core/base/noop_scoped_device_id_guard.hpp"
+
+
+namespace gko {
+
+
+scoped_device_id_guard::scoped_device_id_guard(const ReferenceExecutor* exec,
+                                               int device_id)
+    : scope_(std::make_unique<detail::noop_scoped_device_id_guard>())
+{}
+
+
+}  // namespace gko
diff --git a/reference/distributed/matrix_kernels.cpp b/reference/distributed/matrix_kernels.cpp
new file mode 100644
index 00000000000..b0e09061a5d
--- /dev/null
+++ b/reference/distributed/matrix_kernels.cpp
@@ -0,0 +1,194 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/distributed/matrix_kernels.hpp"
+
+
+#include "core/base/allocator.hpp"
+#include "core/base/device_matrix_data_kernels.hpp"
+#include "core/base/iterator_factory.hpp"
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+namespace distributed_matrix {
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void build_local_nonlocal(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        row_partition,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        col_partition,
+    comm_index_type local_part, array<LocalIndexType>& local_row_idxs,
+    array<LocalIndexType>& local_col_idxs, array<ValueType>& local_values,
+    array<LocalIndexType>& non_local_row_idxs,
+    array<LocalIndexType>& non_local_col_idxs,
+    array<ValueType>& non_local_values,
+    array<LocalIndexType>& local_gather_idxs,
+    array<comm_index_type>& recv_sizes,
+    array<GlobalIndexType>& non_local_to_global)
+{
+    using partition_type =
+        experimental::distributed::Partition<LocalIndexType, GlobalIndexType>;
+    using global_nonzero = matrix_data_entry<ValueType, GlobalIndexType>;
+    auto input_row_idxs = input.get_const_row_idxs();
+    auto input_col_idxs = input.get_const_col_idxs();
+    auto input_vals = input.get_const_values();
+    auto row_part_ids = row_partition->get_part_ids();
+    auto col_part_ids = col_partition->get_part_ids();
+    auto num_parts = row_partition->get_num_parts();
+
+    auto find_range = [](GlobalIndexType idx, const partition_type* partition,
+                         size_type hint) {
+        auto range_bounds = partition->get_range_bounds();
+        auto num_ranges = partition->get_num_ranges();
+        if (range_bounds[hint] <= idx && idx < range_bounds[hint + 1]) {
+            return hint;
+        } else {
+            auto it = std::upper_bound(range_bounds + 1,
+                                       range_bounds + num_ranges + 1, idx);
+            return static_cast<size_type>(std::distance(range_bounds + 1, it));
+        }
+    };
+    auto map_to_local = [](GlobalIndexType idx, const partition_type* partition,
+                           size_type range_id) {
+        auto range_bounds = partition->get_range_bounds();
+        auto range_starting_indices = partition->get_range_starting_indices();
+        return static_cast<LocalIndexType>(idx - range_bounds[range_id]) +
+               range_starting_indices[range_id];
+    };
+
+    vector<global_nonzero> local_entries(exec);
+    vector<global_nonzero> non_local_entries(exec);
+    size_type row_range_id = 0;
+    size_type col_range_id = 0;
+    for (size_type i = 0; i < input.get_num_elems(); ++i) {
+        auto global_row = input_row_idxs[i];
+        row_range_id = find_range(global_row, row_partition, row_range_id);
+        if (row_part_ids[row_range_id] == local_part) {
+            auto global_col = input_col_idxs[i];
+            col_range_id = find_range(global_col, col_partition, col_range_id);
+            if (col_part_ids[col_range_id] == local_part) {
+                local_entries.push_back(
+                    {map_to_local(global_row, row_partition, row_range_id),
+                     map_to_local(global_col, col_partition, col_range_id),
+                     input_vals[i]});
+            } else {
+                non_local_entries.push_back(
+                    {map_to_local(global_row, row_partition, row_range_id),
+                     global_col, input_vals[i]});
+            }
+        }
+    }
+
+    // create local matrix
+    local_row_idxs.resize_and_reset(local_entries.size());
+    local_col_idxs.resize_and_reset(local_entries.size());
+    local_values.resize_and_reset(local_entries.size());
+    for (size_type i = 0; i < local_entries.size(); ++i) {
+        const auto& entry = local_entries[i];
+        local_row_idxs.get_data()[i] = entry.row;
+        local_col_idxs.get_data()[i] = entry.column;
+        local_values.get_data()[i] = entry.value;
+    }
+
+    // create non-local matrix
+    // 1. stable sort global columns according to their part-id and global
+    // columns
+    auto find_col_part = [&](GlobalIndexType idx) {
+        auto range_id = find_range(idx, col_partition, 0);
+        return col_part_ids[range_id];
+    };
+    vector<GlobalIndexType> unique_columns(exec);
+    std::transform(non_local_entries.begin(), non_local_entries.end(),
+                   std::back_inserter(unique_columns),
+                   [](const auto& entry) { return entry.column; });
+    std::sort(unique_columns.begin(), unique_columns.end(),
+              [&](const auto& a, const auto& b) {
+                  auto part_a = find_col_part(a);
+                  auto part_b = find_col_part(b);
+                  return std::tie(part_a, a) < std::tie(part_b, b);
+              });
+
+    // 2. remove duplicate columns, now the new column i has global index
+    // unique_columns[i]
+    unique_columns.erase(
+        std::unique(unique_columns.begin(), unique_columns.end()),
+        unique_columns.end());
+
+    // 3. create mapping from unique_columns
+    unordered_map<GlobalIndexType, LocalIndexType> non_local_column_map(exec);
+    for (size_type i = 0; i < unique_columns.size(); ++i) {
+        non_local_column_map[unique_columns[i]] =
+            static_cast<LocalIndexType>(i);
+    }
+
+    // 3.5 copy unique_columns to array
+    non_local_to_global = array<GlobalIndexType>{exec, unique_columns.begin(),
+                                                 unique_columns.end()};
+
+    // 4. fill non_local_data
+    non_local_row_idxs.resize_and_reset(non_local_entries.size());
+    non_local_col_idxs.resize_and_reset(non_local_entries.size());
+    non_local_values.resize_and_reset(non_local_entries.size());
+    for (size_type i = 0; i < non_local_entries.size(); ++i) {
+        const auto& entry = non_local_entries[i];
+        non_local_row_idxs.get_data()[i] = entry.row;
+        non_local_col_idxs.get_data()[i] = non_local_column_map[entry.column];
+        non_local_values.get_data()[i] = entry.value;
+    }
+
+    // compute gather idxs and recv_sizes
+    local_gather_idxs.resize_and_reset(unique_columns.size());
+    std::fill_n(recv_sizes.get_data(), num_parts, 0);
+    for (size_type i = 0; i < unique_columns.size(); ++i) {
+        col_range_id =
+            find_range(unique_columns[i], col_partition, col_range_id);
+        local_gather_idxs.get_data()[i] =
+            map_to_local(unique_columns[i], col_partition, col_range_id);
+        recv_sizes.get_data()[find_col_part(unique_columns[i])]++;
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_BUILD_LOCAL_NONLOCAL);
+
+
+}  // namespace distributed_matrix
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/distributed/partition_kernels.cpp b/reference/distributed/partition_kernels.cpp
index 37bb7155085..ee03408660a 100644
--- a/reference/distributed/partition_kernels.cpp
+++ b/reference/distributed/partition_kernels.cpp
@@ -139,7 +139,8 @@ GKO_INSTANTIATE_FOR_EACH_LOCAL_GLOBAL_INDEX_TYPE(
 template <typename LocalIndexType, typename GlobalIndexType>
 void has_ordered_parts(
     std::shared_ptr<const DefaultExecutor> exec,
-    const distributed::Partition<LocalIndexType, GlobalIndexType>* partition,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        partition,
     bool* result)
 {
     *result = true;
diff --git a/reference/distributed/vector_kernels.cpp b/reference/distributed/vector_kernels.cpp
new file mode 100644
index 00000000000..05a014f236a
--- /dev/null
+++ b/reference/distributed/vector_kernels.cpp
@@ -0,0 +1,98 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/distributed/vector_kernels.hpp"
+
+
+#include "core/components/prefix_sum_kernels.hpp"
+
+
+namespace gko {
+namespace kernels {
+namespace reference {
+namespace distributed_vector {
+
+
+template <typename ValueType, typename LocalIndexType, typename GlobalIndexType>
+void build_local(
+    std::shared_ptr<const DefaultExecutor> exec,
+    const device_matrix_data<ValueType, GlobalIndexType>& input,
+    const experimental::distributed::Partition<LocalIndexType, GlobalIndexType>*
+        partition,
+    comm_index_type local_part, matrix::Dense<ValueType>* local_mtx)
+{
+    auto row_idxs = input.get_const_row_idxs();
+    auto col_idxs = input.get_const_col_idxs();
+    auto values = input.get_const_values();
+    auto range_bounds = partition->get_range_bounds();
+    auto range_parts = partition->get_part_ids();
+    auto range_starting_indices = partition->get_range_starting_indices();
+    auto num_ranges = partition->get_num_ranges();
+
+    auto find_range = [range_bounds, num_ranges](GlobalIndexType idx,
+                                                 size_type hint) {
+        if (range_bounds[hint] <= idx && idx < range_bounds[hint + 1]) {
+            return hint;
+        } else {
+            auto it = std::upper_bound(range_bounds + 1,
+                                       range_bounds + num_ranges + 1, idx);
+            return static_cast<size_type>(std::distance(range_bounds + 1, it));
+        }
+    };
+    auto map_to_local = [range_bounds, range_starting_indices](
+                            GlobalIndexType idx,
+                            size_type range_id) -> LocalIndexType {
+        return static_cast<LocalIndexType>(idx - range_bounds[range_id]) +
+               range_starting_indices[range_id];
+    };
+
+    size_type range_id_hint = 0;
+    for (size_type i = 0; i < input.get_num_elems(); ++i) {
+        auto range_id = find_range(row_idxs[i], range_id_hint);
+        range_id_hint = range_id;
+        auto part_id = range_parts[range_id];
+        // skip non-local rows
+        if (part_id == local_part) {
+            local_mtx->at(map_to_local(row_idxs[i], range_id),
+                          static_cast<LocalIndexType>(col_idxs[i])) = values[i];
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_AND_LOCAL_GLOBAL_INDEX_TYPE(
+    GKO_DECLARE_DISTRIBUTED_VECTOR_BUILD_LOCAL);
+
+
+}  // namespace distributed_vector
+}  // namespace reference
+}  // namespace kernels
+}  // namespace gko
diff --git a/reference/matrix/dense_kernels.cpp b/reference/matrix/dense_kernels.cpp
index fb3592a8c00..f6c6b87ed9f 100644
--- a/reference/matrix/dense_kernels.cpp
+++ b/reference/matrix/dense_kernels.cpp
@@ -412,6 +412,41 @@ GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
     GKO_DECLARE_DENSE_FILL_IN_MATRIX_DATA_KERNEL);
 
 
+template <typename ValueType>
+void compute_squared_norm2(std::shared_ptr<const ReferenceExecutor> exec,
+                           const matrix::Dense<ValueType>* x,
+                           matrix::Dense<remove_complex<ValueType>>* result,
+                           array<char>&)
+{
+    for (size_type j = 0; j < x->get_size()[1]; ++j) {
+        result->at(0, j) = zero<remove_complex<ValueType>>();
+    }
+    for (size_type i = 0; i < x->get_size()[0]; ++i) {
+        for (size_type j = 0; j < x->get_size()[1]; ++j) {
+            result->at(0, j) += squared_norm(x->at(i, j));
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SQUARED_NORM2_KERNEL);
+
+
+template <typename ValueType>
+void compute_sqrt(std::shared_ptr<const ReferenceExecutor> exec,
+                  matrix::Dense<ValueType>* data)
+{
+    for (size_type i = 0; i < data->get_size()[0]; ++i) {
+        for (size_type j = 0; j < data->get_size()[1]; ++j) {
+            data->at(i, j) = sqrt(data->at(i, j));
+        }
+    }
+}
+
+GKO_INSTANTIATE_FOR_EACH_NON_COMPLEX_VALUE_TYPE(
+    GKO_DECLARE_DENSE_COMPUTE_SQRT_KERNEL);
+
+
 template <typename ValueType, typename IndexType>
 void convert_to_coo(std::shared_ptr<const ReferenceExecutor> exec,
                     const matrix::Dense<ValueType>* source, const int64*,
diff --git a/reference/test/base/composition.cpp b/reference/test/base/composition.cpp
index 7c5c90d8678..4a923660ca0 100644
--- a/reference/test/base/composition.cpp
+++ b/reference/test/base/composition.cpp
@@ -51,7 +51,7 @@ namespace {
 template <typename ValueType>
 class DummyLinOp : public gko::EnableLinOp<DummyLinOp<ValueType>>,
                    public gko::EnableCreateMethod<DummyLinOp<ValueType>> {
-    friend class gko::EnablePolymorphicObject<DummyLinOp, gko::LinOp>;
+    friend class gko::polymorphic_object_traits<DummyLinOp>;
     friend class gko::EnableCreateMethod<DummyLinOp>;
 
 public:
diff --git a/reference/test/distributed/CMakeLists.txt b/reference/test/distributed/CMakeLists.txt
index 78a626512af..2985c7b5e11 100644
--- a/reference/test/distributed/CMakeLists.txt
+++ b/reference/test/distributed/CMakeLists.txt
@@ -1 +1,3 @@
+ginkgo_create_test(matrix_kernels)
 ginkgo_create_test(partition_kernels)
+ginkgo_create_test(vector_kernels)
diff --git a/reference/test/distributed/matrix_kernels.cpp b/reference/test/distributed/matrix_kernels.cpp
new file mode 100644
index 00000000000..32fc635de1e
--- /dev/null
+++ b/reference/test/distributed/matrix_kernels.cpp
@@ -0,0 +1,572 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+
+#include <gtest/gtest-typed-test.h>
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/distributed/matrix_kernels.hpp"
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+using comm_index_type = gko::experimental::distributed::comm_index_type;
+
+
+template <typename ValueLocalGlobalIndexType>
+class Matrix : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using local_index_type =
+        typename std::tuple_element<1, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using global_index_type =
+        typename std::tuple_element<2, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using Mtx = gko::matrix::Csr<value_type, local_index_type>;
+
+    Matrix()
+        : ref(gko::ReferenceExecutor::create()),
+          mapping{ref},
+          local_row_idxs{ref},
+          local_col_idxs{ref},
+          local_values{ref},
+          non_local_row_idxs{ref},
+          non_local_col_idxs{ref},
+          non_local_values{ref},
+          gather_idxs{ref},
+          recv_sizes{ref},
+          non_local_to_global{ref}
+    {}
+
+    void validate(
+        gko::dim<2> size,
+        const gko::experimental::distributed::Partition<
+            local_index_type, global_index_type>* row_partition,
+        const gko::experimental::distributed::Partition<
+            local_index_type, global_index_type>* col_partition,
+        std::initializer_list<global_index_type> input_rows,
+        std::initializer_list<global_index_type> input_cols,
+        std::initializer_list<value_type> input_vals,
+        std::initializer_list<
+            std::tuple<gko::dim<2>, std::initializer_list<global_index_type>,
+                       std::initializer_list<global_index_type>,
+                       std::initializer_list<value_type>>>
+            local_entries,
+        std::initializer_list<
+            std::tuple<gko::dim<2>, std::initializer_list<global_index_type>,
+                       std::initializer_list<global_index_type>,
+                       std::initializer_list<value_type>>>
+            non_local_entries,
+        std::initializer_list<std::initializer_list<local_index_type>>
+            gather_idx_entries,
+        std::initializer_list<std::initializer_list<comm_index_type>>
+            recv_sizes_entries)
+    {
+        std::vector<gko::device_matrix_data<value_type, local_index_type>>
+            ref_locals;
+        std::vector<gko::device_matrix_data<value_type, local_index_type>>
+            ref_non_locals;
+        std::vector<gko::array<local_index_type>> ref_gather_idxs;
+        std::vector<gko::array<comm_index_type>> ref_recv_sizes;
+
+        auto input = gko::device_matrix_data<value_type, global_index_type>{
+            ref, size, input_rows, input_cols, input_vals};
+        this->recv_sizes.resize_and_reset(
+            static_cast<gko::size_type>(row_partition->get_num_parts()));
+        for (auto entry : local_entries) {
+            ref_locals.emplace_back(ref, std::get<0>(entry), std::get<1>(entry),
+                                    std::get<2>(entry), std::get<3>(entry));
+        }
+        for (auto entry : non_local_entries) {
+            ref_non_locals.emplace_back(ref, std::get<0>(entry),
+                                        std::get<1>(entry), std::get<2>(entry),
+                                        std::get<3>(entry));
+        }
+        for (auto entry : gather_idx_entries) {
+            ref_gather_idxs.emplace_back(ref, entry);
+        }
+        for (auto entry : recv_sizes_entries) {
+            ref_recv_sizes.emplace_back(ref, entry);
+        }
+
+        for (comm_index_type part = 0; part < row_partition->get_num_parts();
+             ++part) {
+            gko::kernels::reference::distributed_matrix::build_local_nonlocal(
+                ref, input, row_partition, col_partition, part, local_row_idxs,
+                local_col_idxs, local_values, non_local_row_idxs,
+                non_local_col_idxs, non_local_values, gather_idxs, recv_sizes,
+                non_local_to_global);
+
+            assert_device_matrix_data_equal(local_row_idxs, local_col_idxs,
+                                            local_values, ref_locals[part]);
+            assert_device_matrix_data_equal(
+                non_local_row_idxs, non_local_col_idxs, non_local_values,
+                ref_non_locals[part]);
+            GKO_ASSERT_ARRAY_EQ(gather_idxs, ref_gather_idxs[part]);
+            GKO_ASSERT_ARRAY_EQ(recv_sizes, ref_recv_sizes[part]);
+        }
+    }
+
+    template <typename A1, typename A2, typename A3, typename Data2>
+    void assert_device_matrix_data_equal(A1& row_idxs, A2& col_idxs, A3& values,
+                                         Data2& second)
+    {
+        auto array_second = second.empty_out();
+
+        GKO_ASSERT_ARRAY_EQ(row_idxs, array_second.row_idxs);
+        GKO_ASSERT_ARRAY_EQ(col_idxs, array_second.col_idxs);
+        GKO_ASSERT_ARRAY_EQ(values, array_second.values);
+    }
+
+    gko::device_matrix_data<value_type, global_index_type>
+    create_input_not_full_rank()
+    {
+        return gko::device_matrix_data<value_type, global_index_type>{
+            this->ref, gko::dim<2>{7, 7},
+            I<global_index_type>{0, 0, 2, 3, 3, 4, 4, 5, 5, 6},
+            I<global_index_type>{0, 3, 2, 0, 3, 4, 6, 4, 5, 5},
+            I<value_type>{1, 2, 5, 6, 7, 8, 9, 10, 11, 12}};
+    }
+
+    gko::device_matrix_data<value_type, global_index_type>
+    create_input_full_rank()
+    {
+        return gko::device_matrix_data<value_type, global_index_type>{
+            this->ref, gko::dim<2>{7, 7},
+            I<global_index_type>{0, 0, 1, 1, 2, 3, 3, 4, 4, 5, 5, 6},
+            I<global_index_type>{0, 3, 1, 2, 2, 0, 3, 4, 6, 4, 5, 5},
+            I<value_type>{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}};
+    }
+
+    std::shared_ptr<const gko::ReferenceExecutor> ref;
+    gko::array<comm_index_type> mapping;
+    gko::array<local_index_type> local_row_idxs;
+    gko::array<local_index_type> local_col_idxs;
+    gko::array<value_type> local_values;
+    gko::array<local_index_type> non_local_row_idxs;
+    gko::array<local_index_type> non_local_col_idxs;
+    gko::array<value_type> non_local_values;
+    gko::array<local_index_type> gather_idxs;
+    gko::array<comm_index_type> recv_sizes;
+    gko::array<global_index_type> non_local_to_global;
+};
+
+TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypes);
+
+
+TYPED_TEST(Matrix, BuildsLocalNonLocalEmpty)
+{
+    using lit = typename TestFixture::local_index_type;
+    using git = typename TestFixture::global_index_type;
+    using vt = typename TestFixture::value_type;
+    this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1, 2}};
+    comm_index_type num_parts = 3;
+    auto partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, this->mapping, num_parts);
+
+    this->validate(
+        gko::dim<2>{8, 8}, partition.get(), partition.get(), {}, {}, {},
+        {std::make_tuple(gko::dim<2>{2, 2}, I<git>{}, I<git>{}, I<vt>{}),
+         std::make_tuple(gko::dim<2>{3, 3}, I<git>{}, I<git>{}, I<vt>{}),
+         std::make_tuple(gko::dim<2>{3, 3}, I<git>{}, I<git>{}, I<vt>{})},
+        {std::make_tuple(gko::dim<2>{2, 0}, I<git>{}, I<git>{}, I<vt>{}),
+         std::make_tuple(gko::dim<2>{3, 0}, I<git>{}, I<git>{}, I<vt>{}),
+         std::make_tuple(gko::dim<2>{3, 0}, I<git>{}, I<git>{}, I<vt>{})},
+        {{}, {}, {}}, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}});
+}
+
+
+TYPED_TEST(Matrix, BuildsLocalNonLocalSmall)
+{
+    using lit = typename TestFixture::local_index_type;
+    using git = typename TestFixture::global_index_type;
+    using vt = typename TestFixture::value_type;
+    this->mapping = {this->ref, {1, 0}};
+    comm_index_type num_parts = 2;
+    auto partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, this->mapping, num_parts);
+
+    this->validate(
+        gko::dim<2>{2, 2}, partition.get(), partition.get(), {0, 0, 1, 1},
+        {0, 1, 0, 1}, {1, 2, 3, 4},
+        {std::make_tuple(gko::dim<2>{1, 1}, I<git>{0}, I<git>{0}, I<vt>{4}),
+         std::make_tuple(gko::dim<2>{1, 1}, I<git>{0}, I<git>{0}, I<vt>{1})},
+        {std::make_tuple(gko::dim<2>{1, 1}, I<git>{0}, I<git>{0}, I<vt>{3}),
+         std::make_tuple(gko::dim<2>{1, 1}, I<git>{0}, I<git>{0}, I<vt>{2})},
+        {{0}, {0}}, {{0, 1}, {1, 0}});
+}
+
+
+TYPED_TEST(Matrix, BuildsLocalNonLocalNoNonLocal)
+{
+    using lit = typename TestFixture::local_index_type;
+    using git = typename TestFixture::global_index_type;
+    using vt = typename TestFixture::value_type;
+    this->mapping = {this->ref, {1, 2, 0, 0, 2, 1}};
+    comm_index_type num_parts = 3;
+    auto partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, this->mapping, num_parts);
+
+    this->validate(
+        gko::dim<2>{6, 6}, partition.get(), partition.get(),
+        {0, 0, 1, 1, 2, 3, 4, 5}, {0, 5, 1, 4, 3, 2, 4, 0},
+        {1, 2, 3, 4, 5, 6, 7, 8},
+        {std::make_tuple(gko::dim<2>{2, 2}, I<git>{0, 1}, I<git>{1, 0},
+                         I<vt>{5, 6}),
+         std::make_tuple(gko::dim<2>{2, 2}, I<git>{0, 0, 1}, I<git>{0, 1, 0},
+                         I<vt>{1, 2, 8}),
+         std::make_tuple(gko::dim<2>{2, 2}, I<git>{0, 0, 1}, I<git>{0, 1, 1},
+                         I<vt>{3, 4, 7})},
+        {std::make_tuple(gko::dim<2>{2, 0}, I<git>{}, I<git>{}, I<vt>{}),
+         std::make_tuple(gko::dim<2>{2, 0}, I<git>{}, I<git>{}, I<vt>{}),
+         std::make_tuple(gko::dim<2>{2, 0}, I<git>{}, I<git>{}, I<vt>{})},
+        {{}, {}, {}}, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}});
+}
+
+
+TYPED_TEST(Matrix, BuildsLocalNonLocalNoLocal)
+{
+    using lit = typename TestFixture::local_index_type;
+    using git = typename TestFixture::global_index_type;
+    using vt = typename TestFixture::value_type;
+    this->mapping = {this->ref, {1, 2, 0, 0, 2, 1}};
+    comm_index_type num_parts = 3;
+    auto partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, this->mapping, num_parts);
+
+    this->validate(
+        gko::dim<2>{6, 6}, partition.get(), partition.get(), {0, 0, 1, 3, 4, 5},
+        {1, 3, 5, 1, 3, 2}, {1, 2, 5, 6, 7, 8},
+        {std::make_tuple(gko::dim<2>{2, 2}, I<git>{}, I<git>{}, I<vt>{}),
+         std::make_tuple(gko::dim<2>{2, 2}, I<git>{}, I<git>{}, I<vt>{}),
+         std::make_tuple(gko::dim<2>{2, 2}, I<git>{}, I<git>{}, I<vt>{})},
+        {std::make_tuple(gko::dim<2>{2, 1}, I<git>{1}, I<git>{0}, I<vt>{6}),
+         std::make_tuple(gko::dim<2>{2, 3}, I<git>{0, 0, 1}, I<git>{2, 1, 0},
+                         I<vt>{1, 2, 8}),
+         std::make_tuple(gko::dim<2>{2, 2}, I<git>{0, 1}, I<git>{1, 0},
+                         I<vt>{5, 7})},
+        {{0}, {0, 1, 0}, {1, 1}}, {{0, 0, 1}, {2, 0, 1}, {1, 1, 0}});
+}
+
+
+TYPED_TEST(Matrix, BuildsLocalNonLocalMixed)
+{
+    using lit = typename TestFixture::local_index_type;
+    using git = typename TestFixture::global_index_type;
+    using vt = typename TestFixture::value_type;
+    this->mapping = {this->ref, {1, 2, 0, 0, 2, 1}};
+    comm_index_type num_parts = 3;
+    auto partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, this->mapping, num_parts);
+
+    this->validate(
+        gko::dim<2>{6, 6}, partition.get(), partition.get(),
+        {0, 0, 0, 0, 1, 1, 1, 2, 3, 3, 4, 4, 5, 5},
+        {0, 1, 3, 5, 1, 4, 5, 3, 1, 2, 3, 4, 0, 2},
+        {11, 1, 2, 12, 13, 14, 5, 15, 6, 16, 7, 17, 18, 8},
+
+        {std::make_tuple(gko::dim<2>{2, 2}, I<git>{0, 1}, I<git>{1, 0},
+                         I<vt>{15, 16}),
+         std::make_tuple(gko::dim<2>{2, 2}, I<git>{0, 0, 1}, I<git>{0, 1, 0},
+                         I<vt>{11, 12, 18}),
+         std::make_tuple(gko::dim<2>{2, 2}, I<git>{0, 0, 1}, I<git>{0, 1, 1},
+                         I<vt>{13, 14, 17})},
+        {std::make_tuple(gko::dim<2>{2, 1}, I<git>{1}, I<git>{0}, I<vt>{6}),
+         std::make_tuple(gko::dim<2>{2, 3}, I<git>{0, 0, 1}, I<git>{2, 1, 0},
+                         I<vt>{1, 2, 8}),
+         std::make_tuple(gko::dim<2>{2, 2}, I<git>{0, 1}, I<git>{1, 0},
+                         I<vt>{5, 7})},
+        {{0}, {0, 1, 0}, {1, 1}}, {{0, 0, 1}, {2, 0, 1}, {1, 1, 0}});
+}
+
+
+TYPED_TEST(Matrix, BuildsLocalNonLocalEmptyWithColPartition)
+{
+    using lit = typename TestFixture::local_index_type;
+    using git = typename TestFixture::global_index_type;
+    using vt = typename TestFixture::value_type;
+    this->mapping = {this->ref, {1, 0, 2, 2, 0, 1, 1, 2}};
+    comm_index_type num_parts = 3;
+    auto partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, this->mapping, num_parts);
+    gko::array<comm_index_type> col_mapping{this->ref,
+                                            {0, 0, 2, 2, 2, 1, 1, 1}};
+    auto col_partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, col_mapping, num_parts);
+
+    this->validate(
+        gko::dim<2>{8, 8}, partition.get(), col_partition.get(), {}, {}, {},
+        {std::make_tuple(gko::dim<2>{2, 2}, I<git>{}, I<git>{}, I<vt>{}),
+         std::make_tuple(gko::dim<2>{3, 3}, I<git>{}, I<git>{}, I<vt>{}),
+         std::make_tuple(gko::dim<2>{3, 3}, I<git>{}, I<git>{}, I<vt>{})},
+        {std::make_tuple(gko::dim<2>{2, 0}, I<git>{}, I<git>{}, I<vt>{}),
+         std::make_tuple(gko::dim<2>{3, 0}, I<git>{}, I<git>{}, I<vt>{}),
+         std::make_tuple(gko::dim<2>{3, 0}, I<git>{}, I<git>{}, I<vt>{})},
+        {{}, {}, {}}, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}});
+}
+
+
+TYPED_TEST(Matrix, BuildsLocalNonLocalSmallWithColPartition)
+{
+    using lit = typename TestFixture::local_index_type;
+    using git = typename TestFixture::global_index_type;
+    using vt = typename TestFixture::value_type;
+    this->mapping = {this->ref, {1, 0}};
+    comm_index_type num_parts = 2;
+    auto partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, this->mapping, num_parts);
+    gko::array<comm_index_type> col_mapping{this->ref, {0, 1}};
+    auto col_partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, col_mapping, num_parts);
+
+    this->validate(
+        gko::dim<2>{2, 2}, partition.get(), col_partition.get(), {0, 0, 1, 1},
+        {0, 1, 0, 1}, {1, 2, 3, 4},
+        {std::make_tuple(gko::dim<2>{1, 1}, I<git>{0}, I<git>{0}, I<vt>{3}),
+         std::make_tuple(gko::dim<2>{1, 1}, I<git>{0}, I<git>{0}, I<vt>{2})},
+        {std::make_tuple(gko::dim<2>{1, 1}, I<git>{0}, I<git>{0}, I<vt>{4}),
+         std::make_tuple(gko::dim<2>{1, 1}, I<git>{0}, I<git>{0}, I<vt>{1})},
+        {{0}, {0}}, {{0, 1}, {1, 0}});
+}
+
+TYPED_TEST(Matrix, BuildsLocalNonLocalNoNonLocalWithColPartition)
+{
+    using lit = typename TestFixture::local_index_type;
+    using git = typename TestFixture::global_index_type;
+    using vt = typename TestFixture::value_type;
+    this->mapping = {this->ref, {1, 2, 0, 0, 2, 1}};
+    comm_index_type num_parts = 3;
+    auto partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, this->mapping, num_parts);
+    gko::array<comm_index_type> col_mapping{this->ref, {0, 0, 2, 2, 1, 1}};
+    auto col_partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, col_mapping, num_parts);
+
+    this->validate(
+        gko::dim<2>{6, 6}, partition.get(), col_partition.get(),
+        {3, 0, 5, 1, 1, 4}, {1, 4, 5, 2, 3, 3}, {1, 2, 3, 4, 5, 6},
+        {std::make_tuple(gko::dim<2>{2, 2}, I<git>{1}, I<git>{1}, I<vt>{1}),
+         std::make_tuple(gko::dim<2>{2, 2}, I<git>{0, 1}, I<git>{0, 1},
+                         I<vt>{2, 3}),
+         std::make_tuple(gko::dim<2>{2, 2}, I<git>{0, 0, 1}, I<git>{0, 1, 1},
+                         I<vt>{4, 5, 6})},
+        {std::make_tuple(gko::dim<2>{2, 0}, I<git>{}, I<git>{}, I<vt>{}),
+         std::make_tuple(gko::dim<2>{2, 0}, I<git>{}, I<git>{}, I<vt>{}),
+         std::make_tuple(gko::dim<2>{2, 0}, I<git>{}, I<git>{}, I<vt>{})},
+        {{}, {}, {}}, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}});
+}
+
+
+TYPED_TEST(Matrix, BuildsLocalNonLocalNoLocalWithColPartition)
+{
+    using lit = typename TestFixture::local_index_type;
+    using git = typename TestFixture::global_index_type;
+    using vt = typename TestFixture::value_type;
+    this->mapping = {this->ref, {1, 2, 0, 0, 2, 1}};
+    comm_index_type num_parts = 3;
+    auto partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, this->mapping, num_parts);
+    gko::array<comm_index_type> col_mapping{this->ref, {0, 0, 2, 2, 1, 1}};
+    auto col_partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, col_mapping, num_parts);
+
+    this->validate(
+        gko::dim<2>{6, 6}, partition.get(), col_partition.get(),
+        {2, 3, 2, 0, 5, 1, 1}, {2, 3, 5, 0, 1, 1, 4}, {1, 2, 3, 4, 5, 6, 7},
+        {std::make_tuple(gko::dim<2>{2, 2}, I<git>{}, I<git>{}, I<vt>{}),
+         std::make_tuple(gko::dim<2>{2, 2}, I<git>{}, I<git>{}, I<vt>{}),
+         std::make_tuple(gko::dim<2>{2, 2}, I<git>{}, I<git>{}, I<vt>{})},
+        {std::make_tuple(gko::dim<2>{2, 3}, I<git>{0, 1, 0}, I<git>{1, 2, 0},
+                         I<vt>{1, 2, 3}),
+         std::make_tuple(gko::dim<2>{2, 2}, I<git>{0, 1}, I<git>{0, 1},
+                         I<vt>{4, 5}),
+         std::make_tuple(gko::dim<2>{2, 2}, I<git>{0, 0}, I<git>{0, 1},
+                         I<vt>{6, 7})},
+        {{1, 0, 1}, {0, 1}, {1, 0}}, {{0, 1, 2}, {2, 0, 0}, {1, 1, 0}});
+}
+
+
+TYPED_TEST(Matrix, BuildsLocalNonLocalMixedWithColPartition)
+{
+    using lit = typename TestFixture::local_index_type;
+    using git = typename TestFixture::global_index_type;
+    using vt = typename TestFixture::value_type;
+    this->mapping = {this->ref, {1, 2, 0, 0, 2, 1}};
+    comm_index_type num_parts = 3;
+    auto partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, this->mapping, num_parts);
+    gko::array<comm_index_type> col_mapping{this->ref, {0, 0, 2, 2, 1, 1}};
+    auto col_partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, col_mapping, num_parts);
+
+    this->validate(gko::dim<2>{6, 6}, partition.get(), col_partition.get(),
+                   {2, 3, 3, 0, 5, 1, 4, 2, 3, 2, 0, 0, 1, 1, 4, 4},
+                   {0, 0, 1, 5, 4, 2, 2, 3, 2, 4, 1, 2, 4, 5, 0, 5},
+                   {11, 12, 13, 14, 15, 16, 17, 1, 2, 3, 4, 5, 6, 7, 8, 9},
+                   {std::make_tuple(gko::dim<2>{2, 2}, I<git>{0, 1, 1},
+                                    I<git>{0, 0, 1}, I<vt>{11, 12, 13}),
+                    std::make_tuple(gko::dim<2>{2, 2}, I<git>{0, 1},
+                                    I<git>{1, 0}, I<vt>{14, 15}),
+                    std::make_tuple(gko::dim<2>{2, 2}, I<git>{0, 1},
+                                    I<git>{0, 0}, I<vt>{16, 17})},
+                   {std::make_tuple(gko::dim<2>{2, 3}, I<git>{0, 1, 0},
+                                    I<git>{2, 1, 0}, I<vt>{1, 2, 3}),
+                    std::make_tuple(gko::dim<2>{2, 2}, I<git>{0, 0},
+                                    I<git>{0, 1}, I<vt>{4, 5}),
+                    std::make_tuple(gko::dim<2>{2, 3}, I<git>{0, 0, 1, 1},
+                                    I<git>{1, 2, 0, 2}, I<vt>{6, 7, 8, 9})},
+                   {{0, 0, 1}, {1, 0}, {0, 0, 1}},
+                   {{0, 1, 2}, {1, 0, 1}, {1, 2, 0}});
+}
+
+
+TYPED_TEST(Matrix, BuildsLocalNonLocalNonSquare)
+{
+    using lit = typename TestFixture::local_index_type;
+    using git = typename TestFixture::global_index_type;
+    using vt = typename TestFixture::value_type;
+    gko::array<comm_index_type> row_mapping{this->ref, {1, 2, 0, 0, 2, 1}};
+    comm_index_type num_parts = 3;
+    auto partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, row_mapping, num_parts);
+    gko::array<comm_index_type> col_mapping{this->ref, {0, 2, 2, 1}};
+    auto col_partition =
+        gko::experimental::distributed::Partition<lit, git>::build_from_mapping(
+            this->ref, col_mapping, num_parts);
+
+    this->validate(
+        gko::dim<2>{6, 4}, partition.get(), col_partition.get(),
+        {2, 3, 0, 1, 4, 3, 3, 0, 1, 4}, {0, 0, 3, 2, 1, 2, 3, 0, 3, 3},
+        {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
+        {std::make_tuple(gko::dim<2>{2, 1}, I<git>{0, 1}, I<git>{0, 0},
+                         I<vt>{1, 2}),
+         std::make_tuple(gko::dim<2>{2, 1}, I<git>{0}, I<git>{0}, I<vt>{3}),
+         std::make_tuple(gko::dim<2>{2, 2}, I<git>{0, 1}, I<git>{1, 0},
+                         I<vt>{4, 5})},
+        {std::make_tuple(gko::dim<2>{2, 2}, I<git>{1, 1}, I<git>{1, 0},
+                         I<vt>{6, 7}),
+         std::make_tuple(gko::dim<2>{2, 1}, I<git>{0}, I<git>{0}, I<vt>{8}),
+         std::make_tuple(gko::dim<2>{2, 1}, I<git>{0, 1}, I<git>{0, 0},
+                         I<vt>{9, 10})},
+        {{0, 1}, {0}, {0}}, {{0, 1, 1}, {1, 0, 0}, {0, 1, 0}});
+}
+
+
+TYPED_TEST(Matrix, BuildGhostMapContinuous)
+{
+    using value_type = typename TestFixture::value_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    this->mapping = {this->ref, {0, 0, 0, 1, 1, 2, 2}};
+    constexpr comm_index_type num_parts = 3;
+    auto partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 this->mapping,
+                                                                 num_parts);
+    this->recv_sizes.resize_and_reset(num_parts + 1);
+    gko::array<global_index_type> result[num_parts] = {
+        {this->ref, {3}}, {this->ref, {0, 6}}, {this->ref, {4}}};
+
+    for (int local_id = 0; local_id < num_parts; ++local_id) {
+        gko::kernels::reference::distributed_matrix::build_local_nonlocal(
+            this->ref, this->create_input_full_rank(), partition.get(),
+            partition.get(), local_id, this->local_row_idxs,
+            this->local_col_idxs, this->local_values, this->non_local_row_idxs,
+            this->non_local_col_idxs, this->non_local_values, this->gather_idxs,
+            this->recv_sizes, this->non_local_to_global);
+
+        GKO_ASSERT_ARRAY_EQ(result[local_id], this->non_local_to_global);
+    }
+}
+
+TYPED_TEST(Matrix, BuildGhostMapScattered)
+{
+    using value_type = typename TestFixture::value_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    this->mapping = {this->ref, {0, 1, 2, 0, 1, 2, 0}};
+    constexpr comm_index_type num_parts = 3;
+    auto partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 this->mapping,
+                                                                 num_parts);
+    this->recv_sizes.resize_and_reset(num_parts + 1);
+    gko::array<global_index_type> result[num_parts] = {
+        {this->ref, {5}},
+        {this->ref, {6, 2}},
+        {this->ref, {4}}};  // the columns are sorted by their part_id
+
+    for (int local_id = 0; local_id < num_parts; ++local_id) {
+        gko::kernels::reference::distributed_matrix::build_local_nonlocal(
+            this->ref, this->create_input_full_rank(), partition.get(),
+            partition.get(), local_id, this->local_row_idxs,
+            this->local_col_idxs, this->local_values, this->non_local_row_idxs,
+            this->non_local_col_idxs, this->non_local_values, this->gather_idxs,
+            this->recv_sizes, this->non_local_to_global);
+
+        GKO_ASSERT_ARRAY_EQ(result[local_id], this->non_local_to_global);
+    }
+}
+
+}  // namespace
diff --git a/reference/test/distributed/partition_kernels.cpp b/reference/test/distributed/partition_kernels.cpp
index 265fb780b12..3bc139d604c 100644
--- a/reference/test/distributed/partition_kernels.cpp
+++ b/reference/test/distributed/partition_kernels.cpp
@@ -52,7 +52,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 namespace {
 
 
-using comm_index_type = gko::distributed::comm_index_type;
+using comm_index_type = gko::experimental::distributed::comm_index_type;
 
 
 template <typename T, typename U>
@@ -73,7 +73,8 @@ class Partition : public ::testing::Test {
     using global_index_type =
         typename std::tuple_element<1, decltype(LocalGlobalIndexType())>::type;
     using part_type =
-        gko::distributed::Partition<local_index_type, global_index_type>;
+        gko::experimental::distributed::Partition<local_index_type,
+                                                  global_index_type>;
 
     Partition() : ref(gko::ReferenceExecutor::create()) {}
 
diff --git a/reference/test/distributed/vector_kernels.cpp b/reference/test/distributed/vector_kernels.cpp
new file mode 100644
index 00000000000..7584cc43e81
--- /dev/null
+++ b/reference/test/distributed/vector_kernels.cpp
@@ -0,0 +1,156 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+
+#include <gtest/gtest-typed-test.h>
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/matrix_data.hpp>
+
+
+#include "core/distributed/vector_kernels.hpp"
+#include "core/test/utils.hpp"
+
+
+namespace {
+
+
+using comm_index_type = gko::experimental::distributed::comm_index_type;
+
+
+template <typename ValueLocalGlobalIndexType>
+class Vector : public ::testing::Test {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using local_index_type =
+        typename std::tuple_element<1, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using global_index_type =
+        typename std::tuple_element<2, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using mtx = gko::matrix::Dense<value_type>;
+
+    Vector() : ref(gko::ReferenceExecutor::create()) {}
+
+    void validate(const gko::dim<2> size,
+                  const gko::experimental::distributed::Partition<
+                      local_index_type, global_index_type>* partition,
+                  I<global_index_type> input_rows,
+                  I<global_index_type> input_cols, I<value_type> input_vals,
+                  I<I<I<value_type>>> output_entries)
+    {
+        std::vector<I<I<value_type>>> ref_outputs;
+        auto input = gko::device_matrix_data<value_type, global_index_type>{
+            ref, size, input_rows, input_cols, input_vals};
+        for (auto entry : output_entries) {
+            ref_outputs.emplace_back(entry);
+        }
+        for (comm_index_type part = 0; part < partition->get_num_parts();
+             ++part) {
+            auto num_rows =
+                static_cast<gko::size_type>(partition->get_part_size(part));
+            auto output = mtx::create(ref, gko::dim<2>{num_rows, size[1]});
+            output->fill(gko::zero<value_type>());
+
+            gko::kernels::reference::distributed_vector::build_local(
+                ref, input, partition, part, output.get());
+
+            GKO_ASSERT_MTX_NEAR(output, ref_outputs[part], 0);
+        }
+    }
+
+    std::shared_ptr<const gko::ReferenceExecutor> ref;
+};
+
+TYPED_TEST_SUITE(Vector, gko::test::ValueLocalGlobalIndexTypes);
+
+
+TYPED_TEST(Vector, BuildsLocalEmpty)
+{
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    gko::array<comm_index_type> mapping{this->ref, {1, 0, 2, 2, 0, 1, 1, 2}};
+    comm_index_type num_parts = 3;
+    auto partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 mapping,
+                                                                 num_parts);
+
+    this->validate(gko::dim<2>{0, 0}, partition.get(), {}, {}, {},
+                   {{{}, {}}, {{}, {}, {}}, {{}, {}, {}}});
+}
+
+
+TYPED_TEST(Vector, BuildsLocalSmall)
+{
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    gko::array<comm_index_type> mapping{this->ref, {1, 0}};
+    comm_index_type num_parts = 2;
+    auto partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 mapping,
+                                                                 num_parts);
+
+    this->validate(gko::dim<2>{2, 2}, partition.get(), {0, 0, 1, 1},
+                   {0, 1, 0, 1}, {1, 2, 3, 4}, {{{3, 4}}, {{1, 2}}});
+}
+
+
+TYPED_TEST(Vector, BuildsLocal)
+{
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    gko::array<comm_index_type> mapping{this->ref, {1, 2, 0, 0, 2, 1}};
+    comm_index_type num_parts = 3;
+    auto partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 mapping,
+                                                                 num_parts);
+
+    this->validate(gko::dim<2>{6, 8}, partition.get(), {0, 0, 1, 1, 2, 3, 4, 5},
+                   {0, 1, 2, 3, 4, 5, 6, 7}, {1, 2, 3, 4, 5, 6, 7, 8},
+                   {{{0, 0, 0, 0, 5, 0, 0, 0}, {0, 0, 0, 0, 0, 6, 0, 0}},
+                    {{1, 2, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 8}},
+                    {{0, 0, 3, 4, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 7, 0}}});
+}
+
+
+}  // namespace
diff --git a/reference/test/matrix/dense_kernels.cpp b/reference/test/matrix/dense_kernels.cpp
index fd1cbc3c9b3..a5a58dbe2bc 100644
--- a/reference/test/matrix/dense_kernels.cpp
+++ b/reference/test/matrix/dense_kernels.cpp
@@ -614,13 +614,14 @@ TYPED_TEST(Dense, ComputesNorm2)
 
 TYPED_TEST(Dense, ComputesNorm2Mixed)
 {
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
     using MixedMtx = typename TestFixture::MixedMtx;
     using MixedT = typename MixedMtx::value_type;
     using MixedT_nc = gko::remove_complex<MixedT>;
     using MixedNormVector = gko::matrix::Dense<MixedT_nc>;
-    auto mtx(gko::initialize<MixedMtx>(
-        {I<MixedT>{1.0, 0.0}, I<MixedT>{2.0, 3.0}, I<MixedT>{2.0, 4.0}},
-        this->exec));
+    auto mtx(gko::initialize<Mtx>(
+        {I<T>{1.0, 0.0}, I<T>{2.0, 3.0}, I<T>{2.0, 4.0}}, this->exec));
     auto result = MixedNormVector::create(this->exec, gko::dim<2>{1, 2});
 
     mtx->compute_norm2(result.get());
@@ -630,6 +631,42 @@ TYPED_TEST(Dense, ComputesNorm2Mixed)
 }
 
 
+TYPED_TEST(Dense, ComputesNorm2Squared)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using T_nc = gko::remove_complex<T>;
+    using NormVector = gko::matrix::Dense<T_nc>;
+    gko::array<char> tmp{this->exec};
+    auto mtx(gko::initialize<Mtx>(
+        {I<T>{1.0, 0.0}, I<T>{2.0, 3.0}, I<T>{2.0, 4.0}}, this->exec));
+    auto result = NormVector::create(this->exec, gko::dim<2>{1, 2});
+
+    gko::kernels::reference::dense::compute_squared_norm2(
+        gko::as<gko::ReferenceExecutor>(this->exec), mtx.get(), result.get(),
+        tmp);
+
+    EXPECT_EQ(result->at(0, 0), T_nc{9.0});
+    EXPECT_EQ(result->at(0, 1), T_nc{25.0});
+}
+
+
+TYPED_TEST(Dense, ComputesSqrt)
+{
+    using Mtx = typename TestFixture::Mtx;
+    using T = typename TestFixture::value_type;
+    using T_nc = gko::remove_complex<T>;
+    using NormVector = gko::matrix::Dense<T_nc>;
+    auto mtx(gko::initialize<NormVector>(I<I<T_nc>>{{9.0, 25.0}}, this->exec));
+
+    gko::kernels::reference::dense::compute_sqrt(
+        gko::as<gko::ReferenceExecutor>(this->exec), mtx.get());
+
+    EXPECT_EQ(mtx->at(0, 0), T_nc{3.0});
+    EXPECT_EQ(mtx->at(0, 1), T_nc{5.0});
+}
+
+
 TYPED_TEST(Dense, ComputesNorm1)
 {
     using Mtx = typename TestFixture::Mtx;
diff --git a/reference/test/matrix/diagonal_kernels.cpp b/reference/test/matrix/diagonal_kernels.cpp
index efc99163d7e..9d225a5ec4d 100644
--- a/reference/test/matrix/diagonal_kernels.cpp
+++ b/reference/test/matrix/diagonal_kernels.cpp
@@ -644,10 +644,10 @@ TYPED_TEST(DiagonalComplex, MtxIsConjugateTransposable)
     using value_type = typename TestFixture::value_type;
     auto exec = gko::ReferenceExecutor::create();
     auto diag = Diag::create(exec, 3);
-    auto diag_values = diag->get_values();
-    diag_values[0] = value_type{1.0, 2.0};
-    diag_values[1] = value_type{3.0, 0.0};
-    diag_values[2] = value_type{0.0, 1.5};
+    auto local_values = diag->get_values();
+    local_values[0] = value_type{1.0, 2.0};
+    local_values[1] = value_type{3.0, 0.0};
+    local_values[2] = value_type{0.0, 1.5};
 
     auto trans = diag->conj_transpose();
     auto trans_as_diagonal = static_cast<Diag*>(trans.get());
@@ -666,16 +666,16 @@ TYPED_TEST(DiagonalComplex, InplaceAbsolute)
     using value_type = typename TestFixture::value_type;
     auto exec = gko::ReferenceExecutor::create();
     auto diag = Diag::create(exec, 3);
-    auto diag_values = diag->get_values();
-    diag_values[0] = value_type{3.0, -4.0};
-    diag_values[1] = value_type{-3.0, 0.0};
-    diag_values[2] = value_type{0.0, -1.5};
+    auto local_values = diag->get_values();
+    local_values[0] = value_type{3.0, -4.0};
+    local_values[1] = value_type{-3.0, 0.0};
+    local_values[2] = value_type{0.0, -1.5};
 
     diag->compute_absolute_inplace();
 
-    EXPECT_EQ(diag_values[0], (value_type{5.0, 0.0}));
-    EXPECT_EQ(diag_values[1], (value_type{3.0, 0.0}));
-    EXPECT_EQ(diag_values[2], (value_type{1.5, 0.0}));
+    EXPECT_EQ(local_values[0], (value_type{5.0, 0.0}));
+    EXPECT_EQ(local_values[1], (value_type{3.0, 0.0}));
+    EXPECT_EQ(local_values[2], (value_type{1.5, 0.0}));
 }
 
 
@@ -686,10 +686,10 @@ TYPED_TEST(DiagonalComplex, OutplaceAbsolute)
     using abs_type = gko::remove_complex<value_type>;
     auto exec = gko::ReferenceExecutor::create();
     auto diag = Diag::create(exec, 3);
-    auto diag_values = diag->get_values();
-    diag_values[0] = value_type{3.0, -4.0};
-    diag_values[1] = value_type{-3.0, 0.0};
-    diag_values[2] = value_type{0.0, -1.5};
+    auto local_values = diag->get_values();
+    local_values[0] = value_type{3.0, -4.0};
+    local_values[1] = value_type{-3.0, 0.0};
+    local_values[2] = value_type{0.0, -1.5};
 
     auto abs_diag = diag->compute_absolute();
     auto abs_values = abs_diag->get_values();
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 1095d9d9a6c..6eae23651a5 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -5,6 +5,9 @@ add_subdirectory(components)
 add_subdirectory(distributed)
 add_subdirectory(factorization)
 add_subdirectory(matrix)
+if(GINKGO_BUILD_MPI)
+    add_subdirectory(mpi)
+endif()
 add_subdirectory(multigrid)
 add_subdirectory(preconditioner)
 add_subdirectory(solver)
diff --git a/test/distributed/CMakeLists.txt b/test/distributed/CMakeLists.txt
index b4e2fbff054..1c8e9b1e8fc 100644
--- a/test/distributed/CMakeLists.txt
+++ b/test/distributed/CMakeLists.txt
@@ -1 +1,3 @@
+ginkgo_create_common_test(matrix_kernels DISABLE_EXECUTORS dpcpp)
 ginkgo_create_common_test(partition_kernels DISABLE_EXECUTORS dpcpp)
+ginkgo_create_common_test(vector_kernels DISABLE_EXECUTORS dpcpp)
diff --git a/test/distributed/matrix_kernels.cpp b/test/distributed/matrix_kernels.cpp
new file mode 100644
index 00000000000..c6c4b78eee5
--- /dev/null
+++ b/test/distributed/matrix_kernels.cpp
@@ -0,0 +1,366 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/distributed/matrix_kernels.hpp"
+
+
+#include <algorithm>
+#include <memory>
+
+
+#include <gtest/gtest-typed-test.h>
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/device_matrix_data.hpp>
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/test/utils.hpp"
+#include "test/utils/executor.hpp"
+
+
+using comm_index_type = gko::experimental::distributed::comm_index_type;
+
+
+template <typename ValueLocalGlobalIndexType>
+class Matrix : public CommonTestFixture {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using local_index_type =
+        typename std::tuple_element<1, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using global_index_type =
+        typename std::tuple_element<2, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using Mtx = gko::matrix::Csr<value_type, local_index_type>;
+
+    Matrix() : engine(42) {}
+
+    void validate(const gko::experimental::distributed::Partition<
+                      local_index_type, global_index_type>* row_partition,
+                  const gko::experimental::distributed::Partition<
+                      local_index_type, global_index_type>* col_partition,
+                  const gko::experimental::distributed::Partition<
+                      local_index_type, global_index_type>* d_row_partition,
+                  const gko::experimental::distributed::Partition<
+                      local_index_type, global_index_type>* d_col_partition,
+                  gko::device_matrix_data<value_type, global_index_type> input)
+    {
+        gko::device_matrix_data<value_type, global_index_type> d_input{exec,
+                                                                       input};
+        for (comm_index_type part = 0; part < row_partition->get_num_parts();
+             ++part) {
+            gko::array<local_index_type> local_row_idxs{ref};
+            gko::array<local_index_type> local_col_idxs{ref};
+            gko::array<value_type> local_values{ref};
+            gko::array<local_index_type> d_local_row_idxs{exec};
+            gko::array<local_index_type> d_local_col_idxs{exec};
+            gko::array<value_type> d_local_values{exec};
+            gko::array<local_index_type> non_local_row_idxs{ref};
+            gko::array<local_index_type> non_local_col_idxs{ref};
+            gko::array<value_type> non_local_values{ref};
+            gko::array<local_index_type> d_non_local_row_idxs{exec};
+            gko::array<local_index_type> d_non_local_col_idxs{exec};
+            gko::array<value_type> d_non_local_values{exec};
+            gko::array<local_index_type> gather_idxs{ref};
+            gko::array<local_index_type> d_gather_idxs{exec};
+            gko::array<comm_index_type> recv_sizes{
+                ref,
+                static_cast<gko::size_type>(row_partition->get_num_parts())};
+            gko::array<comm_index_type> d_recv_sizes{
+                exec,
+                static_cast<gko::size_type>(row_partition->get_num_parts())};
+            gko::array<global_index_type> local_to_global_col{ref};
+            gko::array<global_index_type> d_local_to_global_col{exec};
+
+            gko::kernels::reference::distributed_matrix::build_local_nonlocal(
+                ref, input, row_partition, col_partition, part, local_row_idxs,
+                local_col_idxs, local_values, non_local_row_idxs,
+                non_local_col_idxs, non_local_values, gather_idxs, recv_sizes,
+                local_to_global_col);
+            gko::kernels::EXEC_NAMESPACE::distributed_matrix::
+                build_local_nonlocal(exec, d_input, d_row_partition,
+                                     d_col_partition, part, d_local_row_idxs,
+                                     d_local_col_idxs, d_local_values,
+                                     d_non_local_row_idxs, d_non_local_col_idxs,
+                                     d_non_local_values, d_gather_idxs,
+                                     d_recv_sizes, d_local_to_global_col);
+
+            GKO_ASSERT_ARRAY_EQ(local_row_idxs, d_local_row_idxs);
+            GKO_ASSERT_ARRAY_EQ(local_col_idxs, d_local_col_idxs);
+            GKO_ASSERT_ARRAY_EQ(local_values, d_local_values);
+            GKO_ASSERT_ARRAY_EQ(non_local_row_idxs, d_non_local_row_idxs);
+            GKO_ASSERT_ARRAY_EQ(non_local_col_idxs, d_non_local_col_idxs);
+            GKO_ASSERT_ARRAY_EQ(non_local_values, d_non_local_values);
+            GKO_ASSERT_ARRAY_EQ(gather_idxs, d_gather_idxs);
+            GKO_ASSERT_ARRAY_EQ(recv_sizes, d_recv_sizes);
+            GKO_ASSERT_ARRAY_EQ(local_to_global_col, d_local_to_global_col);
+        }
+    }
+
+    std::default_random_engine engine;
+};
+
+TYPED_TEST_SUITE(Matrix, gko::test::ValueLocalGlobalIndexTypes);
+
+
+TYPED_TEST(Matrix, BuildsDiagOffdiagEmptyIsSameAsRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    gko::array<comm_index_type> mapping{this->ref, {1, 0, 2, 2, 0, 1, 1, 2}};
+    comm_index_type num_parts = 3;
+
+    auto partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 mapping,
+                                                                 num_parts);
+    auto d_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->exec,
+                                                                 mapping,
+                                                                 num_parts);
+
+    this->validate(
+        partition.get(), partition.get(), d_partition.get(), d_partition.get(),
+        gko::device_matrix_data<value_type, global_index_type>{this->ref});
+}
+
+
+TYPED_TEST(Matrix, BuildsLocalSmallIsEquivalentToRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    gko::experimental::distributed::comm_index_type num_parts = 3;
+    gko::size_type num_rows = 10;
+    gko::size_type num_cols = 10;
+    auto mapping = gko::test::generate_random_array<
+        gko::experimental::distributed::comm_index_type>(
+        num_rows,
+        std::uniform_int_distribution<
+            gko::experimental::distributed::comm_index_type>(0, num_parts - 1),
+        this->engine, this->ref);
+    auto input = gko::test::generate_random_device_matrix_data<
+        value_type, global_index_type>(
+        num_rows, num_cols,
+        std::uniform_int_distribution<int>(0, static_cast<int>(num_cols - 1)),
+        std::uniform_real_distribution<gko::remove_complex<value_type>>(0, 1),
+        this->engine, this->ref);
+
+    auto partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 mapping,
+                                                                 num_parts);
+    auto d_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->exec,
+                                                                 mapping,
+                                                                 num_parts);
+
+    this->validate(partition.get(), partition.get(), d_partition.get(),
+                   d_partition.get(), input);
+}
+
+
+TYPED_TEST(Matrix, BuildsLocalIsEquivalentToRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    gko::experimental::distributed::comm_index_type num_parts = 13;
+    gko::size_type num_rows = 67;
+    gko::size_type num_cols = 67;
+    auto mapping = gko::test::generate_random_array<
+        gko::experimental::distributed::comm_index_type>(
+        num_rows,
+        std::uniform_int_distribution<
+            gko::experimental::distributed::comm_index_type>(0, num_parts - 1),
+        this->engine, this->ref);
+    auto input = gko::test::generate_random_device_matrix_data<
+        value_type, global_index_type>(
+        num_rows, num_cols,
+        std::uniform_int_distribution<int>(static_cast<int>(num_cols - 1),
+                                           static_cast<int>(num_cols - 1)),
+        std::uniform_real_distribution<gko::remove_complex<value_type>>(0, 1),
+        this->engine, this->ref);
+
+    auto partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 mapping,
+                                                                 num_parts);
+    auto d_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->exec,
+                                                                 mapping,
+                                                                 num_parts);
+
+    this->validate(partition.get(), partition.get(), d_partition.get(),
+                   d_partition.get(), input);
+}
+
+
+TYPED_TEST(Matrix, BuildsDiagOffdiagEmptyWithColPartitionIsSameAsRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    gko::array<comm_index_type> row_mapping{this->ref,
+                                            {1, 0, 2, 2, 0, 1, 1, 2}};
+    gko::array<comm_index_type> col_mapping{this->ref,
+                                            {0, 0, 2, 2, 2, 1, 1, 1}};
+    comm_index_type num_parts = 3;
+
+    auto row_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 row_mapping,
+                                                                 num_parts);
+    auto d_row_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->exec,
+                                                                 row_mapping,
+                                                                 num_parts);
+    auto col_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 col_mapping,
+                                                                 num_parts);
+    auto d_col_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->exec,
+                                                                 col_mapping,
+                                                                 num_parts);
+
+    this->validate(
+        row_partition.get(), col_partition.get(), d_row_partition.get(),
+        d_col_partition.get(),
+        gko::device_matrix_data<value_type, global_index_type>{this->ref});
+}
+
+
+TYPED_TEST(Matrix, BuildsLocalSmallWithColPartitionIsEquivalentToRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    gko::experimental::distributed::comm_index_type num_parts = 3;
+    gko::size_type num_rows = 10;
+    gko::size_type num_cols = 10;
+    auto row_mapping = gko::test::generate_random_array<
+        gko::experimental::distributed::comm_index_type>(
+        num_rows,
+        std::uniform_int_distribution<
+            gko::experimental::distributed::comm_index_type>(0, num_parts - 1),
+        this->engine, this->ref);
+    auto col_mapping = gko::test::generate_random_array<
+        gko::experimental::distributed::comm_index_type>(
+        num_rows,
+        std::uniform_int_distribution<
+            gko::experimental::distributed::comm_index_type>(0, num_parts - 1),
+        this->engine, this->ref);
+    auto input = gko::test::generate_random_device_matrix_data<
+        value_type, global_index_type>(
+        num_rows, num_cols,
+        std::uniform_int_distribution<int>(0, static_cast<int>(num_cols - 1)),
+        std::uniform_real_distribution<gko::remove_complex<value_type>>(0, 1),
+        this->engine, this->ref);
+
+    auto row_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 row_mapping,
+                                                                 num_parts);
+    auto d_row_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->exec,
+                                                                 row_mapping,
+                                                                 num_parts);
+    auto col_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 col_mapping,
+                                                                 num_parts);
+    auto d_col_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->exec,
+                                                                 col_mapping,
+                                                                 num_parts);
+
+    this->validate(row_partition.get(), col_partition.get(),
+                   d_row_partition.get(), d_col_partition.get(), input);
+}
+
+
+TYPED_TEST(Matrix, BuildsLocalWithColPartitionIsEquivalentToRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    gko::experimental::distributed::comm_index_type num_parts = 13;
+    gko::size_type num_rows = 67;
+    gko::size_type num_cols = 67;
+    auto row_mapping = gko::test::generate_random_array<
+        gko::experimental::distributed::comm_index_type>(
+        num_rows,
+        std::uniform_int_distribution<
+            gko::experimental::distributed::comm_index_type>(0, num_parts - 1),
+        this->engine, this->ref);
+    auto col_mapping = gko::test::generate_random_array<
+        gko::experimental::distributed::comm_index_type>(
+        num_rows,
+        std::uniform_int_distribution<
+            gko::experimental::distributed::comm_index_type>(0, num_parts - 1),
+        this->engine, this->ref);
+    auto input = gko::test::generate_random_device_matrix_data<
+        value_type, global_index_type>(
+        num_rows, num_cols,
+        std::uniform_int_distribution<int>(static_cast<int>(num_cols),
+                                           static_cast<int>(num_cols)),
+        std::uniform_real_distribution<gko::remove_complex<value_type>>(0, 1),
+        this->engine, this->ref);
+
+    auto row_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 row_mapping,
+                                                                 num_parts);
+    auto d_row_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->exec,
+                                                                 row_mapping,
+                                                                 num_parts);
+    auto col_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 col_mapping,
+                                                                 num_parts);
+    auto d_col_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->exec,
+                                                                 col_mapping,
+                                                                 num_parts);
+
+    this->validate(row_partition.get(), col_partition.get(),
+                   d_row_partition.get(), d_col_partition.get(), input);
+}
diff --git a/test/distributed/partition_kernels.cpp b/test/distributed/partition_kernels.cpp
index 1c96c8cbec0..b36ad5eda97 100644
--- a/test/distributed/partition_kernels.cpp
+++ b/test/distributed/partition_kernels.cpp
@@ -50,7 +50,7 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include "test/utils/executor.hpp"
 
 
-using comm_index_type = gko::distributed::comm_index_type;
+using comm_index_type = gko::experimental::distributed::comm_index_type;
 
 
 template <typename LocalGlobalIndexType>
@@ -61,7 +61,8 @@ class Partition : public CommonTestFixture {
     using global_index_type =
         typename std::tuple_element<1, decltype(LocalGlobalIndexType())>::type;
     using part_type =
-        gko::distributed::Partition<local_index_type, global_index_type>;
+        gko::experimental::distributed::Partition<local_index_type,
+                                                  global_index_type>;
 
     Partition() : rand_engine(96457) {}
 
diff --git a/test/distributed/vector_kernels.cpp b/test/distributed/vector_kernels.cpp
new file mode 100644
index 00000000000..01168c4ee88
--- /dev/null
+++ b/test/distributed/vector_kernels.cpp
@@ -0,0 +1,216 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include "core/distributed/vector_kernels.hpp"
+
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+
+#include <gtest/gtest-typed-test.h>
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/executor.hpp>
+#include <ginkgo/core/base/matrix_data.hpp>
+
+
+#include "core/test/utils.hpp"
+#include "test/utils/executor.hpp"
+
+
+using comm_index_type = gko::experimental::distributed::comm_index_type;
+
+
+template <typename ValueLocalGlobalIndexType>
+class Vector : public CommonTestFixture {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using local_index_type =
+        typename std::tuple_element<1, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using global_index_type =
+        typename std::tuple_element<2, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using global_entry = gko::matrix_data_entry<value_type, global_index_type>;
+    using mtx = gko::matrix::Dense<value_type>;
+
+    Vector() : engine(42) {}
+
+    void validate(const gko::experimental::distributed::Partition<
+                      local_index_type, global_index_type>* partition,
+                  const gko::experimental::distributed::Partition<
+                      local_index_type, global_index_type>* d_partition,
+                  gko::device_matrix_data<value_type, global_index_type> input)
+    {
+        gko::device_matrix_data<value_type, global_index_type> d_input{exec,
+                                                                       input};
+        for (comm_index_type part = 0; part < partition->get_num_parts();
+             ++part) {
+            auto num_rows =
+                static_cast<gko::size_type>(partition->get_part_size(part));
+            auto output =
+                mtx::create(ref, gko::dim<2>{num_rows, input.get_size()[1]});
+            output->fill(gko::zero<value_type>());
+            auto d_output = gko::clone(exec, output);
+
+            gko::kernels::reference::distributed_vector::build_local(
+                ref, input, partition, part, output.get());
+            gko::kernels::EXEC_NAMESPACE::distributed_vector::build_local(
+                exec, d_input, d_partition, part, d_output.get());
+
+            GKO_ASSERT_MTX_NEAR(output, d_output, 0);
+        }
+    }
+
+    std::default_random_engine engine;
+};
+
+TYPED_TEST_SUITE(Vector, gko::test::ValueLocalGlobalIndexTypes);
+
+
+template <typename ValueType, typename IndexType, typename NonzeroDistribution,
+          typename ValueDistribution, typename Engine>
+gko::device_matrix_data<ValueType, IndexType> generate_random_matrix_data_array(
+    gko::size_type num_rows, gko::size_type num_cols,
+    NonzeroDistribution&& nonzero_dist, ValueDistribution&& value_dist,
+    Engine&& engine, std::shared_ptr<const gko::Executor> exec)
+{
+    auto md = gko::test::generate_random_matrix_data<ValueType, IndexType>(
+        num_rows, num_cols, std::forward<NonzeroDistribution>(nonzero_dist),
+        std::forward<ValueDistribution>(value_dist),
+        std::forward<Engine>(engine));
+    md.ensure_row_major_order();
+    return gko::device_matrix_data<ValueType, IndexType>::create_from_host(exec,
+                                                                           md);
+}
+
+
+TYPED_TEST(Vector, BuildsLocalEmptyIsEquivalentToRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    gko::experimental::distributed::comm_index_type num_parts = 10;
+    auto mapping = gko::test::generate_random_array<
+        gko::experimental::distributed::comm_index_type>(
+        100,
+        std::uniform_int_distribution<
+            gko::experimental::distributed::comm_index_type>(0, num_parts - 1),
+        this->engine, this->ref);
+    auto partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 mapping,
+                                                                 num_parts);
+    auto d_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->exec,
+                                                                 mapping,
+                                                                 num_parts);
+
+    this->validate(
+        partition.get(), d_partition.get(),
+        gko::device_matrix_data<value_type, global_index_type>{this->ref});
+}
+
+
+TYPED_TEST(Vector, BuildsLocalSmallIsEquivalentToRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    gko::experimental::distributed::comm_index_type num_parts = 3;
+    gko::size_type num_rows = 10;
+    gko::size_type num_cols = 2;
+    auto mapping = gko::test::generate_random_array<
+        gko::experimental::distributed::comm_index_type>(
+        num_rows,
+        std::uniform_int_distribution<
+            gko::experimental::distributed::comm_index_type>(0, num_parts - 1),
+        this->engine, this->ref);
+    auto input =
+        generate_random_matrix_data_array<value_type, global_index_type>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<int>(0,
+                                               static_cast<int>(num_cols - 1)),
+            std::uniform_real_distribution<gko::remove_complex<value_type>>(0,
+                                                                            1),
+            this->engine, this->ref);
+    auto partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 mapping,
+                                                                 num_parts);
+    auto d_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->exec,
+                                                                 mapping,
+                                                                 num_parts);
+
+    this->validate(partition.get(), d_partition.get(), input);
+}
+
+
+TYPED_TEST(Vector, BuildsLocalIsEquivalentToRef)
+{
+    using value_type = typename TestFixture::value_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    gko::experimental::distributed::comm_index_type num_parts = 13;
+    gko::size_type num_rows = 40;
+    gko::size_type num_cols = 67;
+    auto mapping = gko::test::generate_random_array<
+        gko::experimental::distributed::comm_index_type>(
+        num_rows,
+        std::uniform_int_distribution<
+            gko::experimental::distributed::comm_index_type>(0, num_parts - 1),
+        this->engine, this->ref);
+    auto input =
+        generate_random_matrix_data_array<value_type, global_index_type>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<int>(0,
+                                               static_cast<int>(num_cols - 1)),
+            std::uniform_real_distribution<gko::remove_complex<value_type>>(0,
+                                                                            1),
+            this->engine, this->ref);
+    auto partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->ref,
+                                                                 mapping,
+                                                                 num_parts);
+    auto d_partition = gko::experimental::distributed::Partition<
+        local_index_type, global_index_type>::build_from_mapping(this->exec,
+                                                                 mapping,
+                                                                 num_parts);
+
+    this->validate(partition.get(), d_partition.get(), input);
+}
diff --git a/test/matrix/dense_kernels.cpp b/test/matrix/dense_kernels.cpp
index 3b198c20b28..b0c9f2f7a70 100644
--- a/test/matrix/dense_kernels.cpp
+++ b/test/matrix/dense_kernels.cpp
@@ -1596,7 +1596,7 @@ TEST_F(Dense, ComputeNorm1IsEquivalentToRef)
 
     auto norm_size = gko::dim<2>{1, x->get_size()[1]};
     auto norm_expected = NormVector::create(ref, norm_size);
-    auto dnorm = NormVector::create(ref, norm_size);
+    auto dnorm = NormVector::create(exec, norm_size);
 
     // all parameters are on ref to check cross-executor calls
     x->compute_norm1(norm_expected.get());
@@ -1822,3 +1822,36 @@ TEST_F(Dense, AddScaledIdentityToNonSquareOnDifferentExecutor)
 
     GKO_ASSERT_MTX_NEAR(x, dx, r<value_type>::value);
 }
+
+
+TEST_F(Dense, ComputeNorm2SquaredIsEquivalentToRef)
+{
+    set_up_apply_data();
+    auto norm_size = gko::dim<2>{1, x->get_size()[1]};
+    auto norm_expected = NormVector::create(ref, norm_size);
+    auto dnorm = NormVector::create(exec, norm_size);
+    gko::array<char> tmp{ref};
+    gko::array<char> dtmp{exec};
+
+    gko::kernels::reference::dense::compute_squared_norm2(
+        ref, x.get(), norm_expected.get(), tmp);
+    gko::kernels::EXEC_NAMESPACE::dense::compute_squared_norm2(
+        exec, dx.get(), dnorm.get(), dtmp);
+
+    GKO_ASSERT_MTX_NEAR(dnorm, norm_expected, r<value_type>::value);
+}
+
+
+TEST_F(Dense, ComputesSqrt)
+{
+    auto mtx = gko::test::generate_random_matrix<NormVector>(
+        1, 7, std::uniform_int_distribution<int>(7, 7),
+        std::uniform_real_distribution<gko::remove_complex<value_type>>(0, 10),
+        rand_engine, ref);
+    auto dmtx = gko::clone(exec, mtx);
+
+    gko::kernels::reference::dense::compute_sqrt(ref, mtx.get());
+    gko::kernels::EXEC_NAMESPACE::dense::compute_sqrt(exec, dmtx.get());
+
+    GKO_ASSERT_MTX_NEAR(mtx, dmtx, r<value_type>::value);
+}
diff --git a/test/mpi/CMakeLists.txt b/test/mpi/CMakeLists.txt
new file mode 100644
index 00000000000..f715ea482ec
--- /dev/null
+++ b/test/mpi/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(distributed)
+add_subdirectory(solver)
diff --git a/test/mpi/distributed/CMakeLists.txt b/test/mpi/distributed/CMakeLists.txt
new file mode 100644
index 00000000000..d7465fa5bd2
--- /dev/null
+++ b/test/mpi/distributed/CMakeLists.txt
@@ -0,0 +1,2 @@
+ginkgo_create_common_and_reference_test(matrix MPI_SIZE 3)
+ginkgo_create_common_and_reference_test(vector MPI_SIZE 3)
diff --git a/test/mpi/distributed/matrix.cpp b/test/mpi/distributed/matrix.cpp
new file mode 100644
index 00000000000..644e65a36cd
--- /dev/null
+++ b/test/mpi/distributed/matrix.cpp
@@ -0,0 +1,594 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <array>
+#include <memory>
+#include <random>
+
+
+#include <mpi.h>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/distributed/matrix.hpp>
+#include <ginkgo/core/distributed/partition.hpp>
+#include <ginkgo/core/distributed/vector.hpp>
+#include <ginkgo/core/log/logger.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+
+
+#include "core/test/utils.hpp"
+#include "test/utils/mpi/executor.hpp"
+
+
+#ifndef GKO_COMPILING_DPCPP
+
+
+template <typename ValueLocalGlobalIndexType>
+class MatrixCreation : public CommonMpiTestFixture {
+protected:
+    using value_type =
+        typename std::tuple_element<0, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using local_index_type =
+        typename std::tuple_element<1, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using global_index_type =
+        typename std::tuple_element<2, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using dist_mtx_type =
+        gko::experimental::distributed::Matrix<value_type, local_index_type,
+                                               global_index_type>;
+    using dist_vec_type = gko::experimental::distributed::Vector<value_type>;
+    using local_matrix_type = gko::matrix::Csr<value_type, local_index_type>;
+    using Partition =
+        gko::experimental::distributed::Partition<local_index_type,
+                                                  global_index_type>;
+    using matrix_data = gko::matrix_data<value_type, global_index_type>;
+
+
+    MatrixCreation()
+        : size{5, 5},
+          mat_input{size,
+                    {{0, 1, 1},
+                     {0, 3, 2},
+                     {1, 1, 3},
+                     {1, 2, 4},
+                     {2, 1, 5},
+                     {2, 2, 6},
+                     {3, 3, 8},
+                     {3, 4, 7},
+                     {4, 0, 9},
+                     {4, 4, 10}}},
+          dist_input{{{size, {{0, 1, 1}, {0, 3, 2}, {1, 1, 3}, {1, 2, 4}}},
+                      {size, {{2, 1, 5}, {2, 2, 6}, {3, 3, 8}, {3, 4, 7}}},
+                      {size, {{4, 0, 9}, {4, 4, 10}}}}},
+          engine(42)
+    {
+        row_part = Partition::build_from_contiguous(
+            exec, gko::array<global_index_type>(
+                      exec, I<global_index_type>{0, 2, 4, 5}));
+        col_part = Partition::build_from_mapping(
+            exec,
+            gko::array<gko::experimental::distributed::comm_index_type>(
+                exec,
+                I<gko::experimental::distributed::comm_index_type>{1, 1, 2, 0,
+                                                                   0}),
+            3);
+
+        dist_mat = dist_mtx_type::create(exec, comm);
+    }
+
+    void SetUp() override { ASSERT_EQ(comm.size(), 3); }
+
+
+    gko::dim<2> size;
+    std::shared_ptr<Partition> row_part;
+    std::shared_ptr<Partition> col_part;
+
+    gko::matrix_data<value_type, global_index_type> mat_input;
+    std::array<matrix_data, 3> dist_input;
+
+    std::unique_ptr<dist_mtx_type> dist_mat;
+
+    std::default_random_engine engine;
+};
+
+TYPED_TEST_SUITE(MatrixCreation, gko::test::ValueLocalGlobalIndexTypes,
+                 TupleTypenameNameGenerator);
+
+
+TYPED_TEST(MatrixCreation, ReadsDistributedGlobalData)
+{
+    using value_type = typename TestFixture::value_type;
+    using csr = typename TestFixture::local_matrix_type;
+    I<I<value_type>> res_local[] = {{{0, 1}, {0, 3}}, {{6, 0}, {0, 8}}, {{10}}};
+    I<I<value_type>> res_non_local[] = {
+        {{0, 2}, {4, 0}}, {{5, 0}, {0, 7}}, {{9}}};
+    auto rank = this->dist_mat->get_communicator().rank();
+
+    this->dist_mat->read_distributed(this->mat_input, this->row_part.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
+                        res_local[rank], 0);
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_non_local_matrix()),
+                        res_non_local[rank], 0);
+}
+
+
+TYPED_TEST(MatrixCreation, ReadsDistributedLocalData)
+{
+    using value_type = typename TestFixture::value_type;
+    using csr = typename TestFixture::local_matrix_type;
+    I<I<value_type>> res_local[] = {{{0, 1}, {0, 3}}, {{6, 0}, {0, 8}}, {{10}}};
+    I<I<value_type>> res_non_local[] = {
+        {{0, 2}, {4, 0}}, {{5, 0}, {0, 7}}, {{9}}};
+    auto rank = this->dist_mat->get_communicator().rank();
+
+    this->dist_mat->read_distributed(this->dist_input[rank],
+                                     this->row_part.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
+                        res_local[rank], 0);
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_non_local_matrix()),
+                        res_non_local[rank], 0);
+}
+
+
+TYPED_TEST(MatrixCreation, ReadsDistributedWithColPartition)
+{
+    using value_type = typename TestFixture::value_type;
+    using csr = typename TestFixture::local_matrix_type;
+    I<I<value_type>> res_local[] = {{{2, 0}, {0, 0}}, {{0, 5}, {0, 0}}, {{0}}};
+    I<I<value_type>> res_non_local[] = {
+        {{1, 0}, {3, 4}}, {{0, 0, 6}, {8, 7, 0}}, {{10, 9}}};
+    auto rank = this->dist_mat->get_communicator().rank();
+
+    this->dist_mat->read_distributed(this->mat_input, this->row_part.get(),
+                                     this->col_part.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
+                        res_local[rank], 0);
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_non_local_matrix()),
+                        res_non_local[rank], 0);
+}
+
+
+#endif
+
+
+template <typename ValueType>
+class Matrix : public CommonMpiTestFixture {
+public:
+    using value_type = ValueType;
+    using local_index_type = gko::int32;
+    using global_index_type = gko::int64;
+    using part_type =
+        gko::experimental::distributed::Partition<local_index_type,
+                                                  global_index_type>;
+    using csr_mtx_type = gko::matrix::Csr<value_type, global_index_type>;
+    using dist_mtx_type =
+        gko::experimental::distributed::Matrix<value_type, local_index_type,
+                                               global_index_type>;
+    using dist_vec_type = gko::experimental::distributed::Vector<value_type>;
+    using local_matrix_type = gko::matrix::Csr<value_type, local_index_type>;
+    using dense_vec_type = gko::matrix::Dense<value_type>;
+    using matrix_data = gko::matrix_data<value_type, global_index_type>;
+
+    Matrix() : size{5, 5}, engine()
+    {
+        row_part = part_type::build_from_contiguous(
+            exec, gko::array<global_index_type>(
+                      exec, I<global_index_type>{0, 2, 4, 5}));
+        col_part = part_type::build_from_mapping(
+            exec,
+            gko::array<gko::experimental::distributed::comm_index_type>(
+                exec,
+                I<gko::experimental::distributed::comm_index_type>{1, 1, 2, 0,
+                                                                   0}),
+            3);
+
+        dist_mat = dist_mtx_type::create(exec, comm);
+        dist_mat_large = dist_mtx_type::create(exec, comm);
+        x = dist_vec_type::create(ref, comm);
+        y = dist_vec_type::create(ref, comm);
+
+        csr_mat = csr_mtx_type::create(exec);
+        dense_x = dense_vec_type::create(exec);
+        dense_y = dense_vec_type::create(exec);
+
+        gko::matrix_data<value_type, global_index_type> mat_input{
+            size,
+            // clang-format off
+            {{0, 1, 1}, {0, 3, 2}, {1, 1, 3}, {1, 2, 4}, {2, 1, 5},
+             {2, 2, 6}, {3, 3, 8}, {3, 4, 7}, {4, 0, 9}, {4, 4, 10}}
+            // clang-format on
+        };
+        dist_mat->read_distributed(mat_input, this->row_part.get(),
+                                   this->col_part.get());
+        csr_mat->read(mat_input);
+
+        alpha = gko::test::generate_random_matrix<dense_vec_type>(
+            1, 1, std::uniform_int_distribution<gko::size_type>(1, 1),
+            std::normal_distribution<gko::remove_complex<value_type>>(),
+            this->engine, this->exec);
+        beta = gko::test::generate_random_matrix<dense_vec_type>(
+            1, 1, std::uniform_int_distribution<gko::size_type>(1, 1),
+            std::normal_distribution<gko::remove_complex<value_type>>(),
+            this->engine, this->exec);
+    }
+
+    void SetUp() override { ASSERT_EQ(comm.size(), 3); }
+
+    void assert_local_vector_equal_to_global_vector(const dist_vec_type* dist,
+                                                    const dense_vec_type* dense,
+                                                    const part_type* part,
+                                                    int rank)
+    {
+        auto host_part = gko::clone(this->ref, part);
+        auto range_bounds = host_part->get_range_bounds();
+        auto part_ids = host_part->get_part_ids();
+        std::vector<global_index_type> gather_idxs;
+        for (gko::size_type range_id = 0;
+             range_id < host_part->get_num_ranges(); ++range_id) {
+            if (part_ids[range_id] == rank) {
+                for (global_index_type global_row = range_bounds[range_id];
+                     global_row < range_bounds[range_id + 1]; ++global_row) {
+                    gather_idxs.push_back(global_row);
+                }
+            }
+        }
+        gko::array<global_index_type> gather_idxs_view(
+            this->exec, gather_idxs.begin(), gather_idxs.end());
+        auto gathered_local = dense->row_gather(&gather_idxs_view);
+
+        GKO_ASSERT_MTX_NEAR(dist->get_local_vector(), gathered_local.get(),
+                            r<value_type>::value);
+    }
+
+    void init_large(gko::size_type num_rows, gko::size_type num_cols)
+    {
+        auto rank = comm.rank();
+        int num_parts = comm.size();
+        auto vec_md = gko::test::generate_random_matrix_data<value_type,
+                                                             global_index_type>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<int>(static_cast<int>(num_cols),
+                                               static_cast<int>(num_cols)),
+            std::normal_distribution<gko::remove_complex<value_type>>(),
+            engine);
+        auto mat_md = gko::test::generate_random_matrix_data<value_type,
+                                                             global_index_type>(
+            num_rows, num_rows,
+            std::uniform_int_distribution<int>(0, static_cast<int>(num_rows)),
+            std::normal_distribution<gko::remove_complex<value_type>>(),
+            engine);
+
+        auto row_mapping = gko::test::generate_random_array<
+            gko::experimental::distributed::comm_index_type>(
+            num_rows, std::uniform_int_distribution<int>(0, num_parts - 1),
+            engine, exec);
+        auto col_mapping = gko::test::generate_random_array<
+            gko::experimental::distributed::comm_index_type>(
+            num_rows, std::uniform_int_distribution<int>(0, num_parts - 1),
+            engine, exec);
+        row_part_large =
+            part_type::build_from_mapping(exec, row_mapping, num_parts);
+        col_part_large =
+            part_type::build_from_mapping(exec, col_mapping, num_parts);
+
+        dist_mat_large->read_distributed(mat_md, row_part_large.get(),
+                                         col_part_large.get());
+        csr_mat->read(mat_md);
+
+        x->read_distributed(vec_md, col_part_large.get());
+        dense_x->read(vec_md);
+
+        y->read_distributed(vec_md, row_part_large.get());
+        dense_y->read(vec_md);
+    }
+
+    gko::dim<2> size;
+
+    std::unique_ptr<part_type> row_part;
+    std::unique_ptr<part_type> col_part;
+    std::unique_ptr<part_type> row_part_large;
+    std::unique_ptr<part_type> col_part_large;
+
+    std::unique_ptr<dist_mtx_type> dist_mat;
+    std::unique_ptr<dist_mtx_type> dist_mat_large;
+    std::unique_ptr<csr_mtx_type> csr_mat;
+
+    std::unique_ptr<dist_vec_type> x;
+    std::unique_ptr<dist_vec_type> y;
+    std::unique_ptr<dense_vec_type> dense_x;
+    std::unique_ptr<dense_vec_type> dense_y;
+
+    std::unique_ptr<dense_vec_type> alpha;
+    std::unique_ptr<dense_vec_type> beta;
+
+    std::default_random_engine engine;
+};
+
+TYPED_TEST_SUITE(Matrix, gko::test::ValueTypes, TypenameNameGenerator);
+
+
+TYPED_TEST(Matrix, CanApplyToSingleVector)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::global_index_type;
+    auto vec_md = gko::matrix_data<value_type, index_type>{
+        I<I<value_type>>{{1}, {2}, {3}, {4}, {5}}};
+    I<I<value_type>> result[3] = {{{10}, {18}}, {{28}, {67}}, {{59}}};
+    auto rank = this->comm.rank();
+    this->x->read_distributed(vec_md, this->col_part.get());
+    this->y->read_distributed(vec_md, this->row_part.get());
+
+    this->dist_mat->apply(this->x.get(), this->y.get());
+
+    GKO_ASSERT_MTX_NEAR(this->y->get_local_vector(), result[rank], 0);
+}
+
+
+TYPED_TEST(Matrix, CanApplyToMultipleVectors)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::global_index_type;
+    auto vec_md = gko::matrix_data<value_type, index_type>{
+        I<I<value_type>>{{1, 11}, {2, 22}, {3, 33}, {4, 44}, {5, 55}}};
+    I<I<value_type>> result[3] = {
+        {{10, 110}, {18, 198}}, {{28, 308}, {67, 737}}, {{59, 649}}};
+    auto rank = this->comm.rank();
+    this->x->read_distributed(vec_md, this->col_part.get());
+    this->y->read_distributed(vec_md, this->row_part.get());
+
+    this->dist_mat->apply(this->x.get(), this->y.get());
+
+    GKO_ASSERT_MTX_NEAR(this->y->get_local_vector(), result[rank], 0);
+}
+
+
+TYPED_TEST(Matrix, CanAdvancedApplyToSingleVector)
+{
+    using value_type = typename TestFixture::value_type;
+    using index_type = typename TestFixture::global_index_type;
+    using dense_vec_type = typename TestFixture::dense_vec_type;
+    auto vec_md = gko::matrix_data<value_type, index_type>{
+        I<I<value_type>>{{1}, {2}, {3}, {4}, {5}}};
+    I<I<value_type>> result[3] = {{{17}, {30}}, {{47}, {122}}, {{103}}};
+    auto rank = this->comm.rank();
+    this->alpha = gko::initialize<dense_vec_type>({2.0}, this->exec);
+    this->beta = gko::initialize<dense_vec_type>({-3.0}, this->exec);
+    this->x->read_distributed(vec_md, this->col_part.get());
+    this->y->read_distributed(vec_md, this->row_part.get());
+
+    this->dist_mat->apply(this->alpha.get(), this->x.get(), this->beta.get(),
+                          this->y.get());
+
+    GKO_ASSERT_MTX_NEAR(this->y->get_local_vector(), result[rank], 0);
+}
+
+
+TYPED_TEST(Matrix, CanApplyToSingleVectorLarge)
+{
+    this->init_large(100, 1);
+
+    this->dist_mat_large->apply(this->x.get(), this->y.get());
+    this->csr_mat->apply(this->dense_x.get(), this->dense_y.get());
+
+    this->assert_local_vector_equal_to_global_vector(
+        this->y.get(), this->dense_y.get(), this->row_part_large.get(),
+        this->comm.rank());
+}
+
+
+TYPED_TEST(Matrix, CanApplyToMultipleVectorsLarge)
+{
+    this->init_large(100, 17);
+
+    this->dist_mat_large->apply(this->x.get(), this->y.get());
+    this->csr_mat->apply(this->dense_x.get(), this->dense_y.get());
+
+    this->assert_local_vector_equal_to_global_vector(
+        this->y.get(), this->dense_y.get(), this->row_part_large.get(),
+        this->comm.rank());
+}
+
+
+TYPED_TEST(Matrix, CanAdvancedApplyToMultipleVectorsLarge)
+{
+    this->init_large(100, 17);
+
+    this->dist_mat_large->apply(this->alpha.get(), this->x.get(),
+                                this->beta.get(), this->y.get());
+    this->csr_mat->apply(this->alpha.get(), this->dense_x.get(),
+                         this->beta.get(), this->dense_y.get());
+
+    this->assert_local_vector_equal_to_global_vector(
+        this->y.get(), this->dense_y.get(), this->row_part_large.get(),
+        this->comm.rank());
+}
+
+
+TYPED_TEST(Matrix, CanConvertToNextPrecision)
+{
+    using T = typename TestFixture::value_type;
+    using csr = typename TestFixture::local_matrix_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    using OtherT = typename gko::next_precision<T>;
+    using OtherDist = typename gko::experimental::distributed::Matrix<
+        OtherT, local_index_type, global_index_type>;
+    auto tmp = OtherDist::create(this->ref, this->comm);
+    auto res = TestFixture::dist_mtx_type::create(this->ref, this->comm);
+    // If OtherT is more precise: 0, otherwise r
+    auto residual = r<OtherT>::value < r<T>::value
+                        ? gko::remove_complex<T>{0}
+                        : gko::remove_complex<T>{r<OtherT>::value};
+
+    this->dist_mat->convert_to(tmp.get());
+    tmp->convert_to(res.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_local_matrix()),
+                        gko::as<csr>(res->get_local_matrix()), residual);
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(this->dist_mat->get_non_local_matrix()),
+                        gko::as<csr>(res->get_non_local_matrix()), residual);
+}
+
+
+TYPED_TEST(Matrix, CanMoveToNextPrecision)
+{
+    using T = typename TestFixture::value_type;
+    using csr = typename TestFixture::local_matrix_type;
+    using local_index_type = typename TestFixture::local_index_type;
+    using global_index_type = typename TestFixture::global_index_type;
+    using OtherT = typename gko::next_precision<T>;
+    using OtherDist = typename gko::experimental::distributed::Matrix<
+        OtherT, local_index_type, global_index_type>;
+    auto tmp = OtherDist::create(this->ref, this->comm);
+    auto res = TestFixture::dist_mtx_type::create(this->ref, this->comm);
+    auto clone_dist_mat = gko::clone(this->dist_mat);
+    // If OtherT is more precise: 0, otherwise r
+    auto residual = r<OtherT>::value < r<T>::value
+                        ? gko::remove_complex<T>{0}
+                        : gko::remove_complex<T>{r<OtherT>::value};
+
+    this->dist_mat->move_to(tmp.get());
+    tmp->convert_to(res.get());
+
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(clone_dist_mat->get_local_matrix()),
+                        gko::as<csr>(res->get_local_matrix()), residual);
+    GKO_ASSERT_MTX_NEAR(gko::as<csr>(clone_dist_mat->get_non_local_matrix()),
+                        gko::as<csr>(res->get_non_local_matrix()), residual);
+}
+
+
+bool needs_transfers(std::shared_ptr<const gko::Executor> exec)
+{
+    return exec->get_master() != exec &&
+           !gko::experimental::mpi::is_gpu_aware();
+}
+
+
+class HostToDeviceLogger : public gko::log::Logger {
+public:
+    void on_copy_started(const gko::Executor* exec_from,
+                         const gko::Executor* exec_to,
+                         const gko::uintptr& loc_from,
+                         const gko::uintptr& loc_to,
+                         const gko::size_type& num_bytes) const override
+    {
+        if (exec_from != exec_to) {
+            transfer_count_++;
+        }
+    }
+
+    int get_transfer_count() const { return transfer_count_; }
+
+    static std::unique_ptr<HostToDeviceLogger> create()
+    {
+        return std::unique_ptr<HostToDeviceLogger>(new HostToDeviceLogger());
+    }
+
+protected:
+    explicit HostToDeviceLogger()
+        : gko::log::Logger(gko::log::Logger::copy_started_mask)
+    {}
+
+private:
+    mutable int transfer_count_ = 0;
+};
+
+
+class MatrixGpuAwareCheck : public CommonMpiTestFixture {
+public:
+    using local_index_type = gko::int32;
+    using global_index_type = gko::int64;
+    using dist_mtx_type =
+        gko::experimental::distributed::Matrix<value_type, local_index_type,
+                                               global_index_type>;
+    using dist_vec_type = gko::experimental::distributed::Vector<value_type>;
+    using dense_vec_type = gko::matrix::Dense<value_type>;
+
+    MatrixGpuAwareCheck()
+        : logger(gko::share(HostToDeviceLogger::create())), engine(42)
+    {
+        exec->add_logger(logger);
+
+        mat = dist_mtx_type::create(exec, comm);
+        x = dist_vec_type::create(exec, comm);
+        y = dist_vec_type::create(exec, comm);
+
+        alpha = dense_vec_type::create(exec, gko::dim<2>{1, 1});
+        beta = dense_vec_type::create(exec, gko::dim<2>{1, 1});
+    }
+
+
+    std::unique_ptr<dist_mtx_type> mat;
+
+    std::unique_ptr<dist_vec_type> x;
+    std::unique_ptr<dist_vec_type> y;
+
+    std::unique_ptr<dense_vec_type> alpha;
+    std::unique_ptr<dense_vec_type> beta;
+
+    std::shared_ptr<HostToDeviceLogger> logger;
+
+    std::default_random_engine engine;
+};
+
+
+TEST_F(MatrixGpuAwareCheck, ApplyCopiesToHostOnlyIfNecessary)
+{
+    auto transfer_count_before = logger->get_transfer_count();
+
+    mat->apply(x.get(), y.get());
+
+    ASSERT_EQ(logger->get_transfer_count() > transfer_count_before,
+              needs_transfers(exec));
+}
+
+
+TEST_F(MatrixGpuAwareCheck, AdvancedApplyCopiesToHostOnlyIfNecessary)
+{
+    auto transfer_count_before = logger->get_transfer_count();
+
+    mat->apply(alpha.get(), x.get(), beta.get(), y.get());
+
+    ASSERT_EQ(logger->get_transfer_count() > transfer_count_before,
+              needs_transfers(exec));
+}
diff --git a/test/mpi/distributed/vector.cpp b/test/mpi/distributed/vector.cpp
new file mode 100644
index 00000000000..bd9df9191ac
--- /dev/null
+++ b/test/mpi/distributed/vector.cpp
@@ -0,0 +1,913 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <memory>
+#include <random>
+
+
+#include <mpi.h>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/config.hpp>
+#include <ginkgo/core/base/array.hpp>
+#include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/distributed/partition.hpp>
+#include <ginkgo/core/distributed/vector.hpp>
+#include <ginkgo/core/log/logger.hpp>
+
+
+#include "core/test/utils.hpp"
+#include "test/utils/mpi/executor.hpp"
+
+
+bool needs_transfers(std::shared_ptr<const gko::Executor> exec)
+{
+    return exec->get_master() != exec &&
+           !gko::experimental::mpi::is_gpu_aware();
+}
+
+
+class HostToDeviceLogger : public gko::log::Logger {
+public:
+    void on_copy_started(const gko::Executor* exec_from,
+                         const gko::Executor* exec_to,
+                         const gko::uintptr& loc_from,
+                         const gko::uintptr& loc_to,
+                         const gko::size_type& num_bytes) const override
+    {
+        if (exec_from != exec_to) {
+            transfer_count_++;
+        }
+    }
+
+    int get_transfer_count() const { return transfer_count_; }
+
+    static std::unique_ptr<HostToDeviceLogger> create()
+    {
+        return std::unique_ptr<HostToDeviceLogger>(new HostToDeviceLogger());
+    }
+
+protected:
+    HostToDeviceLogger() : gko::log::Logger(gko::log::Logger::copy_started_mask)
+    {}
+
+private:
+    mutable int transfer_count_ = 0;
+};
+
+
+template <typename ValueLocalGlobalIndexType>
+class VectorCreation : public CommonMpiTestFixture {
+public:
+    using value_type =
+        typename std::tuple_element<0, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using local_index_type =
+        typename std::tuple_element<1, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using global_index_type =
+        typename std::tuple_element<2, decltype(
+                                           ValueLocalGlobalIndexType())>::type;
+    using part_type =
+        gko::experimental::distributed::Partition<local_index_type,
+                                                  global_index_type>;
+    using md_type = gko::matrix_data<value_type, global_index_type>;
+    using d_md_type = gko::device_matrix_data<value_type, global_index_type>;
+    using dist_vec_type = gko::experimental::distributed::Vector<value_type>;
+    using dense_type = gko::matrix::Dense<value_type>;
+
+    VectorCreation()
+        : part(gko::share(part_type::build_from_contiguous(
+              this->ref, {ref, {0, 2, 4, 6}}))),
+          local_size{4, 11},
+          size{local_size[1] * comm.size(), 11},
+          md{{0, 1}, {2, 3}, {4, 5}, {6, 7}, {8, 9}, {10, 11}},
+          md_localized{{{0, 1}, {2, 3}}, {{4, 5}, {6, 7}}, {{8, 9}, {10, 11}}}
+    {}
+
+    void SetUp() override { ASSERT_EQ(comm.size(), 3); }
+
+    std::shared_ptr<part_type> part;
+
+    gko::dim<2> local_size;
+    gko::dim<2> size;
+
+    md_type md;
+    md_type md_localized[3];
+
+    std::default_random_engine engine;
+};
+
+TYPED_TEST_SUITE(VectorCreation, gko::test::ValueLocalGlobalIndexTypes);
+
+
+#ifndef GKO_COMPILING_DPCPP
+
+
+TYPED_TEST(VectorCreation, CanReadGlobalMatrixData)
+{
+    using value_type = typename TestFixture::value_type;
+    auto vec = TestFixture::dist_vec_type::create(this->exec, this->comm);
+    auto rank = this->comm.rank();
+    I<I<value_type>> ref_data[3] = {
+        {{0, 1}, {2, 3}},
+        {{4, 5}, {6, 7}},
+        {{8, 9}, {10, 11}},
+    };
+
+    vec->read_distributed(this->md, this->part.get());
+
+    GKO_ASSERT_EQUAL_DIMENSIONS(vec->get_size(), gko::dim<2>(6, 2));
+    GKO_ASSERT_EQUAL_DIMENSIONS(vec->get_local_vector()->get_size(),
+                                gko::dim<2>(2, 2));
+    GKO_ASSERT_MTX_NEAR(vec->get_local_vector(), ref_data[rank], 0.0);
+}
+
+
+TYPED_TEST(VectorCreation, CanReadGlobalMatrixDataSomeEmpty)
+{
+    using part_type = typename TestFixture::part_type;
+    auto part = gko::share(part_type::build_from_contiguous(
+        this->exec, {this->exec, {0, 0, 6, 6}}));
+    auto vec = TestFixture::dist_vec_type::create(this->exec, this->comm);
+    auto rank = this->comm.rank();
+
+    vec->read_distributed(this->md, part.get());
+
+    GKO_ASSERT_EQUAL_DIMENSIONS(vec->get_size(), gko::dim<2>(6, 2));
+    if (rank == 1) {
+        GKO_ASSERT_EQUAL_DIMENSIONS(vec->get_local_vector()->get_size(),
+                                    gko::dim<2>(6, 2));
+        GKO_ASSERT_MTX_NEAR(
+            vec->get_local_vector(),
+            l({{0., 1.}, {2., 3.}, {4., 5.}, {6., 7.}, {8., 9.}, {10., 11.}}),
+            0.0);
+    } else {
+        GKO_ASSERT_EQUAL_DIMENSIONS(vec->get_local_vector()->get_size(),
+                                    gko::dim<2>(0, 2));
+    }
+}
+
+
+TYPED_TEST(VectorCreation, CanReadGlobalDeviceMatrixData)
+{
+    using index_type = typename TestFixture::global_index_type;
+    using d_md_type = typename TestFixture::d_md_type;
+    using part_type = typename TestFixture::part_type;
+    using value_type = typename TestFixture::value_type;
+    d_md_type md{
+        this->exec, gko::dim<2>{6, 2},
+        gko::array<index_type>{
+            this->exec, I<index_type>{0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5}},
+        gko::array<index_type>{
+            this->exec, I<index_type>{0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}},
+        gko::array<value_type>{
+            this->exec, I<value_type>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}}};
+    auto part = gko::share(part_type::build_from_contiguous(
+        this->exec, {this->exec, {0, 2, 4, 6}}));
+    auto vec = TestFixture::dist_vec_type::create(this->exec, this->comm);
+    auto rank = this->comm.rank();
+    I<I<value_type>> ref_data[3] = {
+        {{0, 1}, {2, 3}},
+        {{4, 5}, {6, 7}},
+        {{8, 9}, {10, 11}},
+    };
+
+    vec->read_distributed(md, part.get());
+
+    GKO_ASSERT_EQUAL_DIMENSIONS(vec->get_size(), gko::dim<2>(6, 2));
+    GKO_ASSERT_EQUAL_DIMENSIONS(vec->get_local_vector()->get_size(),
+                                gko::dim<2>(2, 2));
+    GKO_ASSERT_MTX_NEAR(vec->get_local_vector(), ref_data[rank], 0.0);
+}
+
+
+TYPED_TEST(VectorCreation, CanReadGlobalMatrixDataScattered)
+{
+    using md_type = typename TestFixture::md_type;
+    using part_type = typename TestFixture::part_type;
+    using value_type = typename TestFixture::value_type;
+    md_type md{{0, 1}, {2, 3}, {4, 5}, {6, 7}, {8, 9}, {10, 11}};
+    auto part = gko::share(part_type::build_from_mapping(
+        this->exec, {this->exec, {0, 1, 2, 0, 2, 0}}, 3));
+    auto vec = TestFixture::dist_vec_type::create(this->exec, this->comm);
+    auto rank = this->comm.rank();
+    gko::dim<2> ref_size[3] = {{3, 2}, {1, 2}, {2, 2}};
+    I<I<value_type>> ref_data[3] = {
+        {{0, 1}, {6, 7}, {10, 11}},
+        {{2, 3}},
+        {{4, 5}, {8, 9}},
+    };
+
+    vec->read_distributed(md, part.get());
+
+    GKO_ASSERT_EQUAL_DIMENSIONS(vec->get_size(), gko::dim<2>(6, 2));
+    GKO_ASSERT_EQUAL_DIMENSIONS(vec->get_local_vector()->get_size(),
+                                ref_size[rank]);
+    GKO_ASSERT_MTX_NEAR(vec->get_local_vector(), ref_data[rank], 0.0);
+}
+
+
+TYPED_TEST(VectorCreation, CanReadLocalMatrixData)
+{
+    using md_type = typename TestFixture::md_type;
+    using part_type = typename TestFixture::part_type;
+    using value_type = typename TestFixture::value_type;
+    md_type md[3] = {
+        {gko::dim<2>{6, 2}, {{0, 0, 0}, {0, 1, 1}, {1, 0, 2}, {1, 1, 3}}},
+        {gko::dim<2>{6, 2}, {{2, 0, 4}, {2, 1, 5}, {3, 0, 6}, {3, 1, 7}}},
+        {gko::dim<2>{6, 2}, {{4, 0, 8}, {4, 1, 9}, {5, 0, 10}, {5, 1, 11}}}};
+    auto part = gko::share(part_type::build_from_contiguous(
+        this->exec, {this->exec, {0, 2, 4, 6}}));
+    auto vec = TestFixture::dist_vec_type::create(this->exec, this->comm);
+    auto rank = this->comm.rank();
+    I<I<value_type>> ref_data[3] = {
+        {{0, 1}, {2, 3}},
+        {{4, 5}, {6, 7}},
+        {{8, 9}, {10, 11}},
+    };
+
+    vec->read_distributed(md[rank], part.get());
+
+    GKO_ASSERT_EQUAL_DIMENSIONS(vec->get_size(), gko::dim<2>(6, 2));
+    GKO_ASSERT_EQUAL_DIMENSIONS(vec->get_local_vector()->get_size(),
+                                gko::dim<2>(2, 2));
+    GKO_ASSERT_MTX_NEAR(vec->get_local_vector(), ref_data[rank], 0.0);
+}
+
+
+TYPED_TEST(VectorCreation, CanReadLocalMatrixDataSomeEmpty)
+{
+    using md_type = typename TestFixture::md_type;
+    using part_type = typename TestFixture::part_type;
+    using value_type = typename TestFixture::value_type;
+    md_type md[3] = {{gko::dim<2>{6, 2}, {}},
+                     {gko::dim<2>{6, 2},
+                      // clang-format off
+                      {{0, 0, 0}, {0, 1, 1},
+                       {1, 0, 2}, {1, 1, 3},
+                       {2, 0, 4}, {2, 1, 5},
+                       {3, 0, 6}, {3, 1, 7},
+                       {4, 0, 8}, {4, 1, 9},
+                       {5, 0, 10}, {5, 1, 11}}},
+                     // clang-format on
+                     {gko::dim<2>{6, 2}, {}}};
+    auto part = gko::share(part_type::build_from_contiguous(
+        this->exec, {this->exec, {0, 0, 6, 6}}));
+    auto vec = TestFixture::dist_vec_type::create(this->exec, this->comm);
+    auto rank = this->comm.rank();
+
+    vec->read_distributed(md[rank], part.get());
+
+    GKO_ASSERT_EQUAL_DIMENSIONS(vec->get_size(), gko::dim<2>(6, 2));
+    if (rank == 1) {
+        GKO_ASSERT_EQUAL_DIMENSIONS(vec->get_local_vector()->get_size(),
+                                    gko::dim<2>(6, 2));
+        GKO_ASSERT_MTX_NEAR(
+            vec->get_local_vector(),
+            I<I<value_type>>(
+                {{0, 1}, {2, 3}, {4, 5}, {6, 7}, {8, 9}, {10, 11}}),
+            0.0);
+    } else {
+        GKO_ASSERT_EQUAL_DIMENSIONS(vec->get_local_vector()->get_size(),
+                                    gko::dim<2>(0, 2));
+    }
+}
+
+
+#endif
+
+
+TYPED_TEST(VectorCreation, CanCreateFromLocalVectorAndSize)
+{
+    using dist_vec_type = typename TestFixture::dist_vec_type;
+    using dense_type = typename TestFixture::dense_type;
+    auto local_vec = dense_type::create(this->exec);
+    local_vec->read(this->md_localized[this->comm.rank()]);
+    auto clone_local_vec = gko::clone(local_vec);
+
+    auto vec = dist_vec_type::create(this->exec, this->comm, gko::dim<2>{6, 2},
+                                     local_vec.get());
+
+    GKO_ASSERT_EQUAL_DIMENSIONS(vec, gko::dim<2>(6, 2));
+    GKO_ASSERT_MTX_NEAR(vec->get_local_vector(), clone_local_vec, 0);
+}
+
+
+TYPED_TEST(VectorCreation, CanCreateFromLocalVectorWithoutSize)
+{
+    using dist_vec_type = typename TestFixture::dist_vec_type;
+    using dense_type = typename TestFixture::dense_type;
+    auto local_vec = dense_type::create(this->exec);
+    local_vec->read(this->md_localized[this->comm.rank()]);
+    auto clone_local_vec = gko::clone(local_vec);
+
+    auto vec = dist_vec_type::create(this->exec, this->comm, local_vec.get());
+
+    GKO_ASSERT_EQUAL_DIMENSIONS(vec, gko::dim<2>(6, 2));
+    GKO_ASSERT_MTX_NEAR(vec->get_local_vector(), clone_local_vec, 0);
+}
+
+
+template <typename ValueType>
+class VectorReductions : public CommonMpiTestFixture {
+public:
+    using value_type = ValueType;
+    using local_index_type = gko::int32;
+    using global_index_type = gko::int64;
+    using part_type =
+        gko::experimental::distributed::Partition<local_index_type,
+                                                  global_index_type>;
+    using dist_vec_type = gko::experimental::distributed::Vector<value_type>;
+    using dense_type = gko::matrix::Dense<value_type>;
+    using real_dense_type = typename dense_type::real_type;
+
+    VectorReductions() : size{53, 11}, engine(42)
+    {
+        logger = gko::share(HostToDeviceLogger::create());
+        exec->add_logger(logger);
+
+        dense_x = dense_type::create(exec);
+        dense_y = dense_type::create(exec);
+        x = dist_vec_type::create(exec, comm);
+        y = dist_vec_type::create(exec, comm);
+        dense_res = dense_type ::create(exec);
+        res = dense_type ::create(exec);
+        dense_real_res = real_dense_type ::create(exec);
+        real_res = real_dense_type ::create(exec);
+
+        dense_tmp = gko::Array<char>(exec);
+        tmp = gko::Array<char>(exec);
+
+        auto num_parts =
+            static_cast<gko::experimental::distributed::comm_index_type>(
+                comm.size());
+        auto mapping = gko::test::generate_random_array<
+            gko::experimental::distributed::comm_index_type>(
+            size[0],
+            std::uniform_int_distribution<
+                gko::experimental::distributed::comm_index_type>(0,
+                                                                 num_parts - 1),
+            engine, ref);
+        auto part = part_type::build_from_mapping(ref, mapping, num_parts);
+
+        auto md_x = gko::test::generate_random_matrix_data<value_type,
+                                                           global_index_type>(
+            size[0], size[1],
+            std::uniform_int_distribution<gko::size_type>(size[1], size[1]),
+            std::normal_distribution<gko::remove_complex<value_type>>(),
+            engine);
+        dense_x->read(md_x);
+        auto tmp_x = dist_vec_type::create(ref, comm);
+        tmp_x->read_distributed(md_x, part.get());
+        x = gko::clone(exec, tmp_x);
+
+        auto md_y = gko::test::generate_random_matrix_data<value_type,
+                                                           global_index_type>(
+            size[0], size[1],
+            std::uniform_int_distribution<gko::size_type>(size[1], size[1]),
+            std::normal_distribution<gko::remove_complex<value_type>>(),
+            engine);
+        dense_y->read(md_y);
+        auto tmp_y = dist_vec_type::create(ref, comm);
+        tmp_y->read_distributed(md_y, part.get());
+        y = gko::clone(exec, tmp_y);
+    }
+
+    void SetUp() override { ASSERT_GT(comm.size(), 0); }
+
+    void init_result()
+    {
+        res = dense_type::create(exec, gko::dim<2>{1, size[1]});
+        dense_res = dense_type::create(exec, gko::dim<2>{1, size[1]});
+        real_res = real_dense_type::create(exec, gko::dim<2>{1, size[1]});
+        dense_real_res = real_dense_type::create(exec, gko::dim<2>{1, size[1]});
+        res->fill(0.0);
+        dense_res->fill(0.0);
+        real_res->fill(0.0);
+        dense_real_res->fill(0.0);
+    }
+
+    gko::dim<2> size;
+
+    std::unique_ptr<dense_type> dense_x;
+    std::unique_ptr<dense_type> dense_y;
+    std::unique_ptr<dist_vec_type> x;
+    std::unique_ptr<dist_vec_type> y;
+    std::unique_ptr<dense_type> dense_res;
+    std::unique_ptr<dense_type> res;
+    std::unique_ptr<real_dense_type> dense_real_res;
+    std::unique_ptr<real_dense_type> real_res;
+    gko::array<char> dense_tmp;
+    gko::array<char> tmp;
+
+    std::shared_ptr<HostToDeviceLogger> logger;
+
+    std::default_random_engine engine;
+};
+
+TYPED_TEST_SUITE(VectorReductions, gko::test::ValueTypes);
+
+
+TYPED_TEST(VectorReductions, ComputesDotProductIsSameAsDense)
+{
+    using value_type = typename TestFixture::value_type;
+    this->init_result();
+
+    this->x->compute_dot(this->y.get(), this->res.get());
+    this->dense_x->compute_dot(this->dense_y.get(), this->dense_res.get());
+
+    GKO_ASSERT_MTX_NEAR(this->res, this->dense_res, r<value_type>::value);
+}
+
+
+TYPED_TEST(VectorReductions, ComputesDotProductWithTmpIsSameAsDense)
+{
+    using value_type = typename TestFixture::value_type;
+    this->init_result();
+
+    this->x->compute_dot(this->y.get(), this->res.get(), this->tmp);
+    this->dense_x->compute_dot(this->dense_y.get(), this->dense_res.get(),
+                               this->dense_tmp);
+
+    GKO_ASSERT_MTX_NEAR(this->res, this->dense_res, r<value_type>::value);
+}
+
+
+TYPED_TEST(VectorReductions, ComputesConjDotProductIsSameAsDense)
+{
+    using value_type = typename TestFixture::value_type;
+    this->init_result();
+
+    this->x->compute_conj_dot(this->y.get(), this->res.get());
+    this->dense_x->compute_conj_dot(this->dense_y.get(), this->dense_res.get());
+
+    GKO_ASSERT_MTX_NEAR(this->res, this->dense_res, r<value_type>::value);
+}
+
+
+TYPED_TEST(VectorReductions, ComputesConjDotProductWithTmpIsSameAsDense)
+{
+    using value_type = typename TestFixture::value_type;
+    this->init_result();
+
+    this->x->compute_conj_dot(this->y.get(), this->res.get(), this->tmp);
+    this->dense_x->compute_conj_dot(this->dense_y.get(), this->dense_res.get(),
+                                    this->dense_tmp);
+
+    GKO_ASSERT_MTX_NEAR(this->res, this->dense_res, r<value_type>::value);
+}
+
+
+TYPED_TEST(VectorReductions, ComputesNorm2IsSameAsDense)
+{
+    using value_type = typename TestFixture::value_type;
+    this->init_result();
+
+    this->x->compute_norm2(this->real_res.get());
+    this->dense_x->compute_norm2(this->dense_real_res.get());
+
+    GKO_ASSERT_MTX_NEAR(this->real_res, this->dense_real_res,
+                        r<value_type>::value);
+}
+
+
+TYPED_TEST(VectorReductions, ComputesNorm2WithTmpIsSameAsDense)
+{
+    using value_type = typename TestFixture::value_type;
+    this->init_result();
+
+    this->x->compute_norm2(this->real_res.get(), this->tmp);
+    this->dense_x->compute_norm2(this->dense_real_res.get(), this->dense_tmp);
+
+    GKO_ASSERT_MTX_NEAR(this->real_res, this->dense_real_res,
+                        r<value_type>::value);
+}
+
+
+TYPED_TEST(VectorReductions, ComputesNorm1IsSameAsDense)
+{
+    using value_type = typename TestFixture::value_type;
+    this->init_result();
+
+    this->x->compute_norm1(this->real_res.get());
+    this->dense_x->compute_norm1(this->dense_real_res.get());
+
+    GKO_ASSERT_MTX_NEAR(this->real_res, this->dense_real_res,
+                        r<value_type>::value);
+}
+
+
+TYPED_TEST(VectorReductions, ComputesNorm1WithTmpIsSameAsDense)
+{
+    using value_type = typename TestFixture::value_type;
+    this->init_result();
+
+    this->x->compute_norm1(this->real_res.get(), this->tmp);
+    this->dense_x->compute_norm1(this->dense_real_res.get(), this->dense_tmp);
+
+    GKO_ASSERT_MTX_NEAR(this->real_res, this->dense_real_res,
+                        r<value_type>::value);
+}
+
+
+TYPED_TEST(VectorReductions, ComputeDotCopiesToHostOnlyIfNecessary)
+{
+    this->init_result();
+    auto transfer_count_before = this->logger->get_transfer_count();
+
+    this->x->compute_dot(this->y.get(), this->res.get());
+
+    ASSERT_EQ(this->logger->get_transfer_count() > transfer_count_before,
+              needs_transfers(this->exec));
+}
+
+
+TYPED_TEST(VectorReductions, ComputeConjDotCopiesToHostOnlyIfNecessary)
+{
+    this->init_result();
+    auto transfer_count_before = this->logger->get_transfer_count();
+
+    this->x->compute_conj_dot(this->y.get(), this->res.get());
+
+    ASSERT_EQ(this->logger->get_transfer_count() > transfer_count_before,
+              needs_transfers(this->exec));
+}
+
+
+TYPED_TEST(VectorReductions, ComputeNorm2CopiesToHostOnlyIfNecessary)
+{
+    this->init_result();
+    auto transfer_count_before = this->logger->get_transfer_count();
+
+    this->x->compute_norm2(this->real_res.get());
+
+    ASSERT_EQ(this->logger->get_transfer_count() > transfer_count_before,
+              needs_transfers(this->exec));
+}
+
+
+TYPED_TEST(VectorReductions, ComputeNorm1CopiesToHostOnlyIfNecessary)
+{
+    this->init_result();
+    auto transfer_count_before = this->logger->get_transfer_count();
+
+    this->x->compute_norm1(this->real_res.get());
+
+    ASSERT_EQ(this->logger->get_transfer_count() > transfer_count_before,
+              needs_transfers(this->exec));
+}
+
+
+template <typename ValueType>
+class VectorLocalOps : public CommonMpiTestFixture {
+public:
+    using value_type = ValueType;
+    using local_index_type = gko::int32;
+    using global_index_type = gko::int64;
+    using dist_vec_type = gko::experimental::distributed::Vector<value_type>;
+    using complex_dist_vec_type = typename dist_vec_type::complex_type;
+    using real_dist_vec_type = typename dist_vec_type ::real_type;
+    using dense_type = gko::matrix::Dense<value_type>;
+    using complex_dense_type = typename dense_type::complex_type;
+    using real_dense_type = typename dense_type ::real_type;
+
+    VectorLocalOps()
+        : local_size{4, 11}, size{local_size[0] * comm.size(), 11}, engine(42)
+    {
+        x = dist_vec_type::create(exec, comm);
+        y = dist_vec_type::create(exec, comm);
+        alpha = dense_type ::create(exec);
+        local_complex = complex_dense_type ::create(exec);
+        complex = complex_dist_vec_type::create(exec, comm);
+    }
+
+    void SetUp() override { ASSERT_GT(comm.size(), 0); }
+
+    template <typename LocalVectorType, typename DistVectorType>
+    void generate_vector_pair(std::unique_ptr<LocalVectorType>& local,
+                              std::unique_ptr<DistVectorType>& dist)
+    {
+        using vtype = typename LocalVectorType::value_type;
+        local = gko::test::generate_random_matrix<LocalVectorType>(
+            local_size[0], local_size[1],
+            std::uniform_int_distribution<gko::size_type>(local_size[1],
+                                                          local_size[1]),
+            std::normal_distribution<gko::remove_complex<vtype>>(), engine,
+            exec);
+        dist =
+            DistVectorType::create(exec, comm, size, gko::clone(local).get());
+    }
+
+    void init_vectors()
+    {
+        generate_vector_pair(local_x, x);
+        generate_vector_pair(local_y, y);
+
+        alpha = gko::test::generate_random_matrix<dense_type>(
+            1, size[1],
+            std::uniform_int_distribution<gko::size_type>(size[1], size[1]),
+            std::normal_distribution<gko::remove_complex<value_type>>(), engine,
+            exec);
+    }
+
+    void init_complex_vectors()
+    {
+        generate_vector_pair(local_real, real);
+        generate_vector_pair(local_complex, complex);
+    }
+
+    gko::dim<2> local_size;
+    gko::dim<2> size;
+
+    std::unique_ptr<dense_type> local_x;
+    std::unique_ptr<dense_type> local_y;
+    std::unique_ptr<complex_dense_type> local_complex;
+    std::unique_ptr<real_dense_type> local_real;
+    std::unique_ptr<dist_vec_type> x;
+    std::unique_ptr<dist_vec_type> y;
+    std::unique_ptr<dense_type> alpha;
+    std::unique_ptr<complex_dist_vec_type> complex;
+    std::unique_ptr<real_dist_vec_type> real;
+
+    std::default_random_engine engine;
+};
+
+TYPED_TEST_SUITE(VectorLocalOps, gko::test::ValueTypes);
+
+
+TYPED_TEST(VectorLocalOps, ApplyNotSupported)
+{
+    using dist_vec_type = typename TestFixture::dist_vec_type;
+    auto a = dist_vec_type::create(this->exec, this->comm, gko::dim<2>{2, 2},
+                                   gko::dim<2>{2, 2});
+    auto b = dist_vec_type::create(this->exec, this->comm, gko::dim<2>{2, 2},
+                                   gko::dim<2>{2, 2});
+    auto c = dist_vec_type::create(this->exec, this->comm, gko::dim<2>{2, 2},
+                                   gko::dim<2>{2, 2});
+
+    ASSERT_THROW(a->apply(b.get(), c.get()), gko::NotSupported);
+}
+
+
+TYPED_TEST(VectorLocalOps, AdvancedApplyNotSupported)
+{
+    using dist_vec_type = typename TestFixture::dist_vec_type;
+    auto a = dist_vec_type::create(this->exec, this->comm, gko::dim<2>{2, 2},
+                                   gko::dim<2>{2, 2});
+    auto b = dist_vec_type::create(this->exec, this->comm, gko::dim<2>{1, 1},
+                                   gko::dim<2>{1, 1});
+    auto c = dist_vec_type::create(this->exec, this->comm, gko::dim<2>{2, 2},
+                                   gko::dim<2>{2, 2});
+    auto d = dist_vec_type::create(this->exec, this->comm, gko::dim<2>{1, 1},
+                                   gko::dim<2>{1, 1});
+    auto e = dist_vec_type::create(this->exec, this->comm, gko::dim<2>{2, 2},
+                                   gko::dim<2>{2, 2});
+
+    ASSERT_THROW(a->apply(b.get(), c.get(), d.get(), e.get()),
+                 gko::NotSupported);
+}
+
+
+TYPED_TEST(VectorLocalOps, ConvertsToPrecision)
+{
+    using T = typename TestFixture::value_type;
+    using OtherT = typename gko::next_precision<T>;
+    using OtherVector = typename gko::experimental::distributed::Vector<OtherT>;
+    auto local_tmp = OtherVector::local_vector_type::create(this->exec);
+    auto tmp = OtherVector::create(this->exec, this->comm);
+    this->init_vectors();
+
+    this->local_x->convert_to(local_tmp.get());
+    this->x->convert_to(tmp.get());
+
+    GKO_ASSERT_MTX_NEAR(tmp->get_local_vector(), local_tmp, 0.0);
+}
+
+
+TYPED_TEST(VectorLocalOps, MovesToPrecision)
+{
+    using T = typename TestFixture::value_type;
+    using OtherT = typename gko::next_precision<T>;
+    using OtherVector = typename gko::experimental::distributed::Vector<OtherT>;
+    auto local_tmp = OtherVector::local_vector_type::create(this->exec);
+    auto tmp = OtherVector::create(this->exec, this->comm);
+    this->init_vectors();
+
+    this->local_x->move_to(local_tmp.get());
+    this->x->move_to(tmp.get());
+
+    GKO_ASSERT_MTX_NEAR(tmp->get_local_vector(), local_tmp, 0.0);
+}
+
+
+TYPED_TEST(VectorLocalOps, ComputeAbsoluteSameAsLocal)
+{
+    using value_type = typename TestFixture::value_type;
+    this->init_vectors();
+
+    auto local_abs = this->local_x->compute_absolute();
+    auto abs = this->x->compute_absolute();
+
+    GKO_ASSERT_MTX_NEAR(abs->get_local_vector(), local_abs,
+                        r<value_type>::value);
+}
+
+
+TYPED_TEST(VectorLocalOps, ComputeAbsoluteInplaceSameAsLocal)
+{
+    using value_type = typename TestFixture::value_type;
+    this->init_vectors();
+
+    this->local_x->compute_absolute_inplace();
+    this->x->compute_absolute_inplace();
+
+    GKO_ASSERT_MTX_NEAR(this->x->get_local_vector(), this->local_x,
+                        r<value_type>::value);
+}
+
+
+TYPED_TEST(VectorLocalOps, MakeComplexSameAsLocal)
+{
+    this->init_vectors();
+    this->init_complex_vectors();
+
+    this->complex = this->x->make_complex();
+    this->local_complex = this->local_x->make_complex();
+
+    GKO_ASSERT_MTX_NEAR(this->complex->get_local_vector(), this->local_complex,
+                        0.0);
+}
+
+
+TYPED_TEST(VectorLocalOps, MakeComplexInplaceSameAsLocal)
+{
+    this->init_vectors();
+    this->init_complex_vectors();
+
+    this->x->make_complex(this->complex.get());
+    this->local_x->make_complex(this->local_complex.get());
+
+    GKO_ASSERT_MTX_NEAR(this->complex->get_local_vector(), this->local_complex,
+                        0.0);
+}
+
+
+TYPED_TEST(VectorLocalOps, GetRealSameAsLocal)
+{
+    this->init_vectors();
+    this->init_complex_vectors();
+
+    this->real = this->complex->get_real();
+    this->local_real = this->local_complex->get_real();
+
+    GKO_ASSERT_MTX_NEAR(this->real->get_local_vector(), this->local_real, 0.0);
+}
+
+
+TYPED_TEST(VectorLocalOps, GetRealInplaceSameAsLocal)
+{
+    this->init_vectors();
+    this->init_complex_vectors();
+
+    this->complex->get_real(this->real.get());
+    this->local_complex->get_real(this->local_real.get());
+
+    GKO_ASSERT_MTX_NEAR(this->real->get_local_vector(), this->local_real, 0.0);
+}
+
+
+TYPED_TEST(VectorLocalOps, GetImagSameAsLocal)
+{
+    this->init_complex_vectors();
+
+    this->real = this->complex->get_imag();
+    this->local_real = this->local_complex->get_imag();
+
+    GKO_ASSERT_MTX_NEAR(this->real->get_local_vector(), this->local_real, 0.0);
+}
+
+
+TYPED_TEST(VectorLocalOps, GetImagInplaceSameAsLocal)
+{
+    this->init_complex_vectors();
+
+    this->complex->get_imag(this->real.get());
+    this->local_complex->get_imag(this->local_real.get());
+
+    GKO_ASSERT_MTX_NEAR(this->real->get_local_vector(), this->local_real, 0.0);
+}
+
+
+TYPED_TEST(VectorLocalOps, FillSameAsLocal)
+{
+    using value_type = typename TestFixture::value_type;
+    auto value = gko::test::detail::get_rand_value<value_type>(
+        std::normal_distribution<gko::remove_complex<value_type>>(),
+        this->engine);
+    this->init_vectors();
+
+    this->x->fill(value);
+    this->local_x->fill(value);
+
+    GKO_ASSERT_MTX_NEAR(this->x->get_local_vector(), this->local_x, 0.0);
+}
+
+
+TYPED_TEST(VectorLocalOps, ScaleSameAsLocal)
+{
+    using value_type = typename TestFixture::value_type;
+    this->init_vectors();
+
+    this->x->scale(this->alpha.get());
+    this->local_x->scale(this->alpha.get());
+
+    GKO_ASSERT_MTX_NEAR(this->x->get_local_vector(), this->local_x,
+                        r<value_type>::value);
+}
+
+
+TYPED_TEST(VectorLocalOps, InvScaleSameAsLocal)
+{
+    using value_type = typename TestFixture::value_type;
+    this->init_vectors();
+
+    this->x->inv_scale(this->alpha.get());
+    this->local_x->inv_scale(this->alpha.get());
+
+    GKO_ASSERT_MTX_NEAR(this->x->get_local_vector(), this->local_x,
+                        r<value_type>::value);
+}
+
+
+TYPED_TEST(VectorLocalOps, AddScaleSameAsLocal)
+{
+    using value_type = typename TestFixture::value_type;
+    this->init_vectors();
+
+    this->x->add_scaled(this->alpha.get(), this->y.get());
+    this->local_x->add_scaled(this->alpha.get(), this->local_y.get());
+
+    GKO_ASSERT_MTX_NEAR(this->x->get_local_vector(), this->local_x,
+                        r<value_type>::value);
+}
+
+
+TYPED_TEST(VectorLocalOps, SubScaleSameAsLocal)
+{
+    using value_type = typename TestFixture::value_type;
+    this->init_vectors();
+
+    this->x->sub_scaled(this->alpha.get(), this->y.get());
+    this->local_x->sub_scaled(this->alpha.get(), this->local_y.get());
+
+    GKO_ASSERT_MTX_NEAR(this->x->get_local_vector(), this->local_x,
+                        r<value_type>::value);
+}
+
+
+TYPED_TEST(VectorLocalOps, CreateRealViewSameAsLocal)
+{
+    this->init_vectors();
+
+    auto rv = this->x->create_real_view();
+    auto local_rv = this->local_x->create_real_view();
+
+    GKO_ASSERT_EQUAL_ROWS(rv, this->x);
+    GKO_ASSERT_EQUAL_ROWS(rv->get_local_vector(), local_rv);
+    GKO_ASSERT_EQUAL_COLS(rv->get_local_vector(), local_rv);
+    EXPECT_EQ(rv->get_local_vector()->get_stride(), local_rv->get_stride());
+    GKO_ASSERT_MTX_NEAR(rv->get_local_vector(), local_rv, 0.0);
+}
diff --git a/test/mpi/solver/CMakeLists.txt b/test/mpi/solver/CMakeLists.txt
new file mode 100644
index 00000000000..43a2d870d3f
--- /dev/null
+++ b/test/mpi/solver/CMakeLists.txt
@@ -0,0 +1 @@
+ginkgo_create_common_and_reference_test(solver MPI_SIZE 3)
diff --git a/test/mpi/solver/solver.cpp b/test/mpi/solver/solver.cpp
new file mode 100644
index 00000000000..bca7ab222d8
--- /dev/null
+++ b/test/mpi/solver/solver.cpp
@@ -0,0 +1,575 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#include <algorithm>
+#include <memory>
+#include <numeric>
+#include <random>
+#include <vector>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/matrix_data.hpp>
+#include <ginkgo/core/base/name_demangling.hpp>
+#include <ginkgo/core/distributed/matrix.hpp>
+#include <ginkgo/core/distributed/partition.hpp>
+#include <ginkgo/core/distributed/vector.hpp>
+#include <ginkgo/core/matrix/csr.hpp>
+#include <ginkgo/core/matrix/dense.hpp>
+#include <ginkgo/core/solver/bicgstab.hpp>
+#include <ginkgo/core/solver/cg.hpp>
+#include <ginkgo/core/solver/cgs.hpp>
+#include <ginkgo/core/solver/fcg.hpp>
+#include <ginkgo/core/solver/ir.hpp>
+#include <ginkgo/core/stop/residual_norm.hpp>
+
+
+#include "core/test/utils.hpp"
+#include "core/test/utils/matrix_generator.hpp"
+#include "core/utils/matrix_utils.hpp"
+#include "test/utils/mpi/executor.hpp"
+
+
+#if GINKGO_DPCPP_SINGLE_MODE
+using solver_value_type = float;
+#else
+using solver_value_type = double;
+#endif  // GINKGO_DPCPP_SINGLE_MODE
+
+
+template <typename SolverType>
+struct SimpleSolverTest {
+    using solver_type = SolverType;
+    using value_type = typename solver_type::value_type;
+    using mixed_value_type = gko::next_precision<value_type>;
+    using local_index_type = gko::int32;
+    using global_index_type = gko::int64;
+    using dist_matrix_type =
+        gko::experimental::distributed::Matrix<value_type, local_index_type,
+                                               gko::int64>;
+    using non_dist_matrix_type =
+        gko::matrix::Csr<value_type, global_index_type>;
+    using dist_vector_type = gko::experimental::distributed::Vector<value_type>;
+    using non_dist_vector_type = gko::matrix::Dense<value_type>;
+    using mixed_dist_vector_type =
+        gko::experimental::distributed::Vector<mixed_value_type>;
+    using mixed_non_dist_vector_type = gko::matrix::Dense<mixed_value_type>;
+    using partition_type =
+        gko::experimental::distributed::Partition<local_index_type,
+                                                  global_index_type>;
+
+    static constexpr double tolerance() { return 10 * reduction_factor(); }
+
+    static constexpr gko::size_type iteration_count() { return 200u; }
+
+    static constexpr value_type reduction_factor() { return 1e-4; }
+
+    static void preprocess(
+        gko::matrix_data<value_type, global_index_type>& data)
+    {
+        gko::utils::make_diag_dominant(data, 1.5);
+    }
+
+    static typename solver_type::parameters_type build(
+        std::shared_ptr<const gko::Executor> exec)
+    {
+        return solver_type::build().with_criteria(
+            gko::stop::Iteration::build()
+                .with_max_iters(iteration_count())
+                .on(exec),
+            gko::stop::ResidualNorm<value_type>::build()
+                .with_baseline(gko::stop::mode::absolute)
+                .with_reduction_factor(reduction_factor())
+                .on(exec));
+    }
+
+    static void assert_empty_state(const solver_type* mtx)
+    {
+        ASSERT_FALSE(mtx->get_size());
+        ASSERT_EQ(mtx->get_system_matrix(), nullptr);
+        ASSERT_EQ(mtx->get_preconditioner(), nullptr);
+        ASSERT_EQ(mtx->get_stopping_criterion_factory(), nullptr);
+    }
+};
+
+
+struct Cg : SimpleSolverTest<gko::solver::Cg<solver_value_type>> {
+    static void preprocess(
+        gko::matrix_data<value_type, global_index_type>& data)
+    {
+        // make sure the matrix is well-conditioned
+        gko::utils::make_hpd(data, 1.5);
+    }
+};
+
+
+struct Cgs : SimpleSolverTest<gko::solver::Cgs<solver_value_type>> {};
+
+
+struct Fcg : SimpleSolverTest<gko::solver::Fcg<solver_value_type>> {
+    static void preprocess(
+        gko::matrix_data<value_type, global_index_type>& data)
+    {
+        gko::utils::make_hpd(data, 1.5);
+    }
+};
+
+
+struct Bicgstab : SimpleSolverTest<gko::solver::Bicgstab<solver_value_type>> {
+    static constexpr double tolerance() { return 300 * reduction_factor(); }
+};
+
+
+struct Ir : SimpleSolverTest<gko::solver::Ir<solver_value_type>> {
+    static void preprocess(
+        gko::matrix_data<value_type, global_index_type>& data)
+    {
+        gko::utils::make_hpd(data, 1.5);
+    }
+
+    static typename solver_type::parameters_type build(
+        std::shared_ptr<const gko::Executor> exec)
+    {
+        return SimpleSolverTest<gko::solver::Ir<solver_value_type>>::build(exec)
+            .with_solver(
+                gko::solver::Cg<value_type>::build()
+                    .with_criteria(
+                        gko::stop::Iteration::build()
+                            .with_max_iters(iteration_count())
+                            .on(exec),
+                        gko::stop::ResidualNorm<value_type>::build()
+                            .with_baseline(gko::stop::mode::absolute)
+                            .with_reduction_factor(2 * reduction_factor())
+                            .on(exec))
+                    .on(exec))
+            .with_relaxation_factor(0.9);
+    }
+};
+
+
+template <typename T>
+class Solver : public CommonMpiTestFixture {
+protected:
+    using Config = T;
+    using SolverType = typename T::solver_type;
+    using Mtx = typename T::dist_matrix_type;
+    using local_index_type = typename T::local_index_type;
+    using global_index_type = typename T::global_index_type;
+    using value_type = typename T::value_type;
+    using mixed_value_type = gko::next_precision<value_type>;
+    using Vec = typename T::dist_vector_type;
+    using LocalVec = typename T::non_dist_vector_type;
+    using MixedVec = typename T::mixed_dist_vector_type;
+    using MixedLocalVec = typename T::mixed_non_dist_vector_type;
+    using Part = typename T::partition_type;
+
+    Solver() : rand_engine(15) {}
+
+    std::unique_ptr<Part> gen_part(int size, int num_active_parts)
+    {
+        auto mapping = gko::test::generate_random_array<
+            gko::experimental::distributed::comm_index_type>(
+            size,
+            std::uniform_int_distribution<
+                gko::experimental::distributed::comm_index_type>(
+                0, num_active_parts - 1),
+            rand_engine, ref);
+        return Part::build_from_mapping(ref, mapping, comm.size());
+    }
+
+
+    std::shared_ptr<Mtx> gen_mtx(const Part* part, int num_rows, int num_cols,
+                                 int min_cols, int max_cols)
+    {
+        auto data = gko::test::generate_random_matrix_data<value_type,
+                                                           global_index_type>(
+            num_rows, num_cols,
+            std::uniform_int_distribution<>(min_cols, max_cols),
+            std::normal_distribution<>(0.0, 1.0), rand_engine);
+        Config::preprocess(data);
+        auto dist_mtx = Mtx::create(ref, comm);
+        dist_mtx->read_distributed(data, part);
+        return gko::share(gko::clone(exec, dist_mtx));
+    }
+
+    template <typename ValueType, typename IndexType>
+    gko::matrix_data<ValueType, IndexType> gen_dense_data(gko::dim<2> size)
+    {
+        return {
+            size,
+            std::normal_distribution<gko::remove_complex<ValueType>>(0.0, 1.0),
+            rand_engine};
+    }
+
+    template <typename DistVecType = Vec>
+    std::shared_ptr<DistVecType> gen_in_vec(
+        const Part* part, const std::shared_ptr<SolverType>& solver, int nrhs,
+        int stride)
+    {
+        auto global_size = gko::dim<2>{solver->get_size()[1],
+                                       static_cast<gko::size_type>(nrhs)};
+        auto local_size = gko::dim<2>{
+            static_cast<gko::size_type>(part->get_part_size(comm.rank())),
+            static_cast<gko::size_type>(nrhs)};
+        auto dist_result =
+            DistVecType::create(ref, comm, global_size, local_size, stride);
+        dist_result->read_distributed(
+            gen_dense_data<typename DistVecType::value_type, global_index_type>(
+                global_size),
+            part);
+        return gko::share(gko::clone(exec, dist_result));
+    }
+
+    template <typename VecType = LocalVec>
+    std::shared_ptr<VecType> gen_scalar()
+    {
+        return gko::share(gko::initialize<VecType>(
+            {gko::test::detail::get_rand_value<typename VecType::value_type>(
+                std::normal_distribution<
+                    gko::remove_complex<typename VecType::value_type>>(0.0,
+                                                                       1.0),
+                rand_engine)},
+            exec));
+    }
+
+    template <typename DistVecType = Vec>
+    std::shared_ptr<DistVecType> gen_out_vec(
+        const Part* part, const std::shared_ptr<SolverType>& solver, int nrhs,
+        int stride)
+    {
+        auto global_size = gko::dim<2>{solver->get_size()[0],
+                                       static_cast<gko::size_type>(nrhs)};
+        auto local_size = gko::dim<2>{
+            static_cast<gko::size_type>(part->get_part_size(comm.rank())),
+            static_cast<gko::size_type>(nrhs)};
+        auto dist_result =
+            DistVecType::create(ref, comm, global_size, local_size, stride);
+        dist_result->read_distributed(
+            gen_dense_data<typename DistVecType::value_type, global_index_type>(
+                global_size),
+            part);
+        return gko::share(gko::clone(exec, dist_result));
+    }
+
+    template <typename VecType>
+    double tol(const std::shared_ptr<VecType>& x)
+    {
+        return Config::tolerance() * std::sqrt(x->get_size()[1]);
+    }
+
+    template <typename VecType>
+    double mixed_tol(const std::shared_ptr<VecType>& x)
+    {
+        return std::max(r_mixed<value_type, mixed_value_type>() *
+                            std::sqrt(x->get_size()[1]),
+                        tol(x));
+    }
+
+    template <typename TestFunction>
+    void forall_partition_scenarios(TestFunction fn)
+    {
+        auto guarded_fn = [&](auto prt) {
+            try {
+                comm.synchronize();
+                fn(std::move(prt));
+            } catch (std::exception& e) {
+                FAIL() << e.what();
+            }
+        };
+        {
+            SCOPED_TRACE("Empty partition");
+            guarded_fn(gen_part(0, comm.size()));
+        }
+        {
+            SCOPED_TRACE("Full partition");
+            guarded_fn(gen_part(50, comm.size()));
+        }
+        {
+            SCOPED_TRACE("Some empty partition");
+            guarded_fn(gen_part(50, comm.size() - 1));
+        }
+    }
+
+    template <typename TestFunction>
+    void forall_matrix_scenarios(const Part* part, TestFunction fn)
+    {
+        auto guarded_fn = [&](auto mtx) {
+            try {
+                fn(std::move(mtx));
+            } catch (std::exception& e) {
+                FAIL() << e.what();
+            }
+        };
+        {
+            int num_rows = part->get_size();
+            SCOPED_TRACE("Sparse Matrix with variable row nnz (" +
+                         std::to_string(num_rows) + "x" +
+                         std::to_string(num_rows) + ")");
+            guarded_fn(gen_mtx(part, num_rows, num_rows, std::min(10, num_rows),
+                               num_rows));
+        }
+    }
+
+    template <typename TestFunction>
+    void forall_solver_scenarios(const std::shared_ptr<Mtx>& mtx,
+                                 TestFunction fn)
+    {
+        auto guarded_fn = [&](auto solver) {
+            try {
+                fn(std::move(solver));
+            } catch (std::exception& e) {
+                FAIL() << e.what();
+            }
+        };
+        {
+            SCOPED_TRACE(
+                "Unpreconditioned solver with fixed tolerance and number of "
+                "iterations");
+            guarded_fn(gko::share(Config::build(exec).on(exec)->generate(mtx)));
+        }
+    }
+
+    template <typename DistVecType = Vec, typename NonDistVecType = LocalVec,
+              typename TestFunction>
+    void forall_vector_scenarios(const Part* part,
+                                 const std::shared_ptr<SolverType>& solver,
+                                 TestFunction fn)
+    {
+        auto guarded_fn = [&](auto b, auto x) {
+            try {
+                fn(std::move(b), std::move(x));
+            } catch (std::exception& e) {
+                FAIL() << e.what();
+            }
+        };
+        {
+            SCOPED_TRACE("Multivector with 0 columns");
+            guarded_fn(gen_in_vec<DistVecType>(part, solver, 0, 0),
+                       gen_out_vec<DistVecType>(part, solver, 0, 0));
+        }
+        {
+            SCOPED_TRACE("Single vector");
+            guarded_fn(gen_in_vec<DistVecType>(part, solver, 1, 1),
+                       gen_out_vec<DistVecType>(part, solver, 1, 1));
+        }
+        {
+            SCOPED_TRACE("Single vector with correct initial guess");
+            auto in = gen_in_vec<DistVecType>(part, solver, 1, 1);
+            auto out = gen_out_vec<DistVecType>(part, solver, 1, 1);
+            solver->get_system_matrix()->apply(out.get(), in.get());
+            guarded_fn(std::move(in), std::move(out));
+        }
+        {
+            SCOPED_TRACE("Single strided vector");
+            guarded_fn(gen_in_vec<DistVecType>(part, solver, 1, 2),
+                       gen_out_vec<DistVecType>(part, solver, 1, 3));
+        }
+        {
+            SCOPED_TRACE("Multivector with 2 columns");
+            guarded_fn(gen_in_vec<DistVecType>(part, solver, 2, 2),
+                       gen_out_vec<DistVecType>(part, solver, 2, 2));
+        }
+        {
+            SCOPED_TRACE("Strided multivector with 2 columns");
+            guarded_fn(gen_in_vec<DistVecType>(part, solver, 2, 3),
+                       gen_out_vec<DistVecType>(part, solver, 2, 4));
+        }
+        {
+            SCOPED_TRACE("Multivector with 17 columns");
+            guarded_fn(gen_in_vec<DistVecType>(part, solver, 17, 17),
+                       gen_out_vec<DistVecType>(part, solver, 17, 17));
+        }
+        {
+            SCOPED_TRACE("Strided multivector with 17 columns");
+            guarded_fn(gen_in_vec<DistVecType>(part, solver, 17, 21),
+                       gen_out_vec<DistVecType>(part, solver, 17, 21));
+        }
+        if (!gko::is_complex<value_type>()) {
+            // check application of real matrix to complex vector
+            // viewed as interleaved real/imag vector
+            using complex_vec = gko::to_complex<DistVecType>;
+            {
+                SCOPED_TRACE("Single strided complex vector");
+                guarded_fn(gen_in_vec<complex_vec>(part, solver, 1, 2),
+                           gen_out_vec<complex_vec>(part, solver, 1, 3));
+            }
+            {
+                SCOPED_TRACE("Strided complex multivector with 2 columns");
+                guarded_fn(gen_in_vec<complex_vec>(part, solver, 2, 3),
+                           gen_out_vec<complex_vec>(part, solver, 2, 4));
+            }
+        }
+    }
+
+    template <typename M1, typename DistVecType>
+    void assert_residual_near(const std::shared_ptr<M1>& mtx,
+                              const std::shared_ptr<DistVecType>& x,
+                              const std::shared_ptr<DistVecType>& b,
+                              double tolerance)
+    {
+        auto one = gko::initialize<LocalVec>({gko::one<value_type>()}, exec);
+        auto neg_one =
+            gko::initialize<LocalVec>({-gko::one<value_type>()}, exec);
+        auto norm = DistVecType::local_vector_type::absolute_type::create(
+            ref, gko::dim<2>{1, b->get_size()[1]});
+        auto dist_res = gko::clone(b);
+        mtx->apply(neg_one.get(), x.get(), one.get(), dist_res.get());
+        dist_res->compute_norm2(norm.get());
+
+        for (int i = 0; i < norm->get_num_stored_elements(); ++i) {
+            ASSERT_LE(norm->at(i), tolerance);
+        }
+    }
+
+
+    template <typename M1, typename DistVecType, typename NonDistVecType>
+    void assert_residual_near(const std::shared_ptr<M1>& mtx,
+                              const std::shared_ptr<DistVecType>& x_sol,
+                              const std::shared_ptr<DistVecType>& x_old,
+                              const std::shared_ptr<DistVecType>& b,
+                              const std::shared_ptr<NonDistVecType>& alpha,
+                              const std::shared_ptr<NonDistVecType>& beta,
+                              double tolerance)
+    {
+        auto one = gko::initialize<LocalVec>({gko::one<value_type>()}, exec);
+        auto neg_one =
+            gko::initialize<LocalVec>({-gko::one<value_type>()}, exec);
+        auto norm = DistVecType::local_vector_type::absolute_type::create(
+            ref, gko::dim<2>{1, b->get_size()[1]});
+        auto dist_res = gko::clone(b);
+        // compute rx = (x_sol - beta * x_old) / alpha, since A * rx = b
+        // and we only know the accuracy of that operation
+        auto recovered_x = gko::clone(x_sol);
+        recovered_x->sub_scaled(beta.get(), x_old.get());
+        recovered_x->inv_scale(alpha.get());
+        mtx->apply(neg_one.get(), recovered_x.get(), one.get(), dist_res.get());
+        dist_res->compute_norm2(norm.get());
+
+        for (int i = 0; i < norm->get_num_stored_elements(); ++i) {
+            ASSERT_LE(norm->at(i), tolerance);
+        }
+    }
+
+    std::default_random_engine rand_engine;
+};
+
+using SolverTypes = ::testing::Types<Cg, Cgs, Fcg, Bicgstab, Ir>;
+
+TYPED_TEST_SUITE(Solver, SolverTypes, TypenameNameGenerator);
+
+
+TYPED_TEST(Solver, ApplyIsEquivalentToRef)
+{
+    this->forall_partition_scenarios([&](auto part) {
+        this->forall_matrix_scenarios(part.get(), [&](auto mtx) {
+            this->forall_solver_scenarios(mtx, [&](auto solver) {
+                this->forall_vector_scenarios(
+                    part.get(), solver, [&](auto b, auto x) {
+                        solver->apply(b.get(), x.get());
+
+                        this->assert_residual_near(mtx, x, b, this->tol(x));
+                    });
+            });
+        });
+    });
+}
+
+
+TYPED_TEST(Solver, AdvancedApplyIsEquivalentToRef)
+{
+    this->forall_partition_scenarios([&](auto part) {
+        this->forall_matrix_scenarios(part.get(), [&](auto mtx) {
+            this->forall_solver_scenarios(mtx, [&](auto solver) {
+                this->forall_vector_scenarios(
+                    part.get(), solver, [&](auto b, auto x) {
+                        auto alpha = this->gen_scalar();
+                        auto beta = this->gen_scalar();
+                        auto x_old = gko::share(gko::clone(x));
+
+                        solver->apply(alpha.get(), b.get(), beta.get(),
+                                      x.get());
+
+                        this->assert_residual_near(mtx, x, x_old, b, alpha,
+                                                   beta, 10 * this->tol(x));
+                    });
+            });
+        });
+    });
+}
+
+#if !(GINKGO_DPCPP_SINGLE_MODE)
+TYPED_TEST(Solver, MixedApplyIsEquivalentToRef)
+{
+    using MixedVec = typename TestFixture::MixedVec;
+    this->forall_partition_scenarios([&](auto part) {
+        this->forall_matrix_scenarios(part.get(), [&](auto mtx) {
+            this->forall_solver_scenarios(mtx, [&](auto solver) {
+                this->template forall_vector_scenarios<MixedVec>(
+                    part.get(), solver, [&](auto b, auto x) {
+                        solver->apply(b.get(), x.get());
+
+                        this->assert_residual_near(mtx, x, b,
+                                                   this->mixed_tol(x));
+                    });
+            });
+        });
+    });
+}
+
+
+TYPED_TEST(Solver, MixedAdvancedApplyIsEquivalentToRef)
+{
+    using MixedVec = typename TestFixture::MixedVec;
+    using MixedLocalVec = typename TestFixture::MixedLocalVec;
+    this->forall_partition_scenarios([&](auto part) {
+        this->forall_matrix_scenarios(part.get(), [&](auto mtx) {
+            this->forall_solver_scenarios(mtx, [&](auto solver) {
+                this->template forall_vector_scenarios<MixedVec>(
+                    part.get(), solver, [&](auto b, auto x) {
+                        auto alpha = this->template gen_scalar<MixedLocalVec>();
+                        auto beta = this->template gen_scalar<MixedLocalVec>();
+                        auto x_old = gko::share(gko::clone(x));
+
+                        solver->apply(alpha.get(), b.get(), beta.get(),
+                                      x.get());
+
+                        this->assert_residual_near(mtx, x, x_old, b, alpha,
+                                                   beta,
+                                                   10 * this->mixed_tol(x));
+                    });
+            });
+        });
+    });
+}
+#endif
diff --git a/test/utils/mpi/executor.hpp b/test/utils/mpi/executor.hpp
new file mode 100644
index 00000000000..5d6c48fe37d
--- /dev/null
+++ b/test/utils/mpi/executor.hpp
@@ -0,0 +1,152 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+#ifndef GKO_TEST_UTILS_MPI_EXECUTOR_HPP_
+#define GKO_TEST_UTILS_MPI_EXECUTOR_HPP_
+
+
+#include <ginkgo/core/base/executor.hpp>
+
+
+#include <memory>
+
+
+#include <gtest/gtest.h>
+
+
+#include <ginkgo/core/base/mpi.hpp>
+
+
+template <typename ExecType>
+std::shared_ptr<ExecType> init_executor(
+    std::shared_ptr<gko::ReferenceExecutor>);
+
+
+template <>
+inline std::shared_ptr<gko::ReferenceExecutor>
+    init_executor<gko::ReferenceExecutor>(
+        std::shared_ptr<gko::ReferenceExecutor>)
+{
+    return gko::ReferenceExecutor::create();
+}
+
+
+template <>
+inline std::shared_ptr<gko::OmpExecutor> init_executor<gko::OmpExecutor>(
+    std::shared_ptr<gko::ReferenceExecutor>)
+{
+    return gko::OmpExecutor::create();
+}
+
+
+template <>
+inline std::shared_ptr<gko::CudaExecutor> init_executor<gko::CudaExecutor>(
+    std::shared_ptr<gko::ReferenceExecutor> ref)
+{
+    {
+        if (gko::CudaExecutor::get_num_devices() == 0) {
+            throw std::runtime_error{"No suitable CUDA devices"};
+        }
+        return gko::CudaExecutor::create(
+            gko::experimental::mpi::map_rank_to_device_id(
+                MPI_COMM_WORLD, gko::CudaExecutor::get_num_devices()),
+            ref);
+    }
+}
+
+
+template <>
+inline std::shared_ptr<gko::HipExecutor> init_executor<gko::HipExecutor>(
+    std::shared_ptr<gko::ReferenceExecutor> ref)
+{
+    if (gko::HipExecutor::get_num_devices() == 0) {
+        throw std::runtime_error{"No suitable HIP devices"};
+    }
+    return gko::HipExecutor::create(
+        gko::experimental::mpi::map_rank_to_device_id(
+            MPI_COMM_WORLD, gko::HipExecutor::get_num_devices()),
+        ref);
+}
+
+
+template <>
+inline std::shared_ptr<gko::DpcppExecutor> init_executor<gko::DpcppExecutor>(
+    std::shared_ptr<gko::ReferenceExecutor> ref)
+{
+    auto num_gpu_devices = gko::DpcppExecutor::get_num_devices("gpu");
+    auto num_cpu_devices = gko::DpcppExecutor::get_num_devices("cpu");
+    if (num_gpu_devices > 0) {
+        return gko::DpcppExecutor::create(
+            gko::experimental::mpi::map_rank_to_device_id(MPI_COMM_WORLD,
+                                                          num_gpu_devices),
+            ref, "gpu");
+    } else if (num_cpu_devices > 0) {
+        return gko::DpcppExecutor::create(
+            gko::experimental::mpi::map_rank_to_device_id(MPI_COMM_WORLD,
+                                                          num_cpu_devices),
+            ref, "cpu");
+    } else {
+        throw std::runtime_error{"No suitable DPC++ devices"};
+    }
+}
+
+
+class CommonMpiTestFixture : public ::testing::Test {
+public:
+#if GINKGO_COMMON_SINGLE_MODE
+    using value_type = float;
+#else
+    using value_type = double;
+#endif
+    using index_type = int;
+
+    CommonMpiTestFixture()
+        : ref{gko::ReferenceExecutor::create()},
+          exec{init_executor<gko::EXEC_TYPE>(ref)},
+          comm(MPI_COMM_WORLD)
+    {}
+
+    void TearDown() final
+    {
+        if (exec != nullptr) {
+            ASSERT_NO_THROW(exec->synchronize());
+        }
+    }
+
+    std::shared_ptr<gko::ReferenceExecutor> ref;
+    std::shared_ptr<gko::EXEC_TYPE> exec;
+
+    gko::experimental::mpi::communicator comm;
+};
+
+
+#endif  // GKO_TEST_UTILS_MPI_EXECUTOR_HPP_