From 134150046245c74e8f675e3f0721c9375405f08c Mon Sep 17 00:00:00 2001
From: Marcel Koch <marcel.koch@kit.edu>
Date: Wed, 23 Feb 2022 18:16:50 +0100
Subject: [PATCH] adds distributed solver example

---
 doc/examples/examples.hpp.in                  |  12 +
 examples/CMakeLists.txt                       |   4 +
 examples/distributed-solver/CMakeLists.txt    |   2 +
 .../distributed-solver-3d.cpp                 | 241 +++++++++++++++++
 .../distributed-solver/distributed-solver.cpp | 256 ++++++++++++++++++
 examples/distributed-solver/doc/builds-on     |   1 +
 examples/distributed-solver/doc/intro.dox     |   8 +
 examples/distributed-solver/doc/kind          |   1 +
 examples/distributed-solver/doc/results.dox   |   1 +
 examples/distributed-solver/doc/short-intro   |   1 +
 examples/distributed-solver/doc/tooltip       |   1 +
 11 files changed, 528 insertions(+)
 create mode 100644 examples/distributed-solver/CMakeLists.txt
 create mode 100644 examples/distributed-solver/distributed-solver-3d.cpp
 create mode 100644 examples/distributed-solver/distributed-solver.cpp
 create mode 100644 examples/distributed-solver/doc/builds-on
 create mode 100644 examples/distributed-solver/doc/intro.dox
 create mode 100644 examples/distributed-solver/doc/kind
 create mode 100644 examples/distributed-solver/doc/results.dox
 create mode 100644 examples/distributed-solver/doc/short-intro
 create mode 100644 examples/distributed-solver/doc/tooltip
diff --git a/doc/examples/examples.hpp.in b/doc/examples/examples.hpp.in
index 55dfc30a14e..131d6a2ef22 100644
--- a/doc/examples/examples.hpp.in
+++ b/doc/examples/examples.hpp.in
@@ -251,6 +251,11 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *       <td> Use multigrid with different precision multigrid_level as a
  *            solver.
  *       </td></tr>
+
+ *   <tr valign="top">
+ *       <td>@ref distributed_solver</td>
+ *       <td> Use a distributed solver to solve a 1D Laplace equation.
+ *       </td></tr>
  *
  * </table>
  *
@@ -405,5 +410,12 @@ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *         @ref mixed_multigrid_solver
  *     </td>
  *   </tr>
+
+ *   <tr valign="top">
+ *     <td> Distributed
+ *     </td>
+ *     <td>@ref distributed_solver
+ *     </td>
+ *   </tr>
  * </table>
  */
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 151430846e1..481e46e8c37 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -50,6 +50,10 @@ if(GINKGO_HAVE_PAPI_SDE)
     list(APPEND EXAMPLES_LIST papi-logging)
 endif()
 
+if(GINKGO_BUILD_MPI)
+    list(APPEND EXAMPLES_LIST distributed-solver)
+endif()
+
 foreach(example ${EXAMPLES_LIST})
     add_subdirectory(${example})
 endforeach()
diff --git a/examples/distributed-solver/CMakeLists.txt b/examples/distributed-solver/CMakeLists.txt
new file mode 100644
index 00000000000..03338204e20
--- /dev/null
+++ b/examples/distributed-solver/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_executable(distributed-solver-1d distributed-solver.cpp)
+target_link_libraries(distributed-solver-1d Ginkgo::ginkgo)
diff --git a/examples/distributed-solver/distributed-solver-3d.cpp b/examples/distributed-solver/distributed-solver-3d.cpp
new file mode 100644
index 00000000000..cbab03106c4
--- /dev/null
+++ b/examples/distributed-solver/distributed-solver-3d.cpp
@@ -0,0 +1,241 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+// @sect3{Include files}
+
+// This is the main ginkgo header file.
+#include <ginkgo/ginkgo.hpp>
+
+// Add the fstream header to read from data from files.
+#include <fstream>
+// Add the C++ iostream header to output information to the console.
+#include <iostream>
+// Add the STL map header for the executor selection
+#include <map>
+// Add the string manipulation header to handle strings.
+#include <string>
+
+
+int main(int argc, char* argv[])
+{
+    const gko::mpi::environment env(argc, argv);
+    using GlobalIndexType = gko::int64;
+    using LocalIndexType = gko::int32;
+    using ValueType = double;
+    using dist_mtx =
+        gko::distributed::Matrix<ValueType, LocalIndexType, GlobalIndexType>;
+    using dist_vec = gko::distributed::Vector<ValueType>;
+    using vec = gko::matrix::Dense<ValueType>;
+    using part_type =
+        gko::distributed::Partition<LocalIndexType, GlobalIndexType>;
+    using solver = gko::solver::Cg<ValueType>;
+    using cg = gko::solver::Cg<ValueType>;
+
+    // Print the ginkgo version information.
+    std::cout << gko::version_info::get() << std::endl;
+
+    if (argc == 2 && (std::string(argv[1]) == "--help")) {
+        std::cerr << "Usage: " << argv[0] << " [executor] " << std::endl;
+        std::exit(-1);
+    }
+
+    // @sect3{Where do you want to run your solver ?}
+    // The gko::Executor class is one of the cornerstones of Ginkgo. Currently,
+    // we have support for
+    // an gko::OmpExecutor, which uses OpenMP multi-threading in most of its
+    // kernels, a gko::ReferenceExecutor, a single threaded specialization of
+    // the OpenMP executor and a gko::CudaExecutor which runs the code on a
+    // NVIDIA GPU if available.
+    // @note With the help of C++, you see that you only ever need to change the
+    // executor and all the other functions/ routines within Ginkgo should
+    // automatically work and run on the executor with any other changes.
+    ValueType t_init = MPI_Wtime();
+    const auto executor_string = argc >= 2 ? argv[1] : "reference";
+    const auto grid_dim =
+        static_cast<gko::size_type>(argc >= 3 ? std::atoi(argv[2]) : 10);
+    const auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    const auto rank = comm.rank();
+    std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
+        exec_map{
+            {"omp", [] { return gko::OmpExecutor::create(); }},
+            {"cuda",
+             [&] {
+                 if (gko::CudaExecutor::get_num_devices() > 1) {
+                     return gko::CudaExecutor::create(
+                         comm.node_local_rank(),
+                         gko::ReferenceExecutor::create(), true);
+                 } else {
+                     return gko::CudaExecutor::create(
+                         0, gko::ReferenceExecutor::create(), true);
+                 }
+             }},
+            {"hip",
+             [&] {
+                 if (gko::HipExecutor::get_num_devices() > 1) {
+                     std::cout << " Multiple GPU seen: "
+                               << gko::HipExecutor::get_num_devices()
+                               << std::endl;
+                     return gko::HipExecutor::create(
+                         comm.node_local_rank(),
+                         gko::ReferenceExecutor::create(), true);
+                 } else {
+                     std::cout << " One GPU seen: "
+                               << gko::HipExecutor::get_num_devices()
+                               << std::endl;
+                     return gko::HipExecutor::create(
+                         0, gko::ReferenceExecutor::create(), true);
+                 }
+             }},
+            {"dpcpp",
+             [] {
+                 return gko::DpcppExecutor::create(
+                     0, gko::ReferenceExecutor::create());
+             }},
+            {"reference", [] { return gko::ReferenceExecutor::create(); }}};
+
+    // executor where Ginkgo will perform the computation
+    const auto exec = exec_map.at(executor_string)();  // throws if not valid
+    const auto num_rows = grid_dim * grid_dim * grid_dim;
+
+
+    // Note that all ranks assemble the full global matrix
+    gko::matrix_data<ValueType, GlobalIndexType> A_data;
+    gko::matrix_data<ValueType, GlobalIndexType> b_data;
+    gko::matrix_data<ValueType, GlobalIndexType> x_data;
+    A_data.size = {num_rows, num_rows};
+    b_data.size = {num_rows, 1};
+    x_data.size = {num_rows, 1};
+    for (int i = 0; i < grid_dim; i++) {
+        for (int j = 0; j < grid_dim; j++) {
+            for (int k = 0; k < grid_dim; k++) {
+                auto idx = i * grid_dim * grid_dim + j * grid_dim + k;
+                if (i > 0)
+                    A_data.nonzeros.emplace_back(idx, idx - grid_dim * grid_dim,
+                                                 -1);
+                if (j > 0)
+                    A_data.nonzeros.emplace_back(idx, idx - grid_dim, -1);
+                if (k > 0) A_data.nonzeros.emplace_back(idx, idx - 1, -1);
+                A_data.nonzeros.emplace_back(idx, idx, 8);
+                if (k < grid_dim - 1)
+                    A_data.nonzeros.emplace_back(idx, idx + 1, -1);
+                if (j < grid_dim - 1)
+                    A_data.nonzeros.emplace_back(idx, idx + grid_dim, -1);
+                if (i < grid_dim - 1)
+                    A_data.nonzeros.emplace_back(idx, idx + grid_dim * grid_dim,
+                                                 -1);
+
+                b_data.nonzeros.emplace_back(idx, 0, 1.0);
+                x_data.nonzeros.emplace_back(idx, 0, 1.0);
+            }
+        }
+    }
+
+    // build partition: uniform number of rows per rank
+    gko::Array<gko::int64> ranges_array{
+        exec->get_master(), static_cast<gko::size_type>(comm.size() + 1)};
+    const auto rows_per_rank = num_rows / comm.size();
+    for (int i = 0; i < comm.size(); i++) {
+        ranges_array.get_data()[i] = i * rows_per_rank;
+    }
+    ranges_array.get_data()[comm.size()] =
+        static_cast<GlobalIndexType>(num_rows);
+    auto partition = gko::share(
+        part_type::build_from_contiguous(exec->get_master(), ranges_array));
+
+    auto A_host = gko::share(dist_mtx::create(exec->get_master(), comm));
+    auto b_host = dist_vec::create(exec->get_master(), comm);
+    auto x_host = dist_vec::create(exec->get_master(), comm);
+    A_host->read_distributed(A_data, partition.get());
+    b_host->read_distributed(b_data, partition.get());
+    x_host->read_distributed(x_data, partition.get());
+    auto A = gko::share(dist_mtx::create(exec, comm));
+    auto x = dist_vec::create(exec, comm);
+    auto b = dist_vec::create(exec, comm);
+    A->copy_from(A_host.get());
+    b->copy_from(b_host.get());
+    x->copy_from(x_host.get());
+    ValueType t_init_end = MPI_Wtime();
+
+    x_host->copy_from(x.get());
+    auto one = gko::initialize<vec>({1.0}, exec);
+    auto minus_one = gko::initialize<vec>({-1.0}, exec);
+    A_host->apply(lend(minus_one), lend(x_host), lend(one), lend(b_host));
+    auto initial_resnorm = gko::initialize<vec>({0.0}, exec->get_master());
+    b_host->compute_norm2(gko::lend(initial_resnorm));
+    b_host->copy_from(b.get());
+    comm.synchronize();
+    ValueType t_read_setup_end = MPI_Wtime();
+
+    auto solver_gen =
+        solver::build()
+            .with_criteria(gko::stop::Iteration::build()
+                               .with_max_iters(static_cast<gko::size_type>(100))
+                               .on(exec),
+                           gko::stop::ImplicitResidualNorm<ValueType>::build()
+                               .with_reduction_factor(1e-4)
+                               .on(exec))
+            .on(exec);
+    auto Ainv = solver_gen->generate(A);
+
+    comm.synchronize();
+    ValueType t_solver_generate_end = MPI_Wtime();
+
+    Ainv->apply(lend(b), lend(x));
+    comm.synchronize();
+    ValueType t_solver_apply_end = MPI_Wtime();
+
+    one = gko::initialize<vec>({1.0}, exec);
+    minus_one = gko::initialize<vec>({-1.0}, exec);
+    A->apply(lend(minus_one), lend(x), lend(one), lend(b));
+    auto result = gko::initialize<vec>({0.0}, exec->get_master());
+    b->compute_norm2(lend(result));
+
+    comm.synchronize();
+    ValueType t_end = MPI_Wtime();
+
+    if (comm.rank() == 0) {
+        // clang-format off
+        std::cout
+              << "\nRunning on: " << executor_string
+              << "\nNum rows in matrix: " << num_rows
+              << "\nNum ranks: " << comm.size()
+              << "\nInitial Res norm: " << *initial_resnorm->get_values()
+              << "\nFinal Res norm: " << *result->get_values()
+              << "\nInit time: " << t_init_end - t_init
+              << "\nRead time: " << t_read_setup_end - t_init
+              << "\nSolver generate time: " << t_solver_generate_end - t_read_setup_end
+              << "\nSolver apply time: " << (t_solver_apply_end - t_solver_generate_end)
+              << "\nTotal time: " << t_end - t_init
+              << std::endl;
+        // clang-format on
+    }
+}
diff --git a/examples/distributed-solver/distributed-solver.cpp b/examples/distributed-solver/distributed-solver.cpp
new file mode 100644
index 00000000000..6c88cc41432
--- /dev/null
+++ b/examples/distributed-solver/distributed-solver.cpp
@@ -0,0 +1,256 @@
+/*******************************<GINKGO LICENSE>******************************
+Copyright (c) 2017-2022, the Ginkgo authors
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+******************************<GINKGO LICENSE>*******************************/
+
+// @sect3{Include files}
+
+// This is the main ginkgo header file.
+#include <ginkgo/ginkgo.hpp>
+
+// Add the C++ iostream header to output information to the console.
+#include <iostream>
+// Add the STL map header for the executor selection
+#include <map>
+// Add the string manipulation header to handle strings.
+#include <string>
+
+
+int main(int argc, char* argv[])
+{
+    // @sect3{Type Definitiions}
+    // Define the needed types. In a parallel program we need to differentiate
+    // beweeen global and local indices, thus we have two index types.
+    using GlobalIndexType = gko::int64;
+    using LocalIndexType = gko::int32;
+    // The underlying value type.
+    using ValueType = double;
+    // As vector type we use the following, which implements a subset of @ref
+    // gko::matrix::Dense.
+    using dist_vec = gko::distributed::Vector<ValueType>;
+    // As matrix type we simply use the following type, which can read
+    // distributed data and be applied to a distributed vector.
+    using dist_mtx =
+        gko::distributed::Matrix<ValueType, LocalIndexType, GlobalIndexType>;
+    // We still need a localized vector type to be used as scalars in the
+    // advanced apply operations.
+    using vec = gko::matrix::Dense<ValueType>;
+    // The partition type describes how the rows of the matrices are
+    // distributed.
+    using part_type =
+        gko::distributed::Partition<LocalIndexType, GlobalIndexType>;
+    // We can use here the same solver type as you would use in a
+    // non-distributed program. Please note that not all solvers support
+    // distributed systems at the moment.
+    using solver = gko::solver::Cg<ValueType>;
+
+    // @sect3{Initialization and User Input Handling}
+    // Since this is an MPI program, we need to initialization and finalization
+    // MPI at the begin and end respectively of our program. This can be easily
+    // done wit the following helper construct that uses RAII to automize the
+    // initialization and finalization.
+    const gko::mpi::environment env(argc, argv);
+
+    // Print the ginkgo version information.
+    std::cout << gko::version_info::get() << std::endl;
+    if (argc == 2 && (std::string(argv[1]) == "--help")) {
+        std::cerr << "Usage: " << argv[0] << " [executor] [num_grid_points] "
+                  << std::endl;
+        std::exit(-1);
+    }
+
+    ValueType t_init = gko::mpi::get_walltime();
+
+    // User input settings:
+    // - The executor, defaults to reference.
+    // - The number of grid points, defaults to 100.
+    const auto executor_string = argc >= 2 ? argv[1] : "reference";
+    const auto grid_dim =
+        static_cast<gko::size_type>(argc >= 3 ? std::atoi(argv[2]) : 100);
+
+    // Create a MPI communicator wrapper and get the rank.
+    const auto comm = gko::mpi::communicator(MPI_COMM_WORLD);
+    const auto rank = comm.rank();
+
+    // Pick requested executor.
+    std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
+        exec_map{
+            {"omp", [] { return gko::OmpExecutor::create(); }},
+            {"cuda",
+             [&] {
+                 return gko::CudaExecutor::create(
+                     comm.node_local_rank() %
+                         gko::CudaExecutor::get_num_devices(),
+                     gko::ReferenceExecutor::create(), true);
+             }},
+            {"hip",
+             [&] {
+                 return gko::HipExecutor::create(
+                     comm.node_local_rank() %
+                         gko::HipExecutor::get_num_devices(),
+                     gko::ReferenceExecutor::create(), true);
+             }},
+            {"dpcpp",
+             [&] {
+                 auto ref = gko::ReferenceExecutor::create();
+                 if (gko::DpcppExecutor::get_num_devices("gpu") > 0) {
+                     return gko::DpcppExecutor::create(
+                         comm.node_local_rank() %
+                             gko::DpcppExecutor::get_num_devices("gpu"),
+                         ref);
+                 } else if (gko::DpcppExecutor::get_num_devices("cpu") > 0) {
+                     return gko::DpcppExecutor::create(
+                         comm.node_local_rank() %
+                             gko::DpcppExecutor::get_num_devices("cpu"),
+                         ref);
+                 } else {
+                     throw std::runtime_error("No suitable DPC++ devices");
+                 }
+             }},
+            {"reference", [] { return gko::ReferenceExecutor::create(); }}};
+    const auto exec = exec_map.at(executor_string)();
+
+    // @sect3{Creating the Distributed Matrix and Vectors}
+    // As a first step, we create a partition of the rows. The partition
+    // consists of ranges of consecutive rows which are assigned a part-id.
+    // These part-id will be used for the distributed data structures to
+    // determine which rows will be stored locally. In this example each rank
+    // has (nearly) the same number of rows, so we can use the following
+    // specialized constructor. See @ref gko::distributed::Partition for other
+    // modes of creating a partition.
+    const auto num_rows = grid_dim;
+    auto partition = gko::share(part_type::build_from_global_size_uniform(
+        exec->get_master(), comm.size(),
+        static_cast<GlobalIndexType>(num_rows)));
+
+    // Assemble the matrix using a 3-pt stencil and fill the right-hand-side
+    // with a sine value. The distributed matrix supports only constructing an
+    // empty matrix of zero size and filling in the values with
+    // gko::distributed::Matrix::read_distributed. Only the data that belongs to
+    // the rows by this rank will be assembled.
+    gko::matrix_data<ValueType, GlobalIndexType> A_data;
+    gko::matrix_data<ValueType, GlobalIndexType> b_data;
+    gko::matrix_data<ValueType, GlobalIndexType> x_data;
+    A_data.size = {num_rows, num_rows};
+    b_data.size = {num_rows, 1};
+    x_data.size = {num_rows, 1};
+    const auto range_start = partition->get_range_bounds()[rank];
+    const auto range_end = partition->get_range_bounds()[rank + 1];
+    for (int i = range_start; i < range_end; i++) {
+        if (i > 0) {
+            A_data.nonzeros.emplace_back(i, i - 1, -1);
+        }
+        A_data.nonzeros.emplace_back(i, i, 2);
+        if (i < grid_dim - 1) {
+            A_data.nonzeros.emplace_back(i, i + 1, -1);
+        }
+        b_data.nonzeros.emplace_back(i, 0, std::sin(i * 0.01));
+    }
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_init_end = gko::mpi::get_walltime();
+
+    // Read the matrix data, currently this is only supported on CPU executors.
+    // This will also set up the communication pattern needed for the
+    // distributed matrix-vector multiplication.
+    auto A_host = gko::share(dist_mtx::create(exec->get_master(), comm));
+    auto x_host = dist_vec::create(exec->get_master(), comm);
+    auto b_host = dist_vec::create(exec->get_master(), comm);
+    A_host->read_distributed(A_data, partition.get());
+    b_host->read_distributed(b_data, partition.get());
+    x_host->read_distributed(x_data, partition.get());
+    // After reading, the matrix and vector can be move to the chosen executor,
+    // since the distributed matrix supports SpMV also on devices.
+    auto A = gko::share(dist_mtx::create(exec, comm));
+    auto x = dist_vec::create(exec, comm);
+    auto b = dist_vec::create(exec, comm);
+    A->copy_from(A_host.get());
+    b->copy_from(b_host.get());
+    x->copy_from(x_host.get());
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_read_setup_end = gko::mpi::get_walltime();
+
+    // @sect3{Solve the Distributed System}
+    // Generate the solver, this is the same as in the non-distributed case.
+    auto Ainv =
+        solver::build()
+            .with_criteria(
+                gko::stop::Iteration::build().with_max_iters(100u).on(exec),
+                gko::stop::ResidualNorm<ValueType>::build()
+                    .with_baseline(gko::stop::mode::absolute)
+                    .with_reduction_factor(1e-4)
+                    .on(exec))
+            .on(exec)
+            ->generate(A);
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_solver_generate_end = gko::mpi::get_walltime();
+
+    // Apply the distributed solver, this is the same as in the non-distributed
+    // case.
+    Ainv->apply(lend(b), lend(x));
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_solver_apply_end = gko::mpi::get_walltime();
+
+    // Compute the true residual, this is the same as in the non-distributed
+    // case.
+    x_host->copy_from(x.get());
+    auto one = gko::initialize<vec>({1.0}, exec);
+    auto minus_one = gko::initialize<vec>({-1.0}, exec);
+    A_host->apply(lend(minus_one), lend(x_host), lend(one), lend(b_host));
+    auto result = gko::initialize<vec>({0.0}, exec->get_master());
+    b_host->compute_norm2(lend(result));
+
+    // Take timings.
+    comm.synchronize();
+    ValueType t_end = gko::mpi::get_walltime();
+
+    // @sect3{Printing Results}
+    // Print the achieved residual norm and timings on rank 0.
+    if (comm.rank() == 0) {
+        // clang-format off
+    std::cout << "\nNum rows in matrix: " << num_rows
+              << "\nNum ranks: " << comm.size()
+              << "\nFinal Res norm: " << *result->get_values()
+              << "\nInit time: " << t_init_end - t_init
+              << "\nRead time: " << t_read_setup_end - t_init
+              << "\nSolver generate time: " << t_solver_generate_end - t_read_setup_end
+              << "\nSolver apply time: " << t_solver_apply_end - t_solver_generate_end
+              << "\nTotal time: " << t_end - t_init
+              << std::endl;
+        // clang-format on
+    }
+}
diff --git a/examples/distributed-solver/doc/builds-on b/examples/distributed-solver/doc/builds-on
new file mode 100644
index 00000000000..896db74e274
--- /dev/null
+++ b/examples/distributed-solver/doc/builds-on
@@ -0,0 +1 @@
+simple-solver three-pt-stencil-solver
diff --git a/examples/distributed-solver/doc/intro.dox b/examples/distributed-solver/doc/intro.dox
new file mode 100644
index 00000000000..1667c8bebdd
--- /dev/null
+++ b/examples/distributed-solver/doc/intro.dox
@@ -0,0 +1,8 @@
+<a name="Intro"></a>
+<h1>Introduction</h1>
+This distributed solver example should help you understanding the basics of using Ginkgo in a distributed setting.
+The example will solve a simple 1D Laplace equation where the system can be distributed row-wise to multiple processes.
+To run the solver with multiple processes, use `mpirun -n NUM_PROCS ./distributed-solver [options]`.
+
+If you are using GPU devices, please make sure that you run this example with at most as many processes as you have GPU
+devices available.
diff --git a/examples/distributed-solver/doc/kind b/examples/distributed-solver/doc/kind
new file mode 100644
index 00000000000..c1d9154931a
--- /dev/null
+++ b/examples/distributed-solver/doc/kind
@@ -0,0 +1 @@
+techniques
diff --git a/examples/distributed-solver/doc/results.dox b/examples/distributed-solver/doc/results.dox
new file mode 100644
index 00000000000..f4c6feefb5b
--- /dev/null
+++ b/examples/distributed-solver/doc/results.dox
@@ -0,0 +1 @@
+<h1>Results</h1>
diff --git a/examples/distributed-solver/doc/short-intro b/examples/distributed-solver/doc/short-intro
new file mode 100644
index 00000000000..57a54287458
--- /dev/null
+++ b/examples/distributed-solver/doc/short-intro
@@ -0,0 +1 @@
+The distributed solver example.
diff --git a/examples/distributed-solver/doc/tooltip b/examples/distributed-solver/doc/tooltip
new file mode 100644
index 00000000000..6008031e5c7
--- /dev/null
+++ b/examples/distributed-solver/doc/tooltip
@@ -0,0 +1 @@
+Computes a distributed sparse matrix-vector product (SpMV).