distributed wfc support / misc updates (#20)

* remove enable_language(CUDA) * enable distributed wave-functions * update spack recipe for intel-oneapi-mkl * cmake: simplify blas/mkl * add check-format * remove kokkos <4 initialization
simonpintarelli · Jun 6, 2024 · 674039f · 674039f
1 parent 44daefa
commit 674039f
Show file tree

Hide file tree

Showing 63 changed files with 816 additions and 742 deletions.
diff --git a/.github/workflows/check-format.yml b/.github/workflows/check-format.yml
@@ -0,0 +1,15 @@
+name: Check source code formatting
+
+on:
+  push: {}
+  pull_request: {}
+
+jobs:
+  check:
+    runs-on: ubuntu-latest
+    container: zhongruoyu/llvm-ports:17.0.4-slim-focal
+    steps:
+      - uses: actions/checkout@v4
+      - name: Check .cpp and .hpp files
+        run: |
+          ./check_format.sh
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,4 @@ spack-build-*
 *~undo-tree~
 __pycache__/
 compile_commands.json
+build-linux-*
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -7,6 +7,7 @@ set(USE_OPENMP On CACHE BOOL "use OpenMP")
 set(USE_CUDA Off CACHE BOOL "use cuda")
 set(USE_ROCM Off CACHE BOOL "use amd gpus")
 set(USE_MAGMA Off CACHE BOOL "use magma eigensolver for amd gpus")
+set(USE_GPU_DIRECT Off CACHE BOOL "use gpu direct")
 
 set(BUILD_TESTS OFF CACHE BOOL "build tests")
 set(LAPACK_VENDOR "OpenBLAS" CACHE STRING "lapack vendor")
@@ -21,22 +22,19 @@ endif()
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS "YES")
 
-# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall")
 
 include(cmake/nlcglib_macros.cmake)
 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake/modules")
 
-find_package(Kokkos)
 if(USE_CUDA)
   find_package(CUDAToolkit REQUIRED)
-  enable_language(CUDA)
   include(cmake/cudalibs_target.cmake)
 endif()
-
 if(USE_ROCM)
   include(cmake/rocmlibs_target.cmake)
 endif()
+find_package(Kokkos)
 
 if(USE_MAGMA)
   find_package(MAGMA REQUIRED)
@@ -53,12 +51,24 @@ if(LAPACK_VENDOR MATCHES OpenBLAS)
       INTERFACE_INCLUDE_DIRECTORIES "${OpenBLAS_INCLUDE_DIRS}"
       INTERFACE_LINK_LIBRARIES "${OpenBLAS_LIBRARIES}")
   endif()
-elseif(LAPACK_VENDOR MATCHES MKL)
-  message("LAPACK VENDOR MKL")
-  find_package(MKL REQUIRED)
 elseif(LAPACK_VENDOR STREQUAL MKLONEAPI)
-  # set(MKL_THREADING gnu_thread)
+  message("LAPACK VENDOR MKL")
+  set(MKL_INTERFACE "lp64" CACHE STRING "")
+  set(MKL_THREADING "sequential" CACHE STRING "")
+  set(MKL_MPI "mpich" CACHE STRING "")
+  find_package(MKL CONFIG REQUIRED)
+  if(NOT TARGET nlcg::cpu_lapack)
+    add_library(nlcg::cpu_lapack INTERFACE IMPORTED)
+    target_link_libraries(nlcg::cpu_lapack INTERFACE MKL::MKL)
+    target_compile_definitions(nlcg::cpu_lapack INTERFACE __USE_MKL)
+  endif()
+elseif(LAPACK_VENDOR STREQUAL MKL)
   find_package(MKL REQUIRED NO_MODULE)
+  if(NOT TARGET nlcg::cpu_lapack)
+    add_library(nlcg::cpu_lapack INTERFACE IMPORTED)
+    target_link_libraries(nlcg::cpu_lapack INTERFACE mkl::mkl_intel_32bit_omp_dyn)
+    target_compile_definitions(nlcg::cpu_lapack INTERFACE __USE_MKL)
+  endif()
 elseif(LAPACK_VENDOR STREQUAL CRAY_LIBSCI)
   message("LAPACK VENDOR Cray Libsci")
   find_package(SCI REQUIRED)

diff --git a/check_format.sh b/check_format.sh
@@ -0,0 +1,15 @@
+#!/bin/bash
+
+check_diff() {
+    local status=0
+	  for file in "$@"; do
+		    if ! diff -q "$file" <(clang-format "$file"); then
+            status=1
+        fi
+	  done
+    return $status
+}
+
+export -f check_diff
+
+find . -type f \( -name "*.cpp" -o -name "*.hpp" \) ! -path "./build-env/*" ! -path "./.*" -exec bash -c 'check_diff "$@"' sh {} +
diff --git a/cmake/nlcglib_macros.cmake b/cmake/nlcglib_macros.cmake
@@ -2,42 +2,30 @@ MACRO(NLCGLIB_SETUP_TARGET _target)
   target_link_libraries(
     ${_target} PUBLIC
     Kokkos::kokkos
-    # ${LAPACK_LIBRARIES}
     MPI::MPI_CXX
-    # $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
+    $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
     $<TARGET_NAME_IF_EXISTS:nlcglib::cudalibs>
     $<TARGET_NAME_IF_EXISTS:nlcglib::rocmlibs>
     $<TARGET_NAME_IF_EXISTS:nlcglib::magma>
     $<TARGET_NAME_IF_EXISTS:roc::hipblas> # only required for magma
     $<TARGET_NAME_IF_EXISTS:roc::hipsparse> # only required for magma
     nlohmann_json::nlohmann_json
-    )
+    nlcg::cpu_lapack
+  )
 
   target_include_directories(${_target} PUBLIC
     ${CMAKE_SOURCE_DIR}/src
     ${CMAKE_SOURCE_DIR}/include
-    )
+  )
 
-    if(USE_ROCM)
-      target_compile_options(${_target} PUBLIC --offload-arch=gfx90a)
-    endif()
-
-  if(LAPACK_VENDOR MATCHES MKL)
-    target_compile_definitions(${_target} PUBLIC __USE_MKL)
-    # if(USE_OPENMP)
-    target_link_libraries(${_target}  PUBLIC mkl::mkl_intel_32bit_omp_dyn)
-    # else()
-    #   target_link_libraries(${_target}  PUBLIC mkl::mkl_intel_32bit_seq_dyn)
-    # endif()
-  elseif(LAPACK_VENDOR STREQUAL MKLONEAPI)
-    target_link_libraries(${_target}  PUBLIC MKL::MKL)
-  else()
-    target_link_libraries(${_target} PRIVATE nlcg::cpu_lapack)
+  if(USE_ROCM)
+    target_compile_options(${_target} PUBLIC --offload-arch=gfx90a)
   endif()
+
   target_compile_definitions(${_target} PUBLIC $<$<BOOL:${USE_OPENMP}>:__USE_OPENMP>)
   target_compile_definitions(${_target} PUBLIC $<$<BOOL:${USE_CUDA}>:__NLCGLIB__CUDA>)
   target_compile_definitions(${_target} PUBLIC $<$<BOOL:${USE_ROCM}>:__NLCGLIB__ROCM>)
   target_compile_definitions(${_target} PUBLIC $<$<BOOL:${USE_MAGMA}>:__NLCGLIB__MAGMA>)
+  target_compile_definitions(${_target} PUBLIC $<$<BOOL:${USE_GPU_DIRECT}>:__NLCGLIB__GPU_DIRECT>)
   target_include_directories(${_target} PUBLIC $<TARGET_PROPERTY:Kokkos::kokkoscore,INTERFACE_INCLUDE_DIRECTORIES>)
-
 ENDMACRO()
diff --git a/include/interface.hpp b/include/interface.hpp
@@ -2,10 +2,10 @@
 
 #include <array>
 #include <complex>
-#include <memory>
+#include <functional>
 #include <map>
+#include <memory>
 #include <stdexcept>
-#include <functional>
 #include <vector>
 #include "mpi.h"
 
@@ -18,9 +18,8 @@ enum class memory_type
   device
 };
 
-static std::map<memory_type, std::string> memory_names = {{memory_type::none, "none"},
-                                                          {memory_type::host, "host"},
-                                                          {memory_type::device, "device"}};
+static std::map<memory_type, std::string> memory_names = {
+    {memory_type::none, "none"}, {memory_type::host, "host"}, {memory_type::device, "device"}};
 
 enum class smearing_type
 {
@@ -50,20 +49,18 @@ struct buffer_protocol
                   std::array<int, d> size,
                   T* data,
                   enum memory_type memtype,
-                  MPI_Comm mpi_comm=MPI_COMM_SELF)
+                  MPI_Comm mpi_comm = MPI_COMM_SELF)
       : stride(std::move(stride))
       , size(std::move(size))
       , data(data)
       , memtype(memtype)
       , mpi_comm(mpi_comm)
-  { /* empty */ }
+  { /* empty */
+  }
 
   // 1d constructor
   // template<int k=dim, class=std::enable_if_t<k==1>>
-  buffer_protocol(int size,
-                  T* data,
-                  enum memory_type memtype,
-                  MPI_Comm mpi_comm= MPI_COMM_SELF)
+  buffer_protocol(int size, T* data, enum memory_type memtype, MPI_Comm mpi_comm = MPI_COMM_SELF)
       : buffer_protocol({1}, {size}, data, memtype, mpi_comm)
   {
     static_assert(d == 1, "not available.");
@@ -79,7 +76,7 @@ struct buffer_protocol
   MPI_Comm mpi_comm{MPI_COMM_SELF};
 };
 
-template<int dim, class numeric_t>
+template <int dim, class numeric_t>
 class BufferBase
 {
 public:
@@ -100,7 +97,7 @@ class BufferBase
   virtual kindex_t kpoint_index(int i) const = 0;
 };
 
-template<class numeric_t>
+template <class numeric_t>
 class BufferBase<0, numeric_t>
 {
 public:
@@ -140,23 +137,24 @@ class EnergyBase
   virtual std::shared_ptr<MatrixBaseZ> get_sphi(memory_type) = 0;
   virtual std::shared_ptr<MatrixBaseZ> get_C(memory_type) = 0;
   virtual std::shared_ptr<VectorBaseZ> get_fn() = 0;
-  virtual void set_fn(const std::vector<std::pair<int, int>>&, const std::vector<std::vector<double>>&) = 0;
+  virtual void set_fn(const std::vector<std::pair<int, int>>&,
+                      const std::vector<std::vector<double>>&) = 0;
   virtual std::shared_ptr<VectorBaseZ> get_ek() = 0;
   virtual std::shared_ptr<VectorBaseZ> get_gkvec_ekin() = 0;
   virtual std::shared_ptr<ScalarBaseZ> get_kpoint_weights() = 0;
   virtual void set_chemical_potential(double) = 0;
   virtual double get_chemical_potential() = 0;
   virtual void print_info() const = 0;
+  virtual MPI_Comm comm_world() const = 0;
 };
 
 class OpBase
 {
 public:
   using key_t = std::pair<int, int>;
+
 public:
-  virtual void apply(const key_t&,
-                     MatrixBaseZ::buffer_t& out,
-                     MatrixBaseZ::buffer_t& in) const = 0;
+  virtual void apply(const key_t&, MatrixBaseZ::buffer_t& out, MatrixBaseZ::buffer_t& in) const = 0;
   virtual std::vector<key_t> get_keys() const = 0;
 };
 

diff --git a/include/nlcglib.hpp b/include/nlcglib.hpp
@@ -4,8 +4,10 @@
 
 namespace nlcglib {
 
-void initialize();
-void finalize();
+void
+initialize();
+void
+finalize();
 
 nlcg_info
 nlcg_mvp2_cpu(EnergyBase& energy_base,
@@ -28,12 +30,12 @@ nlcg_mvp2_device(EnergyBase& energy_base,
 
 nlcg_info
 nlcg_mvp2_cpu_device(EnergyBase& energy_base,
-                      smearing_type smearing,
-                      double temp,
-                      double tol,
-                      double kappa,
-                      double tau,
-                      int maxiter,
+                     smearing_type smearing,
+                     double temp,
+                     double tol,
+                     double kappa,
+                     double tau,
+                     int maxiter,
                      int restart);
 nlcg_info
 nlcg_mvp2_device_cpu(EnergyBase& energy_base,

diff --git a/spack/packages/nlcglib/package.py b/spack/packages/nlcglib/package.py
@@ -28,6 +28,7 @@ class Nlcglib(CMakePackage, CudaPackage, ROCmPackage):
         description="CMake build type",
         values=("Debug", "Release", "RelWithDebInfo"),
     )
+    variant("gpu_direct", default=False)
 
     depends_on("cmake@3.21:", type="build")
     depends_on("mpi")
@@ -38,6 +39,10 @@ class Nlcglib(CMakePackage, CudaPackage, ROCmPackage):
 
     depends_on("googletest", type="build", when="+tests")
     depends_on("nlohmann-json")
+    depends_on("kokkos@4:", when="@1.1:")
+
+    # MKLConfig.cmake introduced in 2021.3
+    conflicts("intel-oneapi-mkl@:2021.2", when="^intel-oneapi-mkl")
 
     with when("@:0.9"):
         conflicts("+rocm")
@@ -59,6 +64,7 @@ def cmake_args(self):
             self.define_from_variant("USE_OPENMP", "openmp"),
             self.define_from_variant("BUILD_TESTS", "tests"),
             self.define_from_variant("USE_ROCM", "rocm"),
+            self.define_from_variant("USE_GPU_DIRECT", "gpu_direct"),
             self.define_from_variant("USE_MAGMA", "magma"),
             self.define_from_variant("USE_CUDA", "cuda"),
         ]
@@ -67,6 +73,29 @@ def cmake_args(self):
             options += [self.define("LAPACK_VENDOR", "MKL")]
         elif self.spec["blas"].name in ["intel-oneapi-mkl"]:
             options += [self.define("LAPACK_VENDOR", "MKLONEAPI")]
+            mkl_mapper = {
+                "threading": {
+                    "none": "sequential",
+                    "openmp": "gnu_thread",
+                    "tbb": "tbb_thread",
+                },
+                "mpi": {"intel-mpi": "intelmpi", "mpich": "mpich", "openmpi": "openmpi"},
+            }
+
+            mkl_threads = mkl_mapper["threading"][self.spec["intel-oneapi-mkl"].variants["threads"].value]
+
+            mpi_provider = self.spec["mpi"].name
+            if mpi_provider in ["mpich", "cray-mpich", "mvapich", "mvapich2"]:
+                mkl_mpi = mkl_mapper["mpi"]["mpich"]
+            else:
+                mkl_mpi = mkl_mapper["mpi"][mpi_provider]
+
+            options.extend([
+                self.define("MKL_INTERFACE", "lp64"),
+                self.define("MKL_THREADING", mkl_threads),
+                self.define("MKL_MPI", mkl_mpi)
+            ])
+
         elif self.spec["blas"].name in ["openblas"]:
             options += [self.define("LAPACK_VENDOR", "OpenBLAS")]
         else:

diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -11,17 +11,6 @@ add_library(nlcglib SHARED nlcglib.cpp)
 target_include_directories(nlcglib PUBLIC $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/include> $<INSTALL_INTERFACE:include>)
 target_link_libraries(nlcglib PRIVATE nlcglib_core)
 
-if(LAPACK_VENDOR MATCHES MKL)
-  target_compile_definitions(nlcglib PRIVATE __USE_MKL)
-  if(USE_OPENMP)
-    target_link_libraries(nlcglib PRIVATE mkl::mkl_intel_32bit_omp_dyn)
-  else()
-    target_link_libraries(nlcglib PRIVATE mkl::mkl_intel_32bit_seq_st)
-  endif()
-else()
-  target_link_libraries(nlcglib PRIVATE nlcg::cpu_lapack)
-endif()
-
 set_target_properties(nlcglib PROPERTIES PUBLIC_HEADER
   ${CMAKE_SOURCE_DIR}/include/nlcglib.hpp
 )

diff --git a/src/constants.hpp b/src/constants.hpp
@@ -4,10 +4,10 @@ namespace nlcglib {
 
 namespace constants {
 const double pi{3.1415926535897932385};
-}  // constants
+}  // namespace constants
 
 namespace physical_constants {
 const double kb{0.00000316681156340226};
-}  // physical_constants
+}  // namespace physical_constants
 
-}  // nlcglib
+}  // namespace nlcglib
diff --git a/src/dft/newton_minimization_smearing.hpp b/src/dft/newton_minimization_smearing.hpp
@@ -28,8 +28,8 @@ double
 newton_minimization_chemical_potential(
     Nt&& N, DNt&& dN, D2Nt&& ddN, double mu0, double ne, double tol, int maxstep = 1000)
 {
-  // Newton finds the minimum, not necessarily N(mu) == ne, tolerate up to `tol_ne` difference in number of electrons
-  // if |N(mu_0) -ne| > tol_ne an error is thrown.
+  // Newton finds the minimum, not necessarily N(mu) == ne, tolerate up to `tol_ne` difference in
+  // number of electrons if |N(mu_0) -ne| > tol_ne an error is thrown.
   const double tol_ne = 1e-2;
 
   double mu = mu0;

diff --git a/src/exceptions.hpp b/src/exceptions.hpp
@@ -25,4 +25,4 @@ class DescentError : public std::exception
 };
 
 
-}  // nlcglib
+}  // namespace nlcglib
diff --git a/src/exec_space.hpp b/src/exec_space.hpp
@@ -30,7 +30,7 @@ template <>
 struct exec<Kokkos::HostSpace>
 {
 #ifdef __USE_OPENMP
- using type = Kokkos::OpenMP;
+  using type = Kokkos::OpenMP;
 #else
   using type = Kokkos::Serial;
 #endif
@@ -40,4 +40,4 @@ template <class SPACE>
 using exec_t = typename exec<SPACE>::type;
 
 
-}  // nlcglib
+}  // namespace nlcglib