Skip to content

Commit

Permalink
Merge pull request #13446 from ndellingwood/kokkos-promotion-patch-4.…
Browse files Browse the repository at this point in the history
…4.01

Kokkos + KokkosKernels Promotion To 4.4.01
  • Loading branch information
ndellingwood committed Sep 13, 2024
2 parents c0ab196 + 3c7be10 commit 244ca95
Show file tree
Hide file tree
Showing 29 changed files with 486 additions and 80 deletions.
14 changes: 14 additions & 0 deletions packages/kokkos-kernels/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,19 @@
# Change Log

## [4.4.01](https://github.com/kokkos/kokkos-kernels/tree/4.4.01)
[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.4.00...4.4.01)

### Build System:
- Restore size_t as default offset, in Tribits builds [\#2313](https://github.com/kokkos/kokkos-kernels/pull/2313)

### Enhancements:
- Improve crs/bsr sorting performance [\#2293](https://github.com/kokkos/kokkos-kernels/pull/2293)

### Bug Fixes:
- SpAdd handle: delete sort_option getter/setter [\#2296](https://github.com/kokkos/kokkos-kernels/pull/2296)
- Improve GH action to produce release artifacts [\#2312](https://github.com/kokkos/kokkos-kernels/pull/2312)
- coo2csr: add parens to function calls [\#2318](https://github.com/kokkos/kokkos-kernels/pull/2318)

## [4.4.00](https://github.com/kokkos/kokkos-kernels/tree/4.4.00)
[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.3.01...4.4.00)

Expand Down
2 changes: 1 addition & 1 deletion packages/kokkos-kernels/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})

SET(KokkosKernels_VERSION_MAJOR 4)
SET(KokkosKernels_VERSION_MINOR 4)
SET(KokkosKernels_VERSION_PATCH 0)
SET(KokkosKernels_VERSION_PATCH 1)
SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}")

#Set variables for config file
Expand Down
1 change: 1 addition & 0 deletions packages/kokkos-kernels/master_history.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,4 @@ tag: 4.2.01 date: 01/30/2024 master: f429f6ec release: bcf9854b
tag: 4.3.00 date: 04/03/2024 master: afd65f03 release: ebbf4b78
tag: 4.3.01 date: 05/07/2024 master: 1b0a15f5 release: 58785c1b
tag: 4.4.00 date: 08/08/2024 master: d1a91b8a release: 1145f529
tag: 4.4.01 date: 09/12/2024 master: 0608a337 release: 6b340287
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ auto coo2crs(DimType m, DimType n, RowViewType row, ColViewType col, DataViewTyp
// clang-format on
template <typename ScalarType, typename OrdinalType, class DeviceType, class MemoryTraitsType, typename SizeType>
auto coo2crs(KokkosSparse::CooMatrix<ScalarType, OrdinalType, DeviceType, MemoryTraitsType, SizeType> &cooMatrix) {
return coo2crs(cooMatrix.numRows(), cooMatrix.numCols(), cooMatrix.row, cooMatrix.col, cooMatrix.data);
return coo2crs(cooMatrix.numRows(), cooMatrix.numCols(), cooMatrix.row(), cooMatrix.col(), cooMatrix.data());
}
} // namespace KokkosSparse
#endif // _KOKKOSSPARSE_COO2CRS_HPP
1 change: 1 addition & 0 deletions packages/kokkos/.jenkins
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,7 @@ pipeline {
-DKokkos_ENABLE_CUDA=ON \
-DKokkos_ENABLE_CUDA_LAMBDA=ON \
-DKokkos_ENABLE_LIBDL=OFF \
-DKokkos_ENABLE_OPENMP=ON \
-DKokkos_ENABLE_IMPL_MDSPAN=OFF \
-DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=OFF \
.. && \
Expand Down
15 changes: 15 additions & 0 deletions packages/kokkos/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,20 @@
# CHANGELOG

## [4.4.01](https://github.com/kokkos/kokkos/tree/4.4.01)
[Full Changelog](https://github.com/kokkos/kokkos/compare/4.0.00...4.4.01)

### Features:
* Introduce new SequentialHostInit view allocation property [\#7229](https://github.com/kokkos/kokkos/pull/7229)

### Backend and Architecture Enhancements:

#### CUDA:
* Experimental support for unified memory mode (intended for Grace-Hopper etc.) [\#6823](https://github.com/kokkos/kokkos/pull/6823)

### Bug Fixes
* OpenMP: Fix issue related to the visibility of an internal symbol with shared libraries that affected `ScatterView` in particular [\#7284](https://github.com/kokkos/kokkos/pull/7284)
* Fix implicit copy assignment operators in few AVX2 masks being deleted [#7296](https://github.com/kokkos/kokkos/pull/7296)

## [4.4.00](https://github.com/kokkos/kokkos/tree/4.4.00)
[Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.01...4.4.00)

Expand Down
2 changes: 1 addition & 1 deletion packages/kokkos/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ ENDIF()

set(Kokkos_VERSION_MAJOR 4)
set(Kokkos_VERSION_MINOR 4)
set(Kokkos_VERSION_PATCH 0)
set(Kokkos_VERSION_PATCH 1)
set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}")
message(STATUS "Kokkos version: ${Kokkos_VERSION}")
math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}")
Expand Down
2 changes: 1 addition & 1 deletion packages/kokkos/Makefile.kokkos
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

KOKKOS_VERSION_MAJOR = 4
KOKKOS_VERSION_MINOR = 4
KOKKOS_VERSION_PATCH = 0
KOKKOS_VERSION_PATCH = 1
KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc)

# Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial
Expand Down
1 change: 1 addition & 0 deletions packages/kokkos/cmake/KokkosCore_config.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
#cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA // deprecated
#cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR
#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC
#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
#cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE
#cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS
#cmakedefine KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY
Expand Down
4 changes: 3 additions & 1 deletion packages/kokkos/cmake/kokkos_enable_options.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda
# resolved but we keep the option around a bit longer to be safe.
KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)")
KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler")
KOKKOS_ENABLE_OPTION(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA")

KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" )
KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" )
KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP")
Expand Down Expand Up @@ -135,7 +137,7 @@ FUNCTION(check_device_specific_options)
ENDIF()
ENDFUNCTION()

CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC)
CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC IMPL_CUDA_UNIFIED_MEMORY)
CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE)
CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH)

Expand Down
12 changes: 12 additions & 0 deletions packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,17 @@
#endif
///@}

/// Some tests are skipped for unified memory space
#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE \
if constexpr (std::is_same_v<typename TEST_EXECSPACE::memory_space, \
Kokkos::CudaSpace>) \
GTEST_SKIP() << "skipping since unified memory requires additional " \
"fences";
#else
#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE
#endif

TEST(TEST_CATEGORY, resize_realloc_no_init_dualview) {
using namespace Kokkos::Test::Tools;
listen_tool_events(Config::DisableAll(), Config::EnableKernels());
Expand Down Expand Up @@ -657,6 +668,7 @@ TEST(TEST_CATEGORY, create_mirror_no_init_dynamicview) {

TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynamicview) {
GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE
GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE

using namespace Kokkos::Test::Tools;
listen_tool_events(Config::DisableAll(), Config::EnableKernels(),
Expand Down
39 changes: 36 additions & 3 deletions packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
#include <algorithm>
#include <atomic>

//#include <Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp>
#include <impl/Kokkos_Error.hpp>

#include <impl/Kokkos_Tools.hpp>
Expand Down Expand Up @@ -178,6 +177,29 @@ void *impl_allocate_common(const int device_id,
cudaError_t error_code = cudaSuccess;
#ifndef CUDART_VERSION
#error CUDART_VERSION undefined!
#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
// This is intended for Grace-Hopper (and future unified memory architectures)
// The idea is to use host allocator and then advise to keep it in HBM on the
// device, but that requires CUDA 12.2
static_assert(CUDART_VERSION >= 12020,
"CUDA runtime version >=12.2 required when "
"Kokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY is set. "
"Please update your CUDA runtime version or "
"reconfigure with "
"-D Kokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY=OFF");
if (arg_alloc_size) { // cudaMemAdvise_v2 does not work with nullptr
error_code = cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal);
if (error_code == cudaSuccess) {
// One would think cudaMemLocation{device_id,
// cudaMemLocationTypeDevice} would work but it doesn't. I.e. the order of
// members doesn't seem to be defined.
cudaMemLocation loc;
loc.id = device_id;
loc.type = cudaMemLocationTypeDevice;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemAdvise_v2(
ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, loc));
}
}
#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
if (arg_alloc_size >= memory_threshold_g) {
error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream);
Expand All @@ -190,9 +212,13 @@ void *impl_allocate_common(const int device_id,
"Kokkos::Cuda: backend fence after async malloc");
}
}
} else
} else {
error_code = cudaMalloc(&ptr, arg_alloc_size);
}
#else
error_code = cudaMalloc(&ptr, arg_alloc_size);
#endif
{ error_code = cudaMalloc(&ptr, arg_alloc_size); }

if (error_code != cudaSuccess) { // TODO tag as unlikely branch
// This is the only way to clear the last error, which
// we should do here since we're turning it into an
Expand Down Expand Up @@ -326,6 +352,9 @@ void CudaSpace::impl_deallocate(
}
#ifndef CUDART_VERSION
#error CUDART_VERSION undefined!
#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr));
#elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020)
if (arg_alloc_size >= memory_threshold_g) {
Impl::cuda_device_synchronize(
Expand Down Expand Up @@ -436,8 +465,12 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes,

#include <impl/Kokkos_SharedAlloc_timpl.hpp>

#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
Kokkos::CudaSpace);
#else
KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::CudaSpace);
#endif
KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
Kokkos::CudaUVMSpace);
KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(
Expand Down
23 changes: 22 additions & 1 deletion packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,19 @@ class CudaSpace {
void* allocate(const char* arg_label, const size_t arg_alloc_size,
const size_t arg_logical_size = 0) const;

#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
template <typename ExecutionSpace>
void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const {
return allocate(arg_alloc_size);
}
template <typename ExecutionSpace>
void* allocate(const ExecutionSpace&, const char* arg_label,
const size_t arg_alloc_size,
const size_t arg_logical_size = 0) const {
return allocate(arg_label, arg_alloc_size, arg_logical_size);
}
#endif

/**\brief Deallocate untracked memory in the cuda space */
void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const;
void deallocate(const char* arg_label, void* const arg_alloc_ptr,
Expand Down Expand Up @@ -337,7 +350,11 @@ static_assert(
template <>
struct MemorySpaceAccess<Kokkos::HostSpace, Kokkos::CudaSpace> {
enum : bool { assignable = false };
enum : bool { accessible = false };
#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
enum : bool{accessible = false};
#else
enum : bool { accessible = true };
#endif
enum : bool { deepcopy = true };
};

Expand Down Expand Up @@ -558,8 +575,12 @@ struct DeepCopy<HostSpace, MemSpace, ExecutionSpace,
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY)
KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION(
Kokkos::CudaSpace);
#else
KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaSpace);
#endif
KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaUVMSpace);
KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::CudaHostPinnedSpace);

Expand Down
20 changes: 20 additions & 0 deletions packages/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,22 @@ Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default

//----------------------------------

#ifdef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
// Check if unified memory is available
int cuda_result;
cudaDeviceGetAttribute(&cuda_result, cudaDevAttrConcurrentManagedAccess,
cuda_device_id);
if (cuda_result == 0) {
Kokkos::abort(
"Kokkos::Cuda::initialize ERROR: Unified memory is not available on "
"this device\n"
"Please recompile Kokkos with "
"-DKokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY=OFF\n");
}
#endif

//----------------------------------

cudaStream_t singleton_stream;
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(cuda_device_id));
KOKKOS_IMPL_CUDA_SAFE_CALL(cudaStreamCreate(&singleton_stream));
Expand Down Expand Up @@ -705,6 +721,10 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const {
#else
os << "no\n";
#endif
#ifdef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY
os << " KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY: ";
os << "yes\n";
#endif

os << "\nCuda Runtime Configuration:\n";

Expand Down
2 changes: 2 additions & 0 deletions packages/kokkos/core/src/Kokkos_View.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -571,6 +571,8 @@ inline constexpr Kokkos::ALL_t ALL{};
#pragma omp end declare target
#endif

inline constexpr Kokkos::Impl::SequentialHostInit_t SequentialHostInit{};

inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{};

inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{};
Expand Down
2 changes: 1 addition & 1 deletion packages/kokkos/core/src/OpenMP/Kokkos_OpenMP.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ int OpenMP::impl_thread_pool_size() const noexcept {
}

int OpenMP::impl_max_hardware_threads() noexcept {
return Impl::g_openmp_hardware_max_threads;
return Impl::OpenMPInternal::max_hardware_threads();
}

namespace Impl {
Expand Down
Loading

0 comments on commit 244ca95

Please sign in to comment.