diff --git a/packages/kokkos-kernels/CHANGELOG.md b/packages/kokkos-kernels/CHANGELOG.md index cefc116c83a0..343f815ed721 100644 --- a/packages/kokkos-kernels/CHANGELOG.md +++ b/packages/kokkos-kernels/CHANGELOG.md @@ -1,5 +1,19 @@ # Change Log +## [4.4.01](https://github.com/kokkos/kokkos-kernels/tree/4.4.01) +[Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.4.00...4.4.01) + +### Build System: +- Restore size_t as default offset, in Tribits builds [\#2313](https://github.com/kokkos/kokkos-kernels/pull/2313) + +### Enhancements: +- Improve crs/bsr sorting performance [\#2293](https://github.com/kokkos/kokkos-kernels/pull/2293) + +### Bug Fixes: +- SpAdd handle: delete sort_option getter/setter [\#2296](https://github.com/kokkos/kokkos-kernels/pull/2296) +- Improve GH action to produce release artifacts [\#2312](https://github.com/kokkos/kokkos-kernels/pull/2312) +- coo2csr: add parens to function calls [\#2318](https://github.com/kokkos/kokkos-kernels/pull/2318) + ## [4.4.00](https://github.com/kokkos/kokkos-kernels/tree/4.4.00) [Full Changelog](https://github.com/kokkos/kokkos-kernels/compare/4.3.01...4.4.00) diff --git a/packages/kokkos-kernels/CMakeLists.txt b/packages/kokkos-kernels/CMakeLists.txt index 48608e756911..fd3515e0c44a 100644 --- a/packages/kokkos-kernels/CMakeLists.txt +++ b/packages/kokkos-kernels/CMakeLists.txt @@ -11,7 +11,7 @@ SET(KOKKOSKERNELS_TOP_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) SET(KokkosKernels_VERSION_MAJOR 4) SET(KokkosKernels_VERSION_MINOR 4) -SET(KokkosKernels_VERSION_PATCH 0) +SET(KokkosKernels_VERSION_PATCH 1) SET(KokkosKernels_VERSION "${KokkosKernels_VERSION_MAJOR}.${KokkosKernels_VERSION_MINOR}.${KokkosKernels_VERSION_PATCH}") #Set variables for config file diff --git a/packages/kokkos-kernels/master_history.txt b/packages/kokkos-kernels/master_history.txt index 6a546fb885ef..c712462dd81e 100644 --- a/packages/kokkos-kernels/master_history.txt +++ b/packages/kokkos-kernels/master_history.txt @@ -27,3 +27,4 @@ tag: 4.2.01 date: 01/30/2024 master: f429f6ec release: bcf9854b tag: 4.3.00 date: 04/03/2024 master: afd65f03 release: ebbf4b78 tag: 4.3.01 date: 05/07/2024 master: 1b0a15f5 release: 58785c1b tag: 4.4.00 date: 08/08/2024 master: d1a91b8a release: 1145f529 +tag: 4.4.01 date: 09/12/2024 master: 0608a337 release: 6b340287 diff --git a/packages/kokkos-kernels/sparse/src/KokkosSparse_coo2crs.hpp b/packages/kokkos-kernels/sparse/src/KokkosSparse_coo2crs.hpp index d10ef9974c19..d9964e18b729 100644 --- a/packages/kokkos-kernels/sparse/src/KokkosSparse_coo2crs.hpp +++ b/packages/kokkos-kernels/sparse/src/KokkosSparse_coo2crs.hpp @@ -79,7 +79,7 @@ auto coo2crs(DimType m, DimType n, RowViewType row, ColViewType col, DataViewTyp // clang-format on template auto coo2crs(KokkosSparse::CooMatrix &cooMatrix) { - return coo2crs(cooMatrix.numRows(), cooMatrix.numCols(), cooMatrix.row, cooMatrix.col, cooMatrix.data); + return coo2crs(cooMatrix.numRows(), cooMatrix.numCols(), cooMatrix.row(), cooMatrix.col(), cooMatrix.data()); } } // namespace KokkosSparse #endif // _KOKKOSSPARSE_COO2CRS_HPP diff --git a/packages/kokkos/.jenkins b/packages/kokkos/.jenkins index 0393ff06fb5e..1635a69f298f 100644 --- a/packages/kokkos/.jenkins +++ b/packages/kokkos/.jenkins @@ -461,6 +461,7 @@ pipeline { -DKokkos_ENABLE_CUDA=ON \ -DKokkos_ENABLE_CUDA_LAMBDA=ON \ -DKokkos_ENABLE_LIBDL=OFF \ + -DKokkos_ENABLE_OPENMP=ON \ -DKokkos_ENABLE_IMPL_MDSPAN=OFF \ -DKokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC=OFF \ .. && \ diff --git a/packages/kokkos/CHANGELOG.md b/packages/kokkos/CHANGELOG.md index 78225f9e6c27..7b1d69e56630 100644 --- a/packages/kokkos/CHANGELOG.md +++ b/packages/kokkos/CHANGELOG.md @@ -1,5 +1,20 @@ # CHANGELOG +## [4.4.01](https://github.com/kokkos/kokkos/tree/4.4.01) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.0.00...4.4.01) + +### Features: +* Introduce new SequentialHostInit view allocation property [\#7229](https://github.com/kokkos/kokkos/pull/7229) + +### Backend and Architecture Enhancements: + +#### CUDA: +* Experimental support for unified memory mode (intended for Grace-Hopper etc.) [\#6823](https://github.com/kokkos/kokkos/pull/6823) + +### Bug Fixes +* OpenMP: Fix issue related to the visibility of an internal symbol with shared libraries that affected `ScatterView` in particular [\#7284](https://github.com/kokkos/kokkos/pull/7284) +* Fix implicit copy assignment operators in few AVX2 masks being deleted [#7296](https://github.com/kokkos/kokkos/pull/7296) + ## [4.4.00](https://github.com/kokkos/kokkos/tree/4.4.00) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.01...4.4.00) diff --git a/packages/kokkos/CMakeLists.txt b/packages/kokkos/CMakeLists.txt index 054de2c1dae8..736cbac218c2 100644 --- a/packages/kokkos/CMakeLists.txt +++ b/packages/kokkos/CMakeLists.txt @@ -151,7 +151,7 @@ ENDIF() set(Kokkos_VERSION_MAJOR 4) set(Kokkos_VERSION_MINOR 4) -set(Kokkos_VERSION_PATCH 0) +set(Kokkos_VERSION_PATCH 1) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") message(STATUS "Kokkos version: ${Kokkos_VERSION}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") diff --git a/packages/kokkos/Makefile.kokkos b/packages/kokkos/Makefile.kokkos index 15f24f30732a..ccb568a553ce 100644 --- a/packages/kokkos/Makefile.kokkos +++ b/packages/kokkos/Makefile.kokkos @@ -2,7 +2,7 @@ KOKKOS_VERSION_MAJOR = 4 KOKKOS_VERSION_MINOR = 4 -KOKKOS_VERSION_PATCH = 0 +KOKKOS_VERSION_PATCH = 1 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,SYCL,OpenMPTarget,OpenMP,Threads,Serial diff --git a/packages/kokkos/cmake/KokkosCore_config.h.in b/packages/kokkos/cmake/KokkosCore_config.h.in index 7997aa3707c6..a93007ff83f6 100644 --- a/packages/kokkos/cmake/KokkosCore_config.h.in +++ b/packages/kokkos/cmake/KokkosCore_config.h.in @@ -37,6 +37,7 @@ #cmakedefine KOKKOS_ENABLE_CUDA_LAMBDA // deprecated #cmakedefine KOKKOS_ENABLE_CUDA_CONSTEXPR #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC +#cmakedefine KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS #cmakedefine KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY diff --git a/packages/kokkos/cmake/kokkos_enable_options.cmake b/packages/kokkos/cmake/kokkos_enable_options.cmake index b900c4a232ea..53764b0c6848 100644 --- a/packages/kokkos/cmake/kokkos_enable_options.cmake +++ b/packages/kokkos/cmake/kokkos_enable_options.cmake @@ -48,6 +48,8 @@ KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda # resolved but we keep the option around a bit longer to be safe. KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") +KOKKOS_ENABLE_OPTION(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA") + KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" ) KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" ) KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") @@ -135,7 +137,7 @@ FUNCTION(check_device_specific_options) ENDIF() ENDFUNCTION() -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC) +CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC IMPL_CUDA_UNIFIED_MEMORY) CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE) CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH) diff --git a/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp b/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp index 7201cd402a95..e8558628dc84 100644 --- a/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp +++ b/packages/kokkos/containers/unit_tests/TestWithoutInitializing.hpp @@ -37,6 +37,17 @@ #endif ///@} +/// Some tests are skipped for unified memory space +#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) +#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE \ + if constexpr (std::is_same_v) \ + GTEST_SKIP() << "skipping since unified memory requires additional " \ + "fences"; +#else +#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE +#endif + TEST(TEST_CATEGORY, resize_realloc_no_init_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels()); @@ -657,6 +668,7 @@ TEST(TEST_CATEGORY, create_mirror_no_init_dynamicview) { TEST(TEST_CATEGORY, create_mirror_view_and_copy_dynamicview) { GTEST_SKIP_IF_CUDAUVM_MEMORY_SPACE + GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index 75318aff7781..6ae24022c8fd 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -31,7 +31,6 @@ #include #include -//#include #include #include @@ -178,6 +177,29 @@ void *impl_allocate_common(const int device_id, cudaError_t error_code = cudaSuccess; #ifndef CUDART_VERSION #error CUDART_VERSION undefined! +#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) + // This is intended for Grace-Hopper (and future unified memory architectures) + // The idea is to use host allocator and then advise to keep it in HBM on the + // device, but that requires CUDA 12.2 + static_assert(CUDART_VERSION >= 12020, + "CUDA runtime version >=12.2 required when " + "Kokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY is set. " + "Please update your CUDA runtime version or " + "reconfigure with " + "-D Kokkos_ENABLE_IMPL_CUDA_UNIFIED_MEMORY=OFF"); + if (arg_alloc_size) { // cudaMemAdvise_v2 does not work with nullptr + error_code = cudaMallocManaged(&ptr, arg_alloc_size, cudaMemAttachGlobal); + if (error_code == cudaSuccess) { + // One would think cudaMemLocation{device_id, + // cudaMemLocationTypeDevice} would work but it doesn't. I.e. the order of + // members doesn't seem to be defined. + cudaMemLocation loc; + loc.id = device_id; + loc.type = cudaMemLocationTypeDevice; + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaMemAdvise_v2( + ptr, arg_alloc_size, cudaMemAdviseSetPreferredLocation, loc)); + } + } #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) if (arg_alloc_size >= memory_threshold_g) { error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream); @@ -190,9 +212,13 @@ void *impl_allocate_common(const int device_id, "Kokkos::Cuda: backend fence after async malloc"); } } - } else + } else { + error_code = cudaMalloc(&ptr, arg_alloc_size); + } +#else + error_code = cudaMalloc(&ptr, arg_alloc_size); #endif - { error_code = cudaMalloc(&ptr, arg_alloc_size); } + if (error_code != cudaSuccess) { // TODO tag as unlikely branch // This is the only way to clear the last error, which // we should do here since we're turning it into an @@ -326,6 +352,9 @@ void CudaSpace::impl_deallocate( } #ifndef CUDART_VERSION #error CUDART_VERSION undefined! +#elif defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaSetDevice(m_device)); + KOKKOS_IMPL_CUDA_SAFE_CALL(cudaFree(arg_alloc_ptr)); #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) if (arg_alloc_size >= memory_threshold_g) { Impl::cuda_device_synchronize( @@ -436,8 +465,12 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, #include +#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( Kokkos::CudaSpace); +#else +KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION(Kokkos::CudaSpace); +#endif KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( Kokkos::CudaUVMSpace); KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( diff --git a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp index 0e20193e8b42..e1d062d72d5a 100644 --- a/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp +++ b/packages/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp @@ -88,6 +88,19 @@ class CudaSpace { void* allocate(const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size = 0) const; +#if defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) + template + void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { + return allocate(arg_alloc_size); + } + template + void* allocate(const ExecutionSpace&, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const { + return allocate(arg_label, arg_alloc_size, arg_logical_size); + } +#endif + /**\brief Deallocate untracked memory in the cuda space */ void deallocate(void* const arg_alloc_ptr, const size_t arg_alloc_size) const; void deallocate(const char* arg_label, void* const arg_alloc_ptr, @@ -337,7 +350,11 @@ static_assert( template <> struct MemorySpaceAccess { enum : bool { assignable = false }; - enum : bool { accessible = false }; +#if !defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) + enum : bool{accessible = false}; +#else + enum : bool { accessible = true }; +#endif enum : bool { deepcopy = true }; }; @@ -558,8 +575,12 @@ struct DeepCopy #include +namespace { +int g_openmp_hardware_max_threads = 1; +} + namespace Kokkos { namespace Impl { std::vector OpenMPInternal::all_instances; std::mutex OpenMPInternal::all_instances_mutex; +int OpenMPInternal::max_hardware_threads() noexcept { + return g_openmp_hardware_max_threads; +} + void OpenMPInternal::clear_thread_data() { const size_t member_bytes = sizeof(int64_t) * @@ -188,9 +196,9 @@ void OpenMPInternal::initialize(int thread_count) { // Before any other call to OMP query the maximum number of threads // and save the value for re-initialization unit testing. - Impl::g_openmp_hardware_max_threads = get_current_max_threads(); + g_openmp_hardware_max_threads = get_current_max_threads(); - int process_num_threads = Impl::g_openmp_hardware_max_threads; + int process_num_threads = g_openmp_hardware_max_threads; if (Kokkos::hwloc::available()) { process_num_threads = Kokkos::hwloc::get_available_numa_count() * @@ -203,11 +211,11 @@ void OpenMPInternal::initialize(int thread_count) { // process_num_threads if thread_count > 0, set // g_openmp_hardware_max_threads to thread_count if (thread_count < 0) { - thread_count = Impl::g_openmp_hardware_max_threads; + thread_count = g_openmp_hardware_max_threads; } else if (thread_count == 0) { - if (Impl::g_openmp_hardware_max_threads != process_num_threads) { - Impl::g_openmp_hardware_max_threads = process_num_threads; - omp_set_num_threads(Impl::g_openmp_hardware_max_threads); + if (g_openmp_hardware_max_threads != process_num_threads) { + g_openmp_hardware_max_threads = process_num_threads; + omp_set_num_threads(g_openmp_hardware_max_threads); } } else { if (Kokkos::show_warnings() && thread_count > process_num_threads) { @@ -218,16 +226,16 @@ void OpenMPInternal::initialize(int thread_count) { << ", requested thread : " << std::setw(3) << thread_count << std::endl; } - Impl::g_openmp_hardware_max_threads = thread_count; - omp_set_num_threads(Impl::g_openmp_hardware_max_threads); + g_openmp_hardware_max_threads = thread_count; + omp_set_num_threads(g_openmp_hardware_max_threads); } // setup thread local -#pragma omp parallel num_threads(Impl::g_openmp_hardware_max_threads) +#pragma omp parallel num_threads(g_openmp_hardware_max_threads) { Impl::SharedAllocationRecord::tracking_enable(); } auto &instance = OpenMPInternal::singleton(); - instance.m_pool_size = Impl::g_openmp_hardware_max_threads; + instance.m_pool_size = g_openmp_hardware_max_threads; // New, unified host thread team data: { @@ -272,10 +280,9 @@ void OpenMPInternal::finalize() { if (this == &singleton()) { auto const &instance = singleton(); // Silence Cuda Warning - const int nthreads = - instance.m_pool_size <= Impl::g_openmp_hardware_max_threads - ? Impl::g_openmp_hardware_max_threads - : instance.m_pool_size; + const int nthreads = instance.m_pool_size <= g_openmp_hardware_max_threads + ? g_openmp_hardware_max_threads + : instance.m_pool_size; (void)nthreads; #pragma omp parallel num_threads(nthreads) @@ -284,7 +291,7 @@ void OpenMPInternal::finalize() { // allow main thread to track Impl::SharedAllocationRecord::tracking_enable(); - Impl::g_openmp_hardware_max_threads = 1; + g_openmp_hardware_max_threads = 1; } m_initialized = false; @@ -307,7 +314,7 @@ void OpenMPInternal::print_configuration(std::ostream &s) const { if (m_initialized) { const int numa_count = 1; - const int core_per_numa = Impl::g_openmp_hardware_max_threads; + const int core_per_numa = g_openmp_hardware_max_threads; const int thread_per_core = 1; s << " thread_pool_topology[ " << numa_count << " x " << core_per_numa diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp index f4a0d3e20123..2aed723b18ff 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_Instance.hpp @@ -47,8 +47,6 @@ namespace Impl { class OpenMPInternal; -inline int g_openmp_hardware_max_threads = 1; - struct OpenMPTraits { static constexpr int MAX_THREAD_COUNT = 512; }; @@ -86,6 +84,8 @@ class OpenMPInternal { void clear_thread_data(); + static int max_hardware_threads() noexcept; + int thread_pool_size() const { return m_pool_size; } void resize_thread_data(size_t pool_reduce_bytes, size_t team_reduce_bytes, diff --git a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp index a37e1758a261..5937c093ba17 100644 --- a/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp +++ b/packages/kokkos/core/src/OpenMP/Kokkos_OpenMP_UniqueToken.hpp @@ -105,7 +105,8 @@ class UniqueToken { /// \brief upper bound for acquired values, i.e. 0 <= value < size() KOKKOS_INLINE_FUNCTION int size() const noexcept { - KOKKOS_IF_ON_HOST((return Kokkos::Impl::g_openmp_hardware_max_threads;)) + KOKKOS_IF_ON_HOST( + (return Kokkos::Impl::OpenMPInternal::max_hardware_threads();)) KOKKOS_IF_ON_DEVICE((return 0;)) } diff --git a/packages/kokkos/core/src/View/Kokkos_ViewAlloc.hpp b/packages/kokkos/core/src/View/Kokkos_ViewAlloc.hpp index 95cb6f619cce..1ade75692f1f 100644 --- a/packages/kokkos/core/src/View/Kokkos_ViewAlloc.hpp +++ b/packages/kokkos/core/src/View/Kokkos_ViewAlloc.hpp @@ -313,6 +313,51 @@ struct ViewValueFunctor { void destroy_shared_allocation() {} }; + +template +struct ViewValueFunctorSequentialHostInit { + using ExecSpace = typename DeviceType::execution_space; + using MemSpace = typename DeviceType::memory_space; + static_assert(SpaceAccessibility::accessible); + + ValueType* ptr; + size_t n; + + ViewValueFunctorSequentialHostInit() = default; + + ViewValueFunctorSequentialHostInit(ExecSpace const& /*arg_space*/, + ValueType* const arg_ptr, + size_t const arg_n, + std::string /*arg_name*/) + : ptr(arg_ptr), n(arg_n) {} + + ViewValueFunctorSequentialHostInit(ValueType* const arg_ptr, + size_t const arg_n, + std::string /*arg_name*/) + : ptr(arg_ptr), n(arg_n) {} + + void construct_shared_allocation() { + if constexpr (std::is_trivial_v) { + // value-initialization is equivalent to filling with zeros + std::memset(static_cast(ptr), 0, n * sizeof(ValueType)); + } else { + for (size_t i = 0; i < n; ++i) { + new (ptr + i) ValueType(); + } + } + } + + void destroy_shared_allocation() { + if constexpr (std::is_trivially_destructible_v) { + // do nothing, don't bother calling the destructor + } else { + for (size_t i = 0; i < n; ++i) { + (ptr + i)->~ValueType(); + } + } + } +}; + } // namespace Kokkos::Impl #endif // KOKKOS_VIEW_ALLOC_HPP diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp index e1b8ba86a5b5..379180ae6435 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ViewCtor.hpp @@ -23,12 +23,16 @@ namespace Kokkos { namespace Impl { +struct SequentialHostInit_t {}; struct WithoutInitializing_t {}; struct AllowPadding_t {}; template struct is_view_ctor_property : public std::false_type {}; +template <> +struct is_view_ctor_property : public std::true_type {}; + template <> struct is_view_ctor_property : public std::true_type {}; @@ -84,10 +88,10 @@ struct ViewCtorProp> { /* Property flags have constexpr value */ template -struct ViewCtorProp< - std::enable_if_t::value || - std::is_same::value>, - P> { +struct ViewCtorProp || + std::is_same_v || + std::is_same_v>, + P> { ViewCtorProp() = default; ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; @@ -199,6 +203,11 @@ struct ViewCtorProp : public ViewCtorProp... { Kokkos::Impl::has_type::value; static constexpr bool initialize = !Kokkos::Impl::has_type::value; + static constexpr bool sequential_host_init = + Kokkos::Impl::has_type::value; + static_assert(initialize || !sequential_host_init, + "Incompatible WithoutInitializing and SequentialHostInit view " + "alloc properties"); using memory_space = typename var_memory_space::type; using execution_space = typename var_execution_space::type; @@ -251,7 +260,9 @@ auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop, (is_view_label::value && !ViewCtorProp::has_label) || (std::is_same_v && - ViewCtorProp::initialize)) { + ViewCtorProp::initialize) || + (std::is_same_v && + !ViewCtorProp::sequential_host_init)) { using NewViewCtorProp = ViewCtorProp; NewViewCtorProp new_view_ctor_prop(view_ctor_prop); static_cast &>(new_view_ctor_prop).value = @@ -299,7 +310,9 @@ struct WithPropertiesIfUnset, Property, Properties...> { (is_view_label::value && !ViewCtorProp::has_label) || (std::is_same_v && - ViewCtorProp::initialize)) { + ViewCtorProp::initialize) || + (std::is_same_v && + !ViewCtorProp::sequential_host_init)) { using NewViewCtorProp = ViewCtorProp; NewViewCtorProp new_view_ctor_prop(view_ctor_prop); static_cast &>(new_view_ctor_prop).value = diff --git a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp index 8919dccdb7a4..10aaa63b7c82 100644 --- a/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp +++ b/packages/kokkos/core/src/impl/Kokkos_ViewMapping.hpp @@ -2825,10 +2825,12 @@ class ViewMapping< using memory_space = typename Traits::memory_space; static_assert( SpaceAccessibility::accessible); - using value_type = typename Traits::value_type; - using functor_type = - ViewValueFunctor, - value_type>; + using device_type = Kokkos::Device; + using value_type = typename Traits::value_type; + using functor_type = std::conditional_t< + alloc_prop::sequential_host_init, + ViewValueFunctorSequentialHostInit, + ViewValueFunctor>; using record_type = Kokkos::Impl::SharedAllocationRecord; diff --git a/packages/kokkos/core/unit_test/TestViewOfViews.hpp b/packages/kokkos/core/unit_test/TestViewOfViews.hpp index a87c829bb73c..1d53bca336d4 100644 --- a/packages/kokkos/core/unit_test/TestViewOfViews.hpp +++ b/packages/kokkos/core/unit_test/TestViewOfViews.hpp @@ -20,7 +20,7 @@ namespace { -// User-defined type with a View data member +// User-defined types with a View data member template class S { V v_; @@ -28,48 +28,102 @@ class S { public: template S(std::string label, Extents... extents) : v_(std::move(label), extents...) {} - S() = default; + KOKKOS_DEFAULTED_FUNCTION S() = default; }; template -void test_view_of_views() { +class N { // not default constructible + V v_; + + public: + template + N(std::string label, Extents... extents) : v_(std::move(label), extents...) {} +}; + +template +class H { // constructible and destructible only from on the host side + V v_; + + public: + template + H(std::string label, Extents... extents) : v_(std::move(label), extents...) {} + H() {} + ~H() {} +}; + +template +void test_view_of_views_default() { + // assigning a default-constructed view to destruct the inner objects using VoV = Kokkos::View; - { // assigning a default-constructed view to destruct the inner objects - VoV vov("vov", 2, 3); - V a("a"); - V b("b"); - vov(0, 0) = a; - vov(1, 0) = a; - vov(0, 1) = b; + VoV vov("vov", 2, 3); + V a("a"); + V b("b"); + vov(0, 0) = a; + vov(1, 0) = a; + vov(0, 1) = b; #ifndef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND - vov(0, 0) = V(); - vov(1, 0) = V(); - vov(0, 1) = V(); + vov(0, 0) = V(); + vov(1, 0) = V(); + vov(0, 1) = V(); #endif - } - { // using placement new to construct the inner objects and explicitly - // calling the destructor - VoV vov(Kokkos::view_alloc("vov", Kokkos::WithoutInitializing), 2, 3); - V a("a"); - V b("b"); - new (&vov(0, 0)) V(a); - new (&vov(1, 0)) V(a); - new (&vov(0, 1)) V(b); +} + +template +void test_view_of_views_without_initializing() { + // using placement new to construct the inner objects and explicitly + // calling the destructor + using VoV = Kokkos::View; + VoV vov(Kokkos::view_alloc("vov", Kokkos::WithoutInitializing), 2, 3); + V a("a"); + V b("b"); + new (&vov(0, 0)) V(a); + new (&vov(1, 0)) V(a); + new (&vov(0, 1)) V(b); #ifndef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND - vov(0, 0).~V(); - vov(1, 0).~V(); - vov(0, 1).~V(); + vov(0, 0).~V(); + vov(1, 0).~V(); + vov(0, 1).~V(); #else - // leaks memory + // leaks memory #endif - } } -TEST(TEST_CATEGORY, view_of_views) { - test_view_of_views>(); - test_view_of_views>(); +template +void test_view_of_views_sequential_host_init() { + // inner views value-initialized sequentially on the host, and also + // sequentially destructed on the host, without the need to cleanup + using VoV = Kokkos::View; + VoV vov(Kokkos::view_alloc("vov", Kokkos::SequentialHostInit), 2, 3); + V a("a"); + V b("b"); + vov(0, 0) = a; + vov(1, 0) = a; + vov(0, 1) = b; +} + +TEST(TEST_CATEGORY, view_of_views_default) { + test_view_of_views_default>(); + test_view_of_views_default>(); // User-defined type with View data member - test_view_of_views>>(); + test_view_of_views_default>>(); +} + +TEST(TEST_CATEGORY, view_of_views_without_initializing) { + test_view_of_views_without_initializing>(); + test_view_of_views_without_initializing< + S>>(); + test_view_of_views_without_initializing< + N>>(); + test_view_of_views_without_initializing< + H>>(); +} + +TEST(TEST_CATEGORY, test_view_of_views_sequential_host_init) { + test_view_of_views_sequential_host_init>(); + test_view_of_views_sequential_host_init< + S>>(); + test_view_of_views_sequential_host_init< + H>>(); } } // namespace diff --git a/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp b/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp index 11fe6b8555b8..f40af99e7c28 100644 --- a/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp +++ b/packages/kokkos/core/unit_test/cuda/TestCuda_Spaces.cpp @@ -39,9 +39,14 @@ TEST(cuda, space_access) { !Kokkos::Impl::MemorySpaceAccess::assignable); +#ifndef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY static_assert( !Kokkos::Impl::MemorySpaceAccess::accessible); +#else + static_assert(Kokkos::Impl::MemorySpaceAccess::accessible); +#endif static_assert( !Kokkos::Impl::MemorySpaceAccess::accessible); +#ifndef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY static_assert(!Kokkos::SpaceAccessibility::accessible); +#else + static_assert(Kokkos::SpaceAccessibility::accessible); +#endif static_assert(Kokkos::SpaceAccessibility::accessible); @@ -157,8 +167,14 @@ TEST(cuda, space_access) { Kokkos::SpaceAccessibility::accessible); +#ifndef KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY static_assert(std::is_same::Space, Kokkos::HostSpace>::value); +#else + static_assert(std::is_same::Space, + Kokkos::Device>::value); +#endif static_assert( std::is_same::Space, diff --git a/packages/kokkos/master_history.txt b/packages/kokkos/master_history.txt index a0e83bef237d..f2a41636101d 100644 --- a/packages/kokkos/master_history.txt +++ b/packages/kokkos/master_history.txt @@ -38,3 +38,4 @@ tag: 4.2.01 date: 01:30:2024 master: 71a9bcae release: 221e5f7a tag: 4.3.00 date: 04:03:2024 master: e0dc0128 release: f08217a4 tag: 4.3.01 date: 05:07:2024 master: 486cc745 release: 262d2d6e tag: 4.4.00 date: 08:08:2024 master: 6ecdf605 release: 6068673c +tag: 4.4.01 date: 09:12:2024 master: 08ceff92 release: 2d60c039 diff --git a/packages/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp b/packages/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp index 27c8af79abd6..0525dc8887a7 100644 --- a/packages/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp +++ b/packages/kokkos/simd/src/Kokkos_SIMD_AVX2.hpp @@ -361,9 +361,7 @@ class simd_mask> { }; using value_type = bool; using abi_type = simd_abi::avx2_fixed_size<4>; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value) : m_value(_mm_set1_epi32(-std::int32_t(value))) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { @@ -460,9 +458,7 @@ class simd_mask> { }; using value_type = bool; using abi_type = simd_abi::avx2_fixed_size<8>; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value) : m_value(_mm256_set1_epi32(-std::int32_t(value))) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { @@ -561,9 +557,7 @@ class simd_mask> { }; using value_type = bool; using abi_type = simd_abi::avx2_fixed_size<4>; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask const&) = default; - KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask(simd_mask&&) = default; + KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION simd_mask() = default; KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION explicit simd_mask(value_type value) : m_value(_mm256_set1_epi64x(-std::int64_t(value))) {} KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION static constexpr std::size_t size() { diff --git a/packages/kokkos/simd/unit_tests/TestSIMD.cpp b/packages/kokkos/simd/unit_tests/TestSIMD.cpp index 7a1f9be2a0f9..df18b43c4e35 100644 --- a/packages/kokkos/simd/unit_tests/TestSIMD.cpp +++ b/packages/kokkos/simd/unit_tests/TestSIMD.cpp @@ -22,3 +22,4 @@ #include #include #include +#include diff --git a/packages/kokkos/simd/unit_tests/include/TestSIMD_Construction.hpp b/packages/kokkos/simd/unit_tests/include/TestSIMD_Construction.hpp new file mode 100644 index 000000000000..0ceb1496c47d --- /dev/null +++ b/packages/kokkos/simd/unit_tests/include/TestSIMD_Construction.hpp @@ -0,0 +1,150 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_TEST_SIMD_CONSTRUCTION_HPP +#define KOKKOS_TEST_SIMD_CONSTRUCTION_HPP + +#include +#include + +template +inline void host_test_simd_traits() { + using simd_type = Kokkos::Experimental::simd; + + static_assert(std::is_nothrow_default_constructible_v); + static_assert(std::is_nothrow_copy_assignable_v); + static_assert(std::is_nothrow_copy_constructible_v); + static_assert(std::is_nothrow_move_assignable_v); + static_assert(std::is_nothrow_move_constructible_v); + + simd_type default_simd, result; + simd_type test_simd(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); }); + simd_type copy_simd(test_simd); + simd_type move_simd(std::move(copy_simd)); + default_simd = std::move(move_simd); + result = default_simd; + EXPECT_TRUE(all_of(test_simd == result)); +} + +template +inline void host_test_mask_traits() { + using mask_type = Kokkos::Experimental::simd_mask; + + static_assert(std::is_nothrow_default_constructible_v); + static_assert(std::is_nothrow_copy_assignable_v); + static_assert(std::is_nothrow_copy_constructible_v); + static_assert(std::is_nothrow_move_assignable_v); + static_assert(std::is_nothrow_move_constructible_v); + + mask_type default_mask, result; + mask_type test_mask(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); }); + mask_type copy_mask(test_mask); + mask_type move_mask(std::move(copy_mask)); + default_mask = std::move(move_mask); + result = default_mask; + EXPECT_EQ(test_mask, result); +} + +template +inline void host_check_construction() { + if constexpr (is_type_v>) { + host_test_simd_traits(); + host_test_mask_traits(); + } +} + +template +inline void host_check_construction_all_types( + Kokkos::Experimental::Impl::data_types) { + (host_check_construction(), ...); +} + +template +inline void host_check_construction_all_abis( + Kokkos::Experimental::Impl::abi_set) { + using DataTypes = Kokkos::Experimental::Impl::data_type_set; + (host_check_construction_all_types(DataTypes()), ...); +} + +template +KOKKOS_INLINE_FUNCTION void device_test_simd_traits() { + using simd_type = Kokkos::Experimental::simd; + + simd_type default_simd, result; + simd_type test_simd(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); }); + simd_type copy_simd(test_simd); + simd_type move_simd(std::move(copy_simd)); + default_simd = std::move(move_simd); + result = default_simd; + + kokkos_checker checker; + checker.truth(all_of(test_simd == result)); +} + +template +KOKKOS_INLINE_FUNCTION void device_test_mask_traits() { + using mask_type = Kokkos::Experimental::simd_mask; + + mask_type default_mask, result; + mask_type test_mask(KOKKOS_LAMBDA(std::size_t i) { return (i % 2 == 0); }); + mask_type copy_mask(test_mask); + mask_type move_mask(std::move(copy_mask)); + default_mask = std::move(move_mask); + result = default_mask; + + kokkos_checker checker; + checker.truth(test_mask == result); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_construction() { + if constexpr (is_type_v>) { + device_test_simd_traits(); + device_test_mask_traits(); + } +} + +template +KOKKOS_INLINE_FUNCTION void device_check_construction_all_types( + Kokkos::Experimental::Impl::data_types) { + (device_check_construction(), ...); +} + +template +KOKKOS_INLINE_FUNCTION void device_check_construction_all_abis( + Kokkos::Experimental::Impl::abi_set) { + using DataTypes = Kokkos::Experimental::Impl::data_type_set; + (device_check_construction_all_types(DataTypes()), ...); +} + +class simd_device_construction_functor { + public: + KOKKOS_INLINE_FUNCTION void operator()(int) const { + device_check_construction_all_abis( + Kokkos::Experimental::Impl::device_abi_set()); + } +}; + +TEST(simd, host_construction) { + host_check_construction_all_abis(Kokkos::Experimental::Impl::host_abi_set()); +} + +TEST(simd, device_construction) { + Kokkos::parallel_for(Kokkos::RangePolicy>(0, 1), + simd_device_construction_functor()); +} + +#endif diff --git a/packages/tpetra/CMakeLists.txt b/packages/tpetra/CMakeLists.txt index 6dbaa6f6485b..eb0c52d686a6 100644 --- a/packages/tpetra/CMakeLists.txt +++ b/packages/tpetra/CMakeLists.txt @@ -24,7 +24,7 @@ TRIBITS_ADD_OPTION_AND_DEFINE( # Supported Kokkos version in Trilinos # NOTE: When we snapshot Kokkos into Trilinos, we have to update these numbers to maintain # compatibility with external Kokkos -SET(Tpetra_SUPPORTED_KOKKOS_VERSION "4.4.0") +SET(Tpetra_SUPPORTED_KOKKOS_VERSION "4.4.1") # Option to allow developers to ignore incompatible Kokkos versions