From c8fedfff8a37b702b5530c4c7c40b27bb37b430a Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Fri, 15 Mar 2024 12:02:36 -0400
Subject: [PATCH 01/60] DOC v24.06 Updates [skip ci]

---
 .../cuda11.8-conda/devcontainer.json          |  4 +--
 .devcontainer/cuda11.8-pip/devcontainer.json  |  8 +++---
 .../cuda12.2-conda/devcontainer.json          |  4 +--
 .devcontainer/cuda12.2-pip/devcontainer.json  |  8 +++---
 .github/workflows/build.yaml                  | 16 ++++++------
 .github/workflows/pr.yaml                     | 26 +++++++++----------
 .github/workflows/test.yaml                   | 10 +++----
 README.md                                     |  2 +-
 VERSION                                       |  2 +-
 .../all_cuda-118_arch-aarch64.yaml            |  8 +++---
 .../all_cuda-118_arch-x86_64.yaml             |  8 +++---
 .../all_cuda-122_arch-aarch64.yaml            |  8 +++---
 .../all_cuda-122_arch-x86_64.yaml             |  8 +++---
 .../bench_ann_cuda-118_arch-aarch64.yaml      |  2 +-
 .../bench_ann_cuda-118_arch-x86_64.yaml       |  2 +-
 .../bench_ann_cuda-120_arch-aarch64.yaml      |  2 +-
 .../bench_ann_cuda-120_arch-x86_64.yaml       |  2 +-
 .../recipes/raft-dask/conda_build_config.yaml |  2 +-
 .../cmake/thirdparty/fetch_rapids.cmake       |  2 +-
 dependencies.yaml                             | 24 ++++++++---------
 docs/source/build.md                          |  2 +-
 docs/source/developer_guide.md                |  8 +++---
 docs/source/raft_ann_benchmarks.md            | 12 ++++-----
 python/pylibraft/pyproject.toml               |  4 +--
 python/raft-ann-bench/pyproject.toml          |  2 +-
 python/raft-dask/pyproject.toml               |  8 +++---
 26 files changed, 92 insertions(+), 92 deletions(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 2682510ed1..3f84407d41 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index de039eeb11..c24cddd78e 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -5,22 +5,22 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda11.8-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.4": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": {
       "version": "1.14.1"
     },
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.4": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.6": {
       "version": "11.8",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json
index 4b24d94dd1..1846d0eac3 100644
--- a/.devcontainer/cuda12.2-conda/devcontainer.json
+++ b/.devcontainer/cuda12.2-conda/devcontainer.json
@@ -5,12 +5,12 @@
     "args": {
       "CUDA": "12.2",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-mambaforge-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils"
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index 489546cb21..291ee56e7f 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -5,22 +5,22 @@
     "args": {
       "CUDA": "12.2",
       "PYTHON_PACKAGE_MANAGER": "pip",
-      "BASE": "rapidsai/devcontainers:24.04-cpp-cuda12.2-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ubuntu22.04"
     }
   },
   "hostRequirements": {"gpu": "optional"},
   "features": {
-    "ghcr.io/rapidsai/devcontainers/features/ucx:24.4": {
+    "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": {
       "version": "1.14.1"
     },
-    "ghcr.io/rapidsai/devcontainers/features/cuda:24.4": {
+    "ghcr.io/rapidsai/devcontainers/features/cuda:24.6": {
       "version": "12.2",
       "installcuBLAS": true,
       "installcuSOLVER": true,
       "installcuRAND": true,
       "installcuSPARSE": true
     },
-    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.4": {}
+    "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
   },
   "overrideFeatureInstallOrder": [
     "ghcr.io/rapidsai/devcontainers/features/ucx",
diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index bd8b13d21e..c8837afba7 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -28,7 +28,7 @@ concurrency:
 jobs:
   cpp-build:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -37,7 +37,7 @@ jobs:
   python-build:
     needs: [cpp-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -46,7 +46,7 @@ jobs:
   upload-conda:
     needs: [cpp-build, python-build]
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-upload-packages.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -57,7 +57,7 @@ jobs:
     if: github.ref_type == 'branch'
     needs: python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       arch: "amd64"
       branch: ${{ inputs.branch }}
@@ -69,7 +69,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-build-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -79,7 +79,7 @@ jobs:
   wheel-publish-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -89,7 +89,7 @@ jobs:
   wheel-build-raft-dask:
     needs: wheel-publish-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
@@ -99,7 +99,7 @@ jobs:
   wheel-publish-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index ada46141a7..c2d9556859 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -25,29 +25,29 @@ jobs:
       - wheel-tests-raft-dask
       - devcontainer
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/pr-builder.yaml@branch-24.06
   checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.06
     with:
       enable_check_generated_files: false
   conda-cpp-build:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-build.yaml@branch-24.06
     with:
       build_type: pull-request
       node_type: cpu16
   conda-cpp-tests:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06
     with:
       build_type: pull-request
   conda-cpp-checks:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.06
     with:
       build_type: pull-request
       enable_check_symbols: true
@@ -55,19 +55,19 @@ jobs:
   conda-python-build:
     needs: conda-cpp-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-build.yaml@branch-24.06
     with:
       build_type: pull-request
   conda-python-tests:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: pull-request
   docs-build:
     needs: conda-python-build
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/custom-job.yaml@branch-24.06
     with:
       build_type: pull-request
       node_type: "gpu-v100-latest-1"
@@ -77,34 +77,34 @@ jobs:
   wheel-build-pylibraft:
     needs: checks
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: pull-request
       script: ci/build_wheel_pylibraft.sh
   wheel-tests-pylibraft:
     needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: pull-request
       script: ci/test_wheel_pylibraft.sh
   wheel-build-raft-dask:
     needs: wheel-tests-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: pull-request
       script: "ci/build_wheel_raft_dask.sh"
   wheel-tests-raft-dask:
     needs: wheel-build-raft-dask
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: pull-request
       script: ci/test_wheel_raft_dask.sh
   devcontainer:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@fix/devcontainer-json-location
+    uses: rapidsai/shared-workflows/.github/workflows/build-in-devcontainer.yaml@branch-24.06
     with:
       arch: '["amd64"]'
       cuda: '["12.2"]'
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 2a557a8b84..18094cc05a 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -16,7 +16,7 @@ on:
 jobs:
   conda-cpp-checks:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-post-build-checks.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -26,7 +26,7 @@ jobs:
       symbol_exclusions: _ZN\d+raft_cutlass
   conda-cpp-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-cpp-tests.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -34,7 +34,7 @@ jobs:
       sha: ${{ inputs.sha }}
   conda-python-tests:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/conda-python-tests.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -42,7 +42,7 @@ jobs:
       sha: ${{ inputs.sha }}
   wheel-tests-pylibraft:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
@@ -51,7 +51,7 @@ jobs:
       script: ci/test_wheel_pylibraft.sh
   wheel-tests-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.04
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: nightly
       branch: ${{ inputs.branch }}
diff --git a/README.md b/README.md
index 7833a5cfa3..c501c37b2f 100755
--- a/README.md
+++ b/README.md
@@ -293,7 +293,7 @@ You can also install the conda packages individually using the `mamba` command a
 mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.0
 ```
 
-If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.04/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
+If installing the C++ APIs please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.06/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
 
 ### Installing Python through Pip
 
diff --git a/VERSION b/VERSION
index 4a2fe8aa57..0bff6981a3 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-24.04.00
+24.06.00
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index e27532a489..cc09d56057 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -20,7 +20,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.4.*
+- dask-cuda==24.6.*
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
 - gmock>=1.13.0
@@ -46,9 +46,9 @@ dependencies:
 - pydata-sphinx-theme
 - pytest-cov
 - pytest==7.*
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.6.*
 - recommonmark
-- rmm==24.4.*
+- rmm==24.6.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
@@ -56,6 +56,6 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.17
 - ucx-proc=*=gpu
-- ucx-py==0.37.*
+- ucx-py==0.38.*
 - ucx>=1.15.0,<1.16.0
 name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index bf535c5c04..9fbdcdaad4 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -20,7 +20,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.4.*
+- dask-cuda==24.6.*
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - gmock>=1.13.0
@@ -46,9 +46,9 @@ dependencies:
 - pydata-sphinx-theme
 - pytest-cov
 - pytest==7.*
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.6.*
 - recommonmark
-- rmm==24.4.*
+- rmm==24.6.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
@@ -56,6 +56,6 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
 - ucx-proc=*=gpu
-- ucx-py==0.37.*
+- ucx-py==0.38.*
 - ucx>=1.15.0,<1.16.0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-122_arch-aarch64.yaml b/conda/environments/all_cuda-122_arch-aarch64.yaml
index 8ea3843841..1e78e7deca 100644
--- a/conda/environments/all_cuda-122_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-122_arch-aarch64.yaml
@@ -21,7 +21,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.4.*
+- dask-cuda==24.6.*
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
 - gmock>=1.13.0
@@ -42,9 +42,9 @@ dependencies:
 - pydata-sphinx-theme
 - pytest-cov
 - pytest==7.*
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.6.*
 - recommonmark
-- rmm==24.4.*
+- rmm==24.6.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
@@ -52,6 +52,6 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.17
 - ucx-proc=*=gpu
-- ucx-py==0.37.*
+- ucx-py==0.38.*
 - ucx>=1.15.0,<1.16.0
 name: all_cuda-122_arch-aarch64
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index a3f6f7e99f..6d88855f30 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -21,7 +21,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.0
-- dask-cuda==24.4.*
+- dask-cuda==24.6.*
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - gmock>=1.13.0
@@ -42,9 +42,9 @@ dependencies:
 - pydata-sphinx-theme
 - pytest-cov
 - pytest==7.*
-- rapids-dask-dependency==24.4.*
+- rapids-dask-dependency==24.6.*
 - recommonmark
-- rmm==24.4.*
+- rmm==24.6.*
 - scikit-build-core>=0.7.0
 - scikit-learn
 - scipy
@@ -52,6 +52,6 @@ dependencies:
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
 - ucx-proc=*=gpu
-- ucx-py==0.37.*
+- ucx-py==0.38.*
 - ucx>=1.15.0,<1.16.0
 name: all_cuda-122_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index 0e0385ceeb..b5f662ebc1 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -38,7 +38,7 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.4.*
+- rmm==24.6.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-118_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index dfe76a2948..6c56cb688c 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -38,7 +38,7 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.4.*
+- rmm==24.6.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-118_arch-x86_64
diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
index 0a6567c646..7f3107e5d6 100644
--- a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -34,7 +34,7 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.4.*
+- rmm==24.6.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-aarch64==2.17
 name: bench_ann_cuda-120_arch-aarch64
diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
index a89d5317b6..62739354a5 100644
--- a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -34,7 +34,7 @@ dependencies:
 - openblas
 - pandas
 - pyyaml
-- rmm==24.4.*
+- rmm==24.6.*
 - scikit-build-core>=0.7.0
 - sysroot_linux-64==2.17
 name: bench_ann_cuda-120_arch-x86_64
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index d2bdcbb351..7db48fb684 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -17,7 +17,7 @@ ucx_version:
   - ">=1.15.0,<1.16.0"
 
 ucx_py_version:
-  - "0.37.*"
+  - "0.38.*"
 
 cmake_version:
   - ">=3.26.4"
diff --git a/cpp/template/cmake/thirdparty/fetch_rapids.cmake b/cpp/template/cmake/thirdparty/fetch_rapids.cmake
index aadfdb0028..11d2403963 100644
--- a/cpp/template/cmake/thirdparty/fetch_rapids.cmake
+++ b/cpp/template/cmake/thirdparty/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 
 # Use this variable to update RAPIDS and RAFT versions
-set(RAPIDS_VERSION "24.04")
+set(RAPIDS_VERSION "24.06")
 
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/RAFT_RAPIDS.cmake)
     file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-${RAPIDS_VERSION}/RAPIDS.cmake
diff --git a/dependencies.yaml b/dependencies.yaml
index 836775a5a3..658b08421d 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -180,7 +180,7 @@ dependencies:
     common:
       - output_types: [conda]
         packages:
-          - &rmm_conda rmm==24.4.*
+          - &rmm_conda rmm==24.6.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -201,10 +201,10 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - &rmm_cu12 rmm-cu12==24.4.*
+              - &rmm_cu12 rmm-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages:
-              - &rmm_cu11 rmm-cu11==24.4.*
+              - &rmm_cu11 rmm-cu11==24.6.*
           - {matrix: null, packages: [*rmm_conda] }
   checks:
     common:
@@ -435,20 +435,20 @@ dependencies:
     common:
       - output_types: [conda, pyproject]
         packages:
-          - dask-cuda==24.4.*
+          - dask-cuda==24.6.*
           - joblib>=0.11
           - numba>=0.57
           - *numpy
-          - rapids-dask-dependency==24.4.*
-          - ucx-py==0.37.*
+          - rapids-dask-dependency==24.6.*
+          - ucx-py==0.38.*
       - output_types: conda
         packages:
           - ucx>=1.15.0,<1.16.0
           - ucx-proc=*=gpu
-          - &ucx_py_conda ucx-py==0.37.*
+          - &ucx_py_conda ucx-py==0.38.*
       - output_types: pyproject
         packages:
-          - &pylibraft_conda pylibraft==24.4.*
+          - &pylibraft_conda pylibraft==24.6.*
       - output_types: requirements
         packages:
           # pip recognizes the index as a global option for the requirements.txt file
@@ -460,12 +460,12 @@ dependencies:
         matrices:
           - matrix: {cuda: "12.*"}
             packages:
-              - &pylibraft_cu12 pylibraft-cu12==24.4.*
-              - &ucx_py_cu12 ucx-py-cu12==0.37.*
+              - &pylibraft_cu12 pylibraft-cu12==24.6.*
+              - &ucx_py_cu12 ucx-py-cu12==0.38.*
           - matrix: {cuda: "11.*"}
             packages:
-              - &pylibraft_cu11 pylibraft-cu11==24.4.*
-              - &ucx_py_cu11 ucx-py-cu11==0.37.*
+              - &pylibraft_cu11 pylibraft-cu11==24.6.*
+              - &ucx_py_cu11 ucx-py-cu11==0.38.*
           - {matrix: null, packages: [*pylibraft_conda, *ucx_py_conda]}
   test_python_common:
     common:
diff --git a/docs/source/build.md b/docs/source/build.md
index 7bb6cf515a..c0abf3f995 100644
--- a/docs/source/build.md
+++ b/docs/source/build.md
@@ -56,7 +56,7 @@ You can also install the conda packages individually using the `mamba` command a
 mamba install -c rapidsai -c conda-forge -c nvidia libraft libraft-headers cuda-version=12.0
 ```
 
-If installing the C++ APIs Please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.04/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
+If installing the C++ APIs Please see [using libraft](https://docs.rapids.ai/api/raft/nightly/using_libraft/) for more information on using the pre-compiled shared library. You can also refer to the [example C++ template project](https://github.com/rapidsai/raft/tree/branch-24.06/cpp/template) for a ready-to-go CMake configuration that you can drop into your project and build against installed RAFT development artifacts above.
 
 ## Installing Python through Pip
 
diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index d29130add0..e10e8987af 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -187,7 +187,7 @@ RAFT relies on `clang-format` to enforce code style across all C++ and CUDA sour
 1. Do not split empty functions/records/namespaces.
 2. Two-space indentation everywhere, including the line continuations.
 3. Disable reflowing of comments.
-   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.04/cpp/.clang-format).
+   The reasons behind these deviations from the Google style guide are given in comments [here](https://github.com/rapidsai/raft/blob/branch-24.06/cpp/.clang-format).
 
 [`doxygen`](https://doxygen.nl/) is used as documentation generator and also as a documentation linter.
 In order to run doxygen as a linter on C++/CUDA code, run
@@ -205,7 +205,7 @@ you can run  `codespell -i 3 -w .` from the repository root directory.
 This will bring up an interactive prompt to select which spelling fixes to apply.
 
 ### #include style
-[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.04/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
+[include_checker.py](https://github.com/rapidsai/raft/blob/branch-24.06/cpp/scripts/include_checker.py) is used to enforce the include style as follows:
 1. `#include "..."` should be used for referencing local files only. It is acceptable to be used for referencing files in a sub-folder/parent-folder of the same algorithm, but should never be used to include files in other algorithms or between algorithms and the primitives or other dependencies.
 2. `#include <...>` should be used for referencing everything else
 
@@ -215,7 +215,7 @@ python ./cpp/scripts/include_checker.py --inplace [cpp/include cpp/test ... list
 ```
 
 ### Copyright header
-[copyright.py](https://github.com/rapidsai/raft/blob/branch-24.04/ci/checks/copyright.py) checks the Copyright header for all git-modified files
+[copyright.py](https://github.com/rapidsai/raft/blob/branch-24.06/ci/checks/copyright.py) checks the Copyright header for all git-modified files
 
 Manually, you can run the following to bulk-fix the header if only the years need to be updated:
 ```bash
@@ -229,7 +229,7 @@ Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY`
 ## Logging
 
 ### Introduction
-Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.04/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
+Anything and everything about logging is defined inside [logger.hpp](https://github.com/rapidsai/raft/blob/branch-24.06/cpp/include/raft/core/logger.hpp). It uses [spdlog](https://github.com/gabime/spdlog) underneath, but this information is transparent to all.
 
 ### Usage
 ```cpp
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index 68fe80f9ce..3eaa72beae 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -62,7 +62,7 @@ Nightly images are located in [dockerhub](https://hub.docker.com/r/rapidsai/raft
 - The following command pulls the nightly container for python version 10, cuda version 12, and RAFT version 23.10:
 
 ```bash
-docker pull rapidsai/raft-ann-bench:24.04a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container.
+docker pull rapidsai/raft-ann-bench:24.06a-cuda12.0-py3.10 #substitute raft-ann-bench for the exact desired container.
 ```
 
 The CUDA and python versions can be changed for the supported values:
@@ -83,7 +83,7 @@ You can see the exact versions as well in the dockerhub site:
 [//]: # ()
 [//]: # (```bash)
 
-[//]: # (docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:24.04-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.)
+[//]: # (docker pull nvcr.io/nvidia/rapidsai/raft-ann-bench:24.06-cuda11.8-py3.10 #substitute raft-ann-bench for the exact desired container.)
 
 [//]: # (```)
 
@@ -344,7 +344,7 @@ For GPU-enabled systems, the `DATA_FOLDER` variable should be a local folder whe
 export DATA_FOLDER=path/to/store/datasets/and/results
 docker run --gpus all --rm -it -u $(id -u)                      \
     -v $DATA_FOLDER:/data/benchmarks                            \
-    rapidsai/raft-ann-bench:24.04a-cuda11.8-py3.10              \
+    rapidsai/raft-ann-bench:24.06a-cuda11.8-py3.10              \
     "--dataset deep-image-96-angular"                           \
     "--normalize"                                               \
     "--algorithms raft_cagra,raft_ivf_pq --batch-size 10 -k 10" \
@@ -355,7 +355,7 @@ Usage of the above command is as follows:
 
 | Argument                                                  | Description                                                                                        |
 |-----------------------------------------------------------|----------------------------------------------------------------------------------------------------|
-| `rapidsai/raft-ann-bench:24.04a-cuda11.8-py3.10`          | Image to use. Can be either `raft-ann-bench` or `raft-ann-bench-datasets`                          |
+| `rapidsai/raft-ann-bench:24.06a-cuda11.8-py3.10`          | Image to use. Can be either `raft-ann-bench` or `raft-ann-bench-datasets`                          |
 | `"--dataset deep-image-96-angular"`                       | Dataset name                                                                                       |
 | `"--normalize"`                                           | Whether to normalize the dataset                                                                   |
 | `"--algorithms raft_cagra,hnswlib --batch-size 10 -k 10"` | Arguments passed to the `run` script, such as the algorithms to benchmark, the batch size, and `k` |
@@ -372,7 +372,7 @@ The container arguments in the above section also be used for the CPU-only conta
 export DATA_FOLDER=path/to/store/datasets/and/results
 docker run  --rm -it -u $(id -u)                  \
     -v $DATA_FOLDER:/data/benchmarks              \
-    rapidsai/raft-ann-bench-cpu:24.04a-py3.10     \
+    rapidsai/raft-ann-bench-cpu:24.06a-py3.10     \
      "--dataset deep-image-96-angular"            \
      "--normalize"                                \
      "--algorithms hnswlib --batch-size 10 -k 10" \
@@ -389,7 +389,7 @@ docker run --gpus all --rm -it -u $(id -u)          \
     --entrypoint /bin/bash                          \
     --workdir /data/benchmarks                      \
     -v $DATA_FOLDER:/data/benchmarks                \
-    rapidsai/raft-ann-bench:24.04a-cuda11.8-py3.10 
+    rapidsai/raft-ann-bench:24.06a-cuda11.8-py3.10 
 ```
 
 This will drop you into a command line in the container, with the `raft-ann-bench` python package ready to use, as described in the [Running the benchmarks](#running-the-benchmarks) section above:
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index d687f70cf5..3e8ca0b6d3 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -19,7 +19,7 @@ requires = [
     "cuda-python>=11.7.1,<12.0a0",
     "cython>=3.0.0",
     "ninja",
-    "rmm==24.4.*",
+    "rmm==24.6.*",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 build-backend = "scikit_build_core.build"
@@ -37,7 +37,7 @@ requires-python = ">=3.9"
 dependencies = [
     "cuda-python>=11.7.1,<12.0a0",
     "numpy>=1.23,<2.0a0",
-    "rmm==24.4.*",
+    "rmm==24.6.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",
diff --git a/python/raft-ann-bench/pyproject.toml b/python/raft-ann-bench/pyproject.toml
index 4a185b22ca..ba336d841c 100644
--- a/python/raft-ann-bench/pyproject.toml
+++ b/python/raft-ann-bench/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 
 [project]
 name = "raft-ann-bench"
-version = "24.04.00"
+version = "24.06.00"
 description = "RAFT ANN benchmarks"
 authors = [
     { name = "NVIDIA Corporation" },
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 07e2463c5c..815f6b277c 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -33,13 +33,13 @@ authors = [
 license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
-    "dask-cuda==24.4.*",
+    "dask-cuda==24.6.*",
     "joblib>=0.11",
     "numba>=0.57",
     "numpy>=1.23,<2.0a0",
-    "pylibraft==24.4.*",
-    "rapids-dask-dependency==24.4.*",
-    "ucx-py==0.37.*",
+    "pylibraft==24.6.*",
+    "rapids-dask-dependency==24.6.*",
+    "ucx-py==0.38.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",

From 52e0d7331cb533955f479d82e4656253eaa9ef6f Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Thu, 21 Mar 2024 16:15:41 -0700
Subject: [PATCH 02/60] Replace usages of raw `get_upstream` with
 `get_upstream_resource()` (#2207)

We want to get rid of raw memory resources so move to the new interface instead

Authors:
  - Michael Schellenberger Costa (https://github.com/miscco)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2207
---
 cpp/test/core/device_resources_manager.cpp | 16 ++++++++--------
 cpp/test/core/handle.cpp                   |  8 +++++---
 2 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/cpp/test/core/device_resources_manager.cpp b/cpp/test/core/device_resources_manager.cpp
index c7c9e175ea..b9b8996a09 100644
--- a/cpp/test/core/device_resources_manager.cpp
+++ b/cpp/test/core/device_resources_manager.cpp
@@ -21,6 +21,7 @@
 #include <rmm/mr/device/limiting_resource_adaptor.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda_runtime_api.h>
 
@@ -114,17 +115,16 @@ TEST(DeviceResourcesManager, ObeysSetters)
 
     auto* mr = dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>*>(
       rmm::mr::get_current_device_resource());
-    auto* workspace_mr =
-      dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>*>(
-        dynamic_cast<rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>*>(
-          res.get_workspace_resource())
-          ->get_upstream());
+    rmm::device_async_resource_ref workspace_mr =
+      dynamic_cast<rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>*>(
+        res.get_workspace_resource())
+        ->get_upstream_resource();
     if (upstream_mrs[i % devices.size()] != nullptr) {
       // Expect that the current memory resource is a pool memory resource as requested
       EXPECT_NE(mr, nullptr);
-      // Expect that the upstream workspace memory resource is a pool memory
-      // resource as requested
-      EXPECT_NE(workspace_mr, nullptr);
+
+      // We cannot easily check the type of a resource_ref
+      (void)workspace_mr;
     }
 
     {
diff --git a/cpp/test/core/handle.cpp b/cpp/test/core/handle.cpp
index 0b0b4b54ab..be18b0d5b4 100644
--- a/cpp/test/core/handle.cpp
+++ b/cpp/test/core/handle.cpp
@@ -25,6 +25,7 @@
 #include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda_runtime.h>
 
@@ -281,7 +282,8 @@ TEST(Raft, WorkspaceResource)
   raft::handle_t handle;
 
   // The returned resource is always a limiting adaptor
-  auto* orig_mr = resource::get_workspace_resource(handle)->get_upstream();
+  rmm::device_async_resource_ref orig_mr{
+    resource::get_workspace_resource(handle)->get_upstream_resource()};
 
   // Let's create a pooled resource
   auto pool_mr = std::shared_ptr<rmm::mr::device_memory_resource>{new rmm::mr::pool_memory_resource(
@@ -295,8 +297,8 @@ TEST(Raft, WorkspaceResource)
   auto new_mr = resource::get_workspace_resource(handle);
 
   // By this point, the orig_mr likely points to a non-existent resource; don't dereference!
-  ASSERT_NE(orig_mr, new_mr);
-  ASSERT_EQ(pool_mr.get(), new_mr->get_upstream());
+  ASSERT_NE(orig_mr, rmm::device_async_resource_ref{new_mr});
+  ASSERT_EQ(rmm::device_async_resource_ref{pool_mr.get()}, new_mr->get_upstream_resource());
   // We can safely reset pool_mr, because the shared_ptr to the pool memory stays in the resource
   pool_mr.reset();
 

From 03b24cf01c1d583d1352e3607e664dae7c89cb1a Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Tue, 26 Mar 2024 10:13:08 -0700
Subject: [PATCH 03/60] Get rid of `cuco::sentinel` namespace (#2243)

This PR removes the use of the deprecated `cuco::sentinel` namespace.

Needed by https://github.com/rapidsai/rapids-cmake/pull/569

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/2243
---
 .../detail/coo_spmv_strategies/hash_strategy.cuh       | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh
index e271f2cdbe..8c267c5e63 100644
--- a/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh
+++ b/cpp/include/raft/sparse/distance/detail/coo_spmv_strategies/hash_strategy.cuh
@@ -236,8 +236,8 @@ class hash_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
     return insert_type::make_from_uninitialized_slots(cooperative_groups::this_thread_block(),
                                                       cache,
                                                       cache_size,
-                                                      cuco::sentinel::empty_key{value_idx{-1}},
-                                                      cuco::sentinel::empty_value{value_t{0}});
+                                                      cuco::empty_key{value_idx{-1}},
+                                                      cuco::empty_value{value_t{0}});
   }
 
   __device__ inline void insert(insert_type cache, const value_idx& key, const value_t& value)
@@ -247,10 +247,8 @@ class hash_strategy : public coo_spmv_strategy<value_idx, value_t, tpb> {
 
   __device__ inline find_type init_find(smem_type cache, const value_idx& cache_size)
   {
-    return find_type(cache,
-                     cache_size,
-                     cuco::sentinel::empty_key{value_idx{-1}},
-                     cuco::sentinel::empty_value{value_t{0}});
+    return find_type(
+      cache, cache_size, cuco::empty_key{value_idx{-1}}, cuco::empty_value{value_t{0}});
   }
 
   __device__ inline value_t find(find_type cache, const value_idx& key)

From eabe3b00dad4225b00cb93d16cf1918b213b0ae3 Mon Sep 17 00:00:00 2001
From: tsuki <12711693+enp1s0@users.noreply.github.com>
Date: Thu, 4 Apr 2024 02:11:48 +0900
Subject: [PATCH 04/60] Add CAGRA-Q subspace dim = 4 support (#2244)

This PR adds the support for subspace dim (pq_dim) = 4 in CAGRA-Q

Authors:
  - tsuki (https://github.com/enp1s0)

Approvers:
  - Artem M. Chirkin (https://github.com/achirkin)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2244
---
 .../neighbors/detail/cagra/cagra_search.cuh   |  3 +-
 .../detail/cagra/compute_distance_vpq.cuh     | 29 ++++++++++---------
 .../raft/neighbors/detail/vpq_dataset.cuh     |  2 +-
 cpp/test/neighbors/ann_cagra_vpq.cuh          |  4 +--
 4 files changed, 21 insertions(+), 17 deletions(-)
 mode change 100755 => 100644 cpp/test/neighbors/ann_cagra_vpq.cuh

diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
index d30f69ddcd..ccfe3c7e2d 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
@@ -166,7 +166,8 @@ void launch_vpq_search_main_core(
   CagraSampleFilterT sample_filter)
 {
   RAFT_EXPECTS(vpq_dset->pq_bits() == 8, "Only pq_bits = 8 is supported for now");
-  RAFT_EXPECTS(vpq_dset->pq_len() == 2, "Only pq_len 2 is supported for now");
+  RAFT_EXPECTS(vpq_dset->pq_len() == 2 || vpq_dset->pq_len() == 4,
+               "Only pq_len 2 or 4 is supported for now");
   RAFT_EXPECTS(vpq_dset->dim() % vpq_dset->pq_dim() == 0,
                "dim must be a multiple of pq_dim at the moment");
 
diff --git a/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh
index 0204addba7..e73d24bfb6 100644
--- a/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh
@@ -33,6 +33,8 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<half, DIS
   using CODE_BOOK_T = CODE_BOOK_T_;
   using QUERY_T     = typename dataset_descriptor_base_t<half, DISTANCE_T, INDEX_T>::QUERY_T;
 
+  static_assert(std::is_same_v<CODE_BOOK_T, half>, "Only CODE_BOOK_T = `half` is supported now");
+
   const std::uint8_t* encoded_dataset_ptr;
   const std::uint32_t encoded_dataset_dim;
   const std::uint32_t n_subspace;
@@ -53,18 +55,19 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<half, DIS
     smem_pq_code_book_ptr = reinterpret_cast<CODE_BOOK_T*>(smem_ptr);
 
     // Copy PQ table
-    if constexpr (std::is_same<CODE_BOOK_T, half>::value) {
-      for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) {
-        half2 buf2;
-        buf2.x                                                   = pq_code_book_ptr[i];
-        buf2.y                                                   = pq_code_book_ptr[i + 1];
-        (reinterpret_cast<half2*>(smem_pq_code_book_ptr + i))[0] = buf2;
-      }
-    } else {
-      for (unsigned i = threadIdx.x; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x) {
-        // TODO: vectorize
-        smem_pq_code_book_ptr[i] = pq_code_book_ptr[i];
-      }
+    for (unsigned i = threadIdx.x * 2; i < (1 << PQ_BITS) * PQ_LEN; i += blockDim.x * 2) {
+      half2 buf2;
+      buf2.x = pq_code_book_ptr[i];
+      buf2.y = pq_code_book_ptr[i + 1];
+
+      // Change the order of PQ code book array to reduce the
+      // frequency of bank conflicts.
+      constexpr auto num_elements_per_bank  = 4 / utils::size_of<CODE_BOOK_T>();
+      constexpr auto num_banks_per_subspace = PQ_LEN / num_elements_per_bank;
+      const auto j                          = i / num_elements_per_bank;
+      const auto smem_index =
+        (j / num_banks_per_subspace) + (j % num_banks_per_subspace) * (1 << PQ_BITS);
+      reinterpret_cast<half2*>(smem_pq_code_book_ptr)[smem_index] = buf2;
     }
   }
 
@@ -136,7 +139,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<half, DIS
               4 + k));
           }
           //
-          if constexpr ((std::is_same<CODE_BOOK_T, half>::value) && (PQ_LEN % 2 == 0)) {
+          if constexpr (PQ_LEN % 2 == 0) {
             // **** Use half2 for distance computation ****
             half2 norm2{0, 0};
 #pragma unroll
diff --git a/cpp/include/raft/neighbors/detail/vpq_dataset.cuh b/cpp/include/raft/neighbors/detail/vpq_dataset.cuh
index f1321ba343..f6cd2a1ceb 100644
--- a/cpp/include/raft/neighbors/detail/vpq_dataset.cuh
+++ b/cpp/include/raft/neighbors/detail/vpq_dataset.cuh
@@ -81,7 +81,7 @@ auto fill_missing_params_heuristics(const vpq_params& params, const DatasetT& da
   vpq_params r  = params;
   double n_rows = dataset.extent(0);
   size_t dim    = dataset.extent(1);
-  if (r.pq_dim == 0) { r.pq_dim = raft::div_rounding_up_safe(dim, size_t{2}); }
+  if (r.pq_dim == 0) { r.pq_dim = raft::div_rounding_up_safe(dim, size_t{4}); }
   if (r.pq_bits == 0) { r.pq_bits = 8; }
   if (r.vq_n_centers == 0) { r.vq_n_centers = raft::round_up_safe<uint32_t>(std::sqrt(n_rows), 8); }
   if (r.vq_kmeans_trainset_fraction == 0) {
diff --git a/cpp/test/neighbors/ann_cagra_vpq.cuh b/cpp/test/neighbors/ann_cagra_vpq.cuh
old mode 100755
new mode 100644
index 503b1a413a..6b24bca921
--- a/cpp/test/neighbors/ann_cagra_vpq.cuh
+++ b/cpp/test/neighbors/ann_cagra_vpq.cuh
@@ -158,7 +158,7 @@ class AnnCagraVpqTest : public ::testing::TestWithParam<AnnCagraVpqInputs> {
       resource::sync_stream(handle_);
     }
 
-    const auto vpq_k = ps.k * 16;
+    const auto vpq_k = ps.k * 4;
     {
       rmm::device_uvector<DistanceT> distances_dev(vpq_k * ps.n_queries, stream_);
       rmm::device_uvector<IdxT> indices_dev(vpq_k * ps.n_queries, stream_);
@@ -319,7 +319,7 @@ const std::vector<AnnCagraVpqInputs> vpq_inputs = raft::util::itertools::product
   {1000, 10000},                                      // n_rows
   {128, 132, 192, 256, 512, 768},                     // dim
   {8, 12},                                            // k
-  {2},                                                // pq_len
+  {2, 4},                                             // pq_len
   {8},                                                // pq_bits
   {graph_build_algo::NN_DESCENT},                     // build_algo
   {search_algo::SINGLE_CTA, search_algo::MULTI_CTA},  // algo

From 8a68518fd5a0ae0e750cd8f77b02f73efc111f5c Mon Sep 17 00:00:00 2001
From: Micka <mide@nvidia.com>
Date: Thu, 4 Apr 2024 19:26:04 +0200
Subject: [PATCH 05/60] Fix time computation in CAGRA notebook (#2231)

Closes #2230.
I am also adding `nn_descent` to the build parameters of cagra

Authors:
  - Micka (https://github.com/lowener)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2231
---
 .../VectorSearch_QuestionRetrieval.ipynb      | 52 ++++++++++---------
 1 file changed, 28 insertions(+), 24 deletions(-)

diff --git a/notebooks/VectorSearch_QuestionRetrieval.ipynb b/notebooks/VectorSearch_QuestionRetrieval.ipynb
index b3a15d3a08..33a2f60228 100644
--- a/notebooks/VectorSearch_QuestionRetrieval.ipynb
+++ b/notebooks/VectorSearch_QuestionRetrieval.ipynb
@@ -89,7 +89,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": 2,
    "id": "eb1e81c3",
    "metadata": {},
    "outputs": [
@@ -154,7 +154,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 3,
    "id": "ee4c5cc0",
    "metadata": {},
    "outputs": [
@@ -184,7 +184,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 4,
    "id": "0a1a6307",
    "metadata": {},
    "outputs": [
@@ -249,7 +249,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 5,
    "id": "ad90b4be",
    "metadata": {},
    "outputs": [
@@ -292,7 +292,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
    "id": "724dcacb",
    "metadata": {
     "scrolled": true
@@ -320,7 +320,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
    "id": "c27d4715",
    "metadata": {},
    "outputs": [
@@ -347,7 +347,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 8,
    "id": "bc375518",
    "metadata": {},
    "outputs": [
@@ -373,7 +373,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 9,
    "id": "ab154181",
    "metadata": {},
    "outputs": [
@@ -399,7 +399,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 10,
    "id": "2d6017ed",
    "metadata": {},
    "outputs": [
@@ -435,7 +435,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 11,
    "id": "f5cfb644",
    "metadata": {},
    "outputs": [
@@ -462,7 +462,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 12,
    "id": "b5694d00",
    "metadata": {},
    "outputs": [
@@ -489,7 +489,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 13,
    "id": "fcfc3c5b",
    "metadata": {},
    "outputs": [
@@ -528,7 +528,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 14,
    "id": "50df1f43-c580-4019-949a-06bdc7185536",
    "metadata": {},
    "outputs": [],
@@ -538,7 +538,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 15,
    "id": "091cde52-4652-4230-af2b-75c35357f833",
    "metadata": {},
    "outputs": [
@@ -546,21 +546,21 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 1min 23s, sys: 2min 7s, total: 3min 31s\n",
-      "Wall time: 4min 43s\n"
+      "CPU times: user 35.3 s, sys: 4.5 s, total: 39.8 s\n",
+      "Wall time: 2.16 s\n"
      ]
     }
    ],
    "source": [
     "%%time\n",
-    "params = cagra.IndexParams(intermediate_graph_degree=128, graph_degree=64)\n",
+    "params = cagra.IndexParams(intermediate_graph_degree=32, graph_degree=16, build_algo=\"nn_descent\")\n",
     "cagra_index = cagra.build(params, corpus_embeddings)\n",
-    "search_params = cagra.SearchParams()"
+    "search_params = cagra.SearchParams(algo=\"multi_cta\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 16,
    "id": "df229e21-f6b6-4d6c-ad54-2724f8738934",
    "metadata": {},
    "outputs": [],
@@ -569,9 +569,12 @@
     "    # Encode the query using the bi-encoder and find potentially relevant passages\n",
     "    question_embedding = bi_encoder.encode(query, convert_to_tensor=True)\n",
     "\n",
+    "    start_time = time.time()\n",
     "    hits = cagra.search(search_params, cagra_index, question_embedding[None], top_k)\n",
+    "    end_time = time.time()\n",
     "\n",
     "    # Output of top-k hits\n",
+    "    print(\"Results (after {:.3f} seconds):\".format(end_time - start_time))\n",
     "    print(\"Input question:\", query)\n",
     "    for k in range(top_k):\n",
     "        print(\"\\t{:.3f}\\t{}\".format(hits[0][0, k], passages[hits[1][0, k]]))"
@@ -587,19 +590,20 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 16 µs, sys: 25 µs, total: 41 µs\n",
-      "Wall time: 83.7 µs\n",
+      "Results (after 0.005 seconds):\n",
       "Input question: Who was Grace Hopper?\n",
       "\t181.649\t['Grace Hopper', 'Hopper was born in New York, USA. Hopper graduated from Vassar College in 1928 and Yale University in 1934 with a Ph.D degree in mathematics. She joined the US Navy during the World War II in 1943. She worked on computers in the Navy for 43 years. She then worked in other private industry companies after 1949. She retired from the Navy in 1986 and died on January 1, 1992.']\n",
       "\t192.946\t['Leona Helmsley', 'Leona Helmsley (July 4, 1920 – August 20, 2007) was an American businesswoman. She was known for having a flamboyant personality. She had a reputation for tyrannical behavior; she was nicknamed the Queen of Mean.']\n",
       "\t194.951\t['Grace Hopper', 'Grace Murray Hopper (December 9 1906 – January 1 1992) was an American computer scientist and United States Navy officer.']\n",
       "\t202.192\t['Nellie Bly', 'Elizabeth Cochrane Seaman (born Elizabeth Jane Cochran; May 5, 1864 – January 27, 1922), better known by her pen name Nellie Bly, was an American journalist, novelist and inventor. She was a newspaper reporter, who worked at various jobs for exposing poor working conditions. Nellie Bly, also, fought for women\\'s right and was known for investigative reporting. She best known for her record-breaking trip around the world in 72 days, inspired by the adventure novel \"Around the World in Eighty Days\" by Jules Verne. In the 1880s, she went undercover as a mentally ill patient in a psychiatric hospital for ten days, with the report being made public in a book called \"\"Ten Days in a Mad-House\"\". She was added to the National Women\\'s Hall of Fame in 1998.']\n",
-      "\t205.038\t['Abbie Hoffman', 'Abbot Howard \"Abbie\" Hoffman (November 30, 1936 – April 12, 1989) was an American social and political activist.']\n"
+      "\t205.038\t['Abbie Hoffman', 'Abbot Howard \"Abbie\" Hoffman (November 30, 1936 – April 12, 1989) was an American social and political activist.']\n",
+      "CPU times: user 4.18 ms, sys: 3.88 ms, total: 8.07 ms\n",
+      "Wall time: 9.97 ms\n"
      ]
     }
    ],
    "source": [
-    "%time \n",
+    "%%time \n",
     "search_raft_cagra(query=\"Who was Grace Hopper?\")"
    ]
   }
@@ -620,7 +624,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.10.13"
   }
  },
  "nbformat": 4,

From 4a20d03af7f6181e3083bc3b65522d7f2c3b6218 Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Mon, 8 Apr 2024 09:36:00 -0700
Subject: [PATCH 06/60] [FEA] Add support for `select_k` on CSR matrix (#2140)

- This PR is one part of the feature of #1969
- Add the API of 'select_k' accepting CSR as input
Authors:
  - James Rong (https://github.com/rhdong)

Approvers:
  - Ben Frederickson (https://github.com/benfred)
  - Micka (https://github.com/lowener)
  - Corey J. Nolet (https://github.com/cjnolet)

Authors:
  - rhdong (https://github.com/rhdong)

Approvers:
  - Artem M. Chirkin (https://github.com/achirkin)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2140
---
 cpp/bench/prims/CMakeLists.txt                |   1 +
 cpp/bench/prims/sparse/select_k_csr.cu        | 287 ++++++++++++
 .../raft/matrix/detail/select_radix.cuh       | 427 ++++++++++--------
 .../raft/matrix/detail/select_warpsort.cuh    |  55 ++-
 .../sparse/matrix/detail/select_k-ext.cuh     |  67 +++
 .../sparse/matrix/detail/select_k-inl.cuh     | 225 +++++++++
 .../raft/sparse/matrix/detail/select_k.cuh    |  24 +
 cpp/include/raft/sparse/matrix/select_k.cuh   |  87 ++++
 .../matrix/detail/select_k_double_int64_t.cu  |  32 ++
 .../matrix/detail/select_k_double_uint32_t.cu |  34 ++
 .../matrix/detail/select_k_float_int32.cu     |  32 ++
 .../matrix/detail/select_k_float_int64_t.cu   |  32 ++
 .../matrix/detail/select_k_float_uint32_t.cu  |  32 ++
 .../matrix/detail/select_k_half_int64_t.cu    |  32 ++
 .../matrix/detail/select_k_half_uint32_t.cu   |  32 ++
 cpp/test/CMakeLists.txt                       |   1 +
 cpp/test/sparse/select_k_csr.cu               | 398 ++++++++++++++++
 17 files changed, 1600 insertions(+), 198 deletions(-)
 create mode 100644 cpp/bench/prims/sparse/select_k_csr.cu
 create mode 100644 cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
 create mode 100644 cpp/include/raft/sparse/matrix/detail/select_k-inl.cuh
 create mode 100644 cpp/include/raft/sparse/matrix/detail/select_k.cuh
 create mode 100644 cpp/include/raft/sparse/matrix/select_k.cuh
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_float_int32.cu
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu
 create mode 100644 cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu
 create mode 100644 cpp/test/sparse/select_k_csr.cu

diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index 9f23c44a5c..0c5521d447 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -137,6 +137,7 @@ if(BUILD_PRIMS_BENCH)
     PATH
     bench/prims/sparse/bitmap_to_csr.cu
     bench/prims/sparse/convert_csr.cu
+    bench/prims/sparse/select_k_csr.cu
     bench/prims/main.cpp
   )
 
diff --git a/cpp/bench/prims/sparse/select_k_csr.cu b/cpp/bench/prims/sparse/select_k_csr.cu
new file mode 100644
index 0000000000..a91e6c8514
--- /dev/null
+++ b/cpp/bench/prims/sparse/select_k_csr.cu
@@ -0,0 +1,287 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <common/benchmark.hpp>
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/copy.cuh>
+#include <raft/random/make_blobs.cuh>
+#include <raft/random/rng_state.hpp>
+#include <raft/sparse/convert/csr.cuh>
+#include <raft/sparse/matrix/select_k.cuh>
+#include <raft/util/cuda_utils.cuh>
+#include <raft/util/itertools.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <random>
+#include <sstream>
+#include <unordered_set>
+#include <vector>
+
+namespace raft::bench::sparse {
+
+template <typename index_t>
+struct bench_param {
+  index_t n_rows;
+  index_t n_cols;
+  index_t top_k;
+  float sparsity;
+  bool select_min         = true;
+  bool customized_indices = false;
+};
+
+template <typename index_t>
+inline auto operator<<(std::ostream& os, const bench_param<index_t>& params) -> std::ostream&
+{
+  os << params.n_rows << "#" << params.n_cols << "#" << params.top_k << "#" << params.sparsity;
+  return os;
+}
+
+template <typename value_t, typename index_t>
+struct SelectKCsrTest : public fixture {
+  SelectKCsrTest(const bench_param<index_t>& p)
+    : fixture(true),
+      params(p),
+      handle(stream),
+      values_d(0, stream),
+      indptr_d(0, stream),
+      indices_d(0, stream),
+      customized_indices_d(0, stream),
+      dst_values_d(0, stream),
+      dst_indices_d(0, stream)
+  {
+    std::vector<bool> dense_values_h(params.n_rows * params.n_cols);
+    nnz = create_sparse_matrix(params.n_rows, params.n_cols, params.sparsity, dense_values_h);
+
+    std::vector<index_t> indices_h(nnz);
+    std::vector<index_t> customized_indices_h(nnz);
+    std::vector<index_t> indptr_h(params.n_rows + 1);
+
+    convert_to_csr(dense_values_h, params.n_rows, params.n_cols, indices_h, indptr_h);
+
+    std::vector<value_t> dst_values_h(params.n_rows * params.top_k, static_cast<value_t>(2.0f));
+    std::vector<index_t> dst_indices_h(params.n_rows * params.top_k,
+                                       static_cast<index_t>(params.n_rows * params.n_cols * 100));
+
+    dst_values_d.resize(params.n_rows * params.top_k, stream);
+    dst_indices_d.resize(params.n_rows * params.top_k, stream);
+    values_d.resize(nnz, stream);
+
+    if (nnz) {
+      auto blobs_values = raft::make_device_matrix<value_t, index_t>(handle, 1, nnz);
+      auto labels       = raft::make_device_vector<index_t, index_t>(handle, 1);
+
+      raft::random::make_blobs<value_t, index_t>(blobs_values.data_handle(),
+                                                 labels.data_handle(),
+                                                 1,
+                                                 nnz,
+                                                 1,
+                                                 stream,
+                                                 false,
+                                                 nullptr,
+                                                 nullptr,
+                                                 value_t(1.0),
+                                                 false,
+                                                 value_t(-10.0f),
+                                                 value_t(10.0f),
+                                                 uint64_t(2024));
+      raft::copy(values_d.data(), blobs_values.data_handle(), nnz, stream);
+      resource::sync_stream(handle);
+    }
+
+    indices_d.resize(nnz, stream);
+    indptr_d.resize(params.n_rows + 1, stream);
+
+    update_device(indices_d.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(indptr_d.data(), indptr_h.data(), indptr_h.size(), stream);
+
+    if (params.customized_indices) {
+      customized_indices_d.resize(nnz, stream);
+      update_device(customized_indices_d.data(),
+                    customized_indices_h.data(),
+                    customized_indices_h.size(),
+                    stream);
+    }
+  }
+
+  index_t create_sparse_matrix(index_t m, index_t n, value_t sparsity, std::vector<bool>& matrix)
+  {
+    index_t total_elements = static_cast<index_t>(m * n);
+    index_t num_ones       = static_cast<index_t>((total_elements * 1.0f) * sparsity);
+    index_t res            = num_ones;
+
+    for (index_t i = 0; i < total_elements; ++i) {
+      matrix[i] = false;
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis_idx(0, total_elements - 1);
+
+    while (num_ones > 0) {
+      size_t index = dis_idx(gen);
+      if (matrix[index] == false) {
+        matrix[index] = true;
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void convert_to_csr(std::vector<bool>& matrix,
+                      index_t rows,
+                      index_t cols,
+                      std::vector<index_t>& indices,
+                      std::vector<index_t>& indptr)
+  {
+    index_t offset_indptr   = 0;
+    index_t offset_values   = 0;
+    indptr[offset_indptr++] = 0;
+
+    for (index_t i = 0; i < rows; ++i) {
+      for (index_t j = 0; j < cols; ++j) {
+        if (matrix[i * cols + j]) {
+          indices[offset_values] = static_cast<index_t>(j);
+          offset_values++;
+        }
+      }
+      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
+    }
+  }
+
+  template <typename data_t>
+  std::optional<data_t> get_opt_var(data_t x)
+  {
+    if (params.customized_indices) {
+      return x;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  void run_benchmark(::benchmark::State& state) override
+  {
+    std::ostringstream label_stream;
+    label_stream << params;
+    state.SetLabel(label_stream.str());
+
+    auto in_val_structure = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+      indptr_d.data(),
+      indices_d.data(),
+      params.n_rows,
+      params.n_cols,
+      static_cast<index_t>(indices_d.size()));
+
+    auto in_val =
+      raft::make_device_csr_matrix_view<const value_t>(values_d.data(), in_val_structure);
+
+    std::optional<raft::device_vector_view<const index_t, index_t>> in_idx;
+
+    in_idx = get_opt_var(
+      raft::make_device_vector_view<const index_t, index_t>(customized_indices_d.data(), nnz));
+
+    auto out_val = raft::make_device_matrix_view<value_t, index_t, raft::row_major>(
+      dst_values_d.data(), params.n_rows, params.top_k);
+    auto out_idx = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
+      dst_indices_d.data(), params.n_rows, params.top_k);
+
+    raft::sparse::matrix::select_k(handle, in_val, in_idx, out_val, out_idx, params.select_min);
+    resource::sync_stream(handle);
+    loop_on_state(state, [this, &in_val, &in_idx, &out_val, &out_idx]() {
+      raft::sparse::matrix::select_k(
+        handle, in_val, in_idx, out_val, out_idx, params.select_min, false);
+      resource::sync_stream(handle);
+    });
+  }
+
+ protected:
+  const raft::device_resources handle;
+
+  bench_param<index_t> params;
+  index_t nnz;
+
+  rmm::device_uvector<value_t> values_d;
+  rmm::device_uvector<index_t> indptr_d;
+  rmm::device_uvector<index_t> indices_d;
+  rmm::device_uvector<index_t> customized_indices_d;
+
+  rmm::device_uvector<value_t> dst_values_d;
+  rmm::device_uvector<index_t> dst_indices_d;
+};  // struct SelectKCsrTest
+
+template <typename index_t>
+const std::vector<bench_param<index_t>> getInputs()
+{
+  std::vector<bench_param<index_t>> param_vec;
+  struct TestParams {
+    index_t m;
+    index_t n;
+    index_t k;
+  };
+
+  const std::vector<TestParams> params_group{
+    {20000, 500, 1},    {20000, 500, 2},    {20000, 500, 4},   {20000, 500, 8},
+    {20000, 500, 16},   {20000, 500, 32},   {20000, 500, 64},  {20000, 500, 128},
+    {20000, 500, 256},
+
+    {1000, 10000, 1},   {1000, 10000, 2},   {1000, 10000, 4},  {1000, 10000, 8},
+    {1000, 10000, 16},  {1000, 10000, 32},  {1000, 10000, 64}, {1000, 10000, 128},
+    {1000, 10000, 256},
+
+    {100, 100000, 1},   {100, 100000, 2},   {100, 100000, 4},  {100, 100000, 8},
+    {100, 100000, 16},  {100, 100000, 32},  {100, 100000, 64}, {100, 100000, 128},
+    {100, 100000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 2},   {10, 1000000, 4},  {10, 1000000, 8},
+    {10, 1000000, 16},  {10, 1000000, 32},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 2},   {10, 1000000, 4},  {10, 1000000, 8},
+    {10, 1000000, 16},  {10, 1000000, 32},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 16},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 16},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256}, {1000, 10000, 1},   {1000, 10000, 16}, {1000, 10000, 64},
+    {1000, 10000, 128}, {1000, 10000, 256},
+
+    {10, 1000000, 1},   {10, 1000000, 16},  {10, 1000000, 64}, {10, 1000000, 128},
+    {10, 1000000, 256}, {1000, 10000, 1},   {1000, 10000, 16}, {1000, 10000, 64},
+    {1000, 10000, 128}, {1000, 10000, 256}};
+
+  param_vec.reserve(params_group.size());
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.k, 0.1}));
+  }
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.k, 0.2}));
+  }
+  for (TestParams params : params_group) {
+    param_vec.push_back(bench_param<index_t>({params.m, params.n, params.k, 0.5}));
+  }
+  return param_vec;
+}
+
+RAFT_BENCH_REGISTER((SelectKCsrTest<float, uint32_t>), "", getInputs<uint32_t>());
+
+}  // namespace raft::bench::sparse
diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh
index 36a346fda3..83d4845c31 100644
--- a/cpp/include/raft/matrix/detail/select_radix.cuh
+++ b/cpp/include/raft/matrix/detail/select_radix.cuh
@@ -442,14 +442,76 @@ _RAFT_DEVICE void last_filter(const T* in_buf,
   }
 }
 
-template <typename T, typename IdxT, int BitsPerPass>
+template <typename T, typename IdxT>
+_RAFT_DEVICE void set_buf_pointers(const T* in,
+                                   const IdxT* in_idx,
+                                   char* bufs,
+                                   IdxT buf_len,
+                                   int pass,
+                                   const T*& in_buf,
+                                   const IdxT*& in_idx_buf,
+                                   T*& out_buf,
+                                   IdxT*& out_idx_buf)
+{
+  // bufs consists of 4 pieces in order: buf1, buf2, idx_buf1, idx_buf2
+  if (pass == 0) {
+    in_buf      = in;
+    in_idx_buf  = nullptr;
+    out_buf     = nullptr;
+    out_idx_buf = nullptr;
+  } else if (pass == 1) {
+    in_buf      = in;
+    in_idx_buf  = in_idx;
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+  } else if (pass % 2 == 0) {
+    in_buf      = reinterpret_cast<T*>(bufs);
+    in_idx_buf  = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+    out_buf     = const_cast<T*>(in_buf + buf_len);
+    out_idx_buf = const_cast<IdxT*>(in_idx_buf + buf_len);
+  } else {
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+    in_buf      = out_buf + buf_len;
+    in_idx_buf  = out_idx_buf + buf_len;
+  }
+}
+
+template <typename T, typename IdxT>
+_RAFT_DEVICE void set_buf_pointers(const T* in,
+                                   const IdxT* in_idx,
+                                   char* bufs,
+                                   IdxT buf_len,
+                                   const int pass,
+                                   const T*& out_buf,
+                                   const IdxT*& out_idx_buf)
+{
+  // bufs consists of 4 pieces in order: buf1, buf2, idx_buf1, idx_buf2
+  if (pass == 0) {
+    out_buf     = nullptr;
+    out_idx_buf = nullptr;
+  } else if (pass == 1) {
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+  } else if (pass % 2 == 0) {
+    out_buf = const_cast<T*>(reinterpret_cast<T*>(bufs) + buf_len);
+    out_idx_buf =
+      const_cast<IdxT*>(reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len) + buf_len);
+  } else {
+    out_buf     = reinterpret_cast<T*>(bufs);
+    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
+  }
+}
+
+template <typename T, typename IdxT, int BitsPerPass, bool len_or_indptr = true>
 RAFT_KERNEL last_filter_kernel(const T* in,
                                const IdxT* in_idx,
-                               const T* in_buf,
-                               const IdxT* in_idx_buf,
+                               char* bufs,
+                               size_t offset,
                                T* out,
                                IdxT* out_idx,
                                const IdxT len,
+                               const IdxT* len_i,
                                const IdxT k,
                                Counter<T, IdxT>* counters,
                                const bool select_min)
@@ -458,22 +520,31 @@ RAFT_KERNEL last_filter_kernel(const T* in,
 
   Counter<T, IdxT>* counter = counters + batch_id;
   IdxT previous_len         = counter->previous_len;
+
   if (previous_len == 0) { return; }
+
+  const IdxT l_len    = len_or_indptr ? len : (len_i[batch_id + 1] - len_i[batch_id]);
+  const IdxT l_offset = len_or_indptr ? (offset + batch_id) * len : len_i[batch_id];
+
   const IdxT buf_len = calc_buf_len<T>(len);
-  if (previous_len > buf_len || in_buf == in) {
-    in_buf       = in + batch_id * len;
-    in_idx_buf   = in_idx ? (in_idx + batch_id * len) : nullptr;
-    previous_len = len;
-  } else {
-    in_buf += batch_id * buf_len;
-    in_idx_buf += batch_id * buf_len;
-  }
-  out += batch_id * k;
-  out_idx += batch_id * k;
+
+  const T* in_buf        = nullptr;
+  const IdxT* in_idx_buf = nullptr;
+  bufs += batch_id * buf_len * 2 * (sizeof(T) + sizeof(IdxT));
 
   constexpr int pass      = calc_num_passes<T, BitsPerPass>() - 1;
   constexpr int start_bit = calc_start_bit<T, BitsPerPass>(pass);
 
+  set_buf_pointers(in + l_offset, in_idx + l_offset, bufs, buf_len, pass, in_buf, in_idx_buf);
+
+  if (previous_len > buf_len || in_buf == in + l_offset) {
+    in_buf       = in + l_offset;
+    in_idx_buf   = in_idx ? (in_idx + l_offset) : nullptr;
+    previous_len = l_len;
+  }
+  out += batch_id * k;
+  out_idx += batch_id * k;
+
   const auto kth_value_bits    = counter->kth_value_bits;
   const IdxT num_of_kth_needed = counter->k;
   IdxT* p_out_cnt              = &counter->out_cnt;
@@ -510,6 +581,29 @@ RAFT_KERNEL last_filter_kernel(const T* in,
                      f);
 }
 
+template <typename T, typename IdxT, typename S>
+_RAFT_DEVICE _RAFT_FORCEINLINE void copy_in_val(
+  T* dest, const T* src, S len, IdxT k, const bool select_min)
+{
+  S idx               = S(threadIdx.x);
+  S stride            = S(blockDim.x);
+  const T default_val = select_min ? upper_bound<T>() : lower_bound<T>();
+  for (S i = idx; i < k; i += stride) {
+    dest[i] = i < len ? src[i] : default_val;
+  }
+}
+
+template <typename T, typename S>
+_RAFT_DEVICE _RAFT_FORCEINLINE void copy_in_idx(T* dest, const T* src, S len)
+{
+  S idx    = S(threadIdx.x);
+  S stride = S(blockDim.x);
+
+  for (S i = idx; i < len; i += stride) {
+    dest[i] = src ? src[i] : i;
+  }
+}
+
 /**
  *
  * It is expected to call this kernel multiple times (passes), in each pass we process a radix,
@@ -545,13 +639,16 @@ RAFT_KERNEL last_filter_kernel(const T* in,
  * rather than from `in_buf`. The benefit is that we can save the cost of writing candidates and
  * their indices.
  */
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool fused_last_filter>
+template <typename T,
+          typename IdxT,
+          int BitsPerPass,
+          int BlockSize,
+          bool fused_last_filter,
+          bool len_or_indptr>
 RAFT_KERNEL radix_kernel(const T* in,
                          const IdxT* in_idx,
-                         const T* in_buf,
-                         const IdxT* in_idx_buf,
-                         T* out_buf,
-                         IdxT* out_idx_buf,
+                         char* bufs,
+                         size_t offset,
                          T* out,
                          IdxT* out_idx,
                          Counter<T, IdxT>* counters,
@@ -567,21 +664,38 @@ RAFT_KERNEL radix_kernel(const T* in,
   IdxT current_k;
   IdxT previous_len;
   IdxT current_len;
+
+  const IdxT l_len    = len_or_indptr ? len : (len_i[batch_id + 1] - len_i[batch_id]);
+  const IdxT l_offset = len_or_indptr ? (offset + batch_id) * len : len_i[batch_id];
+
   if (pass == 0) {
     current_k    = k;
-    previous_len = len;
+    previous_len = l_len;
     // Need to do this so setting counter->previous_len for the next pass is correct.
     // This value is meaningless for pass 0, but it's fine because pass 0 won't be the
     // last pass in this implementation so pass 0 won't hit the "if (pass ==
     // num_passes - 1)" branch.
     // Maybe it's better to reload counter->previous_len and use it rather than
     // current_len in last_filter()
-    current_len = len;
+    current_len = l_len;
   } else {
     current_k    = counter->k;
     current_len  = counter->len;
     previous_len = counter->previous_len;
   }
+  if constexpr (!len_or_indptr) {
+    if (pass == 0 && l_len <= k) {
+      copy_in_val(out + batch_id * k, in + l_offset, l_len, k, select_min);
+      copy_in_idx(out_idx + batch_id * k, (in_idx ? (in_idx + l_offset) : nullptr), l_len);
+      if (threadIdx.x == 0) {
+        counter->previous_len = 0;
+        counter->len          = 0;
+      }
+      __syncthreads();
+      return;
+    }
+  }
+
   if (current_len == 0) { return; }
 
   // When k=len, early_stop will be true at pass 0. It means filter_and_histogram() should handle
@@ -590,20 +704,33 @@ RAFT_KERNEL radix_kernel(const T* in,
   const bool early_stop = (current_len == current_k);
   const IdxT buf_len    = calc_buf_len<T>(len);
 
+  const T* in_buf;
+  const IdxT* in_idx_buf;
+  T* out_buf;
+  IdxT* out_idx_buf;
+  bufs += batch_id * buf_len * 2 * (sizeof(T) + sizeof(IdxT));
+
+  set_buf_pointers(in + l_offset,
+                   (in_idx ? (in_idx + l_offset) : nullptr),
+                   bufs,
+                   buf_len,
+                   pass,
+                   in_buf,
+                   in_idx_buf,
+                   out_buf,
+                   out_idx_buf);
+
   // "previous_len > buf_len" means previous pass skips writing buffer
   if (pass == 0 || pass == 1 || previous_len > buf_len) {
-    in_buf       = in + batch_id * len;
-    in_idx_buf   = in_idx ? (in_idx + batch_id * len) : nullptr;
-    previous_len = len;
-  } else {
-    in_buf += batch_id * buf_len;
-    in_idx_buf += batch_id * buf_len;
+    in_buf       = in + l_offset;
+    in_idx_buf   = in_idx ? (in_idx + l_offset) : nullptr;
+    previous_len = l_len;
   }
 
   // in case we have individual len for each query defined we want to make sure
   // that we only iterate valid elements.
   if (len_i != nullptr) {
-    const IdxT max_len = max(len_i[batch_id], k);
+    const IdxT max_len = max(l_len, k);
     if (max_len < previous_len) previous_len = max_len;
   }
 
@@ -611,9 +738,6 @@ RAFT_KERNEL radix_kernel(const T* in,
   if (pass == 0 || current_len > buf_len) {
     out_buf     = nullptr;
     out_idx_buf = nullptr;
-  } else {
-    out_buf += batch_id * buf_len;
-    out_idx_buf += batch_id * buf_len;
   }
   out += batch_id * k;
   out_idx += batch_id * k;
@@ -640,7 +764,6 @@ RAFT_KERNEL radix_kernel(const T* in,
     unsigned int finished = atomicInc(&counter->finished_block_cnt, gridDim.x - 1);
     isLastBlock           = (finished == (gridDim.x - 1));
   }
-
   if (__syncthreads_or(isLastBlock)) {
     if (early_stop) {
       if (threadIdx.x == 0) {
@@ -676,7 +799,7 @@ RAFT_KERNEL radix_kernel(const T* in,
                                           out_idx_buf ? out_idx_buf : in_idx_buf,
                                           out,
                                           out_idx,
-                                          out_buf ? current_len : len,
+                                          out_buf ? current_len : l_len,
                                           k,
                                           counter,
                                           select_min,
@@ -726,7 +849,7 @@ unsigned calc_grid_dim(int batch_size, IdxT len, int sm_cnt)
 
   int active_blocks;
   RAFT_CUDA_TRY(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-    &active_blocks, radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>, BlockSize, 0));
+    &active_blocks, radix_kernel<T, IdxT, BitsPerPass, BlockSize, false, true>, BlockSize, 0));
   active_blocks *= sm_cnt;
 
   IdxT best_num_blocks         = 0;
@@ -757,78 +880,7 @@ unsigned calc_grid_dim(int batch_size, IdxT len, int sm_cnt)
   return best_num_blocks;
 }
 
-template <typename T, typename IdxT>
-_RAFT_HOST void set_buf_pointers(const T* in,
-                                 const IdxT* in_idx,
-                                 T* buf1,
-                                 IdxT* idx_buf1,
-                                 T* buf2,
-                                 IdxT* idx_buf2,
-                                 int pass,
-                                 const T*& in_buf,
-                                 const IdxT*& in_idx_buf,
-                                 T*& out_buf,
-                                 IdxT*& out_idx_buf)
-{
-  if (pass == 0) {
-    in_buf      = in;
-    in_idx_buf  = nullptr;
-    out_buf     = nullptr;
-    out_idx_buf = nullptr;
-  } else if (pass == 1) {
-    in_buf      = in;
-    in_idx_buf  = in_idx;
-    out_buf     = buf1;
-    out_idx_buf = idx_buf1;
-  } else if (pass % 2 == 0) {
-    in_buf      = buf1;
-    in_idx_buf  = idx_buf1;
-    out_buf     = buf2;
-    out_idx_buf = idx_buf2;
-  } else {
-    in_buf      = buf2;
-    in_idx_buf  = idx_buf2;
-    out_buf     = buf1;
-    out_idx_buf = idx_buf1;
-  }
-}
-
-template <typename T, typename IdxT>
-_RAFT_DEVICE void set_buf_pointers(const T* in,
-                                   const IdxT* in_idx,
-                                   char* bufs,
-                                   IdxT buf_len,
-                                   int pass,
-                                   const T*& in_buf,
-                                   const IdxT*& in_idx_buf,
-                                   T*& out_buf,
-                                   IdxT*& out_idx_buf)
-{
-  // bufs consists of 4 pieces in order: buf1, buf2, idx_buf1, idx_buf2
-  if (pass == 0) {
-    in_buf      = in;
-    in_idx_buf  = nullptr;
-    out_buf     = nullptr;
-    out_idx_buf = nullptr;
-  } else if (pass == 1) {
-    in_buf      = in;
-    in_idx_buf  = in_idx;
-    out_buf     = reinterpret_cast<T*>(bufs);
-    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
-  } else if (pass % 2 == 0) {
-    in_buf      = reinterpret_cast<T*>(bufs);
-    in_idx_buf  = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
-    out_buf     = const_cast<T*>(in_buf + buf_len);
-    out_idx_buf = const_cast<IdxT*>(in_idx_buf + buf_len);
-  } else {
-    out_buf     = reinterpret_cast<T*>(bufs);
-    out_idx_buf = reinterpret_cast<IdxT*>(bufs + sizeof(T) * 2 * buf_len);
-    in_buf      = out_buf + buf_len;
-    in_idx_buf  = out_idx_buf + buf_len;
-  }
-}
-
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr>
 void radix_topk(const T* in,
                 const IdxT* in_idx,
                 int batch_size,
@@ -850,7 +902,7 @@ void radix_topk(const T* in,
 
   if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
 
-  auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>;
+  auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false, len_or_indptr>;
   const size_t max_chunk_size =
     calc_chunk_size<T, IdxT, BlockSize>(batch_size, len, sm_cnt, kernel, false);
   if (max_chunk_size != static_cast<size_t>(batch_size)) {
@@ -862,55 +914,33 @@ void radix_topk(const T* in,
 
   rmm::device_uvector<Counter<T, IdxT>> counters(max_chunk_size, stream, mr);
   rmm::device_uvector<IdxT> histograms(max_chunk_size * num_buckets, stream, mr);
-  rmm::device_uvector<T> buf1(max_chunk_size * buf_len, stream, mr);
-  rmm::device_uvector<IdxT> idx_buf1(max_chunk_size * buf_len, stream, mr);
-  rmm::device_uvector<T> buf2(max_chunk_size * buf_len, stream, mr);
-  rmm::device_uvector<IdxT> idx_buf2(max_chunk_size * buf_len, stream, mr);
+
+  rmm::device_uvector<char> bufs(
+    max_chunk_size * buf_len * 2 * (sizeof(T) + sizeof(IdxT)), stream, mr);
 
   for (size_t offset = 0; offset < static_cast<size_t>(batch_size); offset += max_chunk_size) {
     int chunk_size = std::min(max_chunk_size, batch_size - offset);
     RAFT_CUDA_TRY(
       cudaMemsetAsync(counters.data(), 0, counters.size() * sizeof(Counter<T, IdxT>), stream));
     RAFT_CUDA_TRY(cudaMemsetAsync(histograms.data(), 0, histograms.size() * sizeof(IdxT), stream));
-    auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false>;
+    auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false, len_or_indptr>;
 
-    const T* chunk_in        = in + offset * len;
-    const IdxT* chunk_in_idx = in_idx ? (in_idx + offset * len) : nullptr;
-    T* chunk_out             = out + offset * k;
-    IdxT* chunk_out_idx      = out_idx + offset * k;
-    const IdxT* chunk_len_i  = len_i ? (len_i + offset) : nullptr;
-
-    const T* in_buf        = nullptr;
-    const IdxT* in_idx_buf = nullptr;
-    T* out_buf             = nullptr;
-    IdxT* out_idx_buf      = nullptr;
+    T* chunk_out            = out + offset * k;
+    IdxT* chunk_out_idx     = out_idx + offset * k;
+    const IdxT* chunk_len_i = len_i ? (len_i + offset) : nullptr;
 
     dim3 blocks(grid_dim, chunk_size);
     constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
 
     for (int pass = 0; pass < num_passes; ++pass) {
-      set_buf_pointers(chunk_in,
-                       chunk_in_idx,
-                       buf1.data(),
-                       idx_buf1.data(),
-                       buf2.data(),
-                       idx_buf2.data(),
-                       pass,
-                       in_buf,
-                       in_idx_buf,
-                       out_buf,
-                       out_idx_buf);
-
       if (fused_last_filter && pass == num_passes - 1) {
-        kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, true>;
+        kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, true, len_or_indptr>;
       }
 
-      kernel<<<blocks, BlockSize, 0, stream>>>(chunk_in,
-                                               chunk_in_idx,
-                                               in_buf,
-                                               in_idx_buf,
-                                               out_buf,
-                                               out_idx_buf,
+      kernel<<<blocks, BlockSize, 0, stream>>>(in,
+                                               in_idx,
+                                               bufs.data(),
+                                               offset,
                                                chunk_out,
                                                chunk_out_idx,
                                                counters.data(),
@@ -924,16 +954,18 @@ void radix_topk(const T* in,
     }
 
     if (!fused_last_filter) {
-      last_filter_kernel<T, IdxT, BitsPerPass><<<blocks, BlockSize, 0, stream>>>(chunk_in,
-                                                                                 chunk_in_idx,
-                                                                                 out_buf,
-                                                                                 out_idx_buf,
-                                                                                 chunk_out,
-                                                                                 chunk_out_idx,
-                                                                                 len,
-                                                                                 k,
-                                                                                 counters.data(),
-                                                                                 select_min);
+      last_filter_kernel<T, IdxT, BitsPerPass, len_or_indptr>
+        <<<blocks, BlockSize, 0, stream>>>(in,
+                                           in_idx,
+                                           bufs.data(),
+                                           offset,
+                                           chunk_out,
+                                           chunk_out_idx,
+                                           len,
+                                           chunk_len_i,
+                                           k,
+                                           counters.data(),
+                                           select_min);
       RAFT_CUDA_TRY(cudaPeekAtLastError());
     }
   }
@@ -1015,7 +1047,7 @@ _RAFT_DEVICE void filter_and_histogram_for_one_block(const T* in_buf,
   }
 }
 
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr>
 RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
                                         const IdxT* in_idx,
                                         const IdxT len,
@@ -1024,30 +1056,48 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
                                         T* out,
                                         IdxT* out_idx,
                                         const bool select_min,
-                                        char* bufs)
+                                        char* bufs,
+                                        size_t offset)
 {
   constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
   __shared__ Counter<T, IdxT> counter;
   __shared__ IdxT histogram[num_buckets];
 
+  const size_t batch_id = blockIdx.x;  // size_t to avoid multiplication overflow
+
+  IdxT l_len    = len;
+  IdxT l_offset = (offset + batch_id) * len;
+  if constexpr (!len_or_indptr) {
+    l_offset = len_i[batch_id];
+    l_len    = len_i[batch_id + 1] - l_offset;
+  }
+
   if (threadIdx.x == 0) {
     counter.k              = k;
-    counter.len            = len;
-    counter.previous_len   = len;
+    counter.len            = l_len;
+    counter.previous_len   = l_len;
     counter.kth_value_bits = 0;
     counter.out_cnt        = 0;
     counter.out_back_cnt   = 0;
   }
   __syncthreads();
 
-  const size_t batch_id = blockIdx.x;  // size_t to avoid multiplication overflow
-  in += batch_id * len;
-  if (in_idx) { in_idx += batch_id * len; }
+  in += l_offset;
+  if (in_idx) { in_idx += l_offset; }
   out += batch_id * k;
   out_idx += batch_id * k;
   const IdxT buf_len = calc_buf_len<T, IdxT, unsigned>(len);
   bufs += batch_id * buf_len * 2 * (sizeof(T) + sizeof(IdxT));
 
+  if constexpr (!len_or_indptr) {
+    if (l_len <= k) {
+      copy_in_val(out, in, l_len, k, select_min);
+      copy_in_idx(out_idx, in_idx, l_len);
+      __syncthreads();
+      return;
+    }
+  }
+
   constexpr int num_passes = calc_num_passes<T, BitsPerPass>();
   for (int pass = 0; pass < num_passes; ++pass) {
     const T* in_buf;
@@ -1073,7 +1123,7 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
     // in case we have individual len for each query defined we want to make sure
     // that we only iterate valid elements.
     if (len_i != nullptr) {
-      const IdxT max_len = max(len_i[batch_id], k);
+      const IdxT max_len = max(l_len, k);
       if (max_len < previous_len) previous_len = max_len;
     }
 
@@ -1102,7 +1152,7 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
                                         out_buf ? out_idx_buf : in_idx,
                                         out,
                                         out_idx,
-                                        out_buf ? current_len : len,
+                                        out_buf ? current_len : l_len,
                                         k,
                                         &counter,
                                         select_min,
@@ -1117,7 +1167,7 @@ RAFT_KERNEL radix_topk_one_block_kernel(const T* in,
 // counters and global histograms, can be kept in shared memory and cheap sync operations can be
 // used. It's used when len is relatively small or when the number of blocks per row calculated by
 // `calc_grid_dim()` is 1.
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr>
 void radix_topk_one_block(const T* in,
                           const IdxT* in_idx,
                           int batch_size,
@@ -1133,7 +1183,7 @@ void radix_topk_one_block(const T* in,
 {
   static_assert(calc_num_passes<T, BitsPerPass>() > 1);
 
-  auto kernel        = radix_topk_one_block_kernel<T, IdxT, BitsPerPass, BlockSize>;
+  auto kernel        = radix_topk_one_block_kernel<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>;
   const IdxT buf_len = calc_buf_len<T, IdxT, unsigned>(len);
   const size_t max_chunk_size =
     calc_chunk_size<T, IdxT, BlockSize>(batch_size, len, sm_cnt, kernel, true);
@@ -1144,15 +1194,16 @@ void radix_topk_one_block(const T* in,
   for (size_t offset = 0; offset < static_cast<size_t>(batch_size); offset += max_chunk_size) {
     int chunk_size          = std::min(max_chunk_size, batch_size - offset);
     const IdxT* chunk_len_i = len_i ? (len_i + offset) : nullptr;
-    kernel<<<chunk_size, BlockSize, 0, stream>>>(in + offset * len,
-                                                 in_idx ? (in_idx + offset * len) : nullptr,
+    kernel<<<chunk_size, BlockSize, 0, stream>>>(in,
+                                                 in_idx,
                                                  len,
                                                  chunk_len_i,
                                                  k,
                                                  out + offset * k,
                                                  out_idx + offset * k,
                                                  select_min,
-                                                 bufs.data());
+                                                 bufs.data(),
+                                                 offset);
   }
 }
 
@@ -1182,6 +1233,10 @@ void radix_topk_one_block(const T* in,
  *   it affects the number of passes and number of buckets.
  * @tparam BlockSize
  *   Number of threads in a kernel thread block.
+ * @tparam len_or_indptr
+ *   Flag to interpret `len_i` as either direct row lengths (true) or CSR format
+ *   index pointers (false). When true, each `len_i` element denotes the length of a row. When
+ *   false, `len_i` represents the index pointers for a CSR matrix with shape of `batch_size + 1`.
  *
  * @param[in] res container of reusable resources
  * @param[in] in
@@ -1212,9 +1267,12 @@ void radix_topk_one_block(const T* in,
  *   same. That is, when the value range of input data is narrow. In such case, there could be a
  *   large number of inputs for the last filter, hence using multiple thread blocks is beneficial.
  * @param len_i
- *   optional array of size (batch_size) providing lengths for each individual row
+ *   Optional array used differently based on `len_or_indptr`:
+ *   When `len_or_indptr` is true, `len_i` presents the lengths of each row, which is `batch_size`.
+ *   When `len_or_indptr` is false, `len_i` works like a indptr for a CSR matrix. The length of each
+ *   row would be (`len_i[row_id + 1] - len_i[row_id]`). `len_i` size is `batch_size + 1`.
  */
-template <typename T, typename IdxT, int BitsPerPass, int BlockSize>
+template <typename T, typename IdxT, int BitsPerPass, int BlockSize, bool len_or_indptr = true>
 void select_k(raft::resources const& res,
               const T* in,
               const IdxT* in_idx,
@@ -1227,9 +1285,12 @@ void select_k(raft::resources const& res,
               bool fused_last_filter,
               const IdxT* len_i)
 {
+  RAFT_EXPECTS(!(!len_or_indptr && (len_i == nullptr)),
+               "When `len_or_indptr` is false, `len_i` must not be nullptr!");
+
   auto stream = resource::get_cuda_stream(res);
   auto mr     = resource::get_workspace_resource(res);
-  if (k == len) {
+  if (k == len && len_or_indptr) {
     RAFT_CUDA_TRY(
       cudaMemcpyAsync(out, in, sizeof(T) * batch_size * len, cudaMemcpyDeviceToDevice, stream));
     if (in_idx) {
@@ -1248,29 +1309,29 @@ void select_k(raft::resources const& res,
   constexpr int items_per_thread = 32;
 
   if (len <= BlockSize * items_per_thread) {
-    impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize>(
+    impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>(
       in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr);
   } else {
     unsigned grid_dim =
       impl::calc_grid_dim<T, IdxT, BitsPerPass, BlockSize>(batch_size, len, sm_cnt);
     if (grid_dim == 1) {
-      impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize>(
+      impl::radix_topk_one_block<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>(
         in, in_idx, batch_size, len, k, out, out_idx, select_min, len_i, sm_cnt, stream, mr);
     } else {
-      impl::radix_topk<T, IdxT, BitsPerPass, BlockSize>(in,
-                                                        in_idx,
-                                                        batch_size,
-                                                        len,
-                                                        k,
-                                                        out,
-                                                        out_idx,
-                                                        select_min,
-                                                        fused_last_filter,
-                                                        len_i,
-                                                        grid_dim,
-                                                        sm_cnt,
-                                                        stream,
-                                                        mr);
+      impl::radix_topk<T, IdxT, BitsPerPass, BlockSize, len_or_indptr>(in,
+                                                                       in_idx,
+                                                                       batch_size,
+                                                                       len,
+                                                                       k,
+                                                                       out,
+                                                                       out_idx,
+                                                                       select_min,
+                                                                       fused_last_filter,
+                                                                       len_i,
+                                                                       grid_dim,
+                                                                       sm_cnt,
+                                                                       stream,
+                                                                       mr);
     }
   }
 }
diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
index 572558153d..2cb32585d5 100644
--- a/cpp/include/raft/matrix/detail/select_warpsort.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -754,22 +754,32 @@ template <template <int, bool, typename, typename> class WarpSortClass,
           bool Ascending,
           typename T,
           typename IdxT>
-__launch_bounds__(256) RAFT_KERNEL
-  block_kernel(const T* in, const IdxT* in_idx, IdxT len, int k, T* out, IdxT* out_idx)
+__launch_bounds__(256) RAFT_KERNEL block_kernel(const T* in,
+                                                const IdxT* in_idx,
+                                                const IdxT* in_indptr,
+                                                size_t offset,
+                                                IdxT len,
+                                                int k,
+                                                T* out,
+                                                IdxT* out_idx)
 {
   extern __shared__ __align__(256) uint8_t smem_buf_bytes[];
   using bq_t         = block_sort<WarpSortClass, Capacity, Ascending, T, IdxT>;
   uint8_t* warp_smem = bq_t::queue_t::mem_required(blockDim.x) > 0 ? smem_buf_bytes : nullptr;
   bq_t queue(k, warp_smem);
+  const size_t batch_id = blockIdx.y;
 
-  in += blockIdx.y * len;
-  if (in_idx != nullptr) { in_idx += blockIdx.y * len; }
+  const IdxT l_len    = in_indptr ? (in_indptr[batch_id + 1] - in_indptr[batch_id]) : len;
+  const IdxT l_offset = in_indptr ? in_indptr[batch_id] : (offset + batch_id) * len;
+
+  in += l_offset;
+  if (in_idx != nullptr) { in_idx += l_offset; }
 
   const IdxT stride         = gridDim.x * blockDim.x;
-  const IdxT per_thread_lim = len + laneId();
+  const IdxT per_thread_lim = l_len + laneId();
   for (IdxT i = threadIdx.x + blockIdx.x * blockDim.x; i < per_thread_lim; i += stride) {
-    queue.add(i < len ? __ldcs(in + i) : WarpSortClass<Capacity, Ascending, T, IdxT>::kDummy,
-              (i < len && in_idx != nullptr) ? __ldcs(in_idx + i) : i);
+    queue.add(i < l_len ? __ldcs(in + i) : WarpSortClass<Capacity, Ascending, T, IdxT>::kDummy,
+              (i < l_len && in_idx != nullptr) ? __ldcs(in_idx + i) : i);
   }
 
   queue.done(smem_buf_bytes);
@@ -832,6 +842,7 @@ struct launch_setup {
                      int smem_size,
                      const T* in_key,
                      const IdxT* in_idx,
+                     const IdxT* in_indptr,
                      T* out_key,
                      IdxT* out_idx,
                      rmm::cuda_stream_view stream)
@@ -848,6 +859,7 @@ struct launch_setup {
                                                                           smem_size,
                                                                           in_key,
                                                                           in_idx,
+                                                                          in_indptr,
                                                                           out_key,
                                                                           out_idx,
                                                                           stream);
@@ -858,21 +870,23 @@ struct launch_setup {
     // This is less than cuda's max block dim along Y axis (65535), but it's a
     // power-of-two, which ensures the alignment of batches in memory.
     constexpr size_t kMaxGridDimY = 32768;
+    size_t g_offset               = 0;
     for (size_t offset = 0; offset < batch_size; offset += kMaxGridDimY) {
       size_t batch_chunk = std::min<size_t>(kMaxGridDimY, batch_size - offset);
       dim3 gs(num_blocks, batch_chunk, 1);
       if (select_min) {
-        block_kernel<WarpSortClass, Capacity, true, T, IdxT>
-          <<<gs, block_dim, smem_size, stream>>>(in_key, in_idx, IdxT(len), k, out_key, out_idx);
+        block_kernel<WarpSortClass, Capacity, true, T, IdxT><<<gs, block_dim, smem_size, stream>>>(
+          in_key, in_idx, in_indptr, g_offset, IdxT(len), k, out_key, out_idx);
       } else {
-        block_kernel<WarpSortClass, Capacity, false, T, IdxT>
-          <<<gs, block_dim, smem_size, stream>>>(in_key, in_idx, IdxT(len), k, out_key, out_idx);
+        block_kernel<WarpSortClass, Capacity, false, T, IdxT><<<gs, block_dim, smem_size, stream>>>(
+          in_key, in_idx, in_indptr, g_offset, IdxT(len), k, out_key, out_idx);
       }
       RAFT_CUDA_TRY(cudaPeekAtLastError());
       out_key += batch_chunk * num_blocks * k;
       out_idx += batch_chunk * num_blocks * k;
-      in_key += batch_chunk * len;
-      if (in_idx != nullptr) { in_idx += batch_chunk * len; }
+
+      if (in_indptr != nullptr) { in_indptr += batch_chunk; };
+      g_offset += batch_chunk;
     }
   }
 };
@@ -1010,6 +1024,7 @@ void select_k_(int num_of_block,
                int num_of_warp,
                const T* in,
                const IdxT* in_idx,
+               const IdxT* in_indptr,
                size_t batch_size,
                size_t len,
                int k,
@@ -1041,6 +1056,7 @@ void select_k_(int num_of_block,
                                                smem_size,
                                                in,
                                                in_idx,
+                                               in_indptr,
                                                result_val,
                                                result_idx,
                                                stream);
@@ -1056,6 +1072,7 @@ void select_k_(int num_of_block,
                                                  smem_size,
                                                  tmp_val.data(),
                                                  tmp_idx.data(),
+                                                 nullptr,
                                                  out,
                                                  out_idx,
                                                  stream);
@@ -1071,7 +1088,8 @@ void select_k_impl(raft::resources const& res,
                    int k,
                    T* out,
                    IdxT* out_idx,
-                   bool select_min)
+                   bool select_min,
+                   const IdxT* in_indptr = nullptr)
 {
   int num_of_block = 0;
   int num_of_warp  = 0;
@@ -1082,6 +1100,7 @@ void select_k_impl(raft::resources const& res,
                                     num_of_warp,
                                     in,
                                     in_idx,
+                                    in_indptr,
                                     batch_size,
                                     len,
                                     k,
@@ -1126,6 +1145,9 @@ void select_k_impl(raft::resources const& res,
  *   the payload selected together with `out`.
  * @param select_min
  *   whether to select k smallest (true) or largest (false) keys.
+ * @param[in] in_indptr
+ *   CSR indptr of the index matrix, which indicates the length for each row.
+ *   `nullptr` by default, under this situation, @p len is used as the length.
  */
 template <typename T, typename IdxT>
 void select_k(raft::resources const& res,
@@ -1136,7 +1158,8 @@ void select_k(raft::resources const& res,
               int k,
               T* out,
               IdxT* out_idx,
-              bool select_min)
+              bool select_min,
+              const IdxT* in_indptr = nullptr)
 {
   ASSERT(k <= kMaxCapacity, "Current max k is %d (requested %d)", kMaxCapacity, k);
   ASSERT(len <= size_t(std::numeric_limits<IdxT>::max()),
@@ -1155,6 +1178,7 @@ void select_k(raft::resources const& res,
                                             num_of_warp,
                                             in,
                                             in_idx,
+                                            in_indptr,
                                             batch_size,
                                             len,
                                             k,
@@ -1170,6 +1194,7 @@ void select_k(raft::resources const& res,
                                            num_of_warp,
                                            in,
                                            in_idx,
+                                           in_indptr,
                                            batch_size,
                                            len,
                                            k,
diff --git a/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh b/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
new file mode 100644
index 0000000000..08bdfa6f30
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_resources.hpp>
+#include <raft/matrix/select_k_types.hpp>
+#include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
+
+#include <rmm/cuda_stream_view.hpp>                  // rmm:cuda_stream_view
+#include <rmm/mr/device/device_memory_resource.hpp>  // rmm::mr::device_memory_resource
+
+#include <cuda_fp16.h>  // __half
+
+#include <cstdint>  // uint32_t
+
+#ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+namespace raft::sparse::matrix::detail {
+
+template <typename T, typename IdxT>
+void select_k(raft::resources const& handle,
+              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
+              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
+              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
+              bool select_min,
+              bool sorted                   = false,
+              raft::matrix::SelectAlgo algo = SelectAlgo::kAuto) RAFT_EXPLICIT;
+}  // namespace raft::sparse::matrix::detail
+
+#endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  extern template void raft::sparse::matrix::detail::select_k(        \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(__half, uint32_t);
+instantiate_raft_sparse_matrix_detail_select_k(__half, int64_t);
+instantiate_raft_sparse_matrix_detail_select_k(float, int64_t);
+instantiate_raft_sparse_matrix_detail_select_k(float, uint32_t);
+instantiate_raft_sparse_matrix_detail_select_k(float, int);
+instantiate_raft_sparse_matrix_detail_select_k(double, int64_t);
+instantiate_raft_sparse_matrix_detail_select_k(double, uint32_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/include/raft/sparse/matrix/detail/select_k-inl.cuh b/cpp/include/raft/sparse/matrix/detail/select_k-inl.cuh
new file mode 100644
index 0000000000..5f39affce6
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/detail/select_k-inl.cuh
@@ -0,0 +1,225 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/operators.hpp>
+#include <raft/core/resource/device_memory_resource.hpp>
+#include <raft/linalg/map.cuh>
+#include <raft/matrix/detail/select_k-inl.cuh>
+#include <raft/matrix/select_k_types.hpp>
+
+#include <cub/cub.cuh>
+
+#include <type_traits>
+
+namespace raft::sparse::matrix::detail {
+
+using namespace raft::matrix::detail;
+using raft::matrix::SelectAlgo;
+
+/**
+ * Selects the k smallest or largest keys/values from each row of the input CSR matrix.
+ *
+ * This function operates on a CSR matrix `in_val` with a logical dense shape of [batch_size, len],
+ * selecting the k smallest or largest elements from each row. The selected elements are then stored
+ * in a row-major output matrix `out_val` with dimensions `batch_size` x k.
+ *
+ * @tparam T
+ *   Type of the elements being compared (keys).
+ * @tparam IdxT
+ *   Type of the indices associated with the keys.
+ * @tparam NZType
+ *   Type representing non-zero elements of `in_val`.
+ *
+ * @param[in] handle
+ *   Container for managing reusable resources.
+ * @param[in] in_val
+ *   Input matrix in CSR format with a logical dense shape of [batch_size, len],
+ *   containing the elements to be compared and selected.
+ * @param[in] in_idx
+ *   Optional input indices [in_val.nnz] associated with `in_val.values`.
+ *   If `in_idx` is `std::nullopt`, it defaults to a contiguous array from 0 to len-1.
+ * @param[out] out_val
+ *   Output matrix [in_val.get_n_row(), k] storing the selected k smallest/largest elements
+ *   from each row of `in_val`.
+ * @param[out] out_idx
+ *   Output indices [in_val.get_n_row(), k] corresponding to the selected elements in `out_val`.
+ * @param[in] select_min
+ *   Flag indicating whether to select the k smallest (true) or largest (false) elements.
+ * @param[in] sorted
+ *   whether to make sure selected pairs are sorted by value
+ * @param[in] algo
+ *   the selection algorithm to use
+ */
+template <typename T, typename IdxT>
+void select_k(raft::resources const& handle,
+              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
+              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
+              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
+              bool select_min,
+              bool sorted     = false,
+              SelectAlgo algo = SelectAlgo::kAuto)
+{
+  auto csr_view = in_val.structure_view();
+  auto nnz      = csr_view.get_nnz();
+
+  if (nnz == 0) return;
+
+  auto batch_size = csr_view.get_n_rows();
+  auto len        = csr_view.get_n_cols();
+  auto k          = IdxT(out_val.extent(1));
+
+  common::nvtx::range<common::nvtx::domain::raft> fun_scope(
+    "sparse::matrix::select_k(batch_size = %zu, len = %zu, k = %d)", batch_size, len, k);
+
+  RAFT_EXPECTS(out_val.extent(1) <= int64_t(std::numeric_limits<int>::max()),
+               "output k must fit the int type.");
+
+  RAFT_EXPECTS(batch_size == out_val.extent(0), "batch sizes must be equal");
+  RAFT_EXPECTS(batch_size == out_idx.extent(0), "batch sizes must be equal");
+
+  if (in_idx.has_value()) {
+    RAFT_EXPECTS(size_t(nnz) == in_idx->size(),
+                 "nnz of in_val must be equal to the length of in_idx");
+  }
+  RAFT_EXPECTS(IdxT(k) == out_idx.extent(1), "value and index output lengths must be equal");
+
+  if (algo == SelectAlgo::kAuto) { algo = choose_select_k_algorithm(batch_size, len, k); }
+
+  auto indptr = csr_view.get_indptr().data();
+
+  switch (algo) {
+    case SelectAlgo::kRadix8bits:
+    case SelectAlgo::kRadix11bits:
+    case SelectAlgo::kRadix11bitsExtraPass: {
+      if (algo == SelectAlgo::kRadix8bits) {
+        select::radix::select_k<T, IdxT, 8, 512, false>(
+          handle,
+          in_val.get_elements().data(),
+          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+          batch_size,
+          len,
+          k,
+          out_val.data_handle(),
+          out_idx.data_handle(),
+          select_min,
+          true,
+          indptr);
+      } else {
+        bool fused_last_filter = algo == SelectAlgo::kRadix11bits;
+        select::radix::select_k<T, IdxT, 11, 512, false>(
+          handle,
+          in_val.get_elements().data(),
+          (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+          batch_size,
+          len,
+          k,
+          out_val.data_handle(),
+          out_idx.data_handle(),
+          select_min,
+          fused_last_filter,
+          indptr);
+      }
+
+      if (sorted) {
+        auto offsets = make_device_mdarray<IdxT, IdxT>(
+          handle, resource::get_workspace_resource(handle), make_extents<IdxT>(batch_size + 1));
+        raft::linalg::map_offset(handle, offsets.view(), mul_const_op<IdxT>(k));
+
+        auto keys =
+          raft::make_device_vector_view<T, IdxT>(out_val.data_handle(), (IdxT)(batch_size * k));
+        auto vals =
+          raft::make_device_vector_view<IdxT, IdxT>(out_idx.data_handle(), (IdxT)(batch_size * k));
+
+        segmented_sort_by_key<T, IdxT>(
+          handle, raft::make_const_mdspan(offsets.view()), keys, vals, select_min);
+      }
+
+      return;
+    }
+    case SelectAlgo::kWarpDistributed:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_distributed>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpDistributedShm:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_distributed_ext>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpAuto:
+      return select::warpsort::select_k<T, IdxT>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpImmediate:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_immediate>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    case SelectAlgo::kWarpFiltered:
+      return select::warpsort::select_k_impl<T, IdxT, select::warpsort::warp_sort_filtered>(
+        handle,
+        in_val.get_elements().data(),
+        (in_idx.has_value() ? in_idx->data_handle() : csr_view.get_indices().data()),
+        batch_size,
+        len,
+        k,
+        out_val.data_handle(),
+        out_idx.data_handle(),
+        select_min,
+        indptr);
+    default: RAFT_FAIL("K-selection Algorithm not supported.");
+  }
+
+  return;
+}
+
+}  // namespace raft::sparse::matrix::detail
diff --git a/cpp/include/raft/sparse/matrix/detail/select_k.cuh b/cpp/include/raft/sparse/matrix/detail/select_k.cuh
new file mode 100644
index 0000000000..711169984b
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/detail/select_k.cuh
@@ -0,0 +1,24 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
+#include "select_k-inl.cuh"
+#endif
+
+#ifdef RAFT_COMPILED
+#include "select_k-ext.cuh"
+#endif
diff --git a/cpp/include/raft/sparse/matrix/select_k.cuh b/cpp/include/raft/sparse/matrix/select_k.cuh
new file mode 100644
index 0000000000..3f97e60c99
--- /dev/null
+++ b/cpp/include/raft/sparse/matrix/select_k.cuh
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/nvtx.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/select_k_types.hpp>
+#include <raft/sparse/matrix/detail/select_k.cuh>
+
+#include <optional>
+
+namespace raft::sparse::matrix {
+
+using SelectAlgo = raft::matrix::SelectAlgo;
+
+/**
+ * @defgroup select_k Batched-select k smallest or largest key/values
+ * @{
+ */
+
+/**
+ * Selects the k smallest or largest keys/values from each row of the input matrix.
+ *
+ * This function operates on a row-major matrix `in_val` with dimensions `batch_size` x `len`,
+ * selecting the k smallest or largest elements from each row. The selected elements are then stored
+ * in a row-major output matrix `out_val` with dimensions `batch_size` x k.
+ * If the total number of values in a row is less than K, then the extra position in the
+ * corresponding row of out_val will maintain the original value. This applies to out_idx
+ *
+ * @tparam T
+ *   Type of the elements being compared (keys).
+ * @tparam IdxT
+ *   Type of the indices associated with the keys.
+ *
+ * @param[in] handle
+ *   Container for managing reusable resources.
+ * @param[in] in_val
+ *   Input matrix in CSR format with a logical dense shape of [batch_size, len],
+ *   containing the elements to be compared and selected.
+ * @param[in] in_idx
+ *   Optional input indices [in_val.nnz] associated with `in_val.values`.
+ *   If `in_idx` is `std::nullopt`, it defaults to a contiguous array from 0 to len-1.
+ * @param[out] out_val
+ *   Output matrix [in_val.get_n_row(), k] storing the selected k smallest/largest elements
+ *   from each row of `in_val`.
+ * @param[out] out_idx
+ *   Output indices [in_val.get_n_row(), k] corresponding to the selected elements in `out_val`.
+ * @param[in] select_min
+ *   Flag indicating whether to select the k smallest (true) or largest (false) elements.
+ * @param[in] sorted
+ *   whether to make sure selected pairs are sorted by value
+ * @param[in] algo
+ *   the selection algorithm to use
+ */
+template <typename T, typename IdxT>
+void select_k(raft::resources const& handle,
+              raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,
+              std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx,
+              raft::device_matrix_view<T, IdxT, raft::row_major> out_val,
+              raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
+              bool select_min,
+              bool sorted     = false,
+              SelectAlgo algo = SelectAlgo::kAuto)
+{
+  return detail::select_k<T, IdxT>(
+    handle, in_val, in_idx, out_val, out_idx, select_min, sorted, algo);
+}
+/** @} */  // end of group select_k
+
+}  // namespace raft::sparse::matrix
diff --git a/cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu b/cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu
new file mode 100644
index 0000000000..c784b50dad
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(double, int64_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
\ No newline at end of file
diff --git a/cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu b/cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu
new file mode 100644
index 0000000000..98bab9a504
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#include <cstdint>  // uint32_t
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(double, uint32_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
\ No newline at end of file
diff --git a/cpp/src/sparse/matrix/detail/select_k_float_int32.cu b/cpp/src/sparse/matrix/detail/select_k_float_int32.cu
new file mode 100644
index 0000000000..bff213ae69
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_float_int32.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::matrix::detail::select_k(                       \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(float, int);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu b/cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu
new file mode 100644
index 0000000000..412b06e587
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(float, int64_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu b/cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu
new file mode 100644
index 0000000000..8ba3f0e22b
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(float, uint32_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu b/cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu
new file mode 100644
index 0000000000..24c844f8c8
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(__half, int64_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu b/cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu
new file mode 100644
index 0000000000..d63dc64933
--- /dev/null
+++ b/cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <raft/sparse/matrix/detail/select_k-inl.cuh>
+
+#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
+  template void raft::sparse::matrix::detail::select_k(               \
+    raft::resources const& handle,                                    \
+    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
+    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
+    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
+    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
+    bool select_min,                                                  \
+    bool sorted,                                                      \
+    raft::matrix::SelectAlgo algo)
+
+instantiate_raft_sparse_matrix_detail_select_k(__half, uint32_t);
+
+#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 20ed3bacc7..4d17aacffd 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -320,6 +320,7 @@ if(BUILD_TESTS)
     test/sparse/reduce.cu
     test/sparse/row_op.cu
     test/sparse/sddmm.cu
+    test/sparse/select_k_csr.cu
     test/sparse/sort.cu
     test/sparse/spgemmi.cu
     test/sparse/spmm.cu
diff --git a/cpp/test/sparse/select_k_csr.cu b/cpp/test/sparse/select_k_csr.cu
new file mode 100644
index 0000000000..fc1061d7bb
--- /dev/null
+++ b/cpp/test/sparse/select_k_csr.cu
@@ -0,0 +1,398 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "../test_utils.cuh"
+
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/device_mdspan.hpp>
+#include <raft/core/resource/cuda_stream.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/matrix/copy.cuh>
+#include <raft/random/make_blobs.cuh>
+#include <raft/random/rng_state.hpp>
+#include <raft/sparse/matrix/select_k.cuh>
+#include <raft/util/cuda_utils.cuh>
+
+#include <gtest/gtest.h>
+
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <optional>
+#include <queue>
+#include <random>
+#include <unordered_set>
+#include <vector>
+
+namespace raft {
+namespace sparse {
+
+template <typename index_t>
+struct SelectKCsrInputs {
+  index_t n_rows;
+  index_t n_cols;
+  index_t top_k;
+  float sparsity;
+  bool select_min;
+  bool customized_indices;
+};
+
+template <typename T>
+struct CompareApproxWithInf {
+  CompareApproxWithInf(T eps_) : eps(eps_) {}
+  bool operator()(const T& a, const T& b) const
+  {
+    if (std::isinf(a) && std::isinf(b)) return true;
+    T diff  = std::abs(a - b);
+    T m     = std::max(std::abs(a), std::abs(b));
+    T ratio = diff > eps ? diff / m : diff;
+
+    return (ratio <= eps);
+  }
+
+ private:
+  T eps;
+};
+
+template <typename value_t, typename index_t>
+class SelectKCsrTest : public ::testing::TestWithParam<SelectKCsrInputs<index_t>> {
+ public:
+  SelectKCsrTest()
+    : stream(resource::get_cuda_stream(handle)),
+      params(::testing::TestWithParam<SelectKCsrInputs<index_t>>::GetParam()),
+      indices_d(0, stream),
+      customized_indices_d(0, stream),
+      indptr_d(0, stream),
+      values_d(0, stream),
+      dst_values_d(0, stream),
+      dst_values_expected_d(0, stream),
+      dst_indices_d(0, stream),
+      dst_indices_expected_d(0, stream)
+  {
+  }
+
+ protected:
+  index_t create_sparse_matrix(index_t m, index_t n, value_t sparsity, std::vector<bool>& matrix)
+  {
+    index_t total_elements = static_cast<index_t>(m * n);
+    index_t num_ones       = static_cast<index_t>((total_elements * 1.0f) * sparsity);
+    index_t res            = num_ones;
+
+    for (index_t i = 0; i < total_elements; ++i) {
+      matrix[i] = false;
+    }
+
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<> dis_idx(0, total_elements - 1);
+
+    while (num_ones > 0) {
+      size_t index = dis_idx(gen);
+      if (matrix[index] == false) {
+        matrix[index] = true;
+        num_ones--;
+      }
+    }
+    return res;
+  }
+
+  void convert_to_csr(std::vector<bool>& matrix,
+                      index_t rows,
+                      index_t cols,
+                      std::vector<index_t>& indices,
+                      std::vector<index_t>& indptr)
+  {
+    index_t offset_indptr   = 0;
+    index_t offset_values   = 0;
+    indptr[offset_indptr++] = 0;
+
+    for (index_t i = 0; i < rows; ++i) {
+      for (index_t j = 0; j < cols; ++j) {
+        if (matrix[i * cols + j]) {
+          indices[offset_values] = static_cast<index_t>(j);
+          offset_values++;
+        }
+      }
+      indptr[offset_indptr++] = static_cast<index_t>(offset_values);
+    }
+  }
+
+  void cpu_select_k(const std::vector<index_t>& indptr_h,
+                    const std::vector<index_t>& indices_h,
+                    const std::vector<value_t>& values_h,
+                    std::optional<std::vector<index_t>>& in_idx_h,
+                    index_t n_rows,
+                    index_t n_cols,
+                    index_t top_k,
+                    std::vector<value_t>& out_values_h,
+                    std::vector<index_t>& out_indices_h,
+                    bool select_min = true)
+  {
+    auto comp = [select_min](const std::pair<value_t, index_t>& a,
+                             const std::pair<value_t, index_t>& b) {
+      return select_min ? a.first < b.first : a.first >= b.first;
+    };
+
+    for (index_t row = 0; row < n_rows; ++row) {
+      std::priority_queue<std::pair<value_t, index_t>,
+                          std::vector<std::pair<value_t, index_t>>,
+                          decltype(comp)>
+        pq(comp);
+
+      for (index_t idx = indptr_h[row]; idx < indptr_h[row + 1]; ++idx) {
+        pq.push({values_h[idx], (in_idx_h.has_value()) ? (*in_idx_h)[idx] : indices_h[idx]});
+        if (pq.size() > size_t(top_k)) { pq.pop(); }
+      }
+
+      std::vector<std::pair<value_t, index_t>> row_pairs;
+      while (!pq.empty()) {
+        row_pairs.push_back(pq.top());
+        pq.pop();
+      }
+
+      if (select_min) {
+        std::sort(row_pairs.begin(), row_pairs.end(), [](const auto& a, const auto& b) {
+          return a.first <= b.first;
+        });
+      } else {
+        std::sort(row_pairs.begin(), row_pairs.end(), [](const auto& a, const auto& b) {
+          return a.first >= b.first;
+        });
+      }
+      for (index_t col = 0; col < top_k; col++) {
+        if (col < index_t(row_pairs.size())) {
+          out_values_h[row * top_k + col]  = row_pairs[col].first;
+          out_indices_h[row * top_k + col] = row_pairs[col].second;
+        }
+      }
+    }
+  }
+
+  void random_array(value_t* array, size_t size)
+  {
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_real_distribution<value_t> dis(-10.0, 10.0);
+    std::unordered_set<value_t> uset;
+
+    while (uset.size() < size) {
+      uset.insert(dis(gen));
+    }
+    typename std::unordered_set<value_t>::iterator it = uset.begin();
+    for (size_t i = 0; i < size; ++i) {
+      array[i] = *(it++);
+    }
+  }
+
+  template <typename data_t>
+  std::optional<data_t> get_opt_var(data_t x)
+  {
+    if (params.customized_indices) {
+      return x;
+    } else {
+      return std::nullopt;
+    }
+  }
+
+  void SetUp() override
+  {
+    std::vector<bool> dense_values_h(params.n_rows * params.n_cols, false);
+    nnz = create_sparse_matrix(params.n_rows, params.n_cols, params.sparsity, dense_values_h);
+
+    std::vector<value_t> values_h(nnz);
+    std::vector<index_t> indices_h(nnz);
+    std::vector<index_t> customized_indices_h(nnz);
+    std::vector<index_t> indptr_h(params.n_rows + 1);
+
+    convert_to_csr(dense_values_h, params.n_rows, params.n_cols, indices_h, indptr_h);
+
+    std::vector<value_t> dst_values_h(params.n_rows * params.top_k,
+                                      std::numeric_limits<value_t>::infinity());
+    std::vector<index_t> dst_indices_h(params.n_rows * params.top_k, static_cast<index_t>(0));
+
+    dst_values_d.resize(params.n_rows * params.top_k, stream);
+    dst_indices_d.resize(params.n_rows * params.top_k, stream);
+    values_d.resize(nnz, stream);
+
+    update_device(dst_values_d.data(), dst_values_h.data(), dst_values_h.size(), stream);
+    update_device(dst_indices_d.data(), dst_indices_h.data(), dst_indices_h.size(), stream);
+
+    if (params.customized_indices) {
+      customized_indices_d.resize(nnz, stream);
+      update_device(customized_indices_d.data(),
+                    customized_indices_h.data(),
+                    customized_indices_h.size(),
+                    stream);
+    }
+
+    resource::sync_stream(handle);
+
+    if (values_h.size()) {
+      random_array(values_h.data(), values_h.size());
+      raft::copy(values_d.data(), values_h.data(), values_h.size(), stream);
+      resource::sync_stream(handle);
+    }
+
+    auto optional_indices_h = get_opt_var(customized_indices_h);
+
+    cpu_select_k(indptr_h,
+                 indices_h,
+                 values_h,
+                 optional_indices_h,
+                 params.n_rows,
+                 params.n_cols,
+                 params.top_k,
+                 dst_values_h,
+                 dst_indices_h,
+                 params.select_min);
+
+    indices_d.resize(nnz, stream);
+    indptr_d.resize(params.n_rows + 1, stream);
+
+    dst_values_expected_d.resize(params.n_rows * params.top_k, stream);
+    dst_indices_expected_d.resize(params.n_rows * params.top_k, stream);
+
+    update_device(values_d.data(), values_h.data(), values_h.size(), stream);
+    update_device(indices_d.data(), indices_h.data(), indices_h.size(), stream);
+    update_device(indptr_d.data(), indptr_h.data(), indptr_h.size(), stream);
+    update_device(dst_values_expected_d.data(), dst_values_h.data(), dst_values_h.size(), stream);
+    update_device(
+      dst_indices_expected_d.data(), dst_indices_h.data(), dst_indices_h.size(), stream);
+
+    resource::sync_stream(handle);
+  }
+
+  void Run()
+  {
+    auto in_val_structure = raft::make_device_compressed_structure_view<index_t, index_t, index_t>(
+      indptr_d.data(),
+      indices_d.data(),
+      params.n_rows,
+      params.n_cols,
+      static_cast<index_t>(indices_d.size()));
+
+    auto in_val =
+      raft::make_device_csr_matrix_view<const value_t>(values_d.data(), in_val_structure);
+
+    std::optional<raft::device_vector_view<const index_t, index_t>> in_idx;
+
+    in_idx = get_opt_var(
+      raft::make_device_vector_view<const index_t, index_t>(customized_indices_d.data(), nnz));
+
+    auto out_val = raft::make_device_matrix_view<value_t, index_t, raft::row_major>(
+      dst_values_d.data(), params.n_rows, params.top_k);
+    auto out_idx = raft::make_device_matrix_view<index_t, index_t, raft::row_major>(
+      dst_indices_d.data(), params.n_rows, params.top_k);
+
+    raft::sparse::matrix::select_k(
+      handle, in_val, in_idx, out_val, out_idx, params.select_min, true);
+
+    ASSERT_TRUE(raft::devArrMatch<index_t>(dst_indices_expected_d.data(),
+                                           out_idx.data_handle(),
+                                           params.n_rows * params.top_k,
+                                           raft::Compare<index_t>(),
+                                           stream));
+
+    ASSERT_TRUE(raft::devArrMatch<value_t>(dst_values_expected_d.data(),
+                                           out_val.data_handle(),
+                                           params.n_rows * params.top_k,
+                                           CompareApproxWithInf<value_t>(1e-6f),
+                                           stream));
+  }
+
+ protected:
+  raft::resources handle;
+  cudaStream_t stream;
+
+  SelectKCsrInputs<index_t> params;
+
+  index_t nnz;
+
+  rmm::device_uvector<value_t> values_d;
+  rmm::device_uvector<index_t> indptr_d;
+  rmm::device_uvector<index_t> indices_d;
+  rmm::device_uvector<index_t> customized_indices_d;
+
+  rmm::device_uvector<value_t> dst_values_d;
+  rmm::device_uvector<value_t> dst_values_expected_d;
+
+  rmm::device_uvector<index_t> dst_indices_d;
+  rmm::device_uvector<index_t> dst_indices_expected_d;
+};
+
+using SelectKCsrTest_float_int = SelectKCsrTest<float, int>;
+TEST_P(SelectKCsrTest_float_int, Result) { Run(); }
+
+using SelectKCsrTest_double_int64 = SelectKCsrTest<double, int64_t>;
+TEST_P(SelectKCsrTest_double_int64, Result) { Run(); }
+
+template <typename index_t>
+const std::vector<SelectKCsrInputs<index_t>> selectk_inputs = {
+  {10, 32, 10, 0.0, true, false},
+  {10, 32, 10, 0.0, true, true},
+  {10, 32, 10, 0.01, true, false},  // kWarpImmediate
+  {10, 32, 10, 0.1, true, true},
+  {10, 32, 251, 0.1, true, false},  // kWarpImmediate
+  {10, 32, 251, 0.6, true, true},
+  {1000, 1024 * 100, 1, 0.1, true, false},  // kWarpImmediate
+  {1000, 1024 * 100, 1, 0.2, true, true},
+  {1024, 1024, 258, 0.3, true, false},  // kRadix11bitsExtraPass
+  {1024, 1024, 600, 0.2, true, true},
+  {1024, 1024, 1024, 0.3, true, false},  // kRadix11bitsExtraPass
+  {1024, 1024, 1024, 0.2, true, true},
+  {100, 1024 * 1000, 251, 0.1, true, false},  // kWarpDistributedShm
+  {100, 1024 * 1000, 251, 0.2, true, true},
+  {1024, 1024 * 10, 251, 0.3, true, false},  // kWarpImmediate
+  {1024, 1024 * 10, 251, 0.2, true, true},
+  {1000, 1024 * 20, 1000, 0.2, true, false},  // kRadix11bits
+  {1000, 1024 * 20, 1000, 0.3, true, true},
+  {2048, 1024 * 10, 1000, 0.2, true, false},  // kRadix11bitsExtraPass
+  {2048, 1024 * 10, 1000, 0.3, true, true},
+  {2048, 1024 * 10, 2100, 0.1, true, false},  // kRadix11bitsExtraPass
+  {2048, 1024 * 10, 2100, 0.2, true, true},
+  {10, 32, 10, 0.0, false, false},
+  {10, 32, 10, 0.0, false, true},
+  {10, 32, 10, 0.01, false, false},  // kWarpImmediate
+  {10, 32, 10, 0.1, false, true},
+  {10, 32, 251, 0.1, false, false},  // kWarpImmediate
+  {10, 32, 251, 0.6, false, true},
+  {1000, 1024 * 100, 1, 0.1, false, false},  // kWarpImmediate
+  {1000, 1024 * 100, 1, 0.2, false, true},
+  {1024, 1024, 258, 0.3, false, false},  // kRadix11bitsExtraPass
+  {1024, 1024, 600, 0.2, false, true},
+  {1024, 1024, 1024, 0.3, false, false},  // kRadix11bitsExtraPass
+  {1024, 1024, 1024, 0.2, false, true},
+  {100, 1024 * 1000, 251, 0.1, false, false},  // kWarpDistributedShm
+  {100, 1024 * 1000, 251, 0.2, false, true},
+  {1024, 1024 * 10, 251, 0.3, false, false},  // kWarpImmediate
+  {1024, 1024 * 10, 251, 0.2, false, true},
+  {1000, 1024 * 20, 1000, 0.2, false, false},  // kRadix11bits
+  {1000, 1024 * 20, 1000, 0.3, false, true},
+  {2048, 1024 * 10, 1000, 0.2, false, false},  // kRadix11bitsExtraPass
+  {2048, 1024 * 10, 1000, 0.3, false, true},
+  {2048, 1024 * 10, 2100, 0.1, false, false},  // kRadix11bitsExtraPass
+  {2048, 1024 * 10, 2100, 0.2, false, true}};
+
+INSTANTIATE_TEST_CASE_P(SelectKCsrTest,
+                        SelectKCsrTest_float_int,
+                        ::testing::ValuesIn(selectk_inputs<int>));
+INSTANTIATE_TEST_CASE_P(SelectKCsrTest,
+                        SelectKCsrTest_double_int64,
+                        ::testing::ValuesIn(selectk_inputs<int64_t>));
+
+}  // namespace sparse
+}  // namespace raft

From 139bfd960cf52d1bea36560c543001a2cd948cf7 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 10 Apr 2024 11:18:35 -0500
Subject: [PATCH 07/60] Enable all tests for `arm64` jobs (#2248)

* Enable all pytests for arm jobs

* drop files

* Update test_wheel_pylibraft.sh

* Update test_wheel_raft_dask.sh
---
 ci/test_wheel_pylibraft.sh       |   9 +--
 ci/test_wheel_raft_dask.sh       |   9 +--
 ci/wheel_smoke_test_pylibraft.py |  53 ---------------
 ci/wheel_smoke_test_raft_dask.py | 107 -------------------------------
 4 files changed, 4 insertions(+), 174 deletions(-)
 delete mode 100644 ci/wheel_smoke_test_pylibraft.py
 delete mode 100644 ci/wheel_smoke_test_raft_dask.py

diff --git a/ci/test_wheel_pylibraft.sh b/ci/test_wheel_pylibraft.sh
index d990a0e6c2..b38f5a690b 100755
--- a/ci/test_wheel_pylibraft.sh
+++ b/ci/test_wheel_pylibraft.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -10,9 +10,4 @@ RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/pylibraft*.whl)[test]
 
-# Run smoke tests for aarch64 pull requests
-if [[ "$(arch)" == "aarch64" && "${RAPIDS_BUILD_TYPE}" == "pull-request" ]]; then
-    python ./ci/wheel_smoke_test_pylibraft.py
-else
-    python -m pytest ./python/pylibraft/pylibraft/test
-fi
+python -m pytest ./python/pylibraft/pylibraft/test
diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh
index b70563b7a1..76bb62e859 100755
--- a/ci/test_wheel_raft_dask.sh
+++ b/ci/test_wheel_raft_dask.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -14,9 +14,4 @@ python -m pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/raft_dask*.whl)[test]
 
-# Run smoke tests for aarch64 pull requests
-if [[ "$(arch)" == "aarch64" && "${RAPIDS_BUILD_TYPE}" == "pull-request" ]]; then
-    python ./ci/wheel_smoke_test_raft_dask.py
-else
-    python -m pytest ./python/raft-dask/raft_dask/test
-fi
+python -m pytest ./python/raft-dask/raft_dask/test
diff --git a/ci/wheel_smoke_test_pylibraft.py b/ci/wheel_smoke_test_pylibraft.py
deleted file mode 100644
index c0df2fe45c..0000000000
--- a/ci/wheel_smoke_test_pylibraft.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import numpy as np
-from scipy.spatial.distance import cdist
-
-from pylibraft.common import Handle, Stream, device_ndarray
-from pylibraft.distance import pairwise_distance
-
-
-if __name__ == "__main__":
-    metric = "euclidean"
-    n_rows = 1337
-    n_cols = 1337
-
-    input1 = np.random.random_sample((n_rows, n_cols))
-    input1 = np.asarray(input1, order="C").astype(np.float64)
-
-    output = np.zeros((n_rows, n_rows), dtype=np.float64)
-
-    expected = cdist(input1, input1, metric)
-
-    expected[expected <= 1e-5] = 0.0
-
-    input1_device = device_ndarray(input1)
-    output_device = None
-
-    s2 = Stream()
-    handle = Handle(stream=s2)
-    ret_output = pairwise_distance(
-        input1_device, input1_device, output_device, metric, handle=handle
-    )
-    handle.sync()
-
-    output_device = ret_output
-
-    actual = output_device.copy_to_host()
-
-    actual[actual <= 1e-5] = 0.0
-
-    assert np.allclose(expected, actual, rtol=1e-4)
diff --git a/ci/wheel_smoke_test_raft_dask.py b/ci/wheel_smoke_test_raft_dask.py
deleted file mode 100644
index 5709ac901c..0000000000
--- a/ci/wheel_smoke_test_raft_dask.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from dask.distributed import Client, get_worker, wait
-from dask_cuda import LocalCUDACluster, initialize
-
-from raft_dask.common import (
-    Comms,
-    local_handle,
-    perform_test_comm_split,
-    perform_test_comms_allgather,
-    perform_test_comms_allreduce,
-    perform_test_comms_bcast,
-    perform_test_comms_device_multicast_sendrecv,
-    perform_test_comms_device_send_or_recv,
-    perform_test_comms_device_sendrecv,
-    perform_test_comms_gather,
-    perform_test_comms_gatherv,
-    perform_test_comms_reduce,
-    perform_test_comms_reducescatter,
-    perform_test_comms_send_recv,
-)
-
-import os
-os.environ["UCX_LOG_LEVEL"] = "error"
-
-
-def func_test_send_recv(sessionId, n_trials):
-    handle = local_handle(sessionId, dask_worker=get_worker())
-    return perform_test_comms_send_recv(handle, n_trials)
-
-
-def func_test_collective(func, sessionId, root):
-    handle = local_handle(sessionId, dask_worker=get_worker())
-    return func(handle, root)
-
-
-if __name__ == "__main__":
-    # initial setup
-    cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0)
-    client = Client(cluster)
-
-    n_trials = 5
-    root_location = "client"
-
-    # p2p test for ucx
-    cb = Comms(comms_p2p=True, verbose=True)
-    cb.init()
-
-    dfs = [
-        client.submit(
-            func_test_send_recv,
-            cb.sessionId,
-            n_trials,
-            pure=False,
-            workers=[w],
-        )
-        for w in cb.worker_addresses
-    ]
-
-    wait(dfs, timeout=5)
-
-    assert list(map(lambda x: x.result(), dfs))
-
-    cb.destroy()
-
-    # collectives test for nccl
-
-    cb = Comms(
-        verbose=True, client=client, nccl_root_location=root_location
-    )
-    cb.init()
-
-    for k, v in cb.worker_info(cb.worker_addresses).items():
-
-        dfs = [
-            client.submit(
-                func_test_collective,
-                perform_test_comms_allgather,
-                cb.sessionId,
-                v["rank"],
-                pure=False,
-                workers=[w],
-            )
-            for w in cb.worker_addresses
-        ]
-        wait(dfs, timeout=5)
-
-        assert all([x.result() for x in dfs])
-
-    cb.destroy()
-
-    # final client and cluster teardown
-    client.close()
-    cluster.close()

From 316a06583717c0228a9cc66557c5bed16332365d Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Wed, 10 Apr 2024 14:22:34 -0400
Subject: [PATCH 08/60] Correct member initialization order (#2254)

The init order of class members needs to follow the order listed in the class. This corrects the error for the cagra index type, as it was found to be wrong via https://github.com/rapidsai/cuvs/pull/66

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2254
---
 cpp/include/raft/neighbors/cagra_types.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/include/raft/neighbors/cagra_types.hpp b/cpp/include/raft/neighbors/cagra_types.hpp
index 807f89fd65..97c9c0d098 100644
--- a/cpp/include/raft/neighbors/cagra_types.hpp
+++ b/cpp/include/raft/neighbors/cagra_types.hpp
@@ -205,8 +205,8 @@ struct index : ann::index {
         raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded)
     : ann::index(),
       metric_(metric),
-      dataset_(new neighbors::empty_dataset<int64_t>(0)),
-      graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0))
+      graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0)),
+      dataset_(new neighbors::empty_dataset<int64_t>(0))
   {
   }
 
@@ -271,8 +271,8 @@ struct index : ann::index {
         mdspan<const IdxT, matrix_extent<int64_t>, row_major, graph_accessor> knn_graph)
     : ann::index(),
       metric_(metric),
-      dataset_(make_aligned_dataset(res, dataset, 16)),
-      graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0))
+      graph_(make_device_matrix<IdxT, int64_t>(res, 0, 0)),
+      dataset_(make_aligned_dataset(res, dataset, 16))
   {
     RAFT_EXPECTS(dataset.extent(0) == knn_graph.extent(0),
                  "Dataset and knn_graph must have equal number of rows");

From da6e2a4034b82d165e39127bc696ab8e42853fba Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Mon, 15 Apr 2024 18:41:59 -0400
Subject: [PATCH 09/60] Update the developer's guide with new copyright hook
 (#2266)

Issue: https://github.com/rapidsai/build-planning/issues/30

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/raft/pull/2266
---
 docs/source/developer_guide.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/docs/source/developer_guide.md b/docs/source/developer_guide.md
index e10e8987af..5e288e9f2f 100644
--- a/docs/source/developer_guide.md
+++ b/docs/source/developer_guide.md
@@ -215,13 +215,14 @@ python ./cpp/scripts/include_checker.py --inplace [cpp/include cpp/test ... list
 ```
 
 ### Copyright header
-[copyright.py](https://github.com/rapidsai/raft/blob/branch-24.06/ci/checks/copyright.py) checks the Copyright header for all git-modified files
+RAPIDS [pre-commit-hooks](https://github.com/rapidsai/pre-commit-hooks) checks the Copyright
+header for all git-modified files.
 
-Manually, you can run the following to bulk-fix the header if only the years need to be updated:
+Manually, you can run the following to bulk-fix the header on all files in the repository:
 ```bash
-python ./ci/checks/copyright.py --update-current-year
+pre-commit run -a verify-copyright
 ```
-Keep in mind that this only applies to files tracked by git and having been modified.
+Keep in mind that this only applies to files tracked by git that have been modified.
 
 ## Error handling
 Call CUDA APIs via the provided helper macros `RAFT_CUDA_TRY`, `RAFT_CUBLAS_TRY` and `RAFT_CUSOLVER_TRY`. These macros take care of checking the return values of the used API calls and generate an exception when the command is not successful. If you need to avoid an exception, e.g. inside a destructor, use `RAFT_CUDA_TRY_NO_THROW`, `RAFT_CUBLAS_TRY_NO_THROW ` and `RAFT_CUSOLVER_TRY_NO_THROW`. These macros log the error but do not throw an exception.

From e977d2e27c80747d34253f746381bbdd6a230f38 Mon Sep 17 00:00:00 2001
From: tsuki <12711693+enp1s0@users.noreply.github.com>
Date: Tue, 16 Apr 2024 08:21:34 +0900
Subject: [PATCH 10/60] Fix a compilation error in CAGRA when enabling log
 output (#2262)

This PR fixes a compilation error in CAGRA when enabling log output.

Authors:
  - tsuki (https://github.com/enp1s0)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2262
---
 cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
index ccfe3c7e2d..b9edbbfc4a 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
@@ -90,8 +90,8 @@ void search_main_core(
   CagraSampleFilterT sample_filter = CagraSampleFilterT())
 {
   RAFT_LOG_DEBUG("# dataset size = %lu, dim = %lu\n",
-                 static_cast<size_t>(index.data().n_rows()),
-                 static_cast<size_t>(index.data().dim()));
+                 static_cast<size_t>(dataset_desc.size),
+                 static_cast<size_t>(dataset_desc.dim));
   RAFT_LOG_DEBUG("# query size = %lu, dim = %lu\n",
                  static_cast<size_t>(queries.extent(0)),
                  static_cast<size_t>(queries.extent(1)));

From 39f3854fc0baf5837db19286d8461c2fd6296cdd Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 16 Apr 2024 10:05:25 -0400
Subject: [PATCH 11/60] Always use a static gtest and gbench (#2265)

Removes the need for us to install GTest or GBench in our raft testing CI

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/raft/pull/2265
---
 .../all_cuda-118_arch-aarch64.yaml            |  2 --
 .../all_cuda-118_arch-x86_64.yaml             |  2 --
 .../all_cuda-122_arch-aarch64.yaml            |  2 --
 .../all_cuda-122_arch-x86_64.yaml             |  2 --
 conda/recipes/libraft/conda_build_config.yaml |  6 -----
 conda/recipes/libraft/meta.yaml               |  6 -----
 .../raft-ann-bench/conda_build_config.yaml    |  3 ---
 cpp/CMakeLists.txt                            |  5 +++--
 cpp/cmake/thirdparty/get_gtest.cmake          | 22 -------------------
 dependencies.yaml                             |  2 --
 10 files changed, 3 insertions(+), 49 deletions(-)
 delete mode 100644 cpp/cmake/thirdparty/get_gtest.cmake

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index cc09d56057..189f8268df 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -23,9 +23,7 @@ dependencies:
 - dask-cuda==24.6.*
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
-- gmock>=1.13.0
 - graphviz
-- gtest>=1.13.0
 - ipython
 - joblib>=0.11
 - libcublas-dev=11.11.3.6
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 9fbdcdaad4..e604705112 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -23,9 +23,7 @@ dependencies:
 - dask-cuda==24.6.*
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
-- gmock>=1.13.0
 - graphviz
-- gtest>=1.13.0
 - ipython
 - joblib>=0.11
 - libcublas-dev=11.11.3.6
diff --git a/conda/environments/all_cuda-122_arch-aarch64.yaml b/conda/environments/all_cuda-122_arch-aarch64.yaml
index 1e78e7deca..49c53b4cfe 100644
--- a/conda/environments/all_cuda-122_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-122_arch-aarch64.yaml
@@ -24,9 +24,7 @@ dependencies:
 - dask-cuda==24.6.*
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
-- gmock>=1.13.0
 - graphviz
-- gtest>=1.13.0
 - ipython
 - joblib>=0.11
 - libcublas-dev
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 6d88855f30..6f782175dd 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -24,9 +24,7 @@ dependencies:
 - dask-cuda==24.6.*
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
-- gmock>=1.13.0
 - graphviz
-- gtest>=1.13.0
 - ipython
 - joblib>=0.11
 - libcublas-dev
diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index 9c39da4507..385cd831fc 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -19,12 +19,6 @@ cmake_version:
 nccl_version:
   - ">=2.9.9"
 
-gbench_version:
-  - "==1.8.0"
-
-gtest_version:
-  - ">=1.13.0"
-
 glog_version:
   - ">=0.6.0"
 
diff --git a/conda/recipes/libraft/meta.yaml b/conda/recipes/libraft/meta.yaml
index 55f326dc53..c4fd0aa0b6 100644
--- a/conda/recipes/libraft/meta.yaml
+++ b/conda/recipes/libraft/meta.yaml
@@ -304,9 +304,6 @@ outputs:
         - libcusolver-dev
         - libcusparse-dev
         {% endif %}
-        - benchmark {{ gbench_version }}
-        - gmock {{ gtest_version }}
-        - gtest {{ gtest_version }}
       run:
         - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
         {% if cuda_major == "11" %}
@@ -319,9 +316,6 @@ outputs:
         - libcusparse
         {% endif %}
         - {{ pin_subpackage('libraft', exact=True) }}
-        - benchmark {{ gbench_version }}
-        - gmock {{ gtest_version }}
-        - gtest {{ gtest_version }}
     about:
       home: https://rapids.ai/
       license: Apache-2.0
diff --git a/conda/recipes/raft-ann-bench/conda_build_config.yaml b/conda/recipes/raft-ann-bench/conda_build_config.yaml
index da0b893c1d..6aa6f3d47d 100644
--- a/conda/recipes/raft-ann-bench/conda_build_config.yaml
+++ b/conda/recipes/raft-ann-bench/conda_build_config.yaml
@@ -19,9 +19,6 @@ cmake_version:
 nccl_version:
   - ">=2.9.9"
 
-gtest_version:
-  - ">=1.13.0"
-
 glog_version:
   - ">=0.6.0"
 
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index cbae4bfb3f..25475fc6f2 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -185,12 +185,13 @@ if(NOT BUILD_CPU_ONLY)
 endif()
 
 if(BUILD_TESTS)
-  include(cmake/thirdparty/get_gtest.cmake)
+  include(${rapids-cmake-dir}/cpm/gtest.cmake)
+  rapids_cpm_gtest(BUILD_STATIC)
 endif()
 
 if(BUILD_PRIMS_BENCH OR BUILD_ANN_BENCH)
   include(${rapids-cmake-dir}/cpm/gbench.cmake)
-  rapids_cpm_gbench()
+  rapids_cpm_gbench(BUILD_STATIC)
 endif()
 
 if(BUILD_CAGRA_HNSWLIB)
diff --git a/cpp/cmake/thirdparty/get_gtest.cmake b/cpp/cmake/thirdparty/get_gtest.cmake
deleted file mode 100644
index 7efad7886c..0000000000
--- a/cpp/cmake/thirdparty/get_gtest.cmake
+++ /dev/null
@@ -1,22 +0,0 @@
-#=============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#=============================================================================
-
-function(find_and_configure_gtest )
-    include(${rapids-cmake-dir}/cpm/gtest.cmake)
-    rapids_cpm_gtest()
-endfunction()
-
-find_and_configure_gtest()
diff --git a/dependencies.yaml b/dependencies.yaml
index 658b08421d..a83cd003d6 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -358,8 +358,6 @@ dependencies:
       - output_types: [conda]
         packages:
           - *cmake_ver
-          - gtest>=1.13.0
-          - gmock>=1.13.0
   docs:
     common:
       - output_types: [conda]

From febc004c4fe1b593ee292d767a45c3d684064714 Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Thu, 18 Apr 2024 17:17:52 -0700
Subject: [PATCH 12/60] `#ifdef` out pragma deprecation warning messages
 (#2271)

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2271
---
 cpp/cmake/modules/ConfigureCUDA.cmake                       | 6 +++---
 cpp/include/raft/cluster/specializations.cuh                | 4 +++-
 cpp/include/raft/common/cub_wrappers.cuh                    | 4 +++-
 cpp/include/raft/common/device_loads_stores.cuh             | 4 +++-
 cpp/include/raft/common/scatter.cuh                         | 4 +++-
 cpp/include/raft/common/seive.hpp                           | 4 +++-
 cpp/include/raft/core/detail/logger.hpp                     | 4 +++-
 cpp/include/raft/distance/specializations.cuh               | 4 +++-
 cpp/include/raft/distance/specializations/distance.cuh      | 4 +++-
 .../raft/distance/specializations/fused_l2_nn_min.cuh       | 4 +++-
 cpp/include/raft/lap/lap.cuh                                | 4 +++-
 cpp/include/raft/lap/lap.hpp                                | 4 +++-
 cpp/include/raft/linalg/detail/gemm.hpp                     | 2 ++
 cpp/include/raft/linalg/gemm.cuh                            | 2 ++
 cpp/include/raft/linalg/lanczos.cuh                         | 4 +++-
 cpp/include/raft/matrix/math.cuh                            | 6 ++++--
 cpp/include/raft/matrix/matrix.cuh                          | 2 ++
 cpp/include/raft/matrix/matrix.hpp                          | 4 +++-
 cpp/include/raft/matrix/specializations.cuh                 | 4 +++-
 cpp/include/raft/matrix/specializations/detail/select_k.cuh | 4 +++-
 cpp/include/raft/neighbors/specializations.cuh              | 4 +++-
 cpp/include/raft/neighbors/specializations/ball_cover.cuh   | 4 +++-
 cpp/include/raft/neighbors/specializations/brute_force.cuh  | 4 +++-
 .../specializations/detail/ivf_pq_compute_similarity.cuh    | 4 +++-
 cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh | 4 +++-
 cpp/include/raft/neighbors/specializations/ivf_flat.cuh     | 4 +++-
 cpp/include/raft/neighbors/specializations/ivf_pq.cuh       | 4 +++-
 cpp/include/raft/neighbors/specializations/refine.cuh       | 4 +++-
 cpp/include/raft/sparse/hierarchy/common.h                  | 6 ++++--
 cpp/include/raft/sparse/hierarchy/single_linkage.cuh        | 6 ++++--
 cpp/include/raft/sparse/linalg/spmm.cuh                     | 2 ++
 cpp/include/raft/sparse/mst/mst.cuh                         | 6 ++++--
 cpp/include/raft/sparse/mst/mst.hpp                         | 4 +++-
 cpp/include/raft/sparse/mst/mst_solver.cuh                  | 6 ++++--
 cpp/include/raft/sparse/neighbors/knn.cuh                   | 4 +++-
 cpp/include/raft/sparse/neighbors/specializations.cuh       | 4 +++-
 cpp/include/raft/sparse/selection/cross_component_nn.cuh    | 6 ++++--
 cpp/include/raft/sparse/selection/knn.cuh                   | 6 ++++--
 cpp/include/raft/sparse/selection/knn_graph.cuh             | 4 +++-
 cpp/include/raft/spatial/knn/ann_common.h                   | 2 ++
 cpp/include/raft/spatial/knn/ball_cover.cuh                 | 4 +++-
 cpp/include/raft/spatial/knn/ball_cover_types.hpp           | 4 +++-
 cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh       | 4 +++-
 cpp/include/raft/spatial/knn/ivf_flat.cuh                   | 4 +++-
 cpp/include/raft/spatial/knn/ivf_flat_types.hpp             | 4 +++-
 cpp/include/raft/spatial/knn/ivf_pq.cuh                     | 4 +++-
 cpp/include/raft/spatial/knn/ivf_pq_types.hpp               | 4 +++-
 cpp/include/raft/spatial/knn/specializations.cuh            | 4 +++-
 cpp/include/raft/spatial/knn/specializations/knn.cuh        | 4 +++-
 cpp/include/raft/spectral/specializations.cuh               | 4 +++-
 cpp/include/raft/stats/specializations.cuh                  | 4 +++-
 51 files changed, 155 insertions(+), 55 deletions(-)

diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
index ea8a077b0c..b364d8418d 100644
--- a/cpp/cmake/modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -13,8 +13,8 @@
 # =============================================================================
 
 if(DISABLE_DEPRECATION_WARNINGS)
-  list(APPEND RAFT_CXX_FLAGS -Wno-deprecated-declarations)
-  list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wno-deprecated-declarations)
+  list(APPEND RAFT_CXX_FLAGS -Wno-deprecated-declarations -DRAFT_HIDE_DEPRECATION_WARNINGS)
+  list(APPEND RAFT_CUDA_FLAGS -Xcompiler=-Wno-deprecated-declarations -DRAFT_HIDE_DEPRECATION_WARNINGS)
 endif()
 
 # Be very strict when compiling with GCC as host compiler (and thus more lenient when compiling with
diff --git a/cpp/include/raft/cluster/specializations.cuh b/cpp/include/raft/cluster/specializations.cuh
index 9588a7f329..e85b05575f 100644
--- a/cpp/include/raft/cluster/specializations.cuh
+++ b/cpp/include/raft/cluster/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/common/cub_wrappers.cuh b/cpp/include/raft/common/cub_wrappers.cuh
index dd8fc2d103..239d6e08f6 100644
--- a/cpp/include/raft/common/cub_wrappers.cuh
+++ b/cpp/include/raft/common/cub_wrappers.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please note that there is no equivalent in RAFT's public API"
                 " so this file will eventually be removed altogether.")
+#endif
 
 #include <raft/util/detail/cub_wrappers.cuh>
diff --git a/cpp/include/raft/common/device_loads_stores.cuh b/cpp/include/raft/common/device_loads_stores.cuh
index 6c62cd70cc..53724f4ae1 100644
--- a/cpp/include/raft/common/device_loads_stores.cuh
+++ b/cpp/include/raft/common/device_loads_stores.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,10 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft/util version instead.")
+#endif
 
 #include <raft/util/device_loads_stores.cuh>
diff --git a/cpp/include/raft/common/scatter.cuh b/cpp/include/raft/common/scatter.cuh
index 72de79a596..dcbd46b236 100644
--- a/cpp/include/raft/common/scatter.cuh
+++ b/cpp/include/raft/common/scatter.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,10 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft/matrix version instead.")
+#endif
 
 #include <raft/util/scatter.cuh>
diff --git a/cpp/include/raft/common/seive.hpp b/cpp/include/raft/common/seive.hpp
index 433b032b0f..56b41a41f4 100644
--- a/cpp/include/raft/common/seive.hpp
+++ b/cpp/include/raft/common/seive.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,10 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft/util version instead.")
+#endif
 
 #include <raft/util/seive.hpp>
diff --git a/cpp/include/raft/core/detail/logger.hpp b/cpp/include/raft/core/detail/logger.hpp
index 532aee4d90..f3f52b46ae 100644
--- a/cpp/include/raft/core/detail/logger.hpp
+++ b/cpp/include/raft/core/detail/logger.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                   \
                   " is deprecated and will be removed in future releases." \
                   " Please use the <raft/core/logger.hpp> version instead.")
+#endif
 
 #include <raft/core/logger.hpp>
diff --git a/cpp/include/raft/distance/specializations.cuh b/cpp/include/raft/distance/specializations.cuh
index ed0b6848ae..cba059154f 100644
--- a/cpp/include/raft/distance/specializations.cuh
+++ b/cpp/include/raft/distance/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/distance/specializations/distance.cuh b/cpp/include/raft/distance/specializations/distance.cuh
index ed0b6848ae..cba059154f 100644
--- a/cpp/include/raft/distance/specializations/distance.cuh
+++ b/cpp/include/raft/distance/specializations/distance.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh b/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh
index 9588a7f329..e85b05575f 100644
--- a/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh
+++ b/cpp/include/raft/distance/specializations/fused_l2_nn_min.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/lap/lap.cuh b/cpp/include/raft/lap/lap.cuh
index f7828294cd..b06cd113c1 100644
--- a/cpp/include/raft/lap/lap.cuh
+++ b/cpp/include/raft/lap/lap.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft/solver version instead.")
+#endif
 
 #include <raft/solver/linear_assignment.cuh>
 
diff --git a/cpp/include/raft/lap/lap.hpp b/cpp/include/raft/lap/lap.hpp
index 5472422053..0f1ad14ed5 100644
--- a/cpp/include/raft/lap/lap.hpp
+++ b/cpp/include/raft/lap/lap.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,10 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the cuh version instead.")
+#endif
 
 #include <raft/solver/linear_assignment.cuh>
diff --git a/cpp/include/raft/linalg/detail/gemm.hpp b/cpp/include/raft/linalg/detail/gemm.hpp
index 245f8eb4b0..236c840040 100644
--- a/cpp/include/raft/linalg/detail/gemm.hpp
+++ b/cpp/include/raft/linalg/detail/gemm.hpp
@@ -15,9 +15,11 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Use cublaslt_wrappers.hpp if you really need this low-level api.")
+#endif
 
 #include "cublaslt_wrappers.hpp"
 
diff --git a/cpp/include/raft/linalg/gemm.cuh b/cpp/include/raft/linalg/gemm.cuh
index c9dcbda5cc..7b8d35706b 100644
--- a/cpp/include/raft/linalg/gemm.cuh
+++ b/cpp/include/raft/linalg/gemm.cuh
@@ -18,9 +18,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Use raft/linalg/gemm.hpp instead.")
+#endif
 
 #include "detail/gemm.hpp"
 #include "gemm.hpp"  // Part of the API transferred to the non-deprecated file
diff --git a/cpp/include/raft/linalg/lanczos.cuh b/cpp/include/raft/linalg/lanczos.cuh
index 04e9980583..0117a8e1d4 100644
--- a/cpp/include/raft/linalg/lanczos.cuh
+++ b/cpp/include/raft/linalg/lanczos.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the sparse solvers version instead.")
+#endif
 
 #include <raft/sparse/solver/lanczos.cuh>
 
diff --git a/cpp/include/raft/matrix/math.cuh b/cpp/include/raft/matrix/math.cuh
index 598ac60faf..d1e8586a24 100644
--- a/cpp/include/raft/matrix/math.cuh
+++ b/cpp/include/raft/matrix/math.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,9 +19,11 @@
  * Please use versions in individual header files instead.
  */
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use versions in individual header files instead.")
+#endif
 
 #ifndef __MATH_H
 #define __MATH_H
@@ -488,4 +490,4 @@ void matrixVectorBinarySub(Type* data,
 };  // end namespace matrix
 };  // end namespace raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/matrix/matrix.cuh b/cpp/include/raft/matrix/matrix.cuh
index 29cc6fb75b..096de1b42d 100644
--- a/cpp/include/raft/matrix/matrix.cuh
+++ b/cpp/include/raft/matrix/matrix.cuh
@@ -19,9 +19,11 @@
  * Please use versions in individual header files instead.
  */
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use versions in individual header files instead.")
+#endif
 
 #ifndef __MATRIX_H
 #define __MATRIX_H
diff --git a/cpp/include/raft/matrix/matrix.hpp b/cpp/include/raft/matrix/matrix.hpp
index 53bd30d2eb..f6b06264ae 100644
--- a/cpp/include/raft/matrix/matrix.hpp
+++ b/cpp/include/raft/matrix/matrix.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,8 +24,10 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the cuh version instead.")
+#endif
 
 #include "matrix.cuh"
diff --git a/cpp/include/raft/matrix/specializations.cuh b/cpp/include/raft/matrix/specializations.cuh
index ac3b80e8d9..c61d65dcaf 100644
--- a/cpp/include/raft/matrix/specializations.cuh
+++ b/cpp/include/raft/matrix/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/matrix/specializations/detail/select_k.cuh b/cpp/include/raft/matrix/specializations/detail/select_k.cuh
index ac3b80e8d9..c61d65dcaf 100644
--- a/cpp/include/raft/matrix/specializations/detail/select_k.cuh
+++ b/cpp/include/raft/matrix/specializations/detail/select_k.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/neighbors/specializations.cuh b/cpp/include/raft/neighbors/specializations.cuh
index ed0b6848ae..cba059154f 100644
--- a/cpp/include/raft/neighbors/specializations.cuh
+++ b/cpp/include/raft/neighbors/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/neighbors/specializations/ball_cover.cuh b/cpp/include/raft/neighbors/specializations/ball_cover.cuh
index ed0b6848ae..cba059154f 100644
--- a/cpp/include/raft/neighbors/specializations/ball_cover.cuh
+++ b/cpp/include/raft/neighbors/specializations/ball_cover.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/neighbors/specializations/brute_force.cuh b/cpp/include/raft/neighbors/specializations/brute_force.cuh
index ed0b6848ae..cba059154f 100644
--- a/cpp/include/raft/neighbors/specializations/brute_force.cuh
+++ b/cpp/include/raft/neighbors/specializations/brute_force.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh b/cpp/include/raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh
index 9588a7f329..e85b05575f 100644
--- a/cpp/include/raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh
+++ b/cpp/include/raft/neighbors/specializations/detail/ivf_pq_compute_similarity.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh b/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh
index ed0b6848ae..cba059154f 100644
--- a/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh
+++ b/cpp/include/raft/neighbors/specializations/fused_l2_knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/neighbors/specializations/ivf_flat.cuh b/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
index ac3b80e8d9..c61d65dcaf 100644
--- a/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
+++ b/cpp/include/raft/neighbors/specializations/ivf_flat.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/neighbors/specializations/ivf_pq.cuh b/cpp/include/raft/neighbors/specializations/ivf_pq.cuh
index 9588a7f329..e85b05575f 100644
--- a/cpp/include/raft/neighbors/specializations/ivf_pq.cuh
+++ b/cpp/include/raft/neighbors/specializations/ivf_pq.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/neighbors/specializations/refine.cuh b/cpp/include/raft/neighbors/specializations/refine.cuh
index 9588a7f329..e85b05575f 100644
--- a/cpp/include/raft/neighbors/specializations/refine.cuh
+++ b/cpp/include/raft/neighbors/specializations/refine.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/sparse/hierarchy/common.h b/cpp/include/raft/sparse/hierarchy/common.h
index 01ebfd04df..6ac0fc3b4b 100644
--- a/cpp/include/raft/sparse/hierarchy/common.h
+++ b/cpp/include/raft/sparse/hierarchy/common.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,9 +20,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use raft/cluster/single_linkage_types.hpp instead.")
+#endif
 
 #include <raft/cluster/single_linkage_types.hpp>
 
@@ -31,4 +33,4 @@ using raft::cluster::linkage_output;
 using raft::cluster::linkage_output_int;
 using raft::cluster::linkage_output_int64;
 using raft::cluster::LinkageDistance;
-}  // namespace raft::hierarchy
\ No newline at end of file
+}  // namespace raft::hierarchy
diff --git a/cpp/include/raft/sparse/hierarchy/single_linkage.cuh b/cpp/include/raft/sparse/hierarchy/single_linkage.cuh
index 7f990ff44b..d21b2a87a6 100644
--- a/cpp/include/raft/sparse/hierarchy/single_linkage.cuh
+++ b/cpp/include/raft/sparse/hierarchy/single_linkage.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,13 +20,15 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft/cluster version instead.")
+#endif
 
 #include <raft/cluster/single_linkage.cuh>
 #include <raft/sparse/hierarchy/common.h>
 
 namespace raft::hierarchy {
 using raft::cluster::single_linkage;
-}
\ No newline at end of file
+}
diff --git a/cpp/include/raft/sparse/linalg/spmm.cuh b/cpp/include/raft/sparse/linalg/spmm.cuh
index 439ed8c341..31d84d1b75 100644
--- a/cpp/include/raft/sparse/linalg/spmm.cuh
+++ b/cpp/include/raft/sparse/linalg/spmm.cuh
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the spmm.hpp at the same path instead.")
+#endif
 
 #include <raft/sparse/linalg/detail/spmm.hpp>
diff --git a/cpp/include/raft/sparse/mst/mst.cuh b/cpp/include/raft/sparse/mst/mst.cuh
index eb6de1c0a1..f8aeff23f9 100644
--- a/cpp/include/raft/sparse/mst/mst.cuh
+++ b/cpp/include/raft/sparse/mst/mst.cuh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,13 +21,15 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft/sparse/solver version instead.")
+#endif
 
 #include <raft/sparse/mst/mst_solver.cuh>
 #include <raft/sparse/solver/mst.cuh>
 
 namespace raft::mst {
 using raft::sparse::solver::mst;
-}
\ No newline at end of file
+}
diff --git a/cpp/include/raft/sparse/mst/mst.hpp b/cpp/include/raft/sparse/mst/mst.hpp
index 5fbd264c6f..a0c1db5906 100644
--- a/cpp/include/raft/sparse/mst/mst.hpp
+++ b/cpp/include/raft/sparse/mst/mst.hpp
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,9 +21,11 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft/sparse/solver version instead.")
+#endif
 
 #include <raft/sparse/mst/mst.cuh>
 #include <raft/sparse/mst/mst_solver.cuh>
diff --git a/cpp/include/raft/sparse/mst/mst_solver.cuh b/cpp/include/raft/sparse/mst/mst_solver.cuh
index 76667396c3..d92d609be1 100644
--- a/cpp/include/raft/sparse/mst/mst_solver.cuh
+++ b/cpp/include/raft/sparse/mst/mst_solver.cuh
@@ -1,6 +1,6 @@
 
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,9 +21,11 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft/sparse/solver version instead.")
+#endif
 
 #include <raft/sparse/solver/mst_solver.cuh>
 
@@ -33,4 +35,4 @@ using raft::sparse::solver::Graph_COO;
 
 namespace raft::mst {
 using raft::sparse::solver::MST_solver;
-}
\ No newline at end of file
+}
diff --git a/cpp/include/raft/sparse/neighbors/knn.cuh b/cpp/include/raft/sparse/neighbors/knn.cuh
index 9dea2f5d52..2cf68818aa 100644
--- a/cpp/include/raft/sparse/neighbors/knn.cuh
+++ b/cpp/include/raft/sparse/neighbors/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the sparse/spatial version instead.")
+#endif
 
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/sparse/neighbors/brute_force.cuh>
diff --git a/cpp/include/raft/sparse/neighbors/specializations.cuh b/cpp/include/raft/sparse/neighbors/specializations.cuh
index 9588a7f329..e85b05575f 100644
--- a/cpp/include/raft/sparse/neighbors/specializations.cuh
+++ b/cpp/include/raft/sparse/neighbors/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/sparse/selection/cross_component_nn.cuh b/cpp/include/raft/sparse/selection/cross_component_nn.cuh
index e115d6c061..2874f0bc5e 100644
--- a/cpp/include/raft/sparse/selection/cross_component_nn.cuh
+++ b/cpp/include/raft/sparse/selection/cross_component_nn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the sparse/spatial version instead.")
+#endif
 
 #include <raft/sparse/neighbors/cross_component_nn.cuh>
 
@@ -34,4 +36,4 @@ namespace raft::linkage {
 using raft::sparse::neighbors::cross_component_nn;
 using raft::sparse::neighbors::FixConnectivitiesRedOp;
 using raft::sparse::neighbors::get_n_components;
-}  // namespace raft::linkage
\ No newline at end of file
+}  // namespace raft::linkage
diff --git a/cpp/include/raft/sparse/selection/knn.cuh b/cpp/include/raft/sparse/selection/knn.cuh
index 0258335941..e320d03478 100644
--- a/cpp/include/raft/sparse/selection/knn.cuh
+++ b/cpp/include/raft/sparse/selection/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,12 +24,14 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the sparse/spatial version instead.")
+#endif
 
 #include <raft/sparse/neighbors/knn.cuh>
 
 namespace raft::sparse::selection {
 using raft::sparse::neighbors::brute_force_knn;
-}
\ No newline at end of file
+}
diff --git a/cpp/include/raft/sparse/selection/knn_graph.cuh b/cpp/include/raft/sparse/selection/knn_graph.cuh
index 942213e6c1..e10bfe526f 100644
--- a/cpp/include/raft/sparse/selection/knn_graph.cuh
+++ b/cpp/include/raft/sparse/selection/knn_graph.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the sparse/spatial version instead.")
+#endif
 
 #include <raft/sparse/neighbors/knn_graph.cuh>
 
diff --git a/cpp/include/raft/spatial/knn/ann_common.h b/cpp/include/raft/spatial/knn/ann_common.h
index 722c01d561..4bcc3328a6 100644
--- a/cpp/include/raft/spatial/knn/ann_common.h
+++ b/cpp/include/raft/spatial/knn/ann_common.h
@@ -14,10 +14,12 @@
  * limitations under the License.
  */
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                              \
     __FILE__                                                  \
     " is deprecated and will be removed in a future release." \
     " Please use the other approximate KNN implementations defined in spatial/knn/*.")
+#endif
 
 #pragma once
 
diff --git a/cpp/include/raft/spatial/knn/ball_cover.cuh b/cpp/include/raft/spatial/knn/ball_cover.cuh
index f8c3fbd3c0..d08621030b 100644
--- a/cpp/include/raft/spatial/knn/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/ball_cover.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft::neighbors version instead.")
+#endif
 
 #include <raft/neighbors/ball_cover.cuh>
 #include <raft/spatial/knn/ball_cover_types.hpp>
diff --git a/cpp/include/raft/spatial/knn/ball_cover_types.hpp b/cpp/include/raft/spatial/knn/ball_cover_types.hpp
index 31062ff364..5203d9afe6 100644
--- a/cpp/include/raft/spatial/knn/ball_cover_types.hpp
+++ b/cpp/include/raft/spatial/knn/ball_cover_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft::neighbors version instead.")
+#endif
 
 #include <raft/neighbors/ball_cover_types.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh b/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh
index d516743115..9fcb9323ab 100644
--- a/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh
+++ b/cpp/include/raft/spatial/knn/epsilon_neighborhood.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft::neighbors version instead.")
+#endif
 
 #include <raft/neighbors/epsilon_neighborhood.cuh>
 
diff --git a/cpp/include/raft/spatial/knn/ivf_flat.cuh b/cpp/include/raft/spatial/knn/ivf_flat.cuh
index e63dcff475..6b968e9118 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat.cuh
+++ b/cpp/include/raft/spatial/knn/ivf_flat.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft::neighbors version instead.")
+#endif
 
 #include <raft/neighbors/ivf_flat.cuh>
 
diff --git a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
index 9546e62be0..e882139187 100644
--- a/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
+++ b/cpp/include/raft/spatial/knn/ivf_flat_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft::neighbors version instead.")
+#endif
 
 #include <raft/neighbors/ivf_flat_types.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/ivf_pq.cuh b/cpp/include/raft/spatial/knn/ivf_pq.cuh
index a89968bd80..ae4c0f914e 100644
--- a/cpp/include/raft/spatial/knn/ivf_pq.cuh
+++ b/cpp/include/raft/spatial/knn/ivf_pq.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft::neighbors version instead.")
+#endif
 
 #include <raft/neighbors/ivf_pq.cuh>
 
diff --git a/cpp/include/raft/spatial/knn/ivf_pq_types.hpp b/cpp/include/raft/spatial/knn/ivf_pq_types.hpp
index 168a75034f..dc97ab20a2 100644
--- a/cpp/include/raft/spatial/knn/ivf_pq_types.hpp
+++ b/cpp/include/raft/spatial/knn/ivf_pq_types.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,9 +24,11 @@
 
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(__FILE__                                                    \
                   " is deprecated and will be removed in a future release." \
                   " Please use the raft::neighbors version instead.")
+#endif
 
 #include <raft/neighbors/ivf_pq_types.hpp>
 
diff --git a/cpp/include/raft/spatial/knn/specializations.cuh b/cpp/include/raft/spatial/knn/specializations.cuh
index ed0b6848ae..cba059154f 100644
--- a/cpp/include/raft/spatial/knn/specializations.cuh
+++ b/cpp/include/raft/spatial/knn/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/spatial/knn/specializations/knn.cuh b/cpp/include/raft/spatial/knn/specializations/knn.cuh
index ed0b6848ae..cba059154f 100644
--- a/cpp/include/raft/spatial/knn/specializations/knn.cuh
+++ b/cpp/include/raft/spatial/knn/specializations/knn.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/spectral/specializations.cuh b/cpp/include/raft/spectral/specializations.cuh
index 9588a7f329..e85b05575f 100644
--- a/cpp/include/raft/spectral/specializations.cuh
+++ b/cpp/include/raft/spectral/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif
diff --git a/cpp/include/raft/stats/specializations.cuh b/cpp/include/raft/stats/specializations.cuh
index 9588a7f329..e85b05575f 100644
--- a/cpp/include/raft/stats/specializations.cuh
+++ b/cpp/include/raft/stats/specializations.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,8 +15,10 @@
  */
 #pragma once
 
+#ifndef RAFT_HIDE_DEPRECATION_WARNINGS
 #pragma message(                                            \
     __FILE__                                                \
     " is deprecated and will be removed."                   \
     " Including specializations is not necessary any more." \
     " For more information, see: https://docs.rapids.ai/api/raft/nightly/using_libraft.html")
+#endif

From 317a61cad71b5f83424d2481eb23b1d1b5817f40 Mon Sep 17 00:00:00 2001
From: Louis Sugy <lsugy@nvidia.com>
Date: Mon, 22 Apr 2024 18:37:41 +0200
Subject: [PATCH 13/60] Improve coalesced reduction performance for tall and
 thin matrices (up to 2.6x faster) (#2259)

This PR implements two optimizations to `coalescedReductionThinKernel` which is used for coalesced reductions of tall matrices (many rows) and/or thin (few columns):

1. Process multiple rows per warp to increase bytes in flight and amortize load latencies.
2. Use a vectorized reduction to avoid the LSU bottleneck and have fewer global stores (and at least partially coalesced).

The benchmark below shows the achieved SOL percentage on A30. I also measured that on H200, it achieved 84% SOL for 32 columns and up to 94% for 512 columns.

![2024-04-09_coalesced_reduction_vec](https://github.com/rapidsai/raft/assets/17441062/73dabe9a-e3ad-4708-9ef8-77ca4a4c9166)

Authors:
  - Louis Sugy (https://github.com/Nyrio)
  - Tamas Bela Feher (https://github.com/tfeher)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2259
---
 .../linalg/detail/coalesced_reduction-inl.cuh | 117 ++++++++++++++----
 cpp/include/raft/util/pow2_utils.cuh          |  13 +-
 cpp/include/raft/util/reduction.cuh           | 104 +++++++++++++++-
 cpp/test/linalg/coalesced_reduction.cu        |  46 +++++--
 4 files changed, 244 insertions(+), 36 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
index d580ea72c1..9f3be7ce0e 100644
--- a/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
+++ b/cpp/include/raft/linalg/detail/coalesced_reduction-inl.cuh
@@ -28,11 +28,18 @@ namespace raft {
 namespace linalg {
 namespace detail {
 
-template <int warpSize, int rpb>
+template <int warpSize, int tpb, int rpw, bool noLoop = false>
 struct ReductionThinPolicy {
-  static constexpr int LogicalWarpSize = warpSize;
-  static constexpr int RowsPerBlock    = rpb;
-  static constexpr int ThreadsPerBlock = LogicalWarpSize * RowsPerBlock;
+  static_assert(tpb % warpSize == 0);
+
+  static constexpr int LogicalWarpSize    = warpSize;
+  static constexpr int ThreadsPerBlock    = tpb;
+  static constexpr int RowsPerLogicalWarp = rpw;
+  static constexpr int NumLogicalWarps    = ThreadsPerBlock / LogicalWarpSize;
+  static constexpr int RowsPerBlock       = NumLogicalWarps * RowsPerLogicalWarp;
+
+  // Whether D (run-time arg) will be smaller than warpSize (compile-time parameter)
+  static constexpr bool NoSequentialReduce = noLoop;
 };
 
 template <typename Policy,
@@ -53,19 +60,72 @@ RAFT_KERNEL __launch_bounds__(Policy::ThreadsPerBlock)
                                FinalLambda final_op,
                                bool inplace = false)
 {
-  IdxType i = threadIdx.y + (Policy::RowsPerBlock * static_cast<IdxType>(blockIdx.x));
-  if (i >= N) return;
+  /* The strategy to achieve near-SOL memory bandwidth differs based on D:
+   *  - For small D, we need to process multiple rows per logical warp in order to have
+   *    multiple loads per thread and increase bytes in flight and amortize latencies.
+   *  - For large D, we start with a sequential reduction. The compiler partially unrolls
+   *    that loop (e.g. first a loop of stride 16, then 8, 4, and 1).
+   */
+  IdxType i0 = threadIdx.y + (Policy::RowsPerBlock * static_cast<IdxType>(blockIdx.x));
+  if (i0 >= N) return;
 
-  OutType acc = init;
-  for (IdxType j = threadIdx.x; j < D; j += Policy::LogicalWarpSize) {
-    acc = reduce_op(acc, main_op(data[j + (D * i)], j));
+  OutType acc[Policy::RowsPerLogicalWarp];
+#pragma unroll
+  for (int k = 0; k < Policy::RowsPerLogicalWarp; k++) {
+    acc[k] = init;
   }
-  acc = raft::logicalWarpReduce<Policy::LogicalWarpSize>(acc, reduce_op);
-  if (threadIdx.x == 0) {
+
+  if constexpr (Policy::NoSequentialReduce) {
+    IdxType j = threadIdx.x;
+    if (j < D) {
+#pragma unroll
+      for (IdxType k = 0; k < Policy::RowsPerLogicalWarp; k++) {
+        // Only the first row is known to be within bounds. Clamp to avoid out-of-mem read.
+        const IdxType i = raft::min(i0 + k * Policy::NumLogicalWarps, N - 1);
+        acc[k]          = reduce_op(acc[k], main_op(data[j + (D * i)], j));
+      }
+    }
+  } else {
+    for (IdxType j = threadIdx.x; j < D; j += Policy::LogicalWarpSize) {
+#pragma unroll
+      for (IdxType k = 0; k < Policy::RowsPerLogicalWarp; k++) {
+        const IdxType i = raft::min(i0 + k * Policy::NumLogicalWarps, N - 1);
+        acc[k]          = reduce_op(acc[k], main_op(data[j + (D * i)], j));
+      }
+    }
+  }
+
+  /* This vector reduction has two benefits compared to naive separate reductions:
+   * - It avoids the LSU bottleneck when the number of columns is around 32 (e.g. for 32, 5 shuffles
+   *   are required and there is no initial sequential reduction to amortize that cost).
+   * - It distributes the outputs to multiple threads, enabling a coalesced store when the number of
+   *   rows per logical warp and logical warp size are equal.
+   */
+  raft::logicalWarpReduceVector<Policy::LogicalWarpSize, Policy::RowsPerLogicalWarp>(
+    acc, threadIdx.x, reduce_op);
+
+  constexpr int reducOutVecWidth =
+    std::max(1, Policy::RowsPerLogicalWarp / Policy::LogicalWarpSize);
+  constexpr int reducOutGroupSize =
+    std::max(1, Policy::LogicalWarpSize / Policy::RowsPerLogicalWarp);
+  constexpr int reducNumGroups = Policy::LogicalWarpSize / reducOutGroupSize;
+
+  if (threadIdx.x % reducOutGroupSize == 0) {
+    const int groupId = threadIdx.x / reducOutGroupSize;
     if (inplace) {
-      dots[i] = final_op(reduce_op(dots[i], acc));
+#pragma unroll
+      for (int k = 0; k < reducOutVecWidth; k++) {
+        const int reductionId = k * reducNumGroups + groupId;
+        const IdxType i       = i0 + reductionId * Policy::NumLogicalWarps;
+        if (i < N) { dots[i] = final_op(reduce_op(dots[i], acc[k])); }
+      }
     } else {
-      dots[i] = final_op(acc);
+#pragma unroll
+      for (int k = 0; k < reducOutVecWidth; k++) {
+        const int reductionId = k * reducNumGroups + groupId;
+        const IdxType i       = i0 + reductionId * Policy::NumLogicalWarps;
+        if (i < N) { dots[i] = final_op(acc[k]); }
+      }
     }
   }
 }
@@ -89,8 +149,12 @@ void coalescedReductionThin(OutType* dots,
                             FinalLambda final_op   = raft::identity_op())
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
-    "coalescedReductionThin<%d,%d>", Policy::LogicalWarpSize, Policy::RowsPerBlock);
-  dim3 threads(Policy::LogicalWarpSize, Policy::RowsPerBlock, 1);
+    "coalescedReductionThin<%d,%d,%d,%d>",
+    Policy::LogicalWarpSize,
+    Policy::ThreadsPerBlock,
+    Policy::RowsPerLogicalWarp,
+    static_cast<int>(Policy::NoSequentialReduce));
+  dim3 threads(Policy::LogicalWarpSize, Policy::NumLogicalWarps, 1);
   dim3 blocks(ceildiv<IdxType>(N, Policy::RowsPerBlock), 1, 1);
   coalescedReductionThinKernel<Policy>
     <<<blocks, threads, 0, stream>>>(dots, data, D, N, init, main_op, reduce_op, final_op, inplace);
@@ -115,19 +179,28 @@ void coalescedReductionThinDispatcher(OutType* dots,
                                       FinalLambda final_op   = raft::identity_op())
 {
   if (D <= IdxType(2)) {
-    coalescedReductionThin<ReductionThinPolicy<2, 64>>(
+    coalescedReductionThin<ReductionThinPolicy<2, 128, 8, true>>(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else if (D <= IdxType(4)) {
-    coalescedReductionThin<ReductionThinPolicy<4, 32>>(
+    coalescedReductionThin<ReductionThinPolicy<4, 128, 8, true>>(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else if (D <= IdxType(8)) {
-    coalescedReductionThin<ReductionThinPolicy<8, 16>>(
+    coalescedReductionThin<ReductionThinPolicy<8, 128, 8, true>>(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else if (D <= IdxType(16)) {
-    coalescedReductionThin<ReductionThinPolicy<16, 8>>(
+    coalescedReductionThin<ReductionThinPolicy<16, 128, 8, true>>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else if (D <= IdxType(32)) {
+    coalescedReductionThin<ReductionThinPolicy<32, 128, 8, true>>(
+      dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
+  } else if (D < IdxType(128)) {
+    coalescedReductionThin<ReductionThinPolicy<32, 128, 4, false>>(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else {
-    coalescedReductionThin<ReductionThinPolicy<32, 4>>(
+    // For D=128 (included) and above, the 4x-unrolled loading loop is used
+    // and multiple rows per warp are counter-productive in terms of cache-friendliness
+    // and register use.
+    coalescedReductionThin<ReductionThinPolicy<32, 128, 1, false>>(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   }
 }
@@ -319,10 +392,10 @@ void coalescedReductionThickDispatcher(OutType* dots,
   // Note: multiple elements per thread to take advantage of the sequential reduction and loop
   // unrolling
   if (D < IdxType(32768)) {
-    coalescedReductionThick<ReductionThickPolicy<256, 32>, ReductionThinPolicy<32, 4>>(
+    coalescedReductionThick<ReductionThickPolicy<256, 32>, ReductionThinPolicy<32, 128, 1>>(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   } else {
-    coalescedReductionThick<ReductionThickPolicy<256, 64>, ReductionThinPolicy<32, 4>>(
+    coalescedReductionThick<ReductionThickPolicy<256, 64>, ReductionThinPolicy<32, 128, 1>>(
       dots, data, D, N, init, stream, inplace, main_op, reduce_op, final_op);
   }
 }
diff --git a/cpp/include/raft/util/pow2_utils.cuh b/cpp/include/raft/util/pow2_utils.cuh
index 68b35837b6..0c740ac5f6 100644
--- a/cpp/include/raft/util/pow2_utils.cuh
+++ b/cpp/include/raft/util/pow2_utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,6 +20,15 @@
 
 namespace raft {
 
+/**
+ * Checks whether an integer is a power of 2.
+ */
+template <typename T>
+constexpr HDI std::enable_if_t<std::is_integral_v<T>, bool> is_pow2(T v)
+{
+  return (v && !(v & (v - 1)));
+}
+
 /**
  * @brief Fast arithmetics and alignment checks for power-of-two values known at compile time.
  *
@@ -33,7 +42,7 @@ struct Pow2 {
   static constexpr Type Mask  = Value - 1;
 
   static_assert(std::is_integral<Type>::value, "Value must be integral.");
-  static_assert(Value && !(Value & Mask), "Value must be power of two.");
+  static_assert(is_pow2(Value), "Value must be power of two.");
 
 #define Pow2_FUNC_QUALIFIER         static constexpr __host__ __device__ __forceinline__
 #define Pow2_WHEN_INTEGRAL(I)       std::enable_if_t<Pow2_IS_REPRESENTABLE_AS(I), I>
diff --git a/cpp/include/raft/util/reduction.cuh b/cpp/include/raft/util/reduction.cuh
index 2c2b1aa228..c0d3da7609 100644
--- a/cpp/include/raft/util/reduction.cuh
+++ b/cpp/include/raft/util/reduction.cuh
@@ -39,8 +39,8 @@ DI T logicalWarpReduce(T val, ReduceLambda reduce_op)
 {
 #pragma unroll
   for (int i = logicalWarpSize / 2; i > 0; i >>= 1) {
-    T tmp = shfl_xor(val, i);
-    val   = reduce_op(val, tmp);
+    const T tmp = shfl_xor(val, i, logicalWarpSize);
+    val         = reduce_op(val, tmp);
   }
   return val;
 }
@@ -197,4 +197,104 @@ DI i_t binaryBlockReduce(i_t val, i_t* shmem)
   }
 }
 
+/**
+ * @brief Executes a collaborative vector reduction per sub-warp
+ *
+ * This uses fewer shuffles than naively reducing each element independently.
+ * Better performance is achieved with a larger vector width, up to vecWidth == warpSize/2.
+ * For example, for logicalWarpSize == 32 and vecWidth == 16, the naive method requires 80
+ * shuffles, this one only 31, 2.58x fewer.
+ *
+ * However, the output of the reduction is not broadcasted. The vector is modified in place and
+ * each thread holds a part of the output vector. The outputs are distributed in a round-robin
+ * pattern between the threads to facilitate coalesced IO. There are 2 possible layouts based on
+ * which of logicalWarpSize and vecWidth is larger:
+ * - If vecWidth >= logicalWarpSize, each thread has vecWidth/logicalWarpSize outputs.
+ * - If logicalWarpSize > vecWidth, logicalWarpSize/vecWidth threads have a copy of the same output.
+ *
+ * Example 1: logicalWarpSize == 4, vecWidth == 8, v = a+b+c+d
+ *           IN                        OUT
+ *  lane 0 | a0 a1 a2 a3 a4 a5 a6 a7 | v0 v4 - - - - - -
+ *  lane 1 | b0 b1 b2 b3 b4 b5 b6 b7 | v1 v5 - - - - - -
+ *  lane 2 | c0 c1 c2 c3 c4 c5 c6 c7 | v2 v6 - - - - - -
+ *  lane 3 | d0 d1 d2 d3 d4 d5 d6 d7 | v3 v7 - - - - - -
+ *
+ * Example 2: logicalWarpSize == 8, vecWidth == 4, v = a+b+c+d+e+f+g+h
+ *           IN            OUT
+ *  lane 0 | a0 a1 a2 a3 | v0 - - -
+ *  lane 1 | b0 b1 b2 b3 | v0 - - -
+ *  lane 2 | c0 c1 c2 c3 | v1 - - -
+ *  lane 3 | d0 d1 d2 d3 | v1 - - -
+ *  lane 4 | e0 e1 e2 e3 | v2 - - -
+ *  lane 5 | f0 f1 f2 f3 | v2 - - -
+ *  lane 6 | g0 g1 g2 g3 | v3 - - -
+ *  lane 7 | h0 h1 h2 h3 | v3 - - -
+ *
+ * @tparam logicalWarpSize Sub-warp size. Must be 2, 4, 8, 16 or 32.
+ * @tparam vecWidth Vector width. Must be a power of two.
+ * @tparam T Vector element type.
+ * @tparam ReduceLambda Reduction operator type.
+ * @param[in,out] acc Pointer to a vector of size vecWidth or more in registers
+ * @param[in] lane_id Lane id between 0 and logicalWarpSize-1
+ * @param[in] reduce_op Reduction operator, assumed to be commutative and associative.
+ */
+template <int logicalWarpSize, int vecWidth, typename T, typename ReduceLambda>
+DI void logicalWarpReduceVector(T* acc, int lane_id, ReduceLambda reduce_op)
+{
+  static_assert(vecWidth > 0, "Vec width must be strictly positive.");
+  static_assert(!(vecWidth & (vecWidth - 1)), "Vec width must be a power of two.");
+  static_assert(logicalWarpSize >= 2 && logicalWarpSize <= 32,
+                "Logical warp size must be between 2 and 32");
+  static_assert(!(logicalWarpSize & (logicalWarpSize - 1)),
+                "Logical warp size must be a power of two.");
+
+  constexpr int shflStride   = logicalWarpSize / 2;
+  constexpr int nextWarpSize = logicalWarpSize / 2;
+
+  // One step of the butterfly reduction, applied to each element of the vector.
+#pragma unroll
+  for (int k = 0; k < vecWidth; k++) {
+    const T tmp = shfl_xor(acc[k], shflStride, logicalWarpSize);
+    acc[k]      = reduce_op(acc[k], tmp);
+  }
+
+  constexpr int nextVecWidth = std::max(1, vecWidth / 2);
+
+  /* Split into 2 smaller logical warps and distribute half of the data to each for the next step.
+   * The distribution pattern is designed so that at the end the outputs are coalesced/round-robin.
+   * The idea is to distribute contiguous "chunks" of the vectors based on the new warp size. These
+   * chunks will be halved in the next step and so on.
+   *
+   * Example for logicalWarpSize == 4, vecWidth == 8:
+   *  lane 0 | 0 1 2 3 4 5 6 7 | [0 1] [4 5] - - - - | [0] [4] - - - - - -
+   *  lane 1 | 0 1 2 3 4 5 6 7 | [0 1] [4 5] - - - - | [1] [5] - - - - - -
+   *  lane 2 | 0 1 2 3 4 5 6 7 | [2 3] [6 7] - - - - | [2] [6] - - - - - -
+   *  lane 3 | 0 1 2 3 4 5 6 7 | [2 3] [6 7] - - - - | [3] [7] - - - - - -
+   *                      chunkSize=2           chunkSize=1
+   */
+  if constexpr (nextVecWidth < vecWidth) {
+    T tmp[nextVecWidth];
+    const bool firstHalf    = (lane_id % logicalWarpSize) < nextWarpSize;
+    constexpr int chunkSize = std::min(nextVecWidth, nextWarpSize);
+    constexpr int numChunks = nextVecWidth / chunkSize;
+#pragma unroll
+    for (int c = 0; c < numChunks; c++) {
+#pragma unroll
+      for (int i = 0; i < chunkSize; i++) {
+        const int k = c * chunkSize + i;
+        tmp[k]      = firstHalf ? acc[2 * c * chunkSize + i] : acc[(2 * c + 1) * chunkSize + i];
+      }
+    }
+#pragma unroll
+    for (int k = 0; k < nextVecWidth; k++) {
+      acc[k] = tmp[k];
+    }
+  }
+
+  // Recursively call with smaller sub-warps and possibly smaller vector width.
+  if constexpr (nextWarpSize > 1) {
+    logicalWarpReduceVector<nextWarpSize, nextVecWidth>(acc, lane_id % nextWarpSize, reduce_op);
+  }
+}
+
 }  // namespace raft
diff --git a/cpp/test/linalg/coalesced_reduction.cu b/cpp/test/linalg/coalesced_reduction.cu
index 2061f28d36..28f5ff5f60 100644
--- a/cpp/test/linalg/coalesced_reduction.cu
+++ b/cpp/test/linalg/coalesced_reduction.cu
@@ -39,7 +39,8 @@ struct coalescedReductionInputs {
 template <typename T>
 ::std::ostream& operator<<(::std::ostream& os, const coalescedReductionInputs<T>& dims)
 {
-  return os;
+  return os << "{ " << dims.tolerance << ", " << dims.rows << ", " << dims.cols << ", "
+            << dims.seed;
 }
 
 // Or else, we get the following compilation error
@@ -113,15 +114,40 @@ class coalescedReductionTest : public ::testing::TestWithParam<coalescedReductio
   rmm::device_uvector<T> dots_act;
 };
 
-const std::vector<coalescedReductionInputs<float>> inputsf = {{0.000002f, 1024, 32, 1234ULL},
-                                                              {0.000002f, 1024, 64, 1234ULL},
-                                                              {0.000002f, 1024, 128, 1234ULL},
-                                                              {0.000002f, 1024, 256, 1234ULL}};
-
-const std::vector<coalescedReductionInputs<double>> inputsd = {{0.000000001, 1024, 32, 1234ULL},
-                                                               {0.000000001, 1024, 64, 1234ULL},
-                                                               {0.000000001, 1024, 128, 1234ULL},
-                                                               {0.000000001, 1024, 256, 1234ULL}};
+// Note: it's important to have a variety of rows/columns combinations to test all possible code
+// paths: thin (few cols or many rows), medium, thick (many cols, very few rows).
+
+const std::vector<coalescedReductionInputs<float>> inputsf = {{0.000002f, 50, 2, 1234ULL},
+                                                              {0.000002f, 50, 3, 1234ULL},
+                                                              {0.000002f, 50, 7, 1234ULL},
+                                                              {0.000002f, 50, 9, 1234ULL},
+                                                              {0.000002f, 50, 20, 1234ULL},
+                                                              {0.000002f, 50, 55, 1234ULL},
+                                                              {0.000002f, 50, 100, 1234ULL},
+                                                              {0.000002f, 50, 270, 1234ULL},
+                                                              {0.000002f, 10000, 3, 1234ULL},
+                                                              {0.000002f, 10000, 9, 1234ULL},
+                                                              {0.000002f, 10000, 20, 1234ULL},
+                                                              {0.000002f, 10000, 55, 1234ULL},
+                                                              {0.000002f, 10000, 100, 1234ULL},
+                                                              {0.000002f, 10000, 270, 1234ULL},
+                                                              {0.0001f, 10, 25000, 1234ULL}};
+
+const std::vector<coalescedReductionInputs<double>> inputsd = {{0.000000001, 50, 2, 1234ULL},
+                                                               {0.000000001, 50, 3, 1234ULL},
+                                                               {0.000000001, 50, 7, 1234ULL},
+                                                               {0.000000001, 50, 9, 1234ULL},
+                                                               {0.000000001, 50, 20, 1234ULL},
+                                                               {0.000000001, 50, 55, 1234ULL},
+                                                               {0.000000001, 50, 100, 1234ULL},
+                                                               {0.000000001, 50, 270, 1234ULL},
+                                                               {0.000000001, 10000, 3, 1234ULL},
+                                                               {0.000000001, 10000, 9, 1234ULL},
+                                                               {0.000000001, 10000, 20, 1234ULL},
+                                                               {0.000000001, 10000, 55, 1234ULL},
+                                                               {0.000000001, 10000, 100, 1234ULL},
+                                                               {0.000000001, 10000, 270, 1234ULL},
+                                                               {0.0000001, 10, 25000, 1234ULL}};
 
 typedef coalescedReductionTest<float> coalescedReductionTestF;
 TEST_P(coalescedReductionTestF, Result)

From 71a19a2495b74ef726d7d95a0c953e16b3c86d67 Mon Sep 17 00:00:00 2001
From: Mark Harris <783069+harrism@users.noreply.github.com>
Date: Wed, 24 Apr 2024 11:54:01 +1000
Subject: [PATCH 14/60] Convert device_memory_resource* to
 device_async_resource_ref (#2269)

Closes #2261

For reviewers:
Many of changes are simple textual replace of `rmm::mr::device_memory_resource *` with `rmm::device_async_resource_ref`.  However there are several places where RAFT used a default value of `nullptr` for `device_memory_resource*` parameters. This is incompatible with a `resource_ref`, which is a lightweight non-owning reference class, not a pointer. In most places, I was able to either remove the default parameter value, or use `rmm::mr::get_current_device_resource()`. In the case of ivf_pq, I removed the deprecated versions of `search` that took an `mr` parameter.

I removed the unused old src/util/memory_pool.cpp and its headers.

Authors:
  - Mark Harris (https://github.com/harrism)

Approvers:
  - Artem M. Chirkin (https://github.com/achirkin)
  - Dante Gama Dessavre (https://github.com/dantegd)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2269
---
 cpp/CMakeLists.txt                            |  1 -
 cpp/bench/ann/src/raft/raft_ann_bench_utils.h |  5 +-
 cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu  | 12 +--
 cpp/bench/ann/src/raft/raft_cagra_wrapper.h   |  4 +-
 .../ann/src/raft/raft_ivf_flat_wrapper.h      | 11 ++-
 cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h  |  3 -
 cpp/bench/prims/common/benchmark.hpp          |  1 +
 cpp/bench/prims/matrix/gather.cu              |  1 +
 cpp/bench/prims/neighbors/knn.cuh             | 15 +++-
 cpp/bench/prims/random/subsample.cu           |  1 +
 .../raft/cluster/detail/kmeans_balanced.cuh   | 44 +++++-----
 cpp/include/raft/cluster/kmeans_balanced.cuh  |  3 +-
 .../raft/core/device_container_policy.hpp     | 19 ++---
 cpp/include/raft/core/device_mdarray.hpp      |  4 +-
 cpp/include/raft/core/device_resources.hpp    |  1 +
 .../raft/distance/detail/masked_nn.cuh        |  3 +-
 .../raft/matrix/detail/select_k-ext.cuh       |  3 -
 .../raft/matrix/detail/select_radix.cuh       | 10 +--
 .../raft/matrix/detail/select_warpsort.cuh    |  5 +-
 .../neighbors/detail/cagra/cagra_build.cuh    |  4 +-
 .../raft/neighbors/detail/cagra/utils.hpp     |  5 +-
 .../neighbors/detail/ivf_flat_search-ext.cuh  |  8 +-
 .../neighbors/detail/ivf_flat_search-inl.cuh  |  9 +-
 .../raft/neighbors/detail/ivf_pq_build.cuh    | 13 +--
 .../raft/neighbors/detail/ivf_pq_search.cuh   |  5 +-
 .../raft/neighbors/detail/knn_brute_force.cuh |  1 -
 cpp/include/raft/neighbors/ivf_flat-ext.cuh   | 10 +--
 cpp/include/raft/neighbors/ivf_flat-inl.cuh   |  8 +-
 cpp/include/raft/neighbors/ivf_pq-ext.cuh     | 32 +------
 cpp/include/raft/neighbors/ivf_pq-inl.cuh     | 72 +---------------
 .../random/detail/multi_variable_gaussian.cuh | 18 ++--
 .../raft/random/multi_variable_gaussian.cuh   | 12 ++-
 .../sparse/matrix/detail/select_k-ext.cuh     |  3 -
 .../raft/spatial/knn/detail/ann_quantized.cuh | 11 ++-
 .../raft/spatial/knn/detail/ann_utils.cuh     |  5 +-
 cpp/include/raft/util/cudart_utils.hpp        |  1 -
 cpp/include/raft/util/memory_pool-ext.hpp     | 28 ------
 cpp/include/raft/util/memory_pool-inl.hpp     | 85 -------------------
 cpp/include/raft/util/memory_pool.hpp         | 23 -----
 .../neighbors/ivf_pq_search_test-ext.cuh      |  5 +-
 .../raft_internal/neighbors/naive_knn.cuh     |  2 -
 cpp/src/neighbors/detail/ivf_flat_search.cu   |  6 +-
 cpp/src/neighbors/ivf_flat_00_generate.py     | 14 +--
 .../ivf_flat_search_float_int64_t.cu          |  6 +-
 .../ivf_flat_search_int8_t_int64_t.cu         |  6 +-
 .../ivf_flat_search_uint8_t_int64_t.cu        |  6 +-
 .../neighbors/ivfpq_search_float_int64_t.cu   |  7 +-
 .../neighbors/ivfpq_search_half_int64_t.cu    |  5 +-
 .../neighbors/ivfpq_search_int8_t_int64_t.cu  |  7 +-
 .../neighbors/ivfpq_search_uint8_t_int64_t.cu |  7 +-
 cpp/test/CMakeLists.txt                       |  1 -
 cpp/test/core/device_resources_manager.cpp    |  8 +-
 cpp/test/ext_headers/00_generate.py           |  5 +-
 .../ext_headers/raft_util_memory_pool.cpp     | 27 ------
 cpp/test/matrix/select_k.cuh                  |  1 -
 .../ivf_pq_search_float_uint32_t.cu           |  3 +-
 cpp/test/neighbors/ann_utils.cuh              |  4 -
 cpp/test/random/multi_variable_gaussian.cu    |  5 +-
 cpp/test/util/device_atomics.cu               |  1 +
 59 files changed, 187 insertions(+), 438 deletions(-)
 delete mode 100644 cpp/include/raft/util/memory_pool-ext.hpp
 delete mode 100644 cpp/include/raft/util/memory_pool-inl.hpp
 delete mode 100644 cpp/include/raft/util/memory_pool.hpp
 delete mode 100644 cpp/test/ext_headers/raft_util_memory_pool.cpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 25475fc6f2..eaab637338 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -565,7 +565,6 @@ if(RAFT_COMPILE_LIBRARY)
     src/spatial/knn/detail/fused_l2_knn_int32_t_float.cu
     src/spatial/knn/detail/fused_l2_knn_int64_t_float.cu
     src/spatial/knn/detail/fused_l2_knn_uint32_t_float.cu
-    src/util/memory_pool.cpp
   )
   set_target_properties(
     raft_objs
diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
index 40c1ecfa5e..72a2c0bb05 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
@@ -26,6 +26,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/failure_callback_resource_adaptor.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
@@ -130,8 +131,8 @@ class configured_raft_resources {
   {
   }
 
-  configured_raft_resources(configured_raft_resources&&)            = default;
-  configured_raft_resources& operator=(configured_raft_resources&&) = default;
+  configured_raft_resources(configured_raft_resources&&)            = delete;
+  configured_raft_resources& operator=(configured_raft_resources&&) = delete;
   ~configured_raft_resources()                                      = default;
   configured_raft_resources(const configured_raft_resources& res)
     : configured_raft_resources{res.shared_res_}
diff --git a/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu b/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu
index 709b08db76..d9ef1d74a3 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu
+++ b/cpp/bench/ann/src/raft/raft_cagra_hnswlib.cu
@@ -20,6 +20,7 @@
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #define JSON_DIAGNOSTICS 1
 #include <nlohmann/json.hpp>
@@ -89,10 +90,11 @@ int main(int argc, char** argv)
   // and is initially sized to half of free device memory.
   rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> pool_mr{
     &cuda_mr, rmm::percent_of_free_device_memory(50)};
-  rmm::mr::set_current_device_resource(
-    &pool_mr);  // Updates the current device resource pointer to `pool_mr`
-  rmm::mr::device_memory_resource* mr =
-    rmm::mr::get_current_device_resource();  // Points to `pool_mr`
-  return raft::bench::ann::run_main(argc, argv);
+  // Updates the current device resource pointer to `pool_mr`
+  auto old_mr = rmm::mr::set_current_device_resource(&pool_mr);
+  auto ret    = raft::bench::ann::run_main(argc, argv);
+  // Restores the current device resource pointer to its previous value
+  rmm::mr::set_current_device_resource(old_mr);
+  return ret;
 }
 #endif
diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
index 70fd22001e..46da8c52e6 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
@@ -36,7 +36,7 @@
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cassert>
 #include <fstream>
@@ -138,7 +138,7 @@ class RaftCagra : public ANN<T>, public AnnGPU {
   std::shared_ptr<raft::device_matrix<T, int64_t, row_major>> dataset_;
   std::shared_ptr<raft::device_matrix_view<const T, int64_t, row_major>> input_dataset_v_;
 
-  inline rmm::mr::device_memory_resource* get_mr(AllocatorType mem_type)
+  inline rmm::device_async_resource_ref get_mr(AllocatorType mem_type)
   {
     switch (mem_type) {
       case (AllocatorType::HostPinned): return &mr_pinned_;
diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
index 7f2996d77a..48d2b9de80 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
@@ -134,7 +134,14 @@ void RaftIvfFlatGpu<T, IdxT>::search(
   const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
 {
   static_assert(sizeof(size_t) == sizeof(IdxT), "IdxT is incompatible with size_t");
-  raft::neighbors::ivf_flat::search(
-    handle_, search_params_, *index_, queries, batch_size, k, (IdxT*)neighbors, distances);
+  raft::neighbors::ivf_flat::search(handle_,
+                                    search_params_,
+                                    *index_,
+                                    queries,
+                                    batch_size,
+                                    k,
+                                    (IdxT*)neighbors,
+                                    distances,
+                                    resource::get_workspace_resource(handle_));
 }
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
index 5d8b682264..1d73bd2e51 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
@@ -32,9 +32,6 @@
 #include <raft/neighbors/refine.cuh>
 #include <raft/util/cudart_utils.hpp>
 
-#include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
-
 #include <type_traits>
 
 namespace raft::bench::ann {
diff --git a/cpp/bench/prims/common/benchmark.hpp b/cpp/bench/prims/common/benchmark.hpp
index 4ecad6df3d..3ce43cc1e7 100644
--- a/cpp/bench/prims/common/benchmark.hpp
+++ b/cpp/bench/prims/common/benchmark.hpp
@@ -28,6 +28,7 @@
 #include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
diff --git a/cpp/bench/prims/matrix/gather.cu b/cpp/bench/prims/matrix/gather.cu
index 078f9e6198..876e47525c 100644
--- a/cpp/bench/prims/matrix/gather.cu
+++ b/cpp/bench/prims/matrix/gather.cu
@@ -24,6 +24,7 @@
 #include <raft/util/itertools.hpp>
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
 namespace raft::bench::matrix {
diff --git a/cpp/bench/prims/neighbors/knn.cuh b/cpp/bench/prims/neighbors/knn.cuh
index aea7168142..6499078623 100644
--- a/cpp/bench/prims/neighbors/knn.cuh
+++ b/cpp/bench/prims/neighbors/knn.cuh
@@ -27,10 +27,12 @@
 #include <raft/spatial/knn/knn.cuh>
 #include <raft/util/itertools.hpp>
 
+#include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/host/new_delete_resource.hpp>
 #include <rmm/mr/host/pinned_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/sequence.h>
 
@@ -101,7 +103,7 @@ struct device_resource {
     if (managed_) { delete res_; }
   }
 
-  [[nodiscard]] auto get() const -> rmm::mr::device_memory_resource* { return res_; }
+  [[nodiscard]] auto get() const -> rmm::device_async_resource_ref { return res_; }
 
  private:
   const bool managed_;
@@ -158,8 +160,15 @@ struct ivf_flat_knn {
               IdxT* out_idxs)
   {
     search_params.n_probes = 20;
-    raft::neighbors::ivf_flat::search(
-      handle, search_params, *index, search_items, ps.n_queries, ps.k, out_idxs, out_dists);
+    raft::neighbors::ivf_flat::search(handle,
+                                      search_params,
+                                      *index,
+                                      search_items,
+                                      ps.n_queries,
+                                      ps.k,
+                                      out_idxs,
+                                      out_dists,
+                                      resource::get_workspace_resource(handle));
   }
 };
 
diff --git a/cpp/bench/prims/random/subsample.cu b/cpp/bench/prims/random/subsample.cu
index 4c8ca2bf31..70a9c65e0d 100644
--- a/cpp/bench/prims/random/subsample.cu
+++ b/cpp/bench/prims/random/subsample.cu
@@ -27,6 +27,7 @@
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/device_scalar.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <cub/cub.cuh>
diff --git a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
index 6d3f430e88..0a5a3ba5aa 100644
--- a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
+++ b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
@@ -43,15 +43,14 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
-#include <rmm/device_vector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
 #include <thrust/transform.h>
 
 #include <limits>
+#include <optional>
 #include <tuple>
 #include <type_traits>
 
@@ -91,7 +90,7 @@ inline std::enable_if_t<std::is_floating_point_v<MathT>> predict_core(
   const MathT* dataset_norm,
   IdxT n_rows,
   LabelT* labels,
-  rmm::mr::device_memory_resource* mr)
+  rmm::device_async_resource_ref mr)
 {
   auto stream = resource::get_cuda_stream(handle);
   switch (params.metric) {
@@ -263,10 +262,9 @@ void calc_centers_and_sizes(const raft::resources& handle,
                             const LabelT* labels,
                             bool reset_counters,
                             MappingOpT mapping_op,
-                            rmm::mr::device_memory_resource* mr = nullptr)
+                            rmm::device_async_resource_ref mr)
 {
   auto stream = resource::get_cuda_stream(handle);
-  if (mr == nullptr) { mr = resource::get_workspace_resource(handle); }
 
   if (!reset_counters) {
     raft::linalg::matrixVectorOp(
@@ -322,12 +320,12 @@ void compute_norm(const raft::resources& handle,
                   IdxT dim,
                   IdxT n_rows,
                   MappingOpT mapping_op,
-                  rmm::mr::device_memory_resource* mr = nullptr)
+                  std::optional<rmm::device_async_resource_ref> mr = std::nullopt)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope("compute_norm");
   auto stream = resource::get_cuda_stream(handle);
-  if (mr == nullptr) { mr = resource::get_workspace_resource(handle); }
-  rmm::device_uvector<MathT> mapped_dataset(0, stream, mr);
+  rmm::device_uvector<MathT> mapped_dataset(
+    0, stream, mr.value_or(resource::get_workspace_resource(handle)));
 
   const MathT* dataset_ptr = nullptr;
 
@@ -338,7 +336,7 @@ void compute_norm(const raft::resources& handle,
 
     linalg::unaryOp(mapped_dataset.data(), dataset, n_rows * dim, mapping_op, stream);
 
-    dataset_ptr = (const MathT*)mapped_dataset.data();
+    dataset_ptr = static_cast<const MathT*>(mapped_dataset.data());
   }
 
   raft::linalg::rowNorm<MathT, IdxT>(
@@ -376,22 +374,22 @@ void predict(const raft::resources& handle,
              IdxT n_rows,
              LabelT* labels,
              MappingOpT mapping_op,
-             rmm::mr::device_memory_resource* mr = nullptr,
-             const MathT* dataset_norm           = nullptr)
+             std::optional<rmm::device_async_resource_ref> mr = std::nullopt,
+             const MathT* dataset_norm                        = nullptr)
 {
   auto stream = resource::get_cuda_stream(handle);
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "predict(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
-  if (mr == nullptr) { mr = resource::get_workspace_resource(handle); }
+  auto mem_res = mr.value_or(resource::get_workspace_resource(handle));
   auto [max_minibatch_size, _mem_per_row] =
     calc_minibatch_size<MathT>(n_clusters, n_rows, dim, params.metric, std::is_same_v<T, MathT>);
   rmm::device_uvector<MathT> cur_dataset(
-    std::is_same_v<T, MathT> ? 0 : max_minibatch_size * dim, stream, mr);
+    std::is_same_v<T, MathT> ? 0 : max_minibatch_size * dim, stream, mem_res);
   bool need_compute_norm =
     dataset_norm == nullptr && (params.metric == raft::distance::DistanceType::L2Expanded ||
                                 params.metric == raft::distance::DistanceType::L2SqrtExpanded);
   rmm::device_uvector<MathT> cur_dataset_norm(
-    need_compute_norm ? max_minibatch_size : 0, stream, mr);
+    need_compute_norm ? max_minibatch_size : 0, stream, mem_res);
   const MathT* dataset_norm_ptr = nullptr;
   auto cur_dataset_ptr          = cur_dataset.data();
   for (IdxT offset = 0; offset < n_rows; offset += max_minibatch_size) {
@@ -407,7 +405,7 @@ void predict(const raft::resources& handle,
     // Compute the norm now if it hasn't been pre-computed.
     if (need_compute_norm) {
       compute_norm(
-        handle, cur_dataset_norm.data(), cur_dataset_ptr, dim, minibatch_size, mapping_op, mr);
+        handle, cur_dataset_norm.data(), cur_dataset_ptr, dim, minibatch_size, mapping_op, mem_res);
       dataset_norm_ptr = cur_dataset_norm.data();
     } else if (dataset_norm != nullptr) {
       dataset_norm_ptr = dataset_norm + offset;
@@ -422,7 +420,7 @@ void predict(const raft::resources& handle,
                  dataset_norm_ptr,
                  minibatch_size,
                  labels + offset,
-                 mr);
+                 mem_res);
   }
 }
 
@@ -530,7 +528,7 @@ auto adjust_centers(MathT* centers,
                     MathT threshold,
                     MappingOpT mapping_op,
                     rmm::cuda_stream_view stream,
-                    rmm::mr::device_memory_resource* device_memory) -> bool
+                    rmm::device_async_resource_ref device_memory) -> bool
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "adjust_centers(%zu, %u)", static_cast<size_t>(n_rows), n_clusters);
@@ -628,7 +626,7 @@ void balancing_em_iters(const raft::resources& handle,
                         uint32_t balancing_pullback,
                         MathT balancing_threshold,
                         MappingOpT mapping_op,
-                        rmm::mr::device_memory_resource* device_memory)
+                        rmm::device_async_resource_ref device_memory)
 {
   auto stream                = resource::get_cuda_stream(handle);
   uint32_t balancing_counter = balancing_pullback;
@@ -711,7 +709,7 @@ void build_clusters(const raft::resources& handle,
                     LabelT* cluster_labels,
                     CounterT* cluster_sizes,
                     MappingOpT mapping_op,
-                    rmm::mr::device_memory_resource* device_memory,
+                    rmm::device_async_resource_ref device_memory,
                     const MathT* dataset_norm = nullptr)
 {
   auto stream = resource::get_cuda_stream(handle);
@@ -853,8 +851,8 @@ auto build_fine_clusters(const raft::resources& handle,
                          IdxT fine_clusters_nums_max,
                          MathT* cluster_centers,
                          MappingOpT mapping_op,
-                         rmm::mr::device_memory_resource* managed_memory,
-                         rmm::mr::device_memory_resource* device_memory) -> IdxT
+                         rmm::device_async_resource_ref managed_memory,
+                         rmm::device_async_resource_ref device_memory) -> IdxT
 {
   auto stream = resource::get_cuda_stream(handle);
   rmm::device_uvector<IdxT> mc_trainset_ids_buf(mesocluster_size_max, stream, managed_memory);
@@ -971,7 +969,7 @@ void build_hierarchical(const raft::resources& handle,
 
   // TODO: Remove the explicit managed memory- we shouldn't be creating this on the user's behalf.
   rmm::mr::managed_memory_resource managed_memory;
-  rmm::mr::device_memory_resource* device_memory = resource::get_workspace_resource(handle);
+  rmm::device_async_resource_ref device_memory = resource::get_workspace_resource(handle);
   auto [max_minibatch_size, mem_per_row] =
     calc_minibatch_size<MathT>(n_clusters, n_rows, dim, params.metric, std::is_same_v<T, MathT>);
 
diff --git a/cpp/include/raft/cluster/kmeans_balanced.cuh b/cpp/include/raft/cluster/kmeans_balanced.cuh
index 8cd7730814..a1a182608b 100644
--- a/cpp/include/raft/cluster/kmeans_balanced.cuh
+++ b/cpp/include/raft/cluster/kmeans_balanced.cuh
@@ -358,7 +358,8 @@ void calc_centers_and_sizes(const raft::resources& handle,
                                  X.extent(0),
                                  labels.data_handle(),
                                  reset_counters,
-                                 mapping_op);
+                                 mapping_op,
+                                 resource::get_workspace_resource(handle));
 }
 
 }  // namespace helpers
diff --git a/cpp/include/raft/core/device_container_policy.hpp b/cpp/include/raft/core/device_container_policy.hpp
index 8c6eff582b..18d8b77364 100644
--- a/cpp/include/raft/core/device_container_policy.hpp
+++ b/cpp/include/raft/core/device_container_policy.hpp
@@ -31,7 +31,8 @@
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <thrust/device_ptr.h>
 
@@ -117,7 +118,7 @@ class device_uvector {
    */
   explicit device_uvector(std::size_t size,
                           rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource* mr)
+                          rmm::device_async_resource_ref mr)
     : data_{size, stream, mr}
   {
   }
@@ -164,19 +165,11 @@ class device_uvector_policy {
  public:
   auto create(raft::resources const& res, size_t n) -> container_type
   {
-    if (mr_ == nullptr) {
-      // NB: not using the workspace resource by default!
-      //     The workspace resource is for short-lived temporary allocations.
-      return container_type(n, resource::get_cuda_stream(res));
-    } else {
-      return container_type(n, resource::get_cuda_stream(res), mr_);
-    }
+    return container_type(n, resource::get_cuda_stream(res), mr_);
   }
 
   constexpr device_uvector_policy() = default;
-  constexpr explicit device_uvector_policy(rmm::mr::device_memory_resource* mr) noexcept : mr_(mr)
-  {
-  }
+  explicit device_uvector_policy(rmm::device_async_resource_ref mr) noexcept : mr_(mr) {}
 
   [[nodiscard]] constexpr auto access(container_type& c, size_t n) const noexcept -> reference
   {
@@ -192,7 +185,7 @@ class device_uvector_policy {
   [[nodiscard]] auto make_accessor_policy() const noexcept { return const_accessor_policy{}; }
 
  private:
-  rmm::mr::device_memory_resource* mr_{nullptr};
+  rmm::device_async_resource_ref mr_{rmm::mr::get_current_device_resource()};
 };
 
 }  // namespace raft
diff --git a/cpp/include/raft/core/device_mdarray.hpp b/cpp/include/raft/core/device_mdarray.hpp
index 855642cd76..a34f6e2e02 100644
--- a/cpp/include/raft/core/device_mdarray.hpp
+++ b/cpp/include/raft/core/device_mdarray.hpp
@@ -21,6 +21,8 @@
 #include <raft/core/mdarray.hpp>
 #include <raft/core/resources.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cstdint>
 
 namespace raft {
@@ -107,7 +109,7 @@ template <typename ElementType,
           typename LayoutPolicy = layout_c_contiguous,
           size_t... Extents>
 auto make_device_mdarray(raft::resources const& handle,
-                         rmm::mr::device_memory_resource* mr,
+                         rmm::device_async_resource_ref mr,
                          extents<IndexType, Extents...> exts)
 {
   using mdarray_t = device_mdarray<ElementType, decltype(exts), LayoutPolicy>;
diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp
index 366e387fdd..496c65d91f 100644
--- a/cpp/include/raft/core/device_resources.hpp
+++ b/cpp/include/raft/core/device_resources.hpp
@@ -37,6 +37,7 @@
 
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <cuda_runtime.h>
 
diff --git a/cpp/include/raft/distance/detail/masked_nn.cuh b/cpp/include/raft/distance/detail/masked_nn.cuh
index 3e3699766f..951e030cbd 100644
--- a/cpp/include/raft/distance/detail/masked_nn.cuh
+++ b/cpp/include/raft/distance/detail/masked_nn.cuh
@@ -256,9 +256,8 @@ void masked_l2_nn_impl(raft::resources const& handle,
   static_assert(P::Mblk == 64, "masked_l2_nn_impl only supports a policy with 64 rows per block.");
 
   // Get stream and workspace memory resource
-  rmm::mr::device_memory_resource* ws_mr =
-    dynamic_cast<rmm::mr::device_memory_resource*>(resource::get_workspace_resource(handle));
   auto stream = resource::get_cuda_stream(handle);
+  auto ws_mr  = resource::get_workspace_resource(handle);
 
   // Acquire temporary buffers and initialize to zero:
   // 1) Adjacency matrix bitfield
diff --git a/cpp/include/raft/matrix/detail/select_k-ext.cuh b/cpp/include/raft/matrix/detail/select_k-ext.cuh
index 506cbffcb9..6db1a5acac 100644
--- a/cpp/include/raft/matrix/detail/select_k-ext.cuh
+++ b/cpp/include/raft/matrix/detail/select_k-ext.cuh
@@ -20,9 +20,6 @@
 #include <raft/matrix/select_k_types.hpp>
 #include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
 
-#include <rmm/cuda_stream_view.hpp>                  // rmm:cuda_stream_view
-#include <rmm/mr/device/device_memory_resource.hpp>  // rmm::mr::device_memory_resource
-
 #include <cuda_fp16.h>  // __half
 
 #include <cstdint>  // uint32_t
diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh
index 83d4845c31..9480c8e202 100644
--- a/cpp/include/raft/matrix/detail/select_radix.cuh
+++ b/cpp/include/raft/matrix/detail/select_radix.cuh
@@ -29,9 +29,9 @@
 #include <raft/util/pow2_utils.cuh>
 #include <raft/util/vectorized.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
@@ -894,14 +894,12 @@ void radix_topk(const T* in,
                 unsigned grid_dim,
                 int sm_cnt,
                 rmm::cuda_stream_view stream,
-                rmm::mr::device_memory_resource* mr)
+                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
   // TODO: is it possible to relax this restriction?
   static_assert(calc_num_passes<T, BitsPerPass>() > 1);
   constexpr int num_buckets = calc_num_buckets<BitsPerPass>();
 
-  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
-
   auto kernel = radix_kernel<T, IdxT, BitsPerPass, BlockSize, false, len_or_indptr>;
   const size_t max_chunk_size =
     calc_chunk_size<T, IdxT, BlockSize>(batch_size, len, sm_cnt, kernel, false);
@@ -1179,7 +1177,7 @@ void radix_topk_one_block(const T* in,
                           const IdxT* len_i,
                           int sm_cnt,
                           rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource* mr)
+                          rmm::device_async_resource_ref mr)
 {
   static_assert(calc_num_passes<T, BitsPerPass>() > 1);
 
diff --git a/cpp/include/raft/matrix/detail/select_warpsort.cuh b/cpp/include/raft/matrix/detail/select_warpsort.cuh
index 2cb32585d5..7da659291c 100644
--- a/cpp/include/raft/matrix/detail/select_warpsort.cuh
+++ b/cpp/include/raft/matrix/detail/select_warpsort.cuh
@@ -27,8 +27,9 @@
 #include <raft/util/integer_utils.hpp>
 #include <raft/util/pow2_utils.cuh>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <algorithm>
 #include <functional>
@@ -1032,7 +1033,7 @@ void select_k_(int num_of_block,
                IdxT* out_idx,
                bool select_min,
                rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
+               rmm::device_async_resource_ref mr)
 {
   rmm::device_uvector<T> tmp_val(num_of_block * k * batch_size, stream, mr);
   rmm::device_uvector<IdxT> tmp_idx(num_of_block * k * batch_size, stream, mr);
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
index d91e45257e..d63f865c39 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
@@ -34,6 +34,8 @@
 #include <raft/neighbors/refine.cuh>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 #include <chrono>
 #include <cstdio>
 #include <vector>
@@ -124,7 +126,7 @@ void build_knn_graph(raft::resources const& res,
   bool first                    = true;
   const auto start_clock        = std::chrono::system_clock::now();
 
-  rmm::mr::device_memory_resource* device_memory = raft::resource::get_workspace_resource(res);
+  rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(res);
 
   raft::spatial::knn::detail::utils::batch_load_iterator<DataT> vec_batches(
     dataset.data_handle(),
diff --git a/cpp/include/raft/neighbors/detail/cagra/utils.hpp b/cpp/include/raft/neighbors/detail/cagra/utils.hpp
index 265cbfdceb..ece95a7cb7 100644
--- a/cpp/include/raft/neighbors/detail/cagra/utils.hpp
+++ b/cpp/include/raft/neighbors/detail/cagra/utils.hpp
@@ -20,7 +20,7 @@
 #include <raft/core/host_mdarray.hpp>
 #include <raft/util/integer_utils.hpp>
 
-#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda.h>
 #include <cuda_fp16.h>
@@ -261,9 +261,8 @@ template <typename T, typename data_accessor>
 void copy_with_padding(raft::resources const& res,
                        raft::device_matrix<T, int64_t, row_major>& dst,
                        mdspan<const T, matrix_extent<int64_t>, row_major, data_accessor> src,
-                       rmm::mr::device_memory_resource* mr = nullptr)
+                       rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
 {
-  if (!mr) { mr = rmm::mr::get_current_device_resource(); }
   size_t padded_dim = round_up_safe<size_t>(src.extent(1) * sizeof(T), 16) / sizeof(T);
 
   if ((dst.extent(0) != src.extent(0)) || (static_cast<size_t>(dst.extent(1)) != padded_dim)) {
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
index 350b82ede7..c14b0e810f 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-ext.cuh
@@ -20,6 +20,8 @@
 #include <raft/neighbors/sample_filter_types.hpp>  // none_ivf_sample_filter
 #include <raft/util/raft_explicit.hpp>             // RAFT_EXPLICIT
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda_fp16.h>
 
 #include <cstdint>  // uintX_t
@@ -37,8 +39,8 @@ void search(raft::resources const& handle,
             uint32_t k,
             IdxT* neighbors,
             float* distances,
-            rmm::mr::device_memory_resource* mr = nullptr,
-            IvfSampleFilterT sample_filter      = IvfSampleFilterT()) RAFT_EXPLICIT;
+            rmm::device_async_resource_ref mr,
+            IvfSampleFilterT sample_filter = IvfSampleFilterT()) RAFT_EXPLICIT;
 
 }  // namespace raft::neighbors::ivf_flat::detail
 
@@ -54,7 +56,7 @@ void search(raft::resources const& handle,
     uint32_t k,                                                                      \
     IdxT* neighbors,                                                                 \
     float* distances,                                                                \
-    rmm::mr::device_memory_resource* mr,                                             \
+    rmm::device_async_resource_ref mr,                                               \
     IvfSampleFilterT sample_filter)
 
 instantiate_raft_neighbors_ivf_flat_detail_search(
diff --git a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
index 441fb76b2f..388dd60f14 100644
--- a/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_flat_search-inl.cuh
@@ -30,7 +30,7 @@
 #include <raft/neighbors/sample_filter_types.hpp>               // none_ivf_sample_filter
 #include <raft/spatial/knn/detail/ann_utils.cuh>                // utils::mapping
 
-#include <rmm/mr/device/per_device_resource.hpp>  // rmm::device_memory_resource
+#include <rmm/resource_ref.hpp>
 
 namespace raft::neighbors::ivf_flat::detail {
 
@@ -48,7 +48,7 @@ void search_impl(raft::resources const& handle,
                  bool select_min,
                  IdxT* neighbors,
                  AccT* distances,
-                 rmm::mr::device_memory_resource* search_mr,
+                 rmm::device_async_resource_ref search_mr,
                  IvfSampleFilterT sample_filter)
 {
   auto stream = resource::get_cuda_stream(handle);
@@ -276,13 +276,12 @@ inline void search(raft::resources const& handle,
                    uint32_t k,
                    IdxT* neighbors,
                    float* distances,
-                   rmm::mr::device_memory_resource* mr = nullptr,
-                   IvfSampleFilterT sample_filter      = IvfSampleFilterT())
+                   rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource(),
+                   IvfSampleFilterT sample_filter    = IvfSampleFilterT())
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "ivf_flat::search(k = %u, n_queries = %u, dim = %zu)", k, n_queries, index.dim());
 
-  if (mr == nullptr) { mr = rmm::mr::get_current_device_resource(); }
   RAFT_EXPECTS(params.n_probes > 0,
                "n_probes (number of clusters to probe in the search) must be positive.");
   auto n_probes          = std::min<uint32_t>(params.n_probes, index.n_lists());
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
index 8e3f7dbaf3..24574642ef 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -49,6 +49,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda_fp16.h>
 #include <thrust/extrema.h>
@@ -171,7 +172,7 @@ void select_residuals(raft::resources const& handle,
                       const float* center,           // [dim]
                       const T* dataset,              // [.., dim]
                       const IdxT* row_ids,           // [n_rows]
-                      rmm::mr::device_memory_resource* device_memory
+                      rmm::device_async_resource_ref device_memory
 
 )
 {
@@ -225,7 +226,7 @@ void flat_compute_residuals(
   device_matrix_view<const float, uint32_t, row_major> centers,          // [n_lists, dim_ext]
   const T* dataset,                                                      // [n_rows, dim]
   std::variant<uint32_t, const uint32_t*> labels,                        // [n_rows]
-  rmm::mr::device_memory_resource* device_memory)
+  rmm::device_async_resource_ref device_memory)
 {
   auto stream  = resource::get_cuda_stream(handle);
   auto dim     = rotation_matrix.extent(1);
@@ -397,7 +398,7 @@ void train_per_subset(raft::resources const& handle,
                       const float* trainset,   // [n_rows, dim]
                       const uint32_t* labels,  // [n_rows]
                       uint32_t kmeans_n_iters,
-                      rmm::mr::device_memory_resource* managed_memory)
+                      rmm::device_async_resource_ref managed_memory)
 {
   auto stream        = resource::get_cuda_stream(handle);
   auto device_memory = resource::get_workspace_resource(handle);
@@ -475,7 +476,7 @@ void train_per_cluster(raft::resources const& handle,
                        const float* trainset,   // [n_rows, dim]
                        const uint32_t* labels,  // [n_rows]
                        uint32_t kmeans_n_iters,
-                       rmm::mr::device_memory_resource* managed_memory)
+                       rmm::device_async_resource_ref managed_memory)
 {
   auto stream        = resource::get_cuda_stream(handle);
   auto device_memory = resource::get_workspace_resource(handle);
@@ -1325,7 +1326,7 @@ void process_and_fill_codes(raft::resources const& handle,
                             std::variant<IdxT, const IdxT*> src_offset_or_indices,
                             const uint32_t* new_labels,
                             IdxT n_rows,
-                            rmm::mr::device_memory_resource* mr)
+                            rmm::device_async_resource_ref mr)
 {
   auto new_vectors_residual =
     make_device_mdarray<float>(handle, mr, make_extents<IdxT>(n_rows, index.rot_dim()));
@@ -1516,7 +1517,7 @@ void extend(raft::resources const& handle,
                   std::is_same_v<T, int8_t>,
                 "Unsupported data type");
 
-  rmm::mr::device_memory_resource* device_memory = raft::resource::get_workspace_resource(handle);
+  rmm::device_async_resource_ref device_memory = raft::resource::get_workspace_resource(handle);
 
   // The spec defines how the clusters look like
   auto spec = list_spec<uint32_t, IdxT>{
diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
index 4c5da38092..87e6d0a774 100644
--- a/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
+++ b/cpp/include/raft/neighbors/detail/ivf_pq_search.cuh
@@ -45,8 +45,7 @@
 #include <raft/util/pow2_utils.cuh>
 #include <raft/util/vectorized.cuh>
 
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cub/cub.cuh>
 #include <cuda_fp16.h>
@@ -76,7 +75,7 @@ void select_clusters(raft::resources const& handle,
                      raft::distance::DistanceType metric,
                      const T* queries,              // [n_queries, dim]
                      const float* cluster_centers,  // [n_lists, dim_ext]
-                     rmm::mr::device_memory_resource* mr)
+                     rmm::device_async_resource_ref mr)
 {
   common::nvtx::range<common::nvtx::domain::raft> fun_scope(
     "ivf_pq::search::select_clusters(n_probes = %u, n_queries = %u, n_lists = %u, dim = %u)",
diff --git a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
index adcb639301..daa2798b00 100644
--- a/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
+++ b/cpp/include/raft/neighbors/detail/knn_brute_force.cuh
@@ -38,7 +38,6 @@
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/cuda_device.hpp>
-#include <rmm/cuda_stream_pool.hpp>
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
diff --git a/cpp/include/raft/neighbors/ivf_flat-ext.cuh b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
index a1783dfcfd..12ab0dc3a6 100644
--- a/cpp/include/raft/neighbors/ivf_flat-ext.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat-ext.cuh
@@ -22,7 +22,7 @@
 #include <raft/neighbors/ivf_flat_types.hpp>  // raft::neighbors::ivf_flat::index
 #include <raft/util/raft_explicit.hpp>        // RAFT_EXPLICIT
 
-#include <rmm/mr/device/per_device_resource.hpp>  // rmm::mr::device_memory_resource
+#include <rmm/resource_ref.hpp>
 
 #include <cstdint>  // int64_t
 
@@ -109,8 +109,8 @@ void search_with_filtering(raft::resources const& handle,
                            uint32_t k,
                            IdxT* neighbors,
                            float* distances,
-                           rmm::mr::device_memory_resource* mr = nullptr,
-                           IvfSampleFilterT sample_filter      = IvfSampleFilterT()) RAFT_EXPLICIT;
+                           rmm::device_async_resource_ref mr,
+                           IvfSampleFilterT sample_filter = IvfSampleFilterT()) RAFT_EXPLICIT;
 
 template <typename T, typename IdxT>
 void search(raft::resources const& handle,
@@ -121,7 +121,7 @@ void search(raft::resources const& handle,
             uint32_t k,
             IdxT* neighbors,
             float* distances,
-            rmm::mr::device_memory_resource* mr = nullptr) RAFT_EXPLICIT;
+            rmm::device_async_resource_ref mr) RAFT_EXPLICIT;
 
 template <typename T, typename IdxT, typename IvfSampleFilterT>
 void search_with_filtering(raft::resources const& handle,
@@ -240,7 +240,7 @@ instantiate_raft_neighbors_ivf_flat_extend(uint8_t, int64_t);
     uint32_t k,                                                    \
     IdxT* neighbors,                                               \
     float* distances,                                              \
-    rmm::mr::device_memory_resource* mr);                          \
+    rmm::device_async_resource_ref mr);                            \
                                                                    \
   extern template void raft::neighbors::ivf_flat::search<T, IdxT>( \
     raft::resources const& handle,                                 \
diff --git a/cpp/include/raft/neighbors/ivf_flat-inl.cuh b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
index ed1d320795..ea7cff7060 100644
--- a/cpp/include/raft/neighbors/ivf_flat-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_flat-inl.cuh
@@ -24,7 +24,7 @@
 #include <raft/neighbors/ivf_flat_types.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
+#include <rmm/resource_ref.hpp>
 
 namespace raft::neighbors::ivf_flat {
 
@@ -462,8 +462,8 @@ void search_with_filtering(raft::resources const& handle,
                            uint32_t k,
                            IdxT* neighbors,
                            float* distances,
-                           rmm::mr::device_memory_resource* mr = nullptr,
-                           IvfSampleFilterT sample_filter      = IvfSampleFilterT())
+                           rmm::device_async_resource_ref mr,
+                           IvfSampleFilterT sample_filter = IvfSampleFilterT())
 {
   raft::neighbors::ivf_flat::detail::search(
     handle, params, index, queries, n_queries, k, neighbors, distances, mr, sample_filter);
@@ -520,7 +520,7 @@ void search(raft::resources const& handle,
             uint32_t k,
             IdxT* neighbors,
             float* distances,
-            rmm::mr::device_memory_resource* mr = nullptr)
+            rmm::device_async_resource_ref mr)
 {
   raft::neighbors::ivf_flat::detail::search(handle,
                                             params,
diff --git a/cpp/include/raft/neighbors/ivf_pq-ext.cuh b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
index 160a2753a5..620f4a244f 100644
--- a/cpp/include/raft/neighbors/ivf_pq-ext.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq-ext.cuh
@@ -21,8 +21,6 @@
 #include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
 #include <raft/util/raft_explicit.hpp>      // RAFT_EXPLICIT
 
-#include <rmm/mr/device/per_device_resource.hpp>  // rmm::mr::device_memory_resource
-
 #include <cstdint>  // int64_t
 
 #ifdef RAFT_EXPLICIT_INSTANTIATE_ONLY
@@ -105,33 +103,6 @@ void search(raft::resources const& handle,
             IdxT* neighbors,
             float* distances) RAFT_EXPLICIT;
 
-template <typename T, typename IdxT, typename IvfSampleFilterT>
-[[deprecated(
-  "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void
-search_with_filtering(raft::resources const& handle,
-                      const raft::neighbors::ivf_pq::search_params& params,
-                      const index<IdxT>& idx,
-                      const T* queries,
-                      uint32_t n_queries,
-                      uint32_t k,
-                      IdxT* neighbors,
-                      float* distances,
-                      rmm::mr::device_memory_resource* mr,
-                      IvfSampleFilterT sample_filter = IvfSampleFilterT{}) RAFT_EXPLICIT;
-
-template <typename T, typename IdxT>
-[[deprecated(
-  "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void
-search(raft::resources const& handle,
-       const raft::neighbors::ivf_pq::search_params& params,
-       const index<IdxT>& idx,
-       const T* queries,
-       uint32_t n_queries,
-       uint32_t k,
-       IdxT* neighbors,
-       float* distances,
-       rmm::mr::device_memory_resource* mr) RAFT_EXPLICIT;
-
 }  // namespace raft::neighbors::ivf_pq
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
@@ -209,8 +180,7 @@ instantiate_raft_neighbors_ivf_pq_extend(uint8_t, int64_t);
     uint32_t n_queries,                                              \
     uint32_t k,                                                      \
     IdxT* neighbors,                                                 \
-    float* distances,                                                \
-    rmm::mr::device_memory_resource* mr);                            \
+    float* distances);                                               \
                                                                      \
   extern template void raft::neighbors::ivf_pq::search<T, IdxT>(     \
     raft::resources const& handle,                                   \
diff --git a/cpp/include/raft/neighbors/ivf_pq-inl.cuh b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
index a893153e1a..77c4bb8553 100644
--- a/cpp/include/raft/neighbors/ivf_pq-inl.cuh
+++ b/cpp/include/raft/neighbors/ivf_pq-inl.cuh
@@ -24,8 +24,6 @@
 #include <raft/neighbors/ivf_pq_serialize.cuh>
 #include <raft/neighbors/ivf_pq_types.hpp>
 
-#include <rmm/mr/device/device_memory_resource.hpp>
-
 #include <memory>  // shared_ptr
 
 namespace raft::neighbors::ivf_pq {
@@ -403,38 +401,6 @@ void search_with_filtering(raft::resources const& handle,
   detail::search(handle, params, idx, queries, n_queries, k, neighbors, distances, sample_filter);
 }
 
-/**
- * This function is deprecated and will be removed in a future.
- * Please drop the `mr` argument and use `raft::resource::set_workspace_resource` instead.
- */
-template <typename T, typename IdxT, typename IvfSampleFilterT>
-[[deprecated(
-  "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void
-search_with_filtering(raft::resources const& handle,
-                      const search_params& params,
-                      const index<IdxT>& idx,
-                      const T* queries,
-                      uint32_t n_queries,
-                      uint32_t k,
-                      IdxT* neighbors,
-                      float* distances,
-                      rmm::mr::device_memory_resource* mr,
-                      IvfSampleFilterT sample_filter = IvfSampleFilterT{})
-{
-  if (mr != nullptr) {
-    // Shallow copy of the resource with the automatic lifespan:
-    //                               change the workspace resource temporarily
-    raft::resources res_local(handle);
-    resource::set_workspace_resource(
-      res_local, std::shared_ptr<rmm::mr::device_memory_resource>{mr, void_op{}});
-    return search_with_filtering(
-      res_local, params, idx, queries, n_queries, k, neighbors, distances, sample_filter);
-  } else {
-    return search_with_filtering(
-      handle, params, idx, queries, n_queries, k, neighbors, distances, sample_filter);
-  }
-}
-
 /**
  * @brief Search ANN using the constructed index.
  *
@@ -446,16 +412,13 @@ search_with_filtering(raft::resources const& handle,
  * eliminate entirely allocations happening within `search`:
  * @code{.cpp}
  *   ...
- *   // Create a pooling memory resource with a pre-defined initial size.
- *   rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource> mr(
- *     rmm::mr::get_current_device_resource(), 1024 * 1024);
  *   // use default search parameters
  *   ivf_pq::search_params search_params;
  *   // Use the same allocator across multiple searches to reduce the number of
  *   // cuda memory allocations
- *   ivf_pq::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1, &mr);
- *   ivf_pq::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2, &mr);
- *   ivf_pq::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3, &mr);
+ *   ivf_pq::search(handle, search_params, index, queries1, N1, K, out_inds1, out_dists1);
+ *   ivf_pq::search(handle, search_params, index, queries2, N2, K, out_inds2, out_dists2);
+ *   ivf_pq::search(handle, search_params, index, queries3, N3, K, out_inds3, out_dists3);
  *   ...
  * @endcode
  * The exact size of the temporary buffer depends on multiple factors and is an implementation
@@ -496,33 +459,4 @@ void search(raft::resources const& handle,
                                raft::neighbors::filtering::none_ivf_sample_filter{});
 }
 
-/**
- * This function is deprecated and will be removed in a future.
- * Please drop the `mr` argument and use `raft::resource::set_workspace_resource` instead.
- */
-template <typename T, typename IdxT>
-[[deprecated(
-  "Drop the `mr` argument and use `raft::resource::set_workspace_resource` instead")]] void
-search(raft::resources const& handle,
-       const search_params& params,
-       const index<IdxT>& idx,
-       const T* queries,
-       uint32_t n_queries,
-       uint32_t k,
-       IdxT* neighbors,
-       float* distances,
-       rmm::mr::device_memory_resource* mr)
-{
-  return search_with_filtering(handle,
-                               params,
-                               idx,
-                               queries,
-                               n_queries,
-                               k,
-                               neighbors,
-                               distances,
-                               mr,
-                               raft::neighbors::filtering::none_ivf_sample_filter{});
-}
-
 }  // namespace raft::neighbors::ivf_pq
diff --git a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
index e88cbbdeea..c33bb8c348 100644
--- a/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
+++ b/cpp/include/raft/random/detail/multi_variable_gaussian.cuh
@@ -31,10 +31,10 @@
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/device_uvector.hpp>
-
-#include <stdio.h>
+#include <rmm/resource_ref.hpp>
 
 #include <cmath>
+#include <cstdio>
 #include <memory>
 #include <optional>
 #include <type_traits>
@@ -278,7 +278,7 @@ class multi_variable_gaussian_setup_token;
 template <typename ValueType>
 multi_variable_gaussian_setup_token<ValueType> build_multi_variable_gaussian_token_impl(
   raft::resources const& handle,
-  rmm::mr::device_memory_resource& mem_resource,
+  rmm::device_async_resource_ref mem_resource,
   const int dim,
   const multi_variable_gaussian_decomposition_method method);
 
@@ -294,7 +294,7 @@ class multi_variable_gaussian_setup_token {
   template <typename T>
   friend multi_variable_gaussian_setup_token<T> build_multi_variable_gaussian_token_impl(
     raft::resources const& handle,
-    rmm::mr::device_memory_resource& mem_resource,
+    rmm::device_async_resource_ref mem_resource,
     const int dim,
     const multi_variable_gaussian_decomposition_method method);
 
@@ -321,7 +321,7 @@ class multi_variable_gaussian_setup_token {
   // Constructor, only for use by friend functions.
   // Hiding this will let us change the implementation in the future.
   multi_variable_gaussian_setup_token(raft::resources const& handle,
-                                      rmm::mr::device_memory_resource& mem_resource,
+                                      rmm::device_async_resource_ref mem_resource,
                                       const int dim,
                                       const multi_variable_gaussian_decomposition_method method)
     : impl_(std::make_unique<multi_variable_gaussian_impl<ValueType>>(
@@ -378,14 +378,14 @@ class multi_variable_gaussian_setup_token {
  private:
   std::unique_ptr<multi_variable_gaussian_impl<ValueType>> impl_;
   raft::resources const& handle_;
-  rmm::mr::device_memory_resource& mem_resource_;
+  rmm::device_async_resource_ref mem_resource_;
   int dim_ = 0;
 
   auto allocate_workspace() const
   {
     const auto num_elements = impl_->get_workspace_size();
     return rmm::device_uvector<ValueType>{
-      num_elements, resource::get_cuda_stream(handle_), &mem_resource_};
+      num_elements, resource::get_cuda_stream(handle_), mem_resource_};
   }
 
   int dim() const { return dim_; }
@@ -394,7 +394,7 @@ class multi_variable_gaussian_setup_token {
 template <typename ValueType>
 multi_variable_gaussian_setup_token<ValueType> build_multi_variable_gaussian_token_impl(
   raft::resources const& handle,
-  rmm::mr::device_memory_resource& mem_resource,
+  rmm::device_async_resource_ref mem_resource,
   const int dim,
   const multi_variable_gaussian_decomposition_method method)
 {
@@ -414,7 +414,7 @@ void compute_multi_variable_gaussian_impl(
 template <typename ValueType>
 void compute_multi_variable_gaussian_impl(
   raft::resources const& handle,
-  rmm::mr::device_memory_resource& mem_resource,
+  rmm::device_async_resource_ref mem_resource,
   std::optional<raft::device_vector_view<const ValueType, int>> x,
   raft::device_matrix_view<ValueType, int, raft::col_major> P,
   raft::device_matrix_view<ValueType, int, raft::col_major> X,
diff --git a/cpp/include/raft/random/multi_variable_gaussian.cuh b/cpp/include/raft/random/multi_variable_gaussian.cuh
index ab3f433422..4b37e1ff65 100644
--- a/cpp/include/raft/random/multi_variable_gaussian.cuh
+++ b/cpp/include/raft/random/multi_variable_gaussian.cuh
@@ -24,6 +24,8 @@
 #include <raft/core/resources.hpp>
 #include <raft/random/random_types.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 namespace raft::random {
 
 /**
@@ -33,7 +35,7 @@ namespace raft::random {
 
 template <typename ValueType>
 void multi_variable_gaussian(raft::resources const& handle,
-                             rmm::mr::device_memory_resource& mem_resource,
+                             rmm::device_async_resource_ref mem_resource,
                              std::optional<raft::device_vector_view<const ValueType, int>> x,
                              raft::device_matrix_view<ValueType, int, raft::col_major> P,
                              raft::device_matrix_view<ValueType, int, raft::col_major> X,
@@ -49,12 +51,8 @@ void multi_variable_gaussian(raft::resources const& handle,
                              raft::device_matrix_view<ValueType, int, raft::col_major> X,
                              const multi_variable_gaussian_decomposition_method method)
 {
-  rmm::mr::device_memory_resource* mem_resource_ptr = rmm::mr::get_current_device_resource();
-  RAFT_EXPECTS(mem_resource_ptr != nullptr,
-               "compute_multi_variable_gaussian: "
-               "rmm::mr::get_current_device_resource() returned null; "
-               "please report this bug to the RAPIDS RAFT developers.");
-  detail::compute_multi_variable_gaussian_impl(handle, *mem_resource_ptr, x, P, X, method);
+  detail::compute_multi_variable_gaussian_impl(
+    handle, rmm::mr::get_current_device_resource(), x, P, X, method);
 }
 
 /** @} */
diff --git a/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh b/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
index 08bdfa6f30..922356b040 100644
--- a/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
+++ b/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
@@ -21,9 +21,6 @@
 #include <raft/matrix/select_k_types.hpp>
 #include <raft/util/raft_explicit.hpp>  // RAFT_EXPLICIT
 
-#include <rmm/cuda_stream_view.hpp>                  // rmm:cuda_stream_view
-#include <rmm/mr/device/device_memory_resource.hpp>  // rmm::mr::device_memory_resource
-
 #include <cuda_fp16.h>  // __half
 
 #include <cstdint>  // uint32_t
diff --git a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
index 041ab225f9..351bcd5531 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_quantized.cuh
@@ -108,8 +108,15 @@ void approx_knn_search(raft::resources const& handle,
   if (index->ivf_flat<T, int64_t>()) {
     ivf_flat::search_params params;
     params.n_probes = index->nprobe;
-    ivf_flat::search(
-      handle, params, *(index->ivf_flat<T, int64_t>()), query_array, n, k, indices, distances);
+    ivf_flat::search(handle,
+                     params,
+                     *(index->ivf_flat<T, int64_t>()),
+                     query_array,
+                     n,
+                     k,
+                     indices,
+                     distances,
+                     resource::get_workspace_resource(handle));
   } else if (index->ivf_pq) {
     neighbors::ivf_pq::search_params params;
     params.n_probes = index->nprobe;
diff --git a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
index d862e586e3..920249172f 100644
--- a/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ann_utils.cuh
@@ -25,6 +25,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <cuda_fp16.hpp>
 
@@ -416,7 +417,7 @@ struct batch_load_iterator {
           size_type row_width,
           size_type batch_size,
           rmm::cuda_stream_view stream,
-          rmm::mr::device_memory_resource* mr)
+          rmm::device_async_resource_ref mr)
       : stream_(stream),
         buf_(0, stream, mr),
         source_(source),
@@ -502,7 +503,7 @@ struct batch_load_iterator {
                       size_type row_width,
                       size_type batch_size,
                       rmm::cuda_stream_view stream,
-                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+                      rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
     : cur_batch_(new batch(source, n_rows, row_width, batch_size, stream, mr)), cur_pos_(0)
   {
   }
diff --git a/cpp/include/raft/util/cudart_utils.hpp b/cpp/include/raft/util/cudart_utils.hpp
index e5ce15e8a3..2b334d1bbf 100644
--- a/cpp/include/raft/util/cudart_utils.hpp
+++ b/cpp/include/raft/util/cudart_utils.hpp
@@ -18,7 +18,6 @@
 
 #include <raft/core/error.hpp>
 #include <raft/util/cuda_rt_essentials.hpp>
-#include <raft/util/memory_pool.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/include/raft/util/memory_pool-ext.hpp b/cpp/include/raft/util/memory_pool-ext.hpp
deleted file mode 100644
index 030a9c681e..0000000000
--- a/cpp/include/raft/util/memory_pool-ext.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-#include <rmm/mr/device/device_memory_resource.hpp>  // rmm::mr::device_memory_resource
-
-#include <cstddef>  // size_t
-#include <memory>   // std::unique_ptr
-
-namespace raft {
-
-std::unique_ptr<rmm::mr::device_memory_resource> get_pool_memory_resource(
-  rmm::mr::device_memory_resource*& mr, size_t initial_size);
-
-}  // namespace raft
diff --git a/cpp/include/raft/util/memory_pool-inl.hpp b/cpp/include/raft/util/memory_pool-inl.hpp
deleted file mode 100644
index bd7e0186b3..0000000000
--- a/cpp/include/raft/util/memory_pool-inl.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <raft/core/detail/macros.hpp>  // RAFT_INLINE_CONDITIONAL
-
-#include <rmm/aligned.hpp>
-#include <rmm/mr/device/managed_memory_resource.hpp>
-#include <rmm/mr/device/per_device_resource.hpp>
-#include <rmm/mr/device/pool_memory_resource.hpp>
-
-#include <cstddef>
-#include <memory>
-
-namespace raft {
-
-/**
- * @defgroup memory_pool Memory Pool
- * @{
- */
-/**
- * @brief Get a pointer to a pooled memory resource within the scope of the lifetime of the returned
- * unique pointer.
- *
- * This function is useful in the code where multiple repeated allocations/deallocations are
- * expected.
- * Use case example:
- * @code{.cpp}
- *   void my_func(..., size_t n, rmm::mr::device_memory_resource* mr = nullptr) {
- *     auto pool_guard = raft::get_pool_memory_resource(mr, 2 * n * sizeof(float));
- *     if (pool_guard){
- *       RAFT_LOG_INFO("Created a pool");
- *     } else {
- *       RAFT_LOG_INFO("Using the current default or explicitly passed device memory resource");
- *     }
- *     rmm::device_uvector<float> x(n, stream, mr);
- *     rmm::device_uvector<float> y(n, stream, mr);
- *     ...
- *   }
- * @endcode
- * Here, the new memory resource would be created within the function scope if the passed `mr` is
- * null and the default resource is not a pool. After the call, `mr` contains a valid memory
- * resource in any case.
- *
- * @param[inout] mr if not null do nothing; otherwise get the current device resource and wrap it
- * into a `pool_memory_resource` if necessary and return the pointer to the result.
- * @param initial_size if a new memory pool is created, this would be its initial size (rounded up
- * to 256 bytes).
- *
- * @return if a new memory pool is created, it returns a unique_ptr to it;
- *   this managed pointer controls the lifetime of the created memory resource.
- */
-RAFT_INLINE_CONDITIONAL std::unique_ptr<rmm::mr::device_memory_resource> get_pool_memory_resource(
-  rmm::mr::device_memory_resource*& mr, size_t initial_size)
-{
-  using pool_res_t = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
-  std::unique_ptr<pool_res_t> pool_res{nullptr};
-  if (mr) return pool_res;
-  mr = rmm::mr::get_current_device_resource();
-  if (!dynamic_cast<pool_res_t*>(mr) &&
-      !dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>*>(mr) &&
-      !dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::managed_memory_resource>*>(mr)) {
-    pool_res = std::make_unique<pool_res_t>(
-      mr, rmm::align_down(initial_size, rmm::CUDA_ALLOCATION_ALIGNMENT));
-    mr = pool_res.get();
-  }
-  return pool_res;
-}
-
-/** @} */
-}  // namespace raft
diff --git a/cpp/include/raft/util/memory_pool.hpp b/cpp/include/raft/util/memory_pool.hpp
deleted file mode 100644
index c9d25ecb1f..0000000000
--- a/cpp/include/raft/util/memory_pool.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include "memory_pool-ext.hpp"
-
-#if !defined(RAFT_COMPILED)
-#include "memory_pool-inl.hpp"
-#endif  // RAFT_COMPILED
diff --git a/cpp/internal/raft_internal/neighbors/ivf_pq_search_test-ext.cuh b/cpp/internal/raft_internal/neighbors/ivf_pq_search_test-ext.cuh
index 7a65e2d2f8..1e6f4f9976 100644
--- a/cpp/internal/raft_internal/neighbors/ivf_pq_search_test-ext.cuh
+++ b/cpp/internal/raft_internal/neighbors/ivf_pq_search_test-ext.cuh
@@ -25,6 +25,8 @@
 
 #include <raft_internal/neighbors/ivf_pq_compute_similarity_filters_test-ext.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 #include <cstdint>  // int64_t
 
 #define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)            \
@@ -44,8 +46,7 @@
     uint32_t n_queries,                                              \
     uint32_t k,                                                      \
     IdxT* neighbors,                                                 \
-    float* distances,                                                \
-    rmm::mr::device_memory_resource* mr);                            \
+    float* distances);                                               \
                                                                      \
   extern template void raft::neighbors::ivf_pq::search<T, IdxT>(     \
     raft::resources const& handle,                                   \
diff --git a/cpp/internal/raft_internal/neighbors/naive_knn.cuh b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
index 79206c7a43..c14a8e3e9f 100644
--- a/cpp/internal/raft_internal/neighbors/naive_knn.cuh
+++ b/cpp/internal/raft_internal/neighbors/naive_knn.cuh
@@ -23,9 +23,7 @@
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/util/cuda_utils.cuh>
 
-#include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
 
 namespace raft::neighbors {
 
diff --git a/cpp/src/neighbors/detail/ivf_flat_search.cu b/cpp/src/neighbors/detail/ivf_flat_search.cu
index 9d39607750..336bea19b6 100644
--- a/cpp/src/neighbors/detail/ivf_flat_search.cu
+++ b/cpp/src/neighbors/detail/ivf_flat_search.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 #include <raft/neighbors/detail/ivf_flat_search-inl.cuh>
 #include <raft/neighbors/sample_filter_types.hpp>
 
+#include <rmm/resource_ref.hpp>
+
 #define instantiate_raft_neighbors_ivf_flat_detail_search(T, IdxT, IvfSampleFilterT)  \
   template void raft::neighbors::ivf_flat::detail::search<T, IdxT, IvfSampleFilterT>( \
     raft::resources const& handle,                                                    \
@@ -27,7 +29,7 @@
     uint32_t k,                                                                       \
     IdxT* neighbors,                                                                  \
     float* distances,                                                                 \
-    rmm::mr::device_memory_resource* mr,                                              \
+    rmm::device_async_resource_ref mr,                                                \
     IvfSampleFilterT sample_filter)
 
 instantiate_raft_neighbors_ivf_flat_detail_search(
diff --git a/cpp/src/neighbors/ivf_flat_00_generate.py b/cpp/src/neighbors/ivf_flat_00_generate.py
index d987a4e17d..7b55cad4de 100644
--- a/cpp/src/neighbors/ivf_flat_00_generate.py
+++ b/cpp/src/neighbors/ivf_flat_00_generate.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,7 +13,7 @@
 # limitations under the License.
 
 header = """/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -127,8 +127,8 @@
 
 search_macro = """
 #define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)        \\
-  template void raft::neighbors::ivf_flat::search<T, IdxT>( \\
-    raft::resources const& handle,                          \\
+  template void raft::neighbors::ivf_flat::search<T, IdxT>(        \\
+    raft::resources const& handle,                                 \\
     const raft::neighbors::ivf_flat::search_params& params,        \\
     const raft::neighbors::ivf_flat::index<T, IdxT>& index,        \\
     const T* queries,                                              \\
@@ -136,10 +136,10 @@
     uint32_t k,                                                    \\
     IdxT* neighbors,                                               \\
     float* distances,                                              \\
-    rmm::mr::device_memory_resource* mr );                         \\
+    rmm::device_async_resource_ref mr);                            \\
                                                                    \\
-  template void raft::neighbors::ivf_flat::search<T, IdxT>( \\
-    raft::resources const& handle,                          \\
+  template void raft::neighbors::ivf_flat::search<T, IdxT>(        \\
+    raft::resources const& handle,                                 \\
     const raft::neighbors::ivf_flat::search_params& params,        \\
     const raft::neighbors::ivf_flat::index<T, IdxT>& index,        \\
     raft::device_matrix_view<const T, IdxT, row_major> queries,    \\
diff --git a/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu
index 03dcfee817..e5cfe14e3f 100644
--- a/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat_search_float_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@
 
 #include <raft/neighbors/ivf_flat-inl.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 #define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)     \
   template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
     raft::resources const& handle,                              \
@@ -35,7 +37,7 @@
     uint32_t k,                                                 \
     IdxT* neighbors,                                            \
     float* distances,                                           \
-    rmm::mr::device_memory_resource* mr);                       \
+    rmm::device_async_resource_ref mr);                         \
                                                                 \
   template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
     raft::resources const& handle,                              \
diff --git a/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu
index 7646081183..35792a78a8 100644
--- a/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat_search_int8_t_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@
 
 #include <raft/neighbors/ivf_flat-inl.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 #define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)     \
   template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
     raft::resources const& handle,                              \
@@ -35,7 +37,7 @@
     uint32_t k,                                                 \
     IdxT* neighbors,                                            \
     float* distances,                                           \
-    rmm::mr::device_memory_resource* mr);                       \
+    rmm::device_async_resource_ref mr);                         \
                                                                 \
   template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
     raft::resources const& handle,                              \
diff --git a/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu b/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
index 5d2effd385..663e52cb99 100644
--- a/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivf_flat_search_uint8_t_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@
 
 #include <raft/neighbors/ivf_flat-inl.cuh>
 
+#include <rmm/resource_ref.hpp>
+
 #define instantiate_raft_neighbors_ivf_flat_search(T, IdxT)     \
   template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
     raft::resources const& handle,                              \
@@ -35,7 +37,7 @@
     uint32_t k,                                                 \
     IdxT* neighbors,                                            \
     float* distances,                                           \
-    rmm::mr::device_memory_resource* mr);                       \
+    rmm::device_async_resource_ref mr);                         \
                                                                 \
   template void raft::neighbors::ivf_flat::search<T, IdxT>(     \
     raft::resources const& handle,                              \
diff --git a/cpp/src/neighbors/ivfpq_search_float_int64_t.cu b/cpp/src/neighbors/ivfpq_search_float_int64_t.cu
index e56c107735..2d15167099 100644
--- a/cpp/src/neighbors/ivfpq_search_float_int64_t.cu
+++ b/cpp/src/neighbors/ivfpq_search_float_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 #include <raft/neighbors/ivf_pq-inl.cuh>
 #include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
 
+#include <rmm/resource_ref.hpp>
+
 #define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)            \
   template void raft::neighbors::ivf_pq::search<T, IdxT>(            \
     raft::resources const& handle,                                   \
@@ -34,8 +36,7 @@
     uint32_t n_queries,                                              \
     uint32_t k,                                                      \
     IdxT* neighbors,                                                 \
-    float* distances,                                                \
-    rmm::mr::device_memory_resource* mr)
+    float* distances)
 
 instantiate_raft_neighbors_ivf_pq_search(float, int64_t);
 
diff --git a/cpp/src/neighbors/ivfpq_search_half_int64_t.cu b/cpp/src/neighbors/ivfpq_search_half_int64_t.cu
index c9f2e6fdd5..c9a380e21f 100644
--- a/cpp/src/neighbors/ivfpq_search_half_int64_t.cu
+++ b/cpp/src/neighbors/ivfpq_search_half_int64_t.cu
@@ -17,6 +17,8 @@
 #include <raft/neighbors/ivf_pq-inl.cuh>
 #include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
 
+#include <rmm/resource_ref.hpp>
+
 #include <cuda_fp16.h>
 
 #define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)            \
@@ -36,8 +38,7 @@
     uint32_t n_queries,                                              \
     uint32_t k,                                                      \
     IdxT* neighbors,                                                 \
-    float* distances,                                                \
-    rmm::mr::device_memory_resource* mr)
+    float* distances)
 
 instantiate_raft_neighbors_ivf_pq_search(half, int64_t);
 
diff --git a/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu
index 1efe4f7fb2..e85c98d8dd 100644
--- a/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivfpq_search_int8_t_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 #include <raft/neighbors/ivf_pq-inl.cuh>
 #include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
 
+#include <rmm/resource_ref.hpp>
+
 #define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)            \
   template void raft::neighbors::ivf_pq::search<T, IdxT>(            \
     raft::resources const& handle,                                   \
@@ -34,8 +36,7 @@
     uint32_t n_queries,                                              \
     uint32_t k,                                                      \
     IdxT* neighbors,                                                 \
-    float* distances,                                                \
-    rmm::mr::device_memory_resource* mr)
+    float* distances)
 
 instantiate_raft_neighbors_ivf_pq_search(int8_t, int64_t);
 
diff --git a/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu b/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu
index e746391443..42653254e9 100644
--- a/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu
+++ b/cpp/src/neighbors/ivfpq_search_uint8_t_int64_t.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,8 @@
 #include <raft/neighbors/ivf_pq-inl.cuh>
 #include <raft/neighbors/ivf_pq_types.hpp>  // raft::neighbors::ivf_pq::index
 
+#include <rmm/resource_ref.hpp>
+
 #define instantiate_raft_neighbors_ivf_pq_search(T, IdxT)            \
   template void raft::neighbors::ivf_pq::search<T, IdxT>(            \
     raft::resources const& handle,                                   \
@@ -34,8 +36,7 @@
     uint32_t n_queries,                                              \
     uint32_t k,                                                      \
     IdxT* neighbors,                                                 \
-    float* distances,                                                \
-    rmm::mr::device_memory_resource* mr)
+    float* distances)
 
 instantiate_raft_neighbors_ivf_pq_search(uint8_t, int64_t);
 
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 4d17aacffd..752dffdc16 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -188,7 +188,6 @@ if(BUILD_TESTS)
     test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
     test/ext_headers/raft_distance_fused_l2_nn.cu
     test/ext_headers/raft_neighbors_ivf_pq.cu
-    test/ext_headers/raft_util_memory_pool.cpp
     test/ext_headers/raft_neighbors_ivf_flat.cu
     test/ext_headers/raft_core_logger.cpp
     test/ext_headers/raft_neighbors_refine.cu
diff --git a/cpp/test/core/device_resources_manager.cpp b/cpp/test/core/device_resources_manager.cpp
index b9b8996a09..c63d5896e5 100644
--- a/cpp/test/core/device_resources_manager.cpp
+++ b/cpp/test/core/device_resources_manager.cpp
@@ -115,16 +115,10 @@ TEST(DeviceResourcesManager, ObeysSetters)
 
     auto* mr = dynamic_cast<rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource>*>(
       rmm::mr::get_current_device_resource());
-    rmm::device_async_resource_ref workspace_mr =
-      dynamic_cast<rmm::mr::limiting_resource_adaptor<rmm::mr::device_memory_resource>*>(
-        res.get_workspace_resource())
-        ->get_upstream_resource();
+
     if (upstream_mrs[i % devices.size()] != nullptr) {
       // Expect that the current memory resource is a pool memory resource as requested
       EXPECT_NE(mr, nullptr);
-
-      // We cannot easily check the type of a resource_ref
-      (void)workspace_mr;
     }
 
     {
diff --git a/cpp/test/ext_headers/00_generate.py b/cpp/test/ext_headers/00_generate.py
index 682cadbe89..d9c766979b 100644
--- a/cpp/test/ext_headers/00_generate.py
+++ b/cpp/test/ext_headers/00_generate.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
 
 copyright_notice = """
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -49,7 +49,6 @@
     "raft/spatial/knn/detail/fused_l2_knn-ext.cuh",
     "raft/distance/fused_l2_nn-ext.cuh",
     "raft/neighbors/ivf_pq-ext.cuh",
-    "raft/util/memory_pool-ext.hpp",
     "raft/neighbors/ivf_flat-ext.cuh",
     "raft/core/logger-ext.hpp",
     "raft/neighbors/refine-ext.cuh",
diff --git a/cpp/test/ext_headers/raft_util_memory_pool.cpp b/cpp/test/ext_headers/raft_util_memory_pool.cpp
deleted file mode 100644
index 11a024b958..0000000000
--- a/cpp/test/ext_headers/raft_util_memory_pool.cpp
+++ /dev/null
@@ -1,27 +0,0 @@
-
-/*
- * Copyright (c) 2023, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * NOTE: this file is generated by 00_generate.py
- *
- * Make changes there and run in this directory:
- *
- * > python 00_generate.py
- *
- */
-
-#include <raft/util/memory_pool.hpp>
diff --git a/cpp/test/matrix/select_k.cuh b/cpp/test/matrix/select_k.cuh
index 7f9b7b3fc3..f22f4f5fa7 100644
--- a/cpp/test/matrix/select_k.cuh
+++ b/cpp/test/matrix/select_k.cuh
@@ -25,7 +25,6 @@
 #include <raft_internal/matrix/select_k.cuh>
 
 #include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <gtest/gtest.h>
 
diff --git a/cpp/test/neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu b/cpp/test/neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu
index 942d0fcc44..00baa59f58 100644
--- a/cpp/test/neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu
+++ b/cpp/test/neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu
@@ -37,8 +37,7 @@
     uint32_t n_queries,                                              \
     uint32_t k,                                                      \
     IdxT* neighbors,                                                 \
-    float* distances,                                                \
-    rmm::mr::device_memory_resource* mr)
+    float* distances)
 
 instantiate_raft_neighbors_ivf_pq_search(float, uint32_t);
 
diff --git a/cpp/test/neighbors/ann_utils.cuh b/cpp/test/neighbors/ann_utils.cuh
index 3e0bead665..2139e97428 100644
--- a/cpp/test/neighbors/ann_utils.cuh
+++ b/cpp/test/neighbors/ann_utils.cuh
@@ -28,10 +28,6 @@
 
 #include <raft_internal/neighbors/naive_knn.cuh>
 
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/mr/device/device_memory_resource.hpp>
-
 #include <gtest/gtest.h>
 
 #include <iostream>
diff --git a/cpp/test/random/multi_variable_gaussian.cu b/cpp/test/random/multi_variable_gaussian.cu
index 62bad8e543..bed9515a53 100644
--- a/cpp/test/random/multi_variable_gaussian.cu
+++ b/cpp/test/random/multi_variable_gaussian.cu
@@ -25,6 +25,7 @@
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <gtest/gtest.h>
 
@@ -287,10 +288,8 @@ class MVGMdspanTest : public ::testing::TestWithParam<MVGInputs<T>> {
     raft::device_matrix_view<T, int, raft::col_major> P_view(P_d.data(), dim, dim);
     raft::device_matrix_view<T, int, raft::col_major> X_view(X_d.data(), dim, nPoints);
 
-    rmm::mr::device_memory_resource* mem_resource_ptr = rmm::mr::get_current_device_resource();
-    ASSERT_TRUE(mem_resource_ptr != nullptr);
     raft::random::multi_variable_gaussian(
-      handle, *mem_resource_ptr, x_view, P_view, X_view, method);
+      handle, rmm::mr::get_current_device_resource(), x_view, P_view, X_view, method);
 
     // saving the mean of the randoms in Rand_mean
     //@todo can be swapped with a API that calculates mean
diff --git a/cpp/test/util/device_atomics.cu b/cpp/test/util/device_atomics.cu
index c5bb0ad3b6..086d1f4152 100644
--- a/cpp/test/util/device_atomics.cu
+++ b/cpp/test/util/device_atomics.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <raft/core/detail/macros.hpp>
 #include <raft/util/cudart_utils.hpp>
 #include <raft/util/device_atomics.cuh>
 

From f425f2be3c47c355772cf7d70c70ca63cf04273a Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Thu, 25 Apr 2024 12:38:27 -0700
Subject: [PATCH 15/60] add --rm and --name to devcontainer run args (#2275)

* Update the `cuda11.8-conda` devcontainer's base image
* Remove the devcontainer when the VSCode window closes
* Adds a descriptive name to the running container:
  ```shell
  $ docker ps -a
  CONTAINER ID   IMAGE         ...  NAMES
  0dbb364fe544   vsc-raft-...  ...  rapids-raft-24.06-cuda12.2-conda

  $ docker rm -f rapids-raft-24.06-cuda12.2-conda
  ```

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/raft/pull/2275
---
 .devcontainer/cuda11.8-conda/devcontainer.json | 7 ++++++-
 .devcontainer/cuda11.8-pip/devcontainer.json   | 5 +++++
 .devcontainer/cuda12.2-conda/devcontainer.json | 5 +++++
 .devcontainer/cuda12.2-pip/devcontainer.json   | 5 +++++
 ci/release/update-version.sh                   | 1 +
 5 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/.devcontainer/cuda11.8-conda/devcontainer.json b/.devcontainer/cuda11.8-conda/devcontainer.json
index 3f84407d41..536537f07f 100644
--- a/.devcontainer/cuda11.8-conda/devcontainer.json
+++ b/.devcontainer/cuda11.8-conda/devcontainer.json
@@ -5,9 +5,14 @@
     "args": {
       "CUDA": "11.8",
       "PYTHON_PACKAGE_MANAGER": "conda",
-      "BASE": "rapidsai/devcontainers:24.06-cpp-llvm16-cuda11.8-mambaforge-ubuntu22.04"
+      "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-mambaforge-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-conda"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index c24cddd78e..4ab81f6572 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -8,6 +8,11 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-cuda11.8-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda11.8-pip"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": {
diff --git a/.devcontainer/cuda12.2-conda/devcontainer.json b/.devcontainer/cuda12.2-conda/devcontainer.json
index 1846d0eac3..948680eaf6 100644
--- a/.devcontainer/cuda12.2-conda/devcontainer.json
+++ b/.devcontainer/cuda12.2-conda/devcontainer.json
@@ -8,6 +8,11 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-mambaforge-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-conda"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/rapids-build-utils:24.6": {}
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index 291ee56e7f..e8453ed779 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -8,6 +8,11 @@
       "BASE": "rapidsai/devcontainers:24.06-cpp-cuda12.2-ubuntu22.04"
     }
   },
+  "runArgs": [
+    "--rm",
+    "--name",
+    "${localEnv:USER}-rapids-${localWorkspaceFolderBasename}-24.06-cuda12.2-pip"
+  ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": {
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 636f637d0c..46b992392c 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -86,4 +86,5 @@ find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r
     sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/ucx:[0-9.]*@rapidsai/devcontainers/features/ucx:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+    sed_runner "s@rapids-\${localWorkspaceFolderBasename}-${CURRENT_SHORT_TAG}@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}"
 done

From ce404f686769c74aeb2340cbebd834816b86469c Mon Sep 17 00:00:00 2001
From: Micka <mide@nvidia.com>
Date: Fri, 26 Apr 2024 19:25:48 +0200
Subject: [PATCH 16/60] Fix build command for C++ compilation (#2270)

I noticed that `./build.sh clean` was executing some CMake code when it shouldn't.

Authors:
  - Micka (https://github.com/lowener)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2270
---
 build.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/build.sh b/build.sh
index 45c7d1380f..da5efa5183 100755
--- a/build.sh
+++ b/build.sh
@@ -405,7 +405,7 @@ fi
 
 ################################################################################
 # Configure for building all C++ targets
-if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench-prims || hasArg bench-ann || ((${COMPILE_LIBRARY} == ON )); then
+if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench-prims || hasArg bench-ann || [[ ${COMPILE_LIBRARY} == ON ]]; then
     if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
         RAFT_CMAKE_CUDA_ARCHITECTURES="NATIVE"
         echo "Building for the architecture of the GPU in the system..."

From d4d92ce97f10e9302e3f5de852d1dd9edeec78ce Mon Sep 17 00:00:00 2001
From: Paul Taylor <178183+trxcllnt@users.noreply.github.com>
Date: Fri, 26 Apr 2024 10:48:05 -0700
Subject: [PATCH 17/60] Update pip devcontainers to UCX v1.15.0 (#2274)

Authors:
  - Paul Taylor (https://github.com/trxcllnt)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2274
---
 .devcontainer/cuda11.8-pip/devcontainer.json | 2 +-
 .devcontainer/cuda12.2-pip/devcontainer.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.devcontainer/cuda11.8-pip/devcontainer.json b/.devcontainer/cuda11.8-pip/devcontainer.json
index 4ab81f6572..92e7613a9b 100644
--- a/.devcontainer/cuda11.8-pip/devcontainer.json
+++ b/.devcontainer/cuda11.8-pip/devcontainer.json
@@ -16,7 +16,7 @@
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": {
-      "version": "1.14.1"
+      "version": "1.15.0"
     },
     "ghcr.io/rapidsai/devcontainers/features/cuda:24.6": {
       "version": "11.8",
diff --git a/.devcontainer/cuda12.2-pip/devcontainer.json b/.devcontainer/cuda12.2-pip/devcontainer.json
index e8453ed779..cd287569d8 100644
--- a/.devcontainer/cuda12.2-pip/devcontainer.json
+++ b/.devcontainer/cuda12.2-pip/devcontainer.json
@@ -16,7 +16,7 @@
   "hostRequirements": {"gpu": "optional"},
   "features": {
     "ghcr.io/rapidsai/devcontainers/features/ucx:24.6": {
-      "version": "1.14.1"
+      "version": "1.15.0"
     },
     "ghcr.io/rapidsai/devcontainers/features/cuda:24.6": {
       "version": "12.2",

From e720de760741351e2807cb5922c968c935161a5e Mon Sep 17 00:00:00 2001
From: Tarang Jain <40517122+tarang-jain@users.noreply.github.com>
Date: Mon, 29 Apr 2024 22:25:56 -0700
Subject: [PATCH 18/60] InnerProduct Distance Metric for CAGRA search (#2260)

`InnerProduct` Distance Metric for CAGRA search. InnerProduct in graph building is supported using IVF-PQ for building the graph. NNDescent does not currently support any other metric except L2Expanded.

Authors:
  - Tarang Jain (https://github.com/tarang-jain)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)
  - tsuki (https://github.com/enp1s0)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2260
---
 cpp/include/raft/neighbors/cagra.cuh          |   8 +-
 .../neighbors/detail/cagra/cagra_build.cuh    |  22 ++-
 .../neighbors/detail/cagra/cagra_search.cuh   |  35 +++--
 .../detail/cagra/compute_distance.hpp         |  65 +++++++--
 .../detail/cagra/compute_distance_vpq.cuh     |   5 +-
 .../raft/neighbors/detail/cagra/factory.cuh   |  11 +-
 .../detail/cagra/search_multi_cta.cuh         |  10 +-
 .../cagra/search_multi_cta_kernel-ext.cuh     |   3 +
 .../cagra/search_multi_cta_kernel-inl.cuh     |  14 +-
 .../detail/cagra/search_multi_kernel.cuh      |  65 +++++++--
 .../neighbors/detail/cagra/search_plan.cuh    |  15 ++-
 .../detail/cagra/search_single_cta.cuh        |   7 +-
 .../cagra/search_single_cta_kernel-ext.cuh    |   3 +
 .../cagra/search_single_cta_kernel-inl.cuh    |  14 +-
 cpp/include/raft/neighbors/ivf_pq_types.hpp   |  30 +++++
 .../detail/cagra/search_multi_cta.cuh         |   1 +
 .../detail/cagra/search_single_cta.cuh        |   1 +
 cpp/test/neighbors/ann_cagra.cuh              | 125 ++++++++++++------
 .../ann_cagra/search_kernel_uint64_t.cuh      |   2 +
 .../pylibraft/neighbors/cagra/cagra.pyx       |   5 +-
 python/pylibraft/pylibraft/test/test_cagra.py |   6 +-
 21 files changed, 336 insertions(+), 111 deletions(-)

diff --git a/cpp/include/raft/neighbors/cagra.cuh b/cpp/include/raft/neighbors/cagra.cuh
index b7e362f704..5263ef73e7 100644
--- a/cpp/include/raft/neighbors/cagra.cuh
+++ b/cpp/include/raft/neighbors/cagra.cuh
@@ -24,6 +24,7 @@
 #include <raft/core/host_device_accessor.hpp>
 #include <raft/core/mdspan.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/cagra_types.hpp>
 #include <raft/neighbors/dataset.hpp>
 
@@ -48,13 +49,14 @@ namespace raft::neighbors::cagra {
  *
  * The following distance metrics are supported:
  * - L2Expanded
+ * - InnerProduct
  *
  * Usage example:
  * @code{.cpp}
  *   using namespace raft::neighbors;
- *   // use default index parameters
- *   ivf_pq::index_params build_params;
- *   ivf_pq::search_params search_params
+ *   // use default index parameters based on shape of the dataset
+ *   ivf_pq::index_params build_params = ivf_pq::index_params::from_dataset(dataset);
+ *   ivf_pq::search_params search_params;
  *   auto knn_graph      = raft::make_host_matrix<IdxT, IdxT>(dataset.extent(0), 128);
  *   // create knn graph
  *   cagra::build_knn_graph(res, dataset, knn_graph.view(), 2, build_params, search_params);
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
index d63f865c39..40dcf68e68 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_build.cuh
@@ -21,6 +21,7 @@
 
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
+#include <raft/core/error.hpp>
 #include <raft/core/host_device_accessor.hpp>
 #include <raft/core/host_mdarray.hpp>
 #include <raft/core/host_mdspan.hpp>
@@ -50,8 +51,9 @@ void build_knn_graph(raft::resources const& res,
                      std::optional<ivf_pq::index_params> build_params   = std::nullopt,
                      std::optional<ivf_pq::search_params> search_params = std::nullopt)
 {
-  RAFT_EXPECTS(!build_params || build_params->metric == distance::DistanceType::L2Expanded,
-               "Currently only L2Expanded metric is supported");
+  RAFT_EXPECTS(!build_params || build_params->metric == distance::DistanceType::L2Expanded ||
+                 build_params->metric == distance::DistanceType::InnerProduct,
+               "Currently only L2Expanded or InnerProduct metric are supported");
 
   uint32_t node_degree = knn_graph.extent(1);
   common::nvtx::range<common::nvtx::domain::raft> fun_scope("cagra::build_graph(%zu, %zu, %u)",
@@ -59,15 +61,7 @@ void build_knn_graph(raft::resources const& res,
                                                             size_t(dataset.extent(1)),
                                                             node_degree);
 
-  if (!build_params) {
-    build_params          = ivf_pq::index_params{};
-    build_params->n_lists = dataset.extent(0) < 4 * 2500 ? 4 : (uint32_t)(dataset.extent(0) / 2500);
-    build_params->pq_dim  = raft::Pow2<8>::roundUp(dataset.extent(1) / 2);
-    build_params->pq_bits = 8;
-    build_params->kmeans_trainset_fraction = dataset.extent(0) < 10000 ? 1 : 10;
-    build_params->kmeans_n_iters           = 25;
-    build_params->add_data_on_build        = true;
-  }
+  if (!build_params) { build_params = ivf_pq::index_params::from_dataset(dataset); }
 
   // Make model name
   const std::string model_name = [&]() {
@@ -324,8 +318,10 @@ index<T, IdxT> build(
 
   if (params.build_algo == graph_build_algo::IVF_PQ) {
     build_knn_graph(res, dataset, knn_graph->view(), refine_rate, pq_build_params, search_params);
-
   } else {
+    RAFT_EXPECTS(
+      params.metric == raft::distance::DistanceType::L2Expanded,
+      "L2Expanded is the only distance metrics supported for CAGRA build with nn_descent");
     // Use nn-descent to build CAGRA knn graph
     if (!nn_descent_params) {
       nn_descent_params                            = experimental::nn_descent::index_params();
@@ -348,6 +344,8 @@ index<T, IdxT> build(
   // Construct an index from dataset and optimized knn graph.
   if (construct_index_with_dataset) {
     if (params.compression.has_value()) {
+      RAFT_EXPECTS(params.metric == raft::distance::DistanceType::L2Expanded,
+                   "VPQ compression is only supported with L2Expanded distance mertric");
       index<T, IdxT> idx(res, params.metric);
       idx.update_graph(res, raft::make_const_mdspan(cagra_graph.view()));
       idx.update_dataset(
diff --git a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
index b9edbbfc4a..67fad2e46a 100644
--- a/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/cagra_search.cuh
@@ -26,6 +26,7 @@
 #include <raft/core/nvtx.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/cagra_types.hpp>
 #include <raft/neighbors/detail/ivf_common.cuh>
 #include <raft/neighbors/detail/ivf_pq_search.cuh>
@@ -87,7 +88,8 @@ void search_main_core(
   raft::device_matrix_view<const typename DatasetDescriptorT::DATA_T, int64_t, row_major> queries,
   raft::device_matrix_view<typename DatasetDescriptorT::INDEX_T, int64_t, row_major> neighbors,
   raft::device_matrix_view<typename DatasetDescriptorT::DISTANCE_T, int64_t, row_major> distances,
-  CagraSampleFilterT sample_filter = CagraSampleFilterT())
+  CagraSampleFilterT sample_filter    = CagraSampleFilterT(),
+  raft::distance::DistanceType metric = raft::distance::DistanceType::L2Expanded)
 {
   RAFT_LOG_DEBUG("# dataset size = %lu, dim = %lu\n",
                  static_cast<size_t>(dataset_desc.size),
@@ -112,7 +114,7 @@ void search_main_core(
   using CagraSampleFilterT_s = typename CagraSampleFilterT_Selector<CagraSampleFilterT>::type;
   std::unique_ptr<search_plan_impl<DatasetDescriptorT, CagraSampleFilterT_s>> plan =
     factory<DatasetDescriptorT, CagraSampleFilterT_s>::create(
-      res, params, dataset_desc.dim, graph.extent(1), topk);
+      res, params, dataset_desc.dim, graph.extent(1), topk, metric);
 
   plan->check(topk);
 
@@ -163,7 +165,8 @@ void launch_vpq_search_main_core(
   raft::device_matrix_view<const T, int64_t, row_major> queries,
   raft::device_matrix_view<InternalIdxT, int64_t, row_major> neighbors,
   raft::device_matrix_view<DistanceT, int64_t, row_major> distances,
-  CagraSampleFilterT sample_filter)
+  CagraSampleFilterT sample_filter,
+  const raft::distance::DistanceType metric)
 {
   RAFT_EXPECTS(vpq_dset->pq_bits() == 8, "Only pq_bits = 8 is supported for now");
   RAFT_EXPECTS(vpq_dset->pq_len() == 2 || vpq_dset->pq_len() == 4,
@@ -192,7 +195,7 @@ void launch_vpq_search_main_core(
                                   size_t(vpq_dset->n_rows()),
                                   vpq_dset->dim());
       search_main_core(
-        res, params, dataset_desc, graph, queries, neighbors, distances, sample_filter);
+        res, params, dataset_desc, graph, queries, neighbors, distances, sample_filter, metric);
     } else if (vpq_dset->pq_len() == 4) {
       using dataset_desc_t = cagra_q_dataset_descriptor_t<T,
                                                           DatasetT,
@@ -210,7 +213,7 @@ void launch_vpq_search_main_core(
                                   size_t(vpq_dset->n_rows()),
                                   vpq_dset->dim());
       search_main_core(
-        res, params, dataset_desc, graph, queries, neighbors, distances, sample_filter);
+        res, params, dataset_desc, graph, queries, neighbors, distances, sample_filter, metric);
     } else {
       RAFT_FAIL("Subspace dimension must be 2 or 4");
     }
@@ -268,9 +271,15 @@ void search_main(raft::resources const& res,
                                       strided_dset->n_rows(),
                                       strided_dset->dim(),
                                       strided_dset->stride());
-
-    search_main_core<dataset_desc_t, CagraSampleFilterT>(
-      res, params, dataset_desc, graph_internal, queries, neighbors, distances, sample_filter);
+    search_main_core<dataset_desc_t, CagraSampleFilterT>(res,
+                                                         params,
+                                                         dataset_desc,
+                                                         graph_internal,
+                                                         queries,
+                                                         neighbors,
+                                                         distances,
+                                                         sample_filter,
+                                                         index.metric());
   } else if (auto* vpq_dset = dynamic_cast<const vpq_dataset<float, ds_idx_type>*>(&index.data());
              vpq_dset != nullptr) {
     // Search using a compressed dataset
@@ -278,7 +287,15 @@ void search_main(raft::resources const& res,
   } else if (auto* vpq_dset = dynamic_cast<const vpq_dataset<half, ds_idx_type>*>(&index.data());
              vpq_dset != nullptr) {
     launch_vpq_search_main_core<T, half, ds_idx_type, InternalIdxT, DistanceT, CagraSampleFilterT>(
-      res, vpq_dset, params, graph_internal, queries, neighbors, distances, sample_filter);
+      res,
+      vpq_dset,
+      params,
+      graph_internal,
+      queries,
+      neighbors,
+      distances,
+      sample_filter,
+      index.metric());
   } else if (auto* empty_dset = dynamic_cast<const empty_dataset<ds_idx_type>*>(&index.data());
              empty_dset != nullptr) {
     // Forgot to add a dataset.
diff --git a/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp b/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp
index 49e14be73d..80ee7a36f1 100644
--- a/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp
+++ b/cpp/include/raft/neighbors/detail/cagra/compute_distance.hpp
@@ -19,6 +19,8 @@
 #include "hashmap.hpp"
 #include "utils.hpp"
 
+#include <raft/core/operators.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/util/vectorized.cuh>
 
@@ -54,6 +56,7 @@ _RAFT_DEVICE void compute_distance_to_random_nodes(
   const uint32_t num_seeds,
   INDEX_T* const visited_hash_ptr,
   const uint32_t hash_bitlen,
+  const raft::distance::DistanceType metric,
   const uint32_t block_id   = 0,
   const uint32_t num_blocks = 1)
 {
@@ -78,8 +81,22 @@ _RAFT_DEVICE void compute_distance_to_random_nodes(
         }
       }
 
-      const auto norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM, TEAM_SIZE>(
-        query_buffer, seed_index, valid_i);
+      DISTANCE_T norm2;
+      switch (metric) {
+        case raft::distance::L2Expanded:
+          norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
+                                                           TEAM_SIZE,
+                                                           raft::distance::L2Expanded>(
+            query_buffer, seed_index, valid_i);
+          break;
+        case raft::distance::InnerProduct:
+          norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
+                                                           TEAM_SIZE,
+                                                           raft::distance::InnerProduct>(
+            query_buffer, seed_index, valid_i);
+          break;
+        default: break;
+      }
 
       if (valid_i && (norm2 < best_norm2_team_local)) {
         best_norm2_team_local = norm2;
@@ -121,7 +138,8 @@ _RAFT_DEVICE void compute_distance_to_child_nodes(
   const std::uint32_t hash_bitlen,
   const INDEX_T* const parent_indices,
   const INDEX_T* const internal_topk_list,
-  const std::uint32_t search_width)
+  const std::uint32_t search_width,
+  const raft::distance::DistanceType metric)
 {
   constexpr INDEX_T index_msb_1_mask = utils::gen_index_msb_1_mask<INDEX_T>::value;
   const INDEX_T invalid_index        = utils::get_max_value<INDEX_T>();
@@ -153,8 +171,22 @@ _RAFT_DEVICE void compute_distance_to_child_nodes(
     INDEX_T child_id   = invalid_index;
     if (valid_i) { child_id = result_child_indices_ptr[i]; }
 
-    const auto norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM, TEAM_SIZE>(
-      query_buffer, child_id, child_id != invalid_index);
+    DISTANCE_T norm2;
+    switch (metric) {
+      case raft::distance::L2Expanded:
+        norm2 =
+          dataset_desc
+            .template compute_similarity<DATASET_BLOCK_DIM, TEAM_SIZE, raft::distance::L2Expanded>(
+              query_buffer, child_id, child_id != invalid_index);
+        break;
+      case raft::distance::InnerProduct:
+        norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
+                                                         TEAM_SIZE,
+                                                         raft::distance::InnerProduct>(
+          query_buffer, child_id, child_id != invalid_index);
+        break;
+      default: break;
+    }
 
     // Store the distance
     const unsigned lane_id = threadIdx.x % TEAM_SIZE;
@@ -220,7 +252,22 @@ struct standard_dataset_descriptor_t
     }
   }
 
-  template <uint32_t DATASET_BLOCK_DIM, uint32_t TEAM_SIZE>
+  template <typename T, raft::distance::DistanceType METRIC>
+  std::enable_if_t<METRIC == raft::distance::DistanceType::L2Expanded, T> __device__
+  dist_op(T a, T b) const
+  {
+    T diff = a - b;
+    return diff * diff;
+  }
+
+  template <typename T, raft::distance::DistanceType METRIC>
+  std::enable_if_t<METRIC == raft::distance::DistanceType::InnerProduct, T> __device__
+  dist_op(T a, T b) const
+  {
+    return -a * b;
+  }
+
+  template <uint32_t DATASET_BLOCK_DIM, uint32_t TEAM_SIZE, raft::distance::DistanceType METRIC>
   __device__ DISTANCE_T compute_similarity(const QUERY_T* const query_ptr,
                                            const INDEX_T dataset_i,
                                            const bool valid) const
@@ -252,9 +299,9 @@ struct standard_dataset_descriptor_t
             // because:
             // - Above the last element (dataset_dim-1), the query array is filled with zeros.
             // - The data buffer has to be also padded with zeros.
-            DISTANCE_T diff = query_ptr[device::swizzling(kv)];
-            diff -= spatial::knn::detail::utils::mapping<float>{}(dl_buff[e].val.data[v]);
-            norm2 += diff * diff;
+            DISTANCE_T d = query_ptr[device::swizzling(kv)];
+            norm2 += dist_op<DISTANCE_T, METRIC>(
+              d, spatial::knn::detail::utils::mapping<float>{}(dl_buff[e].val.data[v]));
           }
         }
       }
diff --git a/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh b/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh
index e73d24bfb6..c922a0d7f4 100644
--- a/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/compute_distance_vpq.cuh
@@ -18,6 +18,7 @@
 
 #include "compute_distance.hpp"
 
+#include <raft/distance/distance_types.hpp>
 #include <raft/util/integer_utils.hpp>
 
 namespace raft::neighbors::cagra::detail {
@@ -112,7 +113,7 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<half, DIS
     }
   }
 
-  template <uint32_t DATASET_BLOCK_DIM, uint32_t TEAM_SIZE>
+  template <uint32_t DATASET_BLOCK_DIM, uint32_t TEAM_SIZE, raft::distance::DistanceType METRIC>
   __device__ DISTANCE_T compute_similarity(const QUERY_T* const query_ptr,
                                            const INDEX_T node_id,
                                            const bool valid) const
@@ -227,4 +228,4 @@ struct cagra_q_dataset_descriptor_t : public dataset_descriptor_base_t<half, DIS
   }
 };
 
-}  // namespace raft::neighbors::cagra::detail
+}  // namespace raft::neighbors::cagra::detail
\ No newline at end of file
diff --git a/cpp/include/raft/neighbors/detail/cagra/factory.cuh b/cpp/include/raft/neighbors/detail/cagra/factory.cuh
index 4944b57c46..6d7fc6c966 100644
--- a/cpp/include/raft/neighbors/detail/cagra/factory.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/factory.cuh
@@ -41,9 +41,10 @@ class factory {
     search_params const& params,
     int64_t dim,
     int64_t graph_degree,
-    uint32_t topk)
+    uint32_t topk,
+    const raft::distance::DistanceType metric)
   {
-    search_plan_impl_base plan(params, dim, graph_degree, topk);
+    search_plan_impl_base plan(params, dim, graph_degree, topk, metric);
     switch (plan.dataset_block_dim) {
       case 128:
         switch (plan.team_size) {
@@ -77,17 +78,17 @@ class factory {
       return std::unique_ptr<search_plan_impl<DATASET_DESCRIPTOR_T, CagraSampleFilterT>>(
         new single_cta_search::
           search<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, CagraSampleFilterT>(
-            res, plan, plan.dim, plan.graph_degree, plan.topk));
+            res, plan, plan.dim, plan.graph_degree, plan.topk, plan.metric));
     } else if (plan.algo == search_algo::MULTI_CTA) {
       return std::unique_ptr<search_plan_impl<DATASET_DESCRIPTOR_T, CagraSampleFilterT>>(
         new multi_cta_search::
           search<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, CagraSampleFilterT>(
-            res, plan, plan.dim, plan.graph_degree, plan.topk));
+            res, plan, plan.dim, plan.graph_degree, plan.topk, plan.metric));
     } else {
       return std::unique_ptr<search_plan_impl<DATASET_DESCRIPTOR_T, CagraSampleFilterT>>(
         new multi_kernel_search::
           search<TEAM_SIZE, DATASET_BLOCK_DIM, DATASET_DESCRIPTOR_T, CagraSampleFilterT>(
-            res, plan, plan.dim, plan.graph_degree, plan.topk));
+            res, plan, plan.dim, plan.graph_degree, plan.topk, plan.metric));
     }
   }
 };
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh
index 8192b1ae51..4b979bcae8 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta.cuh
@@ -24,11 +24,14 @@
 #include "topk_for_cagra/topk_core.cuh"  // TODO replace with raft topk if possible
 #include "utils.hpp"
 
+#include <raft/core/detail/macros.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_properties.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
+#include <raft/linalg/map.cuh>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/util/cuda_rt_essentials.hpp>
 #include <raft/util/cudart_utils.hpp>  // RAFT_CUDA_TRY_NOT_THROW is used TODO(tfeher): consider moving this to cuda_rt_essentials.hpp
@@ -96,8 +99,10 @@ struct search : public search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
          search_params params,
          int64_t dim,
          int64_t graph_degree,
-         uint32_t topk)
-    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(res, params, dim, graph_degree, topk),
+         uint32_t topk,
+         raft::distance::DistanceType metric)
+    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(
+        res, params, dim, graph_degree, topk, metric),
       intermediate_indices(0, resource::get_cuda_stream(res)),
       intermediate_distances(0, resource::get_cuda_stream(res)),
       topk_workspace(0, resource::get_cuda_stream(res))
@@ -235,6 +240,7 @@ struct search : public search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
       min_iterations,
       max_iterations,
       sample_filter,
+      this->metric,
       stream);
     RAFT_CUDA_TRY(cudaPeekAtLastError());
 
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh
index 50f9e69593..35f4f0e1c9 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-ext.cuh
@@ -54,6 +54,7 @@ void select_and_run(
   size_t min_iterations,
   size_t max_iterations,
   SAMPLE_FILTER_T sample_filter,
+  raft::distance::DistanceType metric,
   cudaStream_t stream) RAFT_EXPLICIT;
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
 
@@ -88,6 +89,7 @@ void select_and_run(
     size_t min_iterations,                                                                      \
     size_t max_iterations,                                                                      \
     SAMPLE_FILTER_T sample_filter,                                                              \
+    raft::distance::DistanceType metric,                                                        \
     cudaStream_t stream);
 
 instantiate_kernel_selection(
@@ -172,6 +174,7 @@ instantiate_kernel_selection(
     size_t min_iterations,                                                                      \
     size_t max_iterations,                                                                      \
     SAMPLE_FILTER_T sample_filter,                                                              \
+    raft::distance::DistanceType metric,                                                        \
     cudaStream_t stream);
 
 instantiate_q_kernel_selection(
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
index 48c22d9d14..cfbb1e100c 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_cta_kernel-inl.cuh
@@ -28,6 +28,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_properties.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/sample_filter_types.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/util/cuda_rt_essentials.hpp>
@@ -149,7 +150,8 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   const uint32_t min_iteration,
   const uint32_t max_iteration,
   uint32_t* const num_executed_iterations, /* stats */
-  SAMPLE_FILTER_T sample_filter)
+  SAMPLE_FILTER_T sample_filter,
+  const raft::distance::DistanceType metric)
 {
   using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
   using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
@@ -227,6 +229,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   const INDEX_T* const local_seed_ptr = seed_ptr ? seed_ptr + (num_seeds * query_id) : nullptr;
   uint32_t block_id                   = cta_id + (num_cta_per_query * query_id);
   uint32_t num_blocks                 = num_cta_per_query * num_queries;
+
   device::compute_distance_to_random_nodes<TEAM_SIZE, DATASET_BLOCK_DIM>(result_indices_buffer,
                                                                          result_distances_buffer,
                                                                          query_buffer,
@@ -238,6 +241,7 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
                                                                          num_seeds,
                                                                          local_visited_hashmap_ptr,
                                                                          hash_bitlen,
+                                                                         metric,
                                                                          block_id,
                                                                          num_blocks);
   __syncthreads();
@@ -282,7 +286,8 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
       hash_bitlen,
       parent_indices_buffer,
       result_indices_buffer,
-      search_width);
+      search_width,
+      metric);
     _CLK_REC(clk_compute_distance);
     __syncthreads();
 
@@ -459,6 +464,7 @@ void select_and_run(
   size_t min_iterations,
   size_t max_iterations,
   SAMPLE_FILTER_T sample_filter,
+  raft::distance::DistanceType metric,
   cudaStream_t stream)
 {
   auto kernel =
@@ -484,6 +490,7 @@ void select_and_run(
                  num_cta_per_query,
                  num_queries,
                  smem_size);
+
   kernel<<<grid_dims, block_dims, smem_size, stream>>>(topk_indices_ptr,
                                                        topk_distances_ptr,
                                                        dataset_desc,
@@ -501,7 +508,8 @@ void select_and_run(
                                                        min_iterations,
                                                        max_iterations,
                                                        num_executed_iterations,
-                                                       sample_filter);
+                                                       sample_filter,
+                                                       metric);
 }
 
 }  // namespace multi_cta_search
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
index 10788da432..31c4bc5dca 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_multi_kernel.cuh
@@ -27,6 +27,7 @@
 #include <raft/core/logger.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/matrix/select_k.cuh>
 #include <raft/neighbors/sample_filter_types.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
@@ -100,7 +101,8 @@ RAFT_KERNEL random_pickup_kernel(
   typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, ldr]
   const std::uint32_t ldr,                                                // (*) ldr >= num_pickup
   typename DATASET_DESCRIPTOR_T::INDEX_T* const visited_hashmap_ptr,  // [num_queries, 1 << bitlen]
-  const std::uint32_t hash_bitlen)
+  const std::uint32_t hash_bitlen,
+  const raft::distance::DistanceType metric)
 {
   using DATA_T     = typename DATASET_DESCRIPTOR_T::DATA_T;
   using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
@@ -137,8 +139,22 @@ RAFT_KERNEL random_pickup_kernel(
         device::xorshift64((global_team_index ^ rand_xor_mask) * (i + 1)) % dataset_desc.size;
     }
 
-    const auto norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM, TEAM_SIZE>(
-      query_buffer, seed_index, true);
+    DISTANCE_T norm2;
+    switch (metric) {
+      case distance::DistanceType::L2Expanded:
+        norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
+                                                         TEAM_SIZE,
+                                                         distance::DistanceType::L2Expanded>(
+          query_buffer, seed_index, true);
+        break;
+      case distance::DistanceType::InnerProduct:
+        norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
+                                                         TEAM_SIZE,
+                                                         distance::DistanceType::InnerProduct>(
+          query_buffer, seed_index, true);
+        break;
+      default: break;
+    }
 
     if (norm2 < best_norm2_team_local) {
       best_norm2_team_local = norm2;
@@ -175,6 +191,7 @@ void random_pickup(
   const std::size_t ldr,                                                  // (*) ldr >= num_pickup
   typename DATASET_DESCRIPTOR_T::INDEX_T* const visited_hashmap_ptr,  // [num_queries, 1 << bitlen]
   const std::uint32_t hash_bitlen,
+  const raft::distance::DistanceType metric,
   cudaStream_t const cuda_stream = 0)
 {
   const auto block_size                = 256u;
@@ -198,7 +215,8 @@ void random_pickup(
                                                         result_distances_ptr,
                                                         ldr,
                                                         visited_hashmap_ptr,
-                                                        hash_bitlen);
+                                                        hash_bitlen,
+                                                        metric);
 }
 
 template <class INDEX_T>
@@ -325,7 +343,8 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel(
   typename DATASET_DESCRIPTOR_T::INDEX_T* const result_indices_ptr,       // [num_queries, ldd]
   typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, ldd]
   const std::uint32_t ldd,  // (*) ldd >= search_width * graph_degree
-  SAMPLE_FILTER_T sample_filter)
+  SAMPLE_FILTER_T sample_filter,
+  const raft::distance::DistanceType metric)
 {
   using INDEX_T    = typename DATASET_DESCRIPTOR_T::INDEX_T;
   using DISTANCE_T = typename DATASET_DESCRIPTOR_T::DISTANCE_T;
@@ -371,8 +390,22 @@ RAFT_KERNEL compute_distance_to_child_nodes_kernel(
   const auto compute_distance_flag = hashmap::insert<TEAM_SIZE, INDEX_T>(
     visited_hashmap_ptr + (ldb * blockIdx.y), hash_bitlen, child_id);
 
-  const auto norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM, TEAM_SIZE>(
-    query_buffer, child_id, compute_distance_flag);
+  DISTANCE_T norm2;
+  switch (metric) {
+    case raft::distance::DistanceType::L2Expanded:
+      norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
+                                                       TEAM_SIZE,
+                                                       raft::distance::DistanceType::L2Expanded>(
+        query_buffer, child_id, compute_distance_flag);
+      break;
+    case raft::distance::DistanceType::InnerProduct:
+      norm2 = dataset_desc.template compute_similarity<DATASET_BLOCK_DIM,
+                                                       TEAM_SIZE,
+                                                       raft::distance::DistanceType::InnerProduct>(
+        query_buffer, child_id, compute_distance_flag);
+      break;
+    default: break;
+  }
 
   if (compute_distance_flag) {
     if (threadIdx.x % TEAM_SIZE == 0) {
@@ -421,6 +454,7 @@ void compute_distance_to_child_nodes(
   typename DATASET_DESCRIPTOR_T::DISTANCE_T* const result_distances_ptr,  // [num_queries, ldd]
   const std::uint32_t ldd,  // (*) ldd >= search_width * graph_degree
   SAMPLE_FILTER_T sample_filter,
+  const raft::distance::DistanceType metric,
   cudaStream_t cuda_stream = 0)
 {
   const auto block_size = 128;
@@ -452,7 +486,8 @@ void compute_distance_to_child_nodes(
                                                         result_indices_ptr,
                                                         result_distances_ptr,
                                                         ldd,
-                                                        sample_filter);
+                                                        sample_filter,
+                                                        metric);
 }
 
 template <class INDEX_T>
@@ -660,8 +695,10 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
          search_params params,
          int64_t dim,
          int64_t graph_degree,
-         uint32_t topk)
-    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(res, params, dim, graph_degree, topk),
+         uint32_t topk,
+         raft::distance::DistanceType metric)
+    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(
+        res, params, dim, graph_degree, topk, metric),
       result_indices(0, resource::get_cuda_stream(res)),
       result_distances(0, resource::get_cuda_stream(res)),
       parent_node_list(0, resource::get_cuda_stream(res)),
@@ -835,6 +872,7 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
                                                 result_buffer_allocation_size,
                                                 hashmap.data(),
                                                 hash_bitlen,
+                                                this->metric,
                                                 stream);
 
     unsigned iter = 0;
@@ -904,6 +942,7 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
         result_distances.data() + itopk_size,
         result_buffer_allocation_size,
         sample_filter,
+        this->metric,
         stream);
 
       iter++;
@@ -1020,8 +1059,10 @@ struct search<TEAM_SIZE,
          search_params params,
          int64_t dim,
          int64_t graph_degree,
-         uint32_t topk)
-    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(res, params, dim, graph_degree, topk)
+         uint32_t topk,
+         raft::distance::DistanceType metric)
+    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(
+        res, params, dim, graph_degree, topk, metric)
   {
     THROW("The multi-kernel mode does not support VPQ");
   }
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
index be5ac0554f..b35d96e9f5 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_plan.cuh
@@ -25,6 +25,7 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/cagra_types.hpp>
 #include <raft/util/pow2_utils.cuh>
 
@@ -35,8 +36,13 @@ struct search_plan_impl_base : public search_params {
   int64_t dim;
   int64_t graph_degree;
   uint32_t topk;
-  search_plan_impl_base(search_params params, int64_t dim, int64_t graph_degree, uint32_t topk)
-    : search_params(params), dim(dim), graph_degree(graph_degree), topk(topk)
+  raft::distance::DistanceType metric;
+  search_plan_impl_base(search_params params,
+                        int64_t dim,
+                        int64_t graph_degree,
+                        uint32_t topk,
+                        raft::distance::DistanceType metric)
+    : search_params(params), dim(dim), graph_degree(graph_degree), topk(topk), metric(metric)
   {
     set_dataset_block_and_team_size(dim);
     if (algo == search_algo::AUTO) {
@@ -97,8 +103,9 @@ struct search_plan_impl : public search_plan_impl_base {
                    search_params params,
                    int64_t dim,
                    int64_t graph_degree,
-                   uint32_t topk)
-    : search_plan_impl_base(params, dim, graph_degree, topk),
+                   uint32_t topk,
+                   raft::distance::DistanceType metric)
+    : search_plan_impl_base(params, dim, graph_degree, topk, metric),
       hashmap(0, resource::get_cuda_stream(res)),
       num_executed_iterations(0, resource::get_cuda_stream(res)),
       dev_seed(0, resource::get_cuda_stream(res)),
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
index 4430b929fb..0771652787 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta.cuh
@@ -94,8 +94,10 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
          search_params params,
          int64_t dim,
          int64_t graph_degree,
-         uint32_t topk)
-    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(res, params, dim, graph_degree, topk)
+         uint32_t topk,
+         raft::distance::DistanceType metric)
+    : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T>(
+        res, params, dim, graph_degree, topk, metric)
   {
     set_params(res);
   }
@@ -244,6 +246,7 @@ struct search : search_plan_impl<DATASET_DESCRIPTOR_T, SAMPLE_FILTER_T> {
       min_iterations,
       max_iterations,
       sample_filter,
+      this->metric,
       stream);
   }
 };
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh
index a836334667..510219ab5d 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-ext.cuh
@@ -54,6 +54,7 @@ void select_and_run(  // raft::resources const& res,
   size_t min_iterations,
   size_t max_iterations,
   SAMPLE_FILTER_T sample_filter,
+  raft::distance::DistanceType metric,
   cudaStream_t stream) RAFT_EXPLICIT;
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
@@ -90,6 +91,7 @@ void select_and_run(  // raft::resources const& res,
     size_t min_iterations,                                                                      \
     size_t max_iterations,                                                                      \
     SAMPLE_FILTER_T sample_filter,                                                              \
+    raft::distance::DistanceType metric,                                                        \
     cudaStream_t stream);
 
 instantiate_single_cta_select_and_run(
@@ -175,6 +177,7 @@ instantiate_single_cta_select_and_run(
     size_t min_iterations,                                                                      \
     size_t max_iterations,                                                                      \
     SAMPLE_FILTER_T sample_filter,                                                              \
+    raft::distance::DistanceType metric,                                                        \
     cudaStream_t stream);
 
 instantiate_q_single_cta_select_and_run(
diff --git a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
index a697f9512c..e8104bd6f6 100644
--- a/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
+++ b/cpp/include/raft/neighbors/detail/cagra/search_single_cta_kernel-inl.cuh
@@ -29,6 +29,7 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/device_properties.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/distance/distance_types.hpp>
 #include <raft/neighbors/sample_filter_types.hpp>
 #include <raft/spatial/knn/detail/ann_utils.cuh>
 #include <raft/util/cuda_rt_essentials.hpp>
@@ -485,7 +486,8 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
   const std::uint32_t hash_bitlen,
   const std::uint32_t small_hash_bitlen,
   const std::uint32_t small_hash_reset_interval,
-  SAMPLE_FILTER_T sample_filter)
+  SAMPLE_FILTER_T sample_filter,
+  raft::distance::DistanceType metric)
 {
   using LOAD_T = device::LOAD_128BIT_T;
 
@@ -581,7 +583,8 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
                                                                          local_seed_ptr,
                                                                          num_seeds,
                                                                          local_visited_hashmap_ptr,
-                                                                         hash_bitlen);
+                                                                         hash_bitlen,
+                                                                         metric);
   __syncthreads();
   _CLK_REC(clk_compute_1st_distance);
 
@@ -718,7 +721,8 @@ __launch_bounds__(1024, 1) RAFT_KERNEL search_kernel(
       hash_bitlen,
       parent_list_buffer,
       result_indices_buffer,
-      search_width);
+      search_width,
+      metric);
     __syncthreads();
     _CLK_REC(clk_compute_distance);
 
@@ -930,6 +934,7 @@ void select_and_run(
   size_t min_iterations,
   size_t max_iterations,
   SAMPLE_FILTER_T sample_filter,
+  raft::distance::DistanceType metric,
   cudaStream_t stream)
 {
   auto kernel =
@@ -962,7 +967,8 @@ void select_and_run(
                                                          hash_bitlen,
                                                          small_hash_bitlen,
                                                          small_hash_reset_interval,
-                                                         sample_filter);
+                                                         sample_filter,
+                                                         metric);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 }  // namespace single_cta_search
diff --git a/cpp/include/raft/neighbors/ivf_pq_types.hpp b/cpp/include/raft/neighbors/ivf_pq_types.hpp
index 81e2886b18..3ee350c6fb 100644
--- a/cpp/include/raft/neighbors/ivf_pq_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_pq_types.hpp
@@ -104,6 +104,36 @@ struct index_params : ann::index_params {
    * flag to `true` if you prefer to use as little GPU memory for the database as possible.
    */
   bool conservative_memory_allocation = false;
+
+  /**
+   * Creates index_params based on shape of the input dataset.
+   * Usage example:
+   * @code{.cpp}
+   *   using namespace raft::neighbors;
+   *   raft::resources res;
+   *   // create index_params for a [N. D] dataset and have InnerProduct as the distance metric
+   *   auto dataset = raft::make_device_matrix<float, int64_t>(res, N, D);
+   *   ivf_pq::index_params index_params =
+   *     ivf_pq::index_params::from_dataset(dataset.view(), raft::distance::InnerProduct);
+   *   // modify/update index_params as needed
+   *   index_params.add_data_on_build = true;
+   * @endcode
+   */
+  template <typename DataT, typename Accessor>
+  static index_params from_dataset(
+    mdspan<const DataT, matrix_extent<int64_t>, row_major, Accessor> dataset,
+    raft::distance::DistanceType metric = raft::distance::L2Expanded)
+  {
+    index_params params;
+    params.n_lists =
+      dataset.extent(0) < 4 * 2500 ? 4 : static_cast<uint32_t>(std::sqrt(dataset.extent(0)));
+    params.pq_dim =
+      round_up_safe(static_cast<uint32_t>(dataset.extent(1) / 4), static_cast<uint32_t>(8));
+    params.pq_bits                  = 8;
+    params.kmeans_trainset_fraction = dataset.extent(0) < 10000 ? 1 : 0.1;
+    params.metric                   = metric;
+    return params;
+  }
 };
 
 struct search_params : ann::search_params {
diff --git a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
index 179bf8f20f..542fdaad1f 100644
--- a/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_multi_cta.cuh
@@ -44,6 +44,7 @@ namespace raft::neighbors::cagra::detail::multi_cta_search {
     size_t min_iterations,                                                                        \
     size_t max_iterations,                                                                        \
     SAMPLE_FILTER_T sample_filter,                                                                \
+    raft::distance::DistanceType metric,                                                          \
     cudaStream_t stream);
 
 #define COMMA ,
diff --git a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
index 7fb705a2d2..855b104670 100644
--- a/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
+++ b/cpp/src/neighbors/detail/cagra/search_single_cta.cuh
@@ -45,6 +45,7 @@ namespace raft::neighbors::cagra::detail::single_cta_search {
     size_t min_iterations,                                                                        \
     size_t max_iterations,                                                                        \
     SAMPLE_FILTER_T sample_filter,                                                                \
+    raft::distance::DistanceType metric,                                                          \
     cudaStream_t stream);
 
 #define COMMA ,
diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index 7278f71a24..715a94403f 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -28,6 +28,7 @@
 #include <raft/linalg/add.cuh>
 #include <raft/neighbors/cagra.cuh>
 #include <raft/neighbors/cagra_serialize.cuh>
+#include <raft/neighbors/ivf_pq_types.hpp>
 #include <raft/neighbors/sample_filter.cuh>
 #include <raft/random/rng.cuh>
 #include <raft/util/itertools.hpp>
@@ -85,25 +86,49 @@ void RandomSuffle(raft::host_matrix_view<IdxT, int64_t> index)
 
 template <typename DistanceT, typename DatatT, typename IdxT>
 testing::AssertionResult CheckOrder(raft::host_matrix_view<IdxT, int64_t> index_test,
-                                    raft::host_matrix_view<DatatT, int64_t> dataset)
+                                    raft::host_matrix_view<DatatT, int64_t> dataset,
+                                    raft::distance::DistanceType metric)
 {
   for (IdxT i = 0; i < index_test.extent(0); i++) {
     const DatatT* const base_vec = dataset.data_handle() + i * dataset.extent(1);
     const IdxT* const index_row  = index_test.data_handle() + i * index_test.extent(1);
-    DistanceT prev_distance      = 0;
+    DistanceT prev_distance      = metric == raft::distance::DistanceType::L2Expanded
+                                     ? 0
+                                     : std::numeric_limits<DistanceT>::max();
     for (unsigned j = 0; j < index_test.extent(1) - 1; j++) {
       const DatatT* const target_vec = dataset.data_handle() + index_row[j] * dataset.extent(1);
       DistanceT distance             = 0;
-      for (unsigned l = 0; l < dataset.extent(1); l++) {
-        const auto diff =
-          static_cast<DistanceT>(target_vec[l]) - static_cast<DistanceT>(base_vec[l]);
-        distance += diff * diff;
-      }
-      if (prev_distance > distance) {
-        return testing::AssertionFailure()
-               << "Wrong index order (row = " << i << ", neighbor_id = " << j
-               << "). (distance[neighbor_id-1] = " << prev_distance
-               << "should be larger than distance[neighbor_id] = " << distance << ")";
+      switch (metric) {
+        case raft::distance::DistanceType::L2Expanded:
+          for (unsigned l = 0; l < dataset.extent(1); l++) {
+            const auto diff =
+              static_cast<DistanceT>(target_vec[l]) - static_cast<DistanceT>(base_vec[l]);
+            distance += diff * diff;
+          }
+          if (prev_distance > distance) {
+            return testing::AssertionFailure()
+                   << "Wrong index order (row = " << i << ", neighbor_id = " << j
+                   << "). (distance[neighbor_id-1] = " << prev_distance
+                   << "should be lesser than distance[neighbor_id] = " << distance << ")";
+          }
+          break;
+        case raft::distance::DistanceType::InnerProduct:
+          for (unsigned l = 0; l < dataset.extent(1); l++) {
+            const auto prod =
+              static_cast<DistanceT>(target_vec[l]) * static_cast<DistanceT>(base_vec[l]);
+            distance += prod;
+          }
+          if (prev_distance < distance) {
+            return testing::AssertionFailure()
+                   << "Wrong index order (row = " << i << ", neighbor_id = " << j
+                   << "). (distance[neighbor_id-1] = " << prev_distance
+                   << "should be greater than distance[neighbor_id] = " << distance << ")";
+          }
+          break;
+        default:
+          return testing::AssertionFailure()
+                 << "Distance metric " << metric
+                 << " not supported. Only L2Expanded and InnerProduct are supported";
       }
       prev_distance = distance;
     }
@@ -221,6 +246,11 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
  protected:
   void testCagra()
   {
+    // TODO (tarang-jain): remove when NN Descent index building support InnerProduct. Reference
+    // issue: https://github.com/rapidsai/raft/issues/2276
+    if (ps.metric == distance::InnerProduct && ps.build_algo == graph_build_algo::NN_DESCENT)
+      GTEST_SKIP();
+
     size_t queries_size = ps.n_queries * ps.k;
     std::vector<IdxT> indices_Cagra(queries_size);
     std::vector<IdxT> indices_naive(queries_size);
@@ -301,6 +331,7 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
       //   print_vector("T", distances_naive.data() + i * ps.k, ps.k, std::cout);
       //   print_vector("C", distances_Cagra.data() + i * ps.k, ps.k, std::cout);
       // }
+
       double min_recall = ps.min_recall;
       EXPECT_TRUE(eval_neighbours(indices_naive,
                                   indices_Cagra,
@@ -368,6 +399,9 @@ class AnnCagraSortTest : public ::testing::TestWithParam<AnnCagraInputs> {
  protected:
   void testCagraSort()
   {
+    if (ps.metric == distance::InnerProduct && ps.build_algo == graph_build_algo::NN_DESCENT)
+      GTEST_SKIP();
+
     {
       // Step 1: Build a sorted KNN graph by CAGRA knn build
       auto database_view = raft::make_device_matrix_view<const DataT, int64_t>(
@@ -383,10 +417,13 @@ class AnnCagraSortTest : public ::testing::TestWithParam<AnnCagraInputs> {
         raft::make_host_matrix<IdxT, int64_t>(ps.n_rows, index_params.intermediate_graph_degree);
 
       if (ps.build_algo == graph_build_algo::IVF_PQ) {
+        auto build_params = ivf_pq::index_params::from_dataset(database_view, ps.metric);
         if (ps.host_dataset) {
-          cagra::build_knn_graph<DataT, IdxT>(handle_, database_host_view, knn_graph.view());
+          cagra::build_knn_graph<DataT, IdxT>(
+            handle_, database_host_view, knn_graph.view(), 2, build_params);
         } else {
-          cagra::build_knn_graph<DataT, IdxT>(handle_, database_view, knn_graph.view());
+          cagra::build_knn_graph<DataT, IdxT>(
+            handle_, database_view, knn_graph.view(), 2, build_params);
         }
       } else {
         auto nn_descent_idx_params                      = experimental::nn_descent::index_params{};
@@ -403,14 +440,16 @@ class AnnCagraSortTest : public ::testing::TestWithParam<AnnCagraInputs> {
       }
 
       handle_.sync_stream();
-      ASSERT_TRUE(CheckOrder<DistanceT>(knn_graph.view(), database_host.view()));
+      ASSERT_TRUE(CheckOrder<DistanceT>(knn_graph.view(), database_host.view(), ps.metric));
 
-      RandomSuffle(knn_graph.view());
+      if (ps.metric != raft::distance::DistanceType::InnerProduct) {
+        RandomSuffle(knn_graph.view());
 
-      cagra::sort_knn_graph(handle_, database_view, knn_graph.view());
-      handle_.sync_stream();
+        cagra::sort_knn_graph(handle_, database_view, knn_graph.view());
+        handle_.sync_stream();
 
-      ASSERT_TRUE(CheckOrder<DistanceT>(knn_graph.view(), database_host.view()));
+        ASSERT_TRUE(CheckOrder<DistanceT>(knn_graph.view(), database_host.view(), ps.metric));
+      }
     }
   }
 
@@ -453,6 +492,9 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
  protected:
   void testCagraFilter()
   {
+    if (ps.metric == distance::InnerProduct && ps.build_algo == graph_build_algo::NN_DESCENT)
+      GTEST_SKIP();
+
     size_t queries_size = ps.n_queries * ps.k;
     std::vector<IdxT> indices_Cagra(queries_size);
     std::vector<IdxT> indices_naive(queries_size);
@@ -575,6 +617,9 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
 
   void testCagraRemoved()
   {
+    if (ps.metric == distance::InnerProduct && ps.build_algo == graph_build_algo::NN_DESCENT)
+      GTEST_SKIP();
+
     size_t queries_size = ps.n_queries * ps.k;
     std::vector<IdxT> indices_Cagra(queries_size);
     std::vector<IdxT> indices_naive(queries_size);
@@ -741,7 +786,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {0},
     {256},
     {1},
-    {raft::distance::DistanceType::L2Expanded},
+    {raft::distance::DistanceType::L2Expanded, raft::distance::DistanceType::InnerProduct},
     {false},
     {true},
     {0.995});
@@ -757,7 +802,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {0},
     {256},
     {1},
-    {raft::distance::DistanceType::L2Expanded},
+    {raft::distance::DistanceType::L2Expanded, raft::distance::DistanceType::InnerProduct},
     {false},
     {true},
     {99. / 100}
@@ -776,7 +821,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {0},
     {64},
     {1},
-    {raft::distance::DistanceType::L2Expanded},
+    {raft::distance::DistanceType::L2Expanded, raft::distance::DistanceType::InnerProduct},
     {false},
     {true},
     {0.995});
@@ -792,7 +837,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {0, 4, 8, 16, 32},  // team_size
     {64},
     {1},
-    {raft::distance::DistanceType::L2Expanded},
+    {raft::distance::DistanceType::L2Expanded, raft::distance::DistanceType::InnerProduct},
     {false},
     {false},
     {0.995});
@@ -809,7 +854,7 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {0},  // team_size
     {32, 64, 128, 256, 512, 768},
     {1},
-    {raft::distance::DistanceType::L2Expanded},
+    {raft::distance::DistanceType::L2Expanded, raft::distance::DistanceType::InnerProduct},
     {false},
     {true},
     {0.995});
@@ -826,27 +871,27 @@ inline std::vector<AnnCagraInputs> generate_inputs()
     {0},  // team_size
     {64},
     {1},
-    {raft::distance::DistanceType::L2Expanded},
+    {raft::distance::DistanceType::L2Expanded, raft::distance::DistanceType::InnerProduct},
     {false, true},
     {false},
     {0.995});
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
 
-  inputs2 =
-    raft::util::itertools::product<AnnCagraInputs>({100},
-                                                   {20000},
-                                                   {32},
-                                                   {2048},  // k
-                                                   {graph_build_algo::NN_DESCENT},
-                                                   {search_algo::AUTO},
-                                                   {10},
-                                                   {0},
-                                                   {4096},  // itopk_size
-                                                   {1},
-                                                   {raft::distance::DistanceType::L2Expanded},
-                                                   {false},
-                                                   {false},
-                                                   {0.995});
+  inputs2 = raft::util::itertools::product<AnnCagraInputs>(
+    {100},
+    {20000},
+    {32},
+    {2048},  // k
+    {graph_build_algo::NN_DESCENT},
+    {search_algo::AUTO},
+    {10},
+    {0},
+    {4096},  // itopk_size
+    {1},
+    {raft::distance::DistanceType::L2Expanded, raft::distance::DistanceType::InnerProduct},
+    {false},
+    {false},
+    {0.995});
   inputs.insert(inputs.end(), inputs2.begin(), inputs2.end());
 
   return inputs;
diff --git a/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh b/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh
index 5cca6d561a..412e71bff1 100644
--- a/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh
+++ b/cpp/test/neighbors/ann_cagra/search_kernel_uint64_t.cuh
@@ -51,6 +51,7 @@ namespace multi_cta_search {
     size_t min_iterations,                                                                        \
     size_t max_iterations,                                                                        \
     SAMPLE_FILTER_T sample_filter,                                                                \
+    raft::distance::DistanceType metric,                                                          \
     cudaStream_t stream);
 
 instantiate_kernel_selection(standard_dataset_descriptor_t,
@@ -118,6 +119,7 @@ namespace single_cta_search {
     size_t min_iterations,                                                                        \
     size_t max_iterations,                                                                        \
     SAMPLE_FILTER_T sample_filter,                                                                \
+    raft::distance::DistanceType metric,                                                          \
     cudaStream_t stream);
 
 instantiate_single_cta_select_and_run(standard_dataset_descriptor_t,
diff --git a/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx b/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx
index df31d2560b..0e488a51ca 100644
--- a/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx
+++ b/python/pylibraft/pylibraft/neighbors/cagra/cagra.pyx
@@ -97,9 +97,11 @@ cdef class IndexParams:
     Parameters
     ----------
     metric : string denoting the metric type, default="sqeuclidean"
-        Valid values for metric: ["sqeuclidean"], where
+        Valid values for metric: ["sqeuclidean", "inner_product"], where
             - sqeuclidean is the euclidean distance without the square root
               operation, i.e.: distance(a,b) = \\sum_i (a_i - b_i)^2
+            - inner_product is the dot product between two vectors i.e.:
+              distance(a, b) = \\sum_i (a_i * b_i)
     intermediate_graph_degree : int, default = 128
 
     graph_degree : int, default = 64
@@ -355,6 +357,7 @@ def build(IndexParams index_params, dataset, handle=None):
 
     The following distance metrics are supported:
         - L2
+        - inner_product
 
     Parameters
     ----------
diff --git a/python/pylibraft/pylibraft/test/test_cagra.py b/python/pylibraft/pylibraft/test/test_cagra.py
index be53b33da3..ef8e54917a 100644
--- a/python/pylibraft/pylibraft/test/test_cagra.py
+++ b/python/pylibraft/pylibraft/test/test_cagra.py
@@ -29,7 +29,7 @@ def run_cagra_build_search_test(
     n_queries=100,
     k=10,
     dtype=np.float32,
-    metric="euclidean",
+    metric="sqeuclidean",
     intermediate_graph_degree=128,
     graph_degree=64,
     build_algo="ivf_pq",
@@ -143,7 +143,7 @@ def test_cagra_dataset_dtype_host_device(
             "graph_degree": 32,
             "add_data_on_build": True,
             "k": 1,
-            "metric": "euclidean",
+            "metric": "sqeuclidean",
             "build_algo": "ivf_pq",
         },
         {
@@ -159,7 +159,7 @@ def test_cagra_dataset_dtype_host_device(
             "graph_degree": 32,
             "add_data_on_build": True,
             "k": 10,
-            "metric": "inner_product",
+            "metric": "sqeuclidean",
             "build_algo": "nn_descent",
         },
     ],

From 4478d02d5ed2e34d6ce3ebd85b34310e49834453 Mon Sep 17 00:00:00 2001
From: Yinzuo Jiang <jiangyinzuo@foxmail.com>
Date: Wed, 1 May 2024 23:35:49 +0800
Subject: [PATCH 19/60] Fix comments in
 cpp/include/raft/neighbors/cagra_serialize.cuh (#2283)

Comments in `cpp/include/raft/neighbors/cagra_serialize.cuh` are outdated.

Authors:
  - Yinzuo Jiang (https://github.com/jiangyinzuo)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2283
---
 .../raft/neighbors/cagra_serialize.cuh        | 32 +++++++++----------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/cpp/include/raft/neighbors/cagra_serialize.cuh b/cpp/include/raft/neighbors/cagra_serialize.cuh
index 83830c7457..eae2269662 100644
--- a/cpp/include/raft/neighbors/cagra_serialize.cuh
+++ b/cpp/include/raft/neighbors/cagra_serialize.cuh
@@ -32,14 +32,14 @@ namespace raft::neighbors::cagra {
  *
  * @code{.cpp}
  * #include <raft/core/resources.hpp>
- * #include <raft/neighbors/cagra_serialize.hpp>
+ * #include <raft/neighbors/cagra_serialize.cuh>
  *
  * raft::resources handle;
  *
  * // create an output stream
  * std::ostream os(std::cout.rdbuf());
- * // create an index with `auto index = raft::cagra::build(...);`
- * raft::cagra::serialize(handle, os, index);
+ * // create an index with `auto index = raft::neighbors::cagra::build(...);`
+ * raft::neighbors::cagra::serialize(handle, os, index);
  * @endcode
  *
  * @tparam T data element type
@@ -67,14 +67,14 @@ void serialize(raft::resources const& handle,
  *
  * @code{.cpp}
  * #include <raft/core/resources.hpp>
- * #include <raft/neighbors/cagra_serialize.hpp>
+ * #include <raft/neighbors/cagra_serialize.cuh>
  *
  * raft::resources handle;
  *
  * // create a string with a filepath
  * std::string filename("/path/to/index");
- * // create an index with `auto index = raft::cagra::build(...);`
- * raft::cagra::serialize(handle, filename, index);
+ * // create an index with `auto index = raft::neighbors::cagra::build(...);`
+ * raft::neighbors::cagra::serialize(handle, filename, index);
  * @endcode
  *
  * @tparam T data element type
@@ -102,14 +102,14 @@ void serialize(raft::resources const& handle,
  *
  * @code{.cpp}
  * #include <raft/core/resources.hpp>
- * #include <raft/neighbors/cagra_serialize.hpp>
+ * #include <raft/neighbors/cagra_serialize.cuh>
  *
  * raft::resources handle;
  *
  * // create an output stream
  * std::ostream os(std::cout.rdbuf());
- * // create an index with `auto index = raft::cagra::build(...);`
- * raft::cagra::serialize_to_hnswlib(handle, os, index);
+ * // create an index with `auto index = raft::neighbors::cagra::build(...);`
+ * raft::neighbors::cagra::serialize_to_hnswlib(handle, os, index);
  * @endcode
  *
  * @tparam T data element type
@@ -135,14 +135,14 @@ void serialize_to_hnswlib(raft::resources const& handle,
  *
  * @code{.cpp}
  * #include <raft/core/resources.hpp>
- * #include <raft/neighbors/cagra_serialize.hpp>
+ * #include <raft/neighbors/cagra_serialize.cuh>
  *
  * raft::resources handle;
  *
  * // create a string with a filepath
  * std::string filename("/path/to/index");
- * // create an index with `auto index = raft::cagra::build(...);`
- * raft::cagra::serialize_to_hnswlib(handle, filename, index);
+ * // create an index with `auto index = raft::neighbors::cagra::build(...);`
+ * raft::neighbors::cagra::serialize_to_hnswlib(handle, filename, index);
  * @endcode
  *
  * @tparam T data element type
@@ -168,7 +168,7 @@ void serialize_to_hnswlib(raft::resources const& handle,
  *
  * @code{.cpp}
  * #include <raft/core/resources.hpp>
- * #include <raft/neighbors/cagra_serialize.hpp>
+ * #include <raft/neighbors/cagra_serialize.cuh>
  *
  * raft::resources handle;
  *
@@ -176,7 +176,7 @@ void serialize_to_hnswlib(raft::resources const& handle,
  * std::istream is(std::cin.rdbuf());
  * using T    = float; // data element type
  * using IdxT = int; // type of the index
- * auto index = raft::cagra::deserialize<T, IdxT>(handle, is);
+ * auto index = raft::neighbors::cagra::deserialize<T, IdxT>(handle, is);
  * @endcode
  *
  * @tparam T data element type
@@ -200,7 +200,7 @@ index<T, IdxT> deserialize(raft::resources const& handle, std::istream& is)
  *
  * @code{.cpp}
  * #include <raft/core/resources.hpp>
- * #include <raft/neighbors/cagra_serialize.hpp>
+ * #include <raft/neighbors/cagra_serialize.cuh>
  *
  * raft::resources handle;
  *
@@ -208,7 +208,7 @@ index<T, IdxT> deserialize(raft::resources const& handle, std::istream& is)
  * std::string filename("/path/to/index");
  * using T    = float; // data element type
  * using IdxT = int; // type of the index
- * auto index = raft::cagra::deserialize<T, IdxT>(handle, filename);
+ * auto index = raft::neighbors::cagra::deserialize<T, IdxT>(handle, filename);
  * @endcode
  *
  * @tparam T data element type

From 11c1781977406a46e05e356fee1f3ed9cf228cd4 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Thu, 2 May 2024 10:36:45 -0400
Subject: [PATCH 20/60] Use dynamic version for raft-ann-bench (#2285)

Contributes to rapidsai/build-planning#15.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/raft/pull/2285
---
 python/raft-ann-bench/pyproject.toml          |  5 +++-
 .../raft-ann-bench/src/raft-ann-bench/VERSION |  1 +
 .../src/raft-ann-bench/__init__.py            |  4 ++-
 .../src/raft-ann-bench/_version.py            | 25 +++++++++++++++++++
 4 files changed, 33 insertions(+), 2 deletions(-)
 create mode 120000 python/raft-ann-bench/src/raft-ann-bench/VERSION
 create mode 100644 python/raft-ann-bench/src/raft-ann-bench/_version.py

diff --git a/python/raft-ann-bench/pyproject.toml b/python/raft-ann-bench/pyproject.toml
index ba336d841c..1348343db3 100644
--- a/python/raft-ann-bench/pyproject.toml
+++ b/python/raft-ann-bench/pyproject.toml
@@ -9,7 +9,7 @@ requires = [
 
 [project]
 name = "raft-ann-bench"
-version = "24.06.00"
+dynamic = ["version"]
 description = "RAFT ANN benchmarks"
 authors = [
     { name = "NVIDIA Corporation" },
@@ -57,3 +57,6 @@ skip = [
     "build",
     "dist",
 ]
+
+[tool.setuptools.dynamic]
+version = { attr = "raft-ann-bench.__version__" }
diff --git a/python/raft-ann-bench/src/raft-ann-bench/VERSION b/python/raft-ann-bench/src/raft-ann-bench/VERSION
new file mode 120000
index 0000000000..a4e948506b
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/VERSION
@@ -0,0 +1 @@
+../../../../VERSION
\ No newline at end of file
diff --git a/python/raft-ann-bench/src/raft-ann-bench/__init__.py b/python/raft-ann-bench/src/raft-ann-bench/__init__.py
index 8f2cc34855..80a3b3f284 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/__init__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,3 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+from ._version import __git_commit__, __version__
diff --git a/python/raft-ann-bench/src/raft-ann-bench/_version.py b/python/raft-ann-bench/src/raft-ann-bench/_version.py
new file mode 100644
index 0000000000..6dbb8e81b0
--- /dev/null
+++ b/python/raft-ann-bench/src/raft-ann-bench/_version.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files("raft-ann-bench")
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+__git_commit__ = ""

From 3406569eb56fc5a0dafe58885d3fb5f5dcd3fe0d Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Thu, 2 May 2024 09:50:13 -0500
Subject: [PATCH 21/60] Make 'librmm' a 'host' dependency for conda packages
 (#2284)

Contributes to https://github.com/rapidsai/build-planning/issues/54.

The `libraft-headers` and `libraft-headers-only` conda packages are bundling `rmm`'s headers. I believe that's because the `librmm` conda package isn't available in the `libraft*` conda build environment, and as a result it's getting `rmm` via CPM (thanks to `rapids-cmake`).

As a result, this project and any that depend on it are seeing warnings like the following in conda builds where `conda`'s `path_conflict` setting is set to `warn` or `prevent` (like #2245):

```text
This transaction has incompatible packages due to a shared path.
  packages: rapidsai-nightly/linux-64::librmm-24.04.00a38-cuda11_240326_ga98931b9_38, rapidsai-nightly/linux-64::libraft-headers-only-24.04.00a93-cuda11_240326_g9637b3c2_93
  path: 'include/rmm/mr/device/arena_memory_resource.hpp'
```

To fix that, this proposes the following changes:

* make `librmm` a `host` dependency of the following conda packages: `libraft-headers-only`, `libraft-headers`

### Benefits of this change

* slightly smaller `libraft-headers` and `libraft-headers-only` conda packages
* reduced risk of runtime and installation issues caused by file clobbering

## Notes for reviewers

### History of changes to the `librmm` dependency for `libraft-headers`:

* both `run` and `host`: #508
* both `run` and `host`, but ignoring its `run_exports`: #1264
* just `run`, but ignoring its `run_exports`: #2102

In particular, #2102 referred to the `host` dependency on `librmm` as "extraneous" but from a packaging perspective, I don't think it is. `librmm` being in `host` means it'll be present in the build environment, which means its headers will be *found* instead of *downloaded*, and therefore not packaging into the `libraft*` conda packages.

### How I tested this

Built all the `raft` conda packages locally from `branch-24.06` and confirmed that they contain `rmm` headers. Then again from this branch and confirmed they were gone.

```shell
docker run \
    --rm \
    --env-file "${PWD}/aws-creds.env" \
    -v $(pwd):/opt/work \
    -w /opt/work \
    -it rapidsai/ci-conda:cuda12.2.2-ubuntu22.04-py3.10-amd64 \
    bash

CI="true" \
  ci/build_cpp.sh

# On 'branch-24.06', this showed the rmm headers being packaged.
# On this branch, they're omitted.
tar --bzip2 -tf \
  /tmp/conda-bld-output/linux-64/libraft-headers-only-24.06.00a50-cuda12_240430_g1e0e2283_50.tar.bz2 \
| grep 'include/rmm' \
| wc -l
```

Also checked the CI logs from `conda-cpp-build` jobs here. On other recent PRs, I see CPM downloading `rmm` ...

```text
-- CPM: Adding package rmm@24.06 (branch-24.06)
```

... and all the `rmm` headers getting installed as part of the `libraft-headers` package

```text
-- Installing: /opt/conda/conda-bld/_h_env_placehold_placehold_..._placeho/include/rmm/cuda_stream.hpp
```

([build link](https://github.com/rapidsai/raft/actions/runs/8904352932))

Here, I see `librmm` coming through via the conda package requirements ...

```text
The following NEW packages will be INSTALLED:
    ...
    librmm:                      24.06.00a17-cuda12_240430_g26fa9ecb_17 rapidsai-nightly
```

... and being used instead of downloads via CPM ...

```text
-- CPM: Using local package rmm@24.06.0
```

... and no installation of the `rmm` headers as part of building any `libraft` packages.

([build link](https://github.com/rapidsai/raft/actions/runs/8910675575/job/24470450187?pr=2284))

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/raft/pull/2284
---
 conda/recipes/libraft/meta.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/conda/recipes/libraft/meta.yaml b/conda/recipes/libraft/meta.yaml
index c4fd0aa0b6..aa2d905d14 100644
--- a/conda/recipes/libraft/meta.yaml
+++ b/conda/recipes/libraft/meta.yaml
@@ -64,6 +64,7 @@ outputs:
         {% if cuda_major != "11" %}
         - cuda-cudart-dev
         {% endif %}
+        - librmm ={{ minor_version }}
       run:
         - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
         {% if cuda_major == "11" %}
@@ -93,6 +94,7 @@ outputs:
     requirements:
       host:
         - cuda-version ={{ cuda_version }}
+        - librmm ={{ minor_version }}
       run:
         - {{ pin_subpackage('libraft-headers-only', exact=True) }}
         - librmm ={{ minor_version }}

From da3b9a9c442396a43a70efa725ca7f489605d632 Mon Sep 17 00:00:00 2001
From: James Lamb <jlamb@nvidia.com>
Date: Fri, 3 May 2024 14:58:48 -0500
Subject: [PATCH 22/60] define 'ucx' pytest marker (#2281)

Looking through logs on https://github.com/rapidsai/raft/pull/2279, I noticed this warning:

```text
test/test_comms.py:267
  /__w/raft/raft/python/raft-dask/raft_dask/test/test_comms.py:267: PytestUnknownMarkWarning: Unknown pytest.mark.ucx - is this a typo?  You can register custom marks to avoid this warning - for details, see https://docs.pytest.org/en/stable/how-to/mark.html
    @pytest.mark.ucx
```

([build link](https://github.com/rapidsai/raft/actions/runs/8850602382/job/24314273393?pr=2279#step:7:780))

This resolves it.

### How I tested this

Looked for all such cases like this:

```shell
git grep -E 'mark.*ucx'
```

Hopefully we'll see this log disappear from the `conda-python-tests` CI logs on this PR.

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Dante Gama Dessavre (https://github.com/dantegd)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/raft/pull/2281
---
 python/raft-dask/pytest.ini | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/raft-dask/pytest.ini b/python/raft-dask/pytest.ini
index 8904172272..5559bb08c8 100644
--- a/python/raft-dask/pytest.ini
+++ b/python/raft-dask/pytest.ini
@@ -6,3 +6,4 @@ markers =
   mg: marks a test as multi-GPU
   memleak: marks a test as a memory leak test
   nccl: marks a test as using NCCL
+  ucx: marks a test as using ucx-py

From fd64c24fc39e800ed29e09571f75e51667bae2dc Mon Sep 17 00:00:00 2001
From: Philip Hyunsu Cho <phcho@nvidia.com>
Date: Mon, 6 May 2024 10:40:44 -0700
Subject: [PATCH 23/60] Migrate to `{{ stdlib("c") }}` (#2278)

The `sysroot*` syntax is getting phased out (conda-forge/conda-forge.github.io#2102).
The recommendation is to move to `{{ stdlib("c") }}`.

Ref https://github.com/rapidsai/build-planning/issues/39

Authors:
  - Philip Hyunsu Cho (https://github.com/hcho3)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - https://github.com/jakirkham
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/raft/pull/2278
---
 conda/recipes/libraft/conda_build_config.yaml          |  5 ++++-
 conda/recipes/libraft/meta.yaml                        | 10 +++++-----
 conda/recipes/pylibraft/conda_build_config.yaml        |  5 ++++-
 conda/recipes/pylibraft/meta.yaml                      |  2 +-
 .../recipes/raft-ann-bench-cpu/conda_build_config.yaml |  5 ++++-
 conda/recipes/raft-ann-bench-cpu/meta.yaml             |  4 ++--
 conda/recipes/raft-ann-bench/conda_build_config.yaml   |  5 ++++-
 conda/recipes/raft-ann-bench/meta.yaml                 |  2 +-
 conda/recipes/raft-dask/conda_build_config.yaml        |  5 ++++-
 conda/recipes/raft-dask/meta.yaml                      |  2 +-
 10 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml
index 385cd831fc..bb9c715e3a 100644
--- a/conda/recipes/libraft/conda_build_config.yaml
+++ b/conda/recipes/libraft/conda_build_config.yaml
@@ -10,7 +10,10 @@ cuda_compiler:
 cuda11_compiler:
   - nvcc
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
diff --git a/conda/recipes/libraft/meta.yaml b/conda/recipes/libraft/meta.yaml
index aa2d905d14..a075308500 100644
--- a/conda/recipes/libraft/meta.yaml
+++ b/conda/recipes/libraft/meta.yaml
@@ -58,7 +58,7 @@ outputs:
         - cuda-version ={{ cuda_version }}
         - cmake {{ cmake_version }}
         - ninja
-        - sysroot_{{ target_platform }} {{ sysroot_version }}
+        - {{ stdlib("c") }}
       host:
         - cuda-version ={{ cuda_version }}
         {% if cuda_major != "11" %}
@@ -152,7 +152,7 @@ outputs:
         - cuda-version ={{ cuda_version }}
         - cmake {{ cmake_version }}
         - ninja
-        - sysroot_{{ target_platform }} {{ sysroot_version }}
+        - {{ stdlib("c") }}
       host:
         - {{ pin_subpackage('libraft-headers', exact=True) }}
         - cuda-version ={{ cuda_version }}
@@ -214,7 +214,7 @@ outputs:
         - cuda-version ={{ cuda_version }}
         - cmake {{ cmake_version }}
         - ninja
-        - sysroot_{{ target_platform }} {{ sysroot_version }}
+        - {{ stdlib("c") }}
       host:
         - {{ pin_subpackage('libraft-headers', exact=True) }}
         - cuda-version ={{ cuda_version }}
@@ -280,7 +280,7 @@ outputs:
         - cuda-version ={{ cuda_version }}
         - cmake {{ cmake_version }}
         - ninja
-        - sysroot_{{ target_platform }} {{ sysroot_version }}
+        - {{ stdlib("c") }}
       host:
         # We must include both libraft and libraft-static to prevent the test
         # builds from packaging those libraries. However, tests only depend on
@@ -349,7 +349,7 @@ outputs:
         - cuda-version ={{ cuda_version }}
         - cmake {{ cmake_version }}
         - ninja
-        - sysroot_{{ target_platform }} {{ sysroot_version }}
+        - {{ stdlib("c") }}
       host:
         - {{ pin_subpackage('libraft', exact=True) }}
         - {{ pin_subpackage('libraft-headers', exact=True) }}
diff --git a/conda/recipes/pylibraft/conda_build_config.yaml b/conda/recipes/pylibraft/conda_build_config.yaml
index e28b98da7f..e3ca633eb9 100644
--- a/conda/recipes/pylibraft/conda_build_config.yaml
+++ b/conda/recipes/pylibraft/conda_build_config.yaml
@@ -10,7 +10,10 @@ cuda_compiler:
 cuda11_compiler:
   - nvcc
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
diff --git a/conda/recipes/pylibraft/meta.yaml b/conda/recipes/pylibraft/meta.yaml
index e524a68f9e..cbeaec3b55 100644
--- a/conda/recipes/pylibraft/meta.yaml
+++ b/conda/recipes/pylibraft/meta.yaml
@@ -39,7 +39,7 @@ requirements:
     - cuda-version ={{ cuda_version }}
     - cmake {{ cmake_version }}
     - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     {% if cuda_major == "11" %}
     - cuda-python >=11.7.1,<12.0a0
diff --git a/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml b/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml
index 93a5532962..4de3b98f48 100644
--- a/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml
+++ b/conda/recipes/raft-ann-bench-cpu/conda_build_config.yaml
@@ -4,7 +4,10 @@ c_compiler_version:
 cxx_compiler_version:
   - 11
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
diff --git a/conda/recipes/raft-ann-bench-cpu/meta.yaml b/conda/recipes/raft-ann-bench-cpu/meta.yaml
index fce85d5ffc..d0748fdb16 100644
--- a/conda/recipes/raft-ann-bench-cpu/meta.yaml
+++ b/conda/recipes/raft-ann-bench-cpu/meta.yaml
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 # Usage:
 #   conda build . -c conda-forge -c nvidia -c rapidsai
@@ -42,7 +42,7 @@ requirements:
     - {{ compiler('cxx') }}
     - cmake {{ cmake_version }}
     - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
 
   host:
     - glog {{ glog_version }}
diff --git a/conda/recipes/raft-ann-bench/conda_build_config.yaml b/conda/recipes/raft-ann-bench/conda_build_config.yaml
index 6aa6f3d47d..cf025a06a4 100644
--- a/conda/recipes/raft-ann-bench/conda_build_config.yaml
+++ b/conda/recipes/raft-ann-bench/conda_build_config.yaml
@@ -10,7 +10,10 @@ cuda_compiler:
 cuda11_compiler:
   - nvcc
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 cmake_version:
diff --git a/conda/recipes/raft-ann-bench/meta.yaml b/conda/recipes/raft-ann-bench/meta.yaml
index ec24501475..8a6a3d033d 100644
--- a/conda/recipes/raft-ann-bench/meta.yaml
+++ b/conda/recipes/raft-ann-bench/meta.yaml
@@ -57,7 +57,7 @@ requirements:
     - cuda-version ={{ cuda_version }}
     - cmake {{ cmake_version }}
     - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
 
   host:
     - python
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index 7db48fb684..345cef49a1 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -10,7 +10,10 @@ cuda_compiler:
 cuda11_compiler:
   - nvcc
 
-sysroot_version:
+c_stdlib:
+  - sysroot
+
+c_stdlib_version:
   - "2.17"
 
 ucx_version:
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index 6910905d07..7c2fb257b1 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -39,7 +39,7 @@ requirements:
     - cuda-version ={{ cuda_version }}
     - cmake {{ cmake_version }}
     - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
+    - {{ stdlib("c") }}
   host:
     {% if cuda_major == "11" %}
     - cuda-python >=11.7.1,<12.0a0

From 19842a28a323a79d9aded4b4369e5eb889678258 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Tue, 7 May 2024 04:22:02 +0200
Subject: [PATCH 24/60] Add UCXX support (#1983)

Add support for [UCXX](https://github.com/rapidsai/ucxx). It is our intention to soon switch from UCX-Py to UCXX and archive the former.

This PR adds support for UCXX on the C++ backend but retains the original UCX implementation for now (based on the UCP layer), moving to UCXX will simplify RAFT code a bit given the UCXX implementation requires fewer lines of boilerplate code.

On the Python front raft-dask tests are added for both UCX-Py (which there weren't any) and UCXX. The UCX-Py tests continue to use the UCX (UCP layer) implementation, whereas the UCXX tests use the UCXX C++ implementation.

When the switch is complete we can remove all previous UCX/UCX-Py code from the RAFT codebase. If for some reason using the UCX (UCP layer) is preferred on the C++ backend instead of the UCXX C++ implementation this is possible, but UCX-Py code will be archived and dropped in favor of the UCXX Python backend.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)
  - Akira Naruse (https://github.com/anaruse)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Jake Awe (https://github.com/AyodeAwe)
  - Divye Gala (https://github.com/divyegala)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/raft/pull/1983
---
 ci/build_wheel.sh                             |  10 +-
 ci/release/update-version.sh                  |   9 +-
 ci/test_python.sh                             |  18 ++
 ci/test_wheel_raft_dask.sh                    |  12 +-
 .../all_cuda-118_arch-aarch64.yaml            |   2 +
 .../all_cuda-118_arch-x86_64.yaml             |   2 +
 .../all_cuda-122_arch-aarch64.yaml            |   2 +
 .../all_cuda-122_arch-x86_64.yaml             |   2 +
 .../bench_ann_cuda-118_arch-aarch64.yaml      |   1 +
 .../bench_ann_cuda-118_arch-x86_64.yaml       |   1 +
 .../bench_ann_cuda-120_arch-aarch64.yaml      |   1 +
 .../bench_ann_cuda-120_arch-x86_64.yaml       |   1 +
 .../recipes/raft-dask/conda_build_config.yaml |   6 +-
 conda/recipes/raft-dask/meta.yaml             |   7 +-
 cpp/CMakeLists.txt                            |  15 +-
 cpp/include/raft/comms/detail/std_comms.hpp   | 274 +++++++++++++-----
 cpp/include/raft/comms/detail/ucp_helper.hpp  |  27 +-
 cpp/include/raft/comms/std_comms.hpp          |  54 +++-
 dependencies.yaml                             |  82 +++++-
 python/raft-dask/CMakeLists.txt               |   9 +-
 .../raft-dask/cmake/thirdparty/get_ucxx.cmake |  55 ++++
 python/raft-dask/pyproject.toml               |   3 +
 python/raft-dask/pytest.ini                   |   3 +-
 python/raft-dask/raft_dask/__init__.py        |  12 +-
 python/raft-dask/raft_dask/common/comms.py    |  19 +-
 .../raft_dask/common/comms_utils.pyx          |  11 +-
 python/raft-dask/raft_dask/common/ucx.py      |  32 +-
 python/raft-dask/raft_dask/test/conftest.py   |  64 +++-
 python/raft-dask/raft_dask/test/test_comms.py | 158 +++++++++-
 29 files changed, 735 insertions(+), 157 deletions(-)
 create mode 100644 python/raft-dask/cmake/thirdparty/get_ucxx.cmake

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 5d06e46303..e3e7ce9c89 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -7,6 +7,10 @@ package_name=$1
 package_dir=$2
 underscore_package_name=$(echo "${package_name}" | tr "-" "_")
 
+# Clear out system ucx files to ensure that we're getting ucx from the wheel.
+rm -rf /usr/lib64/ucx
+rm -rf /usr/lib64/libuc*
+
 source rapids-configure-sccache
 source rapids-date-string
 
@@ -38,9 +42,11 @@ fi
 
 if [[ ${package_name} == "raft-dask" ]]; then
     sed -r -i "s/pylibraft==(.*)\"/pylibraft${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/libucx(.*)\"/libucx${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
     sed -r -i "s/ucx-py==(.*)\"/ucx-py${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
     sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}
     sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/distributed-ucxx==(.*)\"/distributed-ucxx${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
 else
     sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
 fi
@@ -56,6 +62,6 @@ cd "${package_dir}"
 python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
 
 mkdir -p final_dist
-python -m auditwheel repair -w final_dist dist/*
+python -m auditwheel repair -w final_dist --exclude "libucp.so.0" dist/*
 
 RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 46b992392c..ef9b3e4b83 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -37,6 +37,8 @@ function sed_runner() {
 }
 
 sed_runner "s/set(RAPIDS_VERSION .*)/set(RAPIDS_VERSION \"${NEXT_SHORT_TAG}\")/g" cpp/template/cmake/thirdparty/fetch_rapids.cmake
+sed_runner 's/'"find_and_configure_ucxx(VERSION .*"'/'"find_and_configure_ucxx(VERSION  ${NEXT_UCX_PY_SHORT_TAG_PEP440}"'/g' python/raft-dask/cmake/thirdparty/get_ucxx.cmake
+sed_runner 's/'"branch-.*"'/'"branch-${NEXT_UCX_PY_SHORT_TAG_PEP440}"'/g' python/raft-dask/cmake/thirdparty/get_ucxx.cmake
 
 # Centralized version file update
 echo "${NEXT_FULL_TAG}" > VERSION
@@ -50,7 +52,7 @@ DEPENDENCIES=(
   rmm-cu11
   rmm-cu12
   rapids-dask-dependency
-  # ucx-py is handled separately below
+  # ucx-py and ucxx are handled separately below
 )
 for FILE in dependencies.yaml conda/environments/*.yaml; do
   for DEP in "${DEPENDENCIES[@]}"; do
@@ -59,6 +61,10 @@ for FILE in dependencies.yaml conda/environments/*.yaml; do
   sed_runner "/-.* ucx-py==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
   sed_runner "/-.* ucx-py-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
   sed_runner "/-.* ucx-py-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
+  sed_runner "/-.* libucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
+  sed_runner "/-.* distributed-ucxx==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
+  sed_runner "/-.* distributed-ucxx-cu11==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
+  sed_runner "/-.* distributed-ucxx-cu12==/ s/==.*/==${NEXT_UCX_PY_SHORT_TAG_PEP440}\.*/g" ${FILE};
 done
 for FILE in python/*/pyproject.toml; do
   for DEP in "${DEPENDENCIES[@]}"; do
@@ -68,6 +74,7 @@ for FILE in python/*/pyproject.toml; do
 done
 
 sed_runner "/^ucx_py_version:$/ {n;s/.*/  - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml
+sed_runner "/^ucxx_version:$/ {n;s/.*/  - \"${NEXT_UCX_PY_VERSION}\"/}" conda/recipes/raft-dask/conda_build_config.yaml
 
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-workflows/ s/@.*/@branch-${NEXT_SHORT_TAG}/g" "${FILE}"
diff --git a/ci/test_python.sh b/ci/test_python.sh
index f5b188ca0b..59da1f0bc4 100755
--- a/ci/test_python.sh
+++ b/ci/test_python.sh
@@ -59,5 +59,23 @@ rapids-logger "pytest raft-dask"
   --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/raft-dask-coverage.xml" \
   --cov-report=term
 
+rapids-logger "pytest raft-dask (ucx-py only)"
+./ci/run_raft_dask_pytests.sh \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-raft-dask-ucx.xml" \
+  --cov-config=../.coveragerc \
+  --cov=raft_dask \
+  --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/raft-dask-ucx-coverage.xml" \
+  --cov-report=term \
+  --run_ucx
+
+rapids-logger "pytest raft-dask (ucxx only)"
+./ci/run_raft_dask_pytests.sh \
+  --junitxml="${RAPIDS_TESTS_DIR}/junit-raft-dask-ucxx.xml" \
+  --cov-config=../.coveragerc \
+  --cov=raft_dask \
+  --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/raft-dask-ucxx-coverage.xml" \
+  --cov-report=term \
+  --run_ucxx
+
 rapids-logger "Test script exiting with value: $EXITCODE"
 exit ${EXITCODE}
diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh
index 76bb62e859..fe2d44f2b3 100755
--- a/ci/test_wheel_raft_dask.sh
+++ b/ci/test_wheel_raft_dask.sh
@@ -11,7 +11,13 @@ RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels
 RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibraft-dep
 python -m pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl
 
-# echo to expand wildcard before adding `[extra]` requires for pip
-python -m pip install $(echo ./dist/raft_dask*.whl)[test]
+python -m pip install "raft_dask-${RAPIDS_PY_CUDA_SUFFIX}[test]>=0.0.0a0" --find-links dist/
 
-python -m pytest ./python/raft-dask/raft_dask/test
+# rapids-logger "pytest raft-dask"
+# python -m pytest ./python/raft-dask/raft_dask/test
+
+# rapids-logger "pytest raft-dask (ucx-py only)"
+# python -m pytest ./python/raft-dask/raft_dask/test --run_ucx
+
+rapids-logger "pytest raft-dask (ucxx only)"
+python -m pytest ./python/raft-dask/raft_dask/test --run_ucxx
diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 189f8268df..7453df2593 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -21,6 +21,7 @@ dependencies:
 - cxx-compiler
 - cython>=3.0.0
 - dask-cuda==24.6.*
+- distributed-ucxx==0.38.*
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
 - graphviz
@@ -34,6 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- libucxx==0.38.*
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index e604705112..b983eb0388 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -21,6 +21,7 @@ dependencies:
 - cxx-compiler
 - cython>=3.0.0
 - dask-cuda==24.6.*
+- distributed-ucxx==0.38.*
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - graphviz
@@ -34,6 +35,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- libucxx==0.38.*
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
diff --git a/conda/environments/all_cuda-122_arch-aarch64.yaml b/conda/environments/all_cuda-122_arch-aarch64.yaml
index 49c53b4cfe..7dacfc2d2b 100644
--- a/conda/environments/all_cuda-122_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-122_arch-aarch64.yaml
@@ -22,6 +22,7 @@ dependencies:
 - cxx-compiler
 - cython>=3.0.0
 - dask-cuda==24.6.*
+- distributed-ucxx==0.38.*
 - doxygen>=1.8.20
 - gcc_linux-aarch64=11.*
 - graphviz
@@ -31,6 +32,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- libucxx==0.38.*
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 6f782175dd..1c16d2ea93 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -22,6 +22,7 @@ dependencies:
 - cxx-compiler
 - cython>=3.0.0
 - dask-cuda==24.6.*
+- distributed-ucxx==0.38.*
 - doxygen>=1.8.20
 - gcc_linux-64=11.*
 - graphviz
@@ -31,6 +32,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- libucxx==0.38.*
 - nccl>=2.9.9
 - ninja
 - numba>=0.57
diff --git a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
index b5f662ebc1..7315f82c13 100644
--- a/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-aarch64.yaml
@@ -30,6 +30,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- libucxx==0.38.*
 - matplotlib
 - nccl>=2.9.9
 - ninja
diff --git a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
index 6c56cb688c..ff973acc0c 100644
--- a/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-118_arch-x86_64.yaml
@@ -30,6 +30,7 @@ dependencies:
 - libcusolver=11.4.1.48
 - libcusparse-dev=11.7.5.86
 - libcusparse=11.7.5.86
+- libucxx==0.38.*
 - matplotlib
 - nccl>=2.9.9
 - ninja
diff --git a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
index 7f3107e5d6..056550fc07 100644
--- a/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-aarch64.yaml
@@ -27,6 +27,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- libucxx==0.38.*
 - matplotlib
 - nccl>=2.9.9
 - ninja
diff --git a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
index 62739354a5..41a48f4a12 100644
--- a/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/bench_ann_cuda-120_arch-x86_64.yaml
@@ -27,6 +27,7 @@ dependencies:
 - libcurand-dev
 - libcusolver-dev
 - libcusparse-dev
+- libucxx==0.38.*
 - matplotlib
 - nccl>=2.9.9
 - ninja
diff --git a/conda/recipes/raft-dask/conda_build_config.yaml b/conda/recipes/raft-dask/conda_build_config.yaml
index 345cef49a1..b157e41753 100644
--- a/conda/recipes/raft-dask/conda_build_config.yaml
+++ b/conda/recipes/raft-dask/conda_build_config.yaml
@@ -16,11 +16,11 @@ c_stdlib:
 c_stdlib_version:
   - "2.17"
 
-ucx_version:
-  - ">=1.15.0,<1.16.0"
-
 ucx_py_version:
   - "0.38.*"
 
+ucxx_version:
+  - "0.38.*"
+
 cmake_version:
   - ">=3.26.4"
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index 7c2fb257b1..50042780b4 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -56,9 +56,9 @@ requirements:
     - rmm ={{ minor_version }}
     - scikit-build-core >=0.7.0
     - setuptools
-    - ucx {{ ucx_version }}
-    - ucx-proc=*=gpu
     - ucx-py {{ ucx_py_version }}
+    - libucxx {{ ucxx_version }}
+    - ucxx {{ ucxx_version }}
   run:
     {% if cuda_major == "11" %}
     - cudatoolkit
@@ -73,9 +73,8 @@ requirements:
     - pylibraft {{ version }}
     - python x.x
     - rmm ={{ minor_version }}
-    - ucx {{ ucx_version }}
-    - ucx-proc=*=gpu
     - ucx-py {{ ucx_py_version }}
+    - ucxx {{ ucxx_version }}
 
 tests:
   requirements:
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index eaab637338..259d9fe428 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -650,12 +650,21 @@ rapids_find_generate_module(
   INSTALL_EXPORT_SET raft-distributed-exports
 )
 
-rapids_export_package(BUILD ucx raft-distributed-exports)
-rapids_export_package(INSTALL ucx raft-distributed-exports)
+rapids_export_package(
+  BUILD ucxx raft-distributed-exports COMPONENTS ucxx python GLOBAL_TARGETS ucxx::ucxx ucxx::python
+)
+rapids_export_package(
+  INSTALL ucxx raft-distributed-exports COMPONENTS ucxx python GLOBAL_TARGETS ucxx::ucxx
+                                                                              ucxx::python
+)
 rapids_export_package(BUILD NCCL raft-distributed-exports)
 rapids_export_package(INSTALL NCCL raft-distributed-exports)
 
-target_link_libraries(raft_distributed INTERFACE ucx::ucp NCCL::NCCL)
+# ucx is a requirement for raft_distributed, but its config is not safe to be found multiple times,
+# so rather than exporting a package dependency on it above we rely on consumers to find it
+# themselves. Once https://github.com/rapidsai/ucxx/issues/173 is resolved we can export it above
+# again.
+target_link_libraries(raft_distributed INTERFACE ucx::ucp ucxx::ucxx NCCL::NCCL)
 
 # ##################################################################################################
 # * install targets-----------------------------------------------------------
diff --git a/cpp/include/raft/comms/detail/std_comms.hpp b/cpp/include/raft/comms/detail/std_comms.hpp
index 6e7ff7106f..cb1accc95e 100644
--- a/cpp/include/raft/comms/detail/std_comms.hpp
+++ b/cpp/include/raft/comms/detail/std_comms.hpp
@@ -34,6 +34,7 @@
 #include <time.h>
 #include <ucp/api/ucp.h>
 #include <ucp/api/ucp_def.h>
+#include <ucxx/api.h>
 
 #include <algorithm>
 #include <chrono>
@@ -49,6 +50,17 @@ namespace raft {
 namespace comms {
 namespace detail {
 
+using ucp_endpoint_array_t  = std::shared_ptr<ucp_ep_h*>;
+using ucxx_endpoint_array_t = std::shared_ptr<ucxx::Endpoint**>;
+using ucp_worker_t          = ucp_worker_h;
+using ucxx_worker_t         = ucxx::Worker*;
+
+struct ucx_objects_t {
+ public:
+  std::variant<ucp_endpoint_array_t, ucxx_endpoint_array_t> endpoints;
+  std::variant<ucp_worker_t, ucxx_worker_t> worker;
+};
+
 class std_comms : public comms_iface {
  public:
   std_comms() = delete;
@@ -64,8 +76,7 @@ class std_comms : public comms_iface {
    * @param subcomms_ucp use ucp for subcommunicators
    */
   std_comms(ncclComm_t nccl_comm,
-            ucp_worker_h ucp_worker,
-            std::shared_ptr<ucp_ep_h*> eps,
+            ucx_objects_t ucx_objects,
             int num_ranks,
             int rank,
             rmm::cuda_stream_view stream,
@@ -76,9 +87,8 @@ class std_comms : public comms_iface {
       num_ranks_(num_ranks),
       rank_(rank),
       subcomms_ucp_(subcomms_ucp),
+      ucx_objects_(ucx_objects),
       own_nccl_comm_(false),
-      ucp_worker_(ucp_worker),
-      ucp_eps_(eps),
       next_request_id_(0)
   {
     initialize();
@@ -205,96 +215,209 @@ class std_comms : public comms_iface {
 
   void isend(const void* buf, size_t size, int dest, int tag, request_t* request) const
   {
-    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
+    if (std::holds_alternative<ucxx_worker_t>(ucx_objects_.worker)) {
+      get_request_id(request);
 
-    get_request_id(request);
-    ucp_ep_h ep_ptr = (*ucp_eps_)[dest];
+      ucxx::Endpoint* ep_ptr = (*std::get<ucxx_endpoint_array_t>(ucx_objects_.endpoints))[dest];
 
-    ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
+      ucp_tag_t ucp_tag = build_message_tag(get_rank(), tag);
+      auto ucxx_req     = ep_ptr->tagSend(const_cast<void*>(buf), size, ucxx::Tag(ucp_tag));
 
-    this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, default_tag_mask, get_rank());
+      requests_in_flight_.insert(std::make_pair(*request, ucxx_req));
+    } else {
+      ASSERT(std::get<ucp_worker_t>(ucx_objects_.worker) != nullptr,
+             "ERROR: UCX comms not initialized on communicator.");
 
-    requests_in_flight_.insert(std::make_pair(*request, ucp_req));
-  }
+      get_request_id(request);
+      ucp_ep_h ep_ptr = (*std::get<ucp_endpoint_array_t>(ucx_objects_.endpoints))[dest];
 
-  void irecv(void* buf, size_t size, int source, int tag, request_t* request) const
-  {
-    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
+      ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
 
-    get_request_id(request);
+      this->ucp_handler_.ucp_isend(ucp_req, ep_ptr, buf, size, tag, default_tag_mask, get_rank());
 
-    ucp_ep_h ep_ptr = (*ucp_eps_)[source];
-
-    ucp_tag_t tag_mask = default_tag_mask;
-
-    ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
-    ucp_handler_.ucp_irecv(ucp_req, ucp_worker_, ep_ptr, buf, size, tag, tag_mask, source);
-
-    requests_in_flight_.insert(std::make_pair(*request, ucp_req));
+      requests_in_flight_.insert(std::make_pair(*request, ucp_req));
+    }
   }
 
-  void waitall(int count, request_t array_of_requests[]) const
+  void irecv(void* buf, size_t size, int source, int tag, request_t* request) const
   {
-    ASSERT(ucp_worker_ != nullptr, "ERROR: UCX comms not initialized on communicator.");
+    if (std::holds_alternative<ucxx_worker_t>(ucx_objects_.worker)) {
+      get_request_id(request);
 
-    std::vector<ucp_request*> requests;
-    requests.reserve(count);
+      ucxx::Endpoint* ep_ptr = (*std::get<ucxx_endpoint_array_t>(ucx_objects_.endpoints))[source];
 
-    time_t start = time(NULL);
+      ucp_tag_t ucp_tag = build_message_tag(get_rank(), tag);
+      auto ucxx_req =
+        ep_ptr->tagRecv(buf, size, ucxx::Tag(ucp_tag), ucxx::TagMask(default_tag_mask));
 
-    for (int i = 0; i < count; ++i) {
-      auto req_it = requests_in_flight_.find(array_of_requests[i]);
-      ASSERT(requests_in_flight_.end() != req_it,
-             "ERROR: waitall on invalid request: %d",
-             array_of_requests[i]);
-      requests.push_back(req_it->second);
-      free_requests_.insert(req_it->first);
-      requests_in_flight_.erase(req_it);
-    }
-
-    while (requests.size() > 0) {
-      time_t now = time(NULL);
+      requests_in_flight_.insert(std::make_pair(*request, ucxx_req));
+    } else {
+      ASSERT(std::get<ucp_worker_t>(ucx_objects_.worker) != nullptr,
+             "ERROR: UCX comms not initialized on communicator.");
 
-      // Timeout if we have not gotten progress or completed any requests
-      // in 10 or more seconds.
-      ASSERT(now - start < 10, "Timed out waiting for requests.");
+      get_request_id(request);
 
-      for (std::vector<ucp_request*>::iterator it = requests.begin(); it != requests.end();) {
-        bool restart = false;  // resets the timeout when any progress was made
+      ucp_ep_h ep_ptr = (*std::get<ucp_endpoint_array_t>(ucx_objects_.endpoints))[source];
 
-        // Causes UCP to progress through the send/recv message queue
-        while (ucp_worker_progress(ucp_worker_) != 0) {
-          restart = true;
-        }
+      ucp_tag_t tag_mask = default_tag_mask;
 
-        auto req = *it;
+      ucp_request* ucp_req = (ucp_request*)malloc(sizeof(ucp_request));
+      ucp_handler_.ucp_irecv(ucp_req,
+                             std::get<ucp_worker_t>(ucx_objects_.worker),
+                             ep_ptr,
+                             buf,
+                             size,
+                             tag,
+                             tag_mask,
+                             source);
 
-        // If the message needs release, we know it will be sent/received
-        // asynchronously, so we will need to track and verify its state
-        if (req->needs_release) {
-          ASSERT(UCS_PTR_IS_PTR(req->req), "UCX Request Error. Request is not valid UCX pointer");
-          ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", UCS_PTR_STATUS(req->req));
-          ASSERT(req->req->completed == 1 || req->req->completed == 0,
-                 "request->completed not a valid value: %d\n",
-                 req->req->completed);
-        }
+      requests_in_flight_.insert(std::make_pair(*request, ucp_req));
+    }
+  }
 
-        // If a message was sent synchronously (eg. completed before
-        // `isend`/`irecv` completed) or an asynchronous message
-        // is complete, we can go ahead and clean it up.
-        if (!req->needs_release || req->req->completed == 1) {
-          restart = true;
+  void waitall(int count, request_t array_of_requests[]) const
+  {
+    if (std::holds_alternative<ucxx_worker_t>(ucx_objects_.worker)) {
+      ucxx_worker_t worker = std::get<ucxx_worker_t>(ucx_objects_.worker);
+
+      std::vector<std::shared_ptr<ucxx::Request>> requests;
+      requests.reserve(count);
+
+      time_t start = time(NULL);
+
+      for (int i = 0; i < count; ++i) {
+        auto req_it = requests_in_flight_.find(array_of_requests[i]);
+        ASSERT(requests_in_flight_.end() != req_it,
+               "ERROR: waitall on invalid request: %d",
+               array_of_requests[i]);
+        requests.push_back(std::get<std::shared_ptr<ucxx::Request>>(req_it->second));
+        free_requests_.insert(req_it->first);
+        requests_in_flight_.erase(req_it);
+      }
 
-          // perform cleanup
-          ucp_handler_.free_ucp_request(req);
+      while (requests.size() > 0) {
+        time_t now = time(NULL);
+
+        // Timeout if we have not gotten progress or completed any requests
+        // in 10 or more seconds.
+        ASSERT(now - start < 10, "Timed out waiting for requests.");
+
+        for (std::vector<std::shared_ptr<ucxx::Request>>::iterator it = requests.begin();
+             it != requests.end();) {
+          bool restart = false;  // resets the timeout when any progress was made
+
+          if (worker->isProgressThreadRunning()) {
+            // Wait for a UCXX progress thread roundtrip
+            ucxx::utils::CallbackNotifier callbackNotifierPre{};
+            worker->registerGenericPre([&callbackNotifierPre]() { callbackNotifierPre.set(); });
+            callbackNotifierPre.wait();
+
+            ucxx::utils::CallbackNotifier callbackNotifierPost{};
+            worker->registerGenericPost([&callbackNotifierPost]() { callbackNotifierPost.set(); });
+            callbackNotifierPost.wait();
+          } else {
+            // Causes UCXX to progress through the send/recv message queue
+            while (!worker->progress()) {
+              restart = true;
+            }
+          }
+
+          auto req = *it;
+
+          // If the message needs release, we know it will be sent/received
+          // asynchronously, so we will need to track and verify its state
+          if (req->isCompleted()) {
+            auto status = req->getStatus();
+            ASSERT(req->getStatus() == UCS_OK,
+                   "UCX Request Error: %d (%s)\n",
+                   status,
+                   ucs_status_string(status));
+          }
+
+          // If a message was sent synchronously (eg. completed before
+          // `isend`/`irecv` completed) or an asynchronous message
+          // is complete, we can go ahead and clean it up.
+          if (req->isCompleted()) {
+            restart = true;
+
+            auto status = req->getStatus();
+            ASSERT(req->getStatus() == UCS_OK,
+                   "UCX Request Error: %d (%s)\n",
+                   status,
+                   ucs_status_string(status));
+
+            // remove from pending requests
+            it = requests.erase(it);
+          } else {
+            ++it;
+          }
+          // if any progress was made, reset the timeout start time
+          if (restart) { start = time(NULL); }
+        }
+      }
+    } else {
+      ucp_worker_t worker = std::get<ucp_worker_t>(ucx_objects_.worker);
+      ASSERT(worker != nullptr, "ERROR: UCX comms not initialized on communicator.");
+
+      std::vector<ucp_request*> requests;
+      requests.reserve(count);
+
+      time_t start = time(NULL);
+
+      for (int i = 0; i < count; ++i) {
+        auto req_it = requests_in_flight_.find(array_of_requests[i]);
+        ASSERT(requests_in_flight_.end() != req_it,
+               "ERROR: waitall on invalid request: %d",
+               array_of_requests[i]);
+        requests.push_back(std::get<ucp_request*>(req_it->second));
+        free_requests_.insert(req_it->first);
+        requests_in_flight_.erase(req_it);
+      }
 
-          // remove from pending requests
-          it = requests.erase(it);
-        } else {
-          ++it;
+      while (requests.size() > 0) {
+        time_t now = time(NULL);
+
+        // Timeout if we have not gotten progress or completed any requests
+        // in 10 or more seconds.
+        ASSERT(now - start < 10, "Timed out waiting for requests.");
+
+        for (std::vector<ucp_request*>::iterator it = requests.begin(); it != requests.end();) {
+          bool restart = false;  // resets the timeout when any progress was made
+
+          // Causes UCP to progress through the send/recv message queue
+          while (ucp_worker_progress(worker) != 0) {
+            restart = true;
+          }
+
+          auto req = *it;
+
+          // If the message needs release, we know it will be sent/received
+          // asynchronously, so we will need to track and verify its state
+          if (req->needs_release) {
+            ASSERT(UCS_PTR_IS_PTR(req->req), "UCX Request Error. Request is not valid UCX pointer");
+            ASSERT(!UCS_PTR_IS_ERR(req->req), "UCX Request Error: %d\n", UCS_PTR_STATUS(req->req));
+            ASSERT(req->req->completed == 1 || req->req->completed == 0,
+                   "request->completed not a valid value: %d\n",
+                   req->req->completed);
+          }
+
+          // If a message was sent synchronously (eg. completed before
+          // `isend`/`irecv` completed) or an asynchronous message
+          // is complete, we can go ahead and clean it up.
+          if (!req->needs_release || req->req->completed == 1) {
+            restart = true;
+
+            // perform cleanup
+            ucp_handler_.free_ucp_request(req);
+
+            // remove from pending requests
+            it = requests.erase(it);
+          } else {
+            ++it;
+          }
+          // if any progress was made, reset the timeout start time
+          if (restart) { start = time(NULL); }
         }
-        // if any progress was made, reset the timeout start time
-        if (restart) { start = time(NULL); }
       }
     }
   }
@@ -524,10 +647,11 @@ class std_comms : public comms_iface {
   bool own_nccl_comm_;
 
   comms_ucp_handler ucp_handler_;
-  ucp_worker_h ucp_worker_;
-  std::shared_ptr<ucp_ep_h*> ucp_eps_;
+  ucx_objects_t ucx_objects_;
   mutable request_t next_request_id_;
-  mutable std::unordered_map<request_t, struct ucp_request*> requests_in_flight_;
+  mutable std::unordered_map<request_t,
+                             std::variant<struct ucp_request*, std::shared_ptr<ucxx::Request>>>
+    requests_in_flight_;
   mutable std::unordered_set<request_t> free_requests_;
 };
 }  // namespace detail
diff --git a/cpp/include/raft/comms/detail/ucp_helper.hpp b/cpp/include/raft/comms/detail/ucp_helper.hpp
index 5896248c1d..65e1957e54 100644
--- a/cpp/include/raft/comms/detail/ucp_helper.hpp
+++ b/cpp/include/raft/comms/detail/ucp_helper.hpp
@@ -46,9 +46,7 @@ struct ucx_context {
 class ucp_request {
  public:
   struct ucx_context* req;
-  bool needs_release   = true;
-  int other_rank       = -1;
-  bool is_send_request = false;
+  bool needs_release = true;
 };
 
 // by default, match the whole tag
@@ -72,17 +70,16 @@ static void recv_callback(void* request, ucs_status_t status, ucp_tag_recv_info_
   context->completed          = 1;
 }
 
+ucp_tag_t build_message_tag(int rank, int tag)
+{
+  // keeping the rank in the lower bits enables debugging.
+  return ((uint32_t)tag << 31) | (uint32_t)rank;
+}
+
 /**
  * Helper class for interacting with ucp.
  */
 class comms_ucp_handler {
- private:
-  ucp_tag_t build_message_tag(int rank, int tag) const
-  {
-    // keeping the rank in the lower bits enables debugging.
-    return ((uint32_t)tag << 31) | (uint32_t)rank;
-  }
-
  public:
   /**
    * @brief Frees any memory underlying the given ucp request object
@@ -132,9 +129,7 @@ class comms_ucp_handler {
       req->needs_release = false;
     }
 
-    req->other_rank      = rank;
-    req->is_send_request = true;
-    req->req             = ucp_req;
+    req->req = ucp_req;
   }
 
   /**
@@ -156,10 +151,8 @@ class comms_ucp_handler {
 
     struct ucx_context* ucp_req = (struct ucx_context*)recv_result;
 
-    req->req             = ucp_req;
-    req->needs_release   = true;
-    req->is_send_request = false;
-    req->other_rank      = sender_rank;
+    req->req           = ucp_req;
+    req->needs_release = true;
 
     ASSERT(!UCS_PTR_IS_ERR(recv_result),
            "unable to receive UCX data message (%d)\n",
diff --git a/cpp/include/raft/comms/std_comms.hpp b/cpp/include/raft/comms/std_comms.hpp
index c81b19c9ba..667c8be285 100644
--- a/cpp/include/raft/comms/std_comms.hpp
+++ b/cpp/include/raft/comms/std_comms.hpp
@@ -24,6 +24,7 @@
 
 #include <nccl.h>
 #include <ucp/api/ucp.h>
+#include <ucxx/api.h>
 
 #include <iostream>
 
@@ -81,6 +82,8 @@ void build_comms_nccl_only(resources* handle, ncclComm_t nccl_comm, int num_rank
  *
  * @param handle raft::resources for injecting the comms
  * @param nccl_comm initialized NCCL communicator to use for collectives
+ * @param is_ucxx whether `ucp_worker` and `eps` objects are UCXX (true) or
+ *                pure UCX (false).
  * @param ucp_worker of local process
  *        Note: This is purposefully left as void* so that the ucp_worker_h
  *        doesn't need to be exposed through the cython layer
@@ -112,30 +115,55 @@ void build_comms_nccl_only(resources* handle, ncclComm_t nccl_comm, int num_rank
  * comm.sync_stream(resource::get_cuda_stream(handle));
  * @endcode
  */
-void build_comms_nccl_ucx(
-  resources* handle, ncclComm_t nccl_comm, void* ucp_worker, void* eps, int num_ranks, int rank)
+void build_comms_nccl_ucx(resources* handle,
+                          ncclComm_t nccl_comm,
+                          bool is_ucxx,
+                          void* ucp_worker,
+                          void* eps,
+                          int num_ranks,
+                          int rank)
 {
-  auto eps_sp = std::make_shared<ucp_ep_h*>(new ucp_ep_h[num_ranks]);
+  detail::ucx_objects_t ucx_objects;
+  if (is_ucxx) {
+    ucx_objects.endpoints = std::make_shared<ucxx::Endpoint**>(new ucxx::Endpoint*[num_ranks]);
+    ucx_objects.worker    = static_cast<ucxx::Worker*>(ucp_worker);
+  } else {
+    ucx_objects.endpoints = std::make_shared<ucp_ep_h*>(new ucp_ep_h[num_ranks]);
+    ucx_objects.worker    = static_cast<ucp_worker_h>(ucp_worker);
+  }
 
   auto size_t_ep_arr = reinterpret_cast<size_t*>(eps);
 
   for (int i = 0; i < num_ranks; i++) {
-    size_t ptr    = size_t_ep_arr[i];
-    auto ucp_ep_v = reinterpret_cast<ucp_ep_h*>(*eps_sp);
-
-    if (ptr != 0) {
-      auto eps_ptr = reinterpret_cast<ucp_ep_h>(size_t_ep_arr[i]);
-      ucp_ep_v[i]  = eps_ptr;
+    size_t ptr = size_t_ep_arr[i];
+
+    if (is_ucxx) {
+      auto ucp_ep_v = reinterpret_cast<ucxx::Endpoint**>(
+        *std::get<detail::ucxx_endpoint_array_t>(ucx_objects.endpoints));
+
+      if (ptr != 0) {
+        auto eps_ptr = reinterpret_cast<ucxx::Endpoint*>(size_t_ep_arr[i]);
+        ucp_ep_v[i]  = eps_ptr;
+      } else {
+        ucp_ep_v[i] = nullptr;
+      }
     } else {
-      ucp_ep_v[i] = nullptr;
+      auto ucp_ep_v =
+        reinterpret_cast<ucp_ep_h*>(*std::get<detail::ucp_endpoint_array_t>(ucx_objects.endpoints));
+
+      if (ptr != 0) {
+        auto eps_ptr = reinterpret_cast<ucp_ep_h>(size_t_ep_arr[i]);
+        ucp_ep_v[i]  = eps_ptr;
+      } else {
+        ucp_ep_v[i] = nullptr;
+      }
     }
   }
 
   cudaStream_t stream = resource::get_cuda_stream(*handle);
 
-  auto communicator =
-    std::make_shared<comms_t>(std::unique_ptr<comms_iface>(new raft::comms::std_comms(
-      nccl_comm, (ucp_worker_h)ucp_worker, eps_sp, num_ranks, rank, stream)));
+  auto communicator = std::make_shared<comms_t>(std::unique_ptr<comms_iface>(
+    new raft::comms::std_comms(nccl_comm, ucx_objects, num_ranks, rank, stream)));
   resource::set_comms(*handle, communicator);
 }
 
diff --git a/dependencies.yaml b/dependencies.yaml
index a83cd003d6..a336aa1577 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -10,6 +10,8 @@ files:
       - build_pylibraft
       - cuda
       - cuda_version
+      - depends_on_cupy
+      - depends_on_distributed_ucxx
       - develop
       - checks
       - build_wheels
@@ -19,7 +21,6 @@ files:
       - run_pylibraft
       - test_python_common
       - test_pylibraft
-      - cupy
   bench_ann:
     output: conda
     matrix:
@@ -44,7 +45,8 @@ files:
       - py_version
       - test_python_common
       - test_pylibraft
-      - cupy
+      - depends_on_cupy
+      - depends_on_distributed_ucxx
   checks:
     output: none
     includes:
@@ -54,7 +56,7 @@ files:
     output: none
     includes:
       - cuda_version
-      - cupy
+      - depends_on_cupy
       - docs
       - py_version
       - test_pylibraft
@@ -82,7 +84,7 @@ files:
     includes:
       - test_python_common
       - test_pylibraft
-      - cupy
+      - depends_on_cupy
   py_build_raft_dask:
     output: pyproject
     pyproject_dir: python/raft-dask
@@ -90,6 +92,7 @@ files:
       table: build-system
     includes:
       - build
+      - depends_on_ucx_build
   py_run_raft_dask:
     output: pyproject
     pyproject_dir: python/raft-dask
@@ -105,6 +108,8 @@ files:
       key: test
     includes:
       - test_python_common
+      - depends_on_distributed_ucxx
+      - depends_on_ucx_run
   py_build_raft_ann_bench:
     output: pyproject
     pyproject_dir: python/raft-ann-bench
@@ -138,6 +143,7 @@ dependencies:
           - c-compiler
           - cxx-compiler
           - nccl>=2.9.9
+          - libucxx==0.38.*
           - scikit-build-core>=0.7.0
       - output_types: [requirements, pyproject]
         packages:
@@ -337,7 +343,7 @@ dependencies:
               - *libcusparse_dev114
               - *libcusparse114
 
-  cupy:
+  depends_on_cupy:
     common:
       - output_types: conda
         packages:
@@ -477,3 +483,69 @@ dependencies:
         packages:
           - scikit-learn
           - scipy
+  depends_on_distributed_ucxx:
+    common:
+      - output_types: conda
+        packages:
+          # UCXX is not currently a hard-dependency thus only installed during tests,
+          # this will change in the future.
+          - &distributed_ucxx_conda distributed-ucxx==0.38.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.*"}
+            packages:
+              - distributed-ucxx-cu12==0.38.*
+          - matrix: {cuda: "11.*"}
+            packages:
+              - distributed-ucxx-cu11==0.38.*
+          - {matrix: null, packages: [*distributed_ucxx_conda]}
+  depends_on_ucx_build:
+    common:
+      - output_types: conda
+        packages:
+          - &ucx_conda_build ucx==1.15.0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.*"}
+            packages:
+              - libucx-cu12==1.15.0
+          - matrix: {cuda: "11.*"}
+            packages:
+              - libucx-cu11==1.15.0
+          - matrix: null
+            packages:
+              - libucx==1.15.0
+  depends_on_ucx_run:
+    common:
+      - output_types: conda
+        packages:
+          - &ucx_conda_run ucx>=1.15.0
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.*"}
+            packages:
+              - libucx-cu12>=1.15.0
+          - matrix: {cuda: "11.*"}
+            packages:
+              - libucx-cu11>=1.15.0
+          - matrix: null
+            packages:
+              - libucx>=1.15.0
diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt
index 58e5ae8104..2c629f3b73 100644
--- a/python/raft-dask/CMakeLists.txt
+++ b/python/raft-dask/CMakeLists.txt
@@ -15,6 +15,7 @@
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
 include(../../rapids_config.cmake)
+include(rapids-cpm)
 include(rapids-cuda)
 rapids_cuda_init_architectures(raft-dask-python)
 
@@ -28,6 +29,11 @@ option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulti
        OFF
 )
 
+rapids_cpm_init()
+# Once https://github.com/rapidsai/ucxx/issues/173 is resolved we can remove this.
+find_package(ucx REQUIRED)
+include(cmake/thirdparty/get_ucxx.cmake)
+
 # If the user requested it we attempt to find RAFT.
 if(FIND_RAFT_CPP)
   find_package(raft "${RAPIDS_VERSION}" REQUIRED COMPONENTS distributed)
@@ -36,8 +42,6 @@ else()
 endif()
 
 if(NOT raft_FOUND)
-  find_package(ucx REQUIRED)
-
   # raft-dask doesn't actually use raft libraries, it just needs the headers, so we can turn off all
   # library compilation and we don't need to install anything here.
   set(BUILD_TESTS OFF)
@@ -47,6 +51,7 @@ if(NOT raft_FOUND)
   set(RAFT_COMPILE_DIST_LIBRARY OFF)
   set(RAFT_COMPILE_NN_LIBRARY OFF)
   set(CUDA_STATIC_RUNTIME ON)
+  set(RAFT_DASK_UCXX_STATIC ON)
 
   add_subdirectory(../../cpp raft-cpp EXCLUDE_FROM_ALL)
   list(APPEND CMAKE_MODULE_PATH ${CMAKE_BINARY_DIR}/cmake/find_modules)
diff --git a/python/raft-dask/cmake/thirdparty/get_ucxx.cmake b/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
new file mode 100644
index 0000000000..8e340eec73
--- /dev/null
+++ b/python/raft-dask/cmake/thirdparty/get_ucxx.cmake
@@ -0,0 +1,55 @@
+#=============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#=============================================================================
+
+function(find_and_configure_ucxx)
+    set(oneValueArgs VERSION FORK PINNED_TAG EXCLUDE_FROM_ALL)
+    set(options UCXX_STATIC)
+    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN} )
+
+    set(BUILD_UCXX_SHARED ON)
+    if(PKG_UCXX_STATIC)
+      set(BUILD_UCXX_SHARED OFF)
+    endif()
+
+    rapids_cpm_find(ucxx ${PKG_VERSION}
+            GLOBAL_TARGETS         ucxx::ucxx ucxx::python
+            BUILD_EXPORT_SET       raft-distributed-exports
+            INSTALL_EXPORT_SET     raft-distributed-exports
+            CPM_ARGS
+            GIT_REPOSITORY         https://github.com/${PKG_FORK}/ucxx.git
+            GIT_TAG                ${PKG_PINNED_TAG}
+            SOURCE_SUBDIR          cpp
+            EXCLUDE_FROM_ALL       ${PKG_EXCLUDE_FROM_ALL}
+            OPTIONS
+              "BUILD_TESTS OFF"
+              "BUILD_BENCH OFF"
+              "UCXX_ENABLE_PYTHON ON"
+              "UCXX_ENABLE_RMM ON"
+              "BUILD_SHARED_LIBS ${BUILD_UCXX_SHARED}"
+        )
+
+endfunction()
+
+# Change pinned tag here to test a commit in CI
+# To use a different RAFT locally, set the CMake variable
+# CPM_raft_SOURCE=/path/to/local/raft
+find_and_configure_ucxx(VERSION  0.38
+        FORK             rapidsai
+        PINNED_TAG       branch-0.38
+        EXCLUDE_FROM_ALL YES
+        UCXX_STATIC      ${RAFT_DASK_UCXX_STATIC}
+    )
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 815f6b277c..0181bef4ce 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -18,6 +18,7 @@ build-backend = "scikit_build_core.build"
 requires = [
     "cmake>=3.26.4",
     "cython>=3.0.0",
+    "libucx==1.15.0",
     "ninja",
     "scikit-build-core[pyproject]>=0.7.0",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
@@ -51,6 +52,8 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
+    "distributed-ucxx==0.38.*",
+    "libucx>=1.15.0",
     "pytest-cov",
     "pytest==7.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/raft-dask/pytest.ini b/python/raft-dask/pytest.ini
index 5559bb08c8..fcb18fe412 100644
--- a/python/raft-dask/pytest.ini
+++ b/python/raft-dask/pytest.ini
@@ -6,4 +6,5 @@ markers =
   mg: marks a test as multi-GPU
   memleak: marks a test as a memory leak test
   nccl: marks a test as using NCCL
-  ucx: marks a test as using ucx-py
+  ucx: marks a test as using UCX-Py
+  ucxx: marks a test as using UCXX
diff --git a/python/raft-dask/raft_dask/__init__.py b/python/raft-dask/raft_dask/__init__.py
index fbbaee4118..19a037ae75 100644
--- a/python/raft-dask/raft_dask/__init__.py
+++ b/python/raft-dask/raft_dask/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,3 +14,13 @@
 #
 
 from raft_dask._version import __git_commit__, __version__
+
+# If libucx was installed as a wheel, we must request it to load the library symbols.
+# Otherwise, we assume that the library was installed in a system path that ld can find.
+try:
+    import libucx
+except ModuleNotFoundError:
+    pass
+else:
+    libucx.load_library()
+    del libucx
diff --git a/python/raft-dask/raft_dask/common/comms.py b/python/raft-dask/raft_dask/common/comms.py
index b2f7d1fb74..c67170342f 100644
--- a/python/raft-dask/raft_dask/common/comms.py
+++ b/python/raft-dask/raft_dask/common/comms.py
@@ -327,11 +327,15 @@ def get_ucx(dask_worker=None):
                   (Note: if called by client.run(), this is supplied by Dask
                    and not the client)
     """
+    protocol = (
+        "ucxx" if dask_worker._protocol.split("://")[0] == "ucxx" else "ucx"
+    )
+
     raft_comm_state = get_raft_comm_state(
         sessionId="ucp", state_object=dask_worker
     )
     if "ucx" not in raft_comm_state:
-        raft_comm_state["ucx"] = UCX.get()
+        raft_comm_state["ucx"] = UCX.get(protocol=protocol)
 
     return raft_comm_state["ucx"]
 
@@ -535,7 +539,9 @@ def _func_build_handle_p2p(
     if verbose:
         dask_worker.log_event(topic="info", msg="Building p2p handle.")
 
-    ucp_worker = get_ucx(dask_worker).get_worker()
+    ucx = get_ucx(dask_worker)
+    is_ucxx = ucx._protocol == "ucxx"
+    ucx_worker = ucx.get_worker()
     raft_comm_state = get_raft_comm_state(
         sessionId=sessionId, state_object=dask_worker
     )
@@ -550,7 +556,14 @@ def _func_build_handle_p2p(
         dask_worker.log_event(topic="info", msg="Injecting comms on handle.")
 
     inject_comms_on_handle(
-        handle, nccl_comm, ucp_worker, eps, nWorkers, workerId, verbose
+        handle,
+        nccl_comm,
+        is_ucxx,
+        ucx_worker,
+        eps,
+        nWorkers,
+        workerId,
+        verbose,
     )
 
     if verbose:
diff --git a/python/raft-dask/raft_dask/common/comms_utils.pyx b/python/raft-dask/raft_dask/common/comms_utils.pyx
index 768ba0e422..2d4d2cc83b 100644
--- a/python/raft-dask/raft_dask/common/comms_utils.pyx
+++ b/python/raft-dask/raft_dask/common/comms_utils.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -41,6 +41,7 @@ cdef extern from "raft/comms/std_comms.hpp" namespace "raft::comms":
 
     void build_comms_nccl_ucx(device_resources *handle,
                               ncclComm_t comm,
+                              bint is_ucxx,
                               void *ucp_worker,
                               void *eps,
                               int size,
@@ -285,7 +286,7 @@ def inject_comms_on_handle_coll_only(handle, nccl_inst, size, rank, verbose):
                           rank)
 
 
-def inject_comms_on_handle(handle, nccl_inst, ucp_worker, eps, size,
+def inject_comms_on_handle(handle, nccl_inst, is_ucxx, ucp_worker, eps, size,
                            rank, verbose):
     """
     Given a handle and initialized comms, creates a comms_t instance
@@ -308,7 +309,10 @@ def inject_comms_on_handle(handle, nccl_inst, ucp_worker, eps, size,
 
     for i in range(len(eps)):
         if eps[i] is not None:
-            ep_st = <uintptr_t>eps[i].get_ucp_endpoint()
+            if is_ucxx:
+                ep_st = <uintptr_t>eps[i].ucxx_endpoint
+            else:
+                ep_st = <uintptr_t>eps[i].get_ucp_endpoint()
             ucp_eps[i] = <size_t>ep_st
         else:
             ucp_eps[i] = 0
@@ -323,6 +327,7 @@ def inject_comms_on_handle(handle, nccl_inst, ucp_worker, eps, size,
 
     build_comms_nccl_ucx(handle_,
                          deref(nccl_comm_),
+                         is_ucxx,
                          <void*>ucp_worker_st,
                          <void*>ucp_eps,
                          size,
diff --git a/python/raft-dask/raft_dask/common/ucx.py b/python/raft-dask/raft_dask/common/ucx.py
index eb246853f4..423e6f4692 100644
--- a/python/raft-dask/raft_dask/common/ucx.py
+++ b/python/raft-dask/raft_dask/common/ucx.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,8 +13,6 @@
 # limitations under the License.
 #
 
-import ucp
-
 
 async def _connection_func(ep):
     UCX.get().add_server_endpoint(ep)
@@ -29,10 +27,20 @@ class UCX:
 
     __instance = None
 
-    def __init__(self, listener_callback):
+    def __init__(self, listener_callback, protocol):
 
         self.listener_callback = listener_callback
 
+        self._protocol = protocol
+        if self._protocol == "ucxx":
+            import ucxx
+
+            self.ucx_api = ucxx
+        else:
+            import ucp
+
+            self.ucx_api = ucp
+
         self._create_listener()
         self._endpoints = {}
         self._server_endpoints = []
@@ -42,22 +50,28 @@ def __init__(self, listener_callback):
         UCX.__instance = self
 
     @staticmethod
-    def get(listener_callback=_connection_func):
+    def get(listener_callback=_connection_func, protocol="ucx"):
         if UCX.__instance is None:
-            UCX(listener_callback)
+            UCX(listener_callback, protocol)
         return UCX.__instance
 
+    def get_protocol(self):
+        return self._protocol
+
     def get_worker(self):
-        return ucp.get_ucp_worker()
+        if self._protocol == "ucxx":
+            return self.ucx_api.get_ucxx_worker()
+        else:
+            return self.ucx_api.get_ucp_worker()
 
     def _create_listener(self):
-        self._listener = ucp.create_listener(self.listener_callback)
+        self._listener = self.ucx_api.create_listener(self.listener_callback)
 
     def listener_port(self):
         return self._listener.port
 
     async def _create_endpoint(self, ip, port):
-        ep = await ucp.create_endpoint(ip, port)
+        ep = await self.ucx_api.create_endpoint(ip, port)
         self._endpoints[(ip, port)] = ep
         return ep
 
diff --git a/python/raft-dask/raft_dask/test/conftest.py b/python/raft-dask/raft_dask/test/conftest.py
index d1baa684d4..a60e4d995f 100644
--- a/python/raft-dask/raft_dask/test/conftest.py
+++ b/python/raft-dask/raft_dask/test/conftest.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 
 import os
 
@@ -34,6 +34,21 @@ def ucx_cluster():
         cluster.close()
 
 
+@pytest.fixture(scope="session")
+def ucxx_cluster():
+    pytest.importorskip("distributed_ucxx")
+
+    scheduler_file = os.environ.get("SCHEDULER_FILE")
+    if scheduler_file:
+        yield scheduler_file
+    else:
+        cluster = LocalCUDACluster(
+            protocol="ucxx",
+        )
+        yield cluster
+        cluster.close()
+
+
 @pytest.fixture(scope="session")
 def client(cluster):
     client = create_client(cluster)
@@ -48,6 +63,13 @@ def ucx_client(ucx_cluster):
     client.close()
 
 
+@pytest.fixture()
+def ucxx_client(ucxx_cluster):
+    client = create_client(ucxx_cluster)
+    yield client
+    client.close()
+
+
 def create_client(cluster):
     """
     Create a Dask distributed client for a specified cluster.
@@ -69,3 +91,43 @@ def create_client(cluster):
         return Client(cluster)
     else:
         return Client(scheduler_file=cluster)
+
+
+def pytest_addoption(parser):
+    group = parser.getgroup("Dask RAFT Custom Options")
+
+    group.addoption(
+        "--run_ucx", action="store_true", help="run _only_ UCX-Py tests"
+    )
+
+    group.addoption(
+        "--run_ucxx", action="store_true", help="run _only_ UCXX tests"
+    )
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--run_ucx"):
+        skip_others = pytest.mark.skip(
+            reason="only runs when --run_ucx is not specified"
+        )
+        for item in items:
+            if "ucx" not in item.keywords:
+                item.add_marker(skip_others)
+    else:
+        skip_ucx = pytest.mark.skip(reason="requires --run_ucx to run")
+        for item in items:
+            if "ucx" in item.keywords:
+                item.add_marker(skip_ucx)
+
+    if config.getoption("--run_ucxx"):
+        skip_others = pytest.mark.skip(
+            reason="only runs when --run_ucxx is not specified"
+        )
+        for item in items:
+            if "ucxx" not in item.keywords:
+                item.add_marker(skip_others)
+    else:
+        skip_ucxx = pytest.mark.skip(reason="requires --run_ucxx to run")
+        for item in items:
+            if "ucxx" in item.keywords:
+                item.add_marker(skip_ucxx)
diff --git a/python/raft-dask/raft_dask/test/test_comms.py b/python/raft-dask/raft_dask/test/test_comms.py
index b62d7185b2..109dd12b5e 100644
--- a/python/raft-dask/raft_dask/test/test_comms.py
+++ b/python/raft-dask/raft_dask/test/test_comms.py
@@ -66,6 +66,10 @@ def create_client(cluster):
         return Client(scheduler_file=cluster)
 
 
+def _get_client(dask_client, request):
+    return request.getfixturevalue(dask_client)
+
+
 def test_comms_init_no_p2p(cluster):
     client = create_client(cluster)
     try:
@@ -179,8 +183,7 @@ def _has_handle(sessionId):
     functions = [None]
 
 
-@pytest.mark.parametrize("root_location", ["client", "worker", "scheduler"])
-def test_nccl_root_placement(client, root_location):
+def _test_nccl_root_placement(client, root_location):
 
     cb = None
     try:
@@ -214,10 +217,31 @@ def test_nccl_root_placement(client, root_location):
             cb.destroy()
 
 
-@pytest.mark.parametrize("func", functions)
 @pytest.mark.parametrize("root_location", ["client", "worker", "scheduler"])
 @pytest.mark.nccl
-def test_collectives(client, func, root_location):
+def test_nccl_root_placement(root_location, request):
+    _test_nccl_root_placement(_get_client("client", request), root_location)
+
+
+@pytest.mark.parametrize("root_location", ["client", "worker", "scheduler"])
+@pytest.mark.nccl
+@pytest.mark.ucx
+def test_nccl_root_placement_ucx(root_location, request):
+    _test_nccl_root_placement(
+        _get_client("ucx_client", request), root_location
+    )
+
+
+@pytest.mark.parametrize("root_location", ["client", "worker", "scheduler"])
+@pytest.mark.nccl
+@pytest.mark.ucxx
+def test_nccl_root_placement_ucxx(root_location, request):
+    _test_nccl_root_placement(
+        _get_client("ucxx_client", request), root_location
+    )
+
+
+def _test_collectives(client, func, root_location):
 
     try:
         cb = Comms(
@@ -246,8 +270,30 @@ def test_collectives(client, func, root_location):
             cb.destroy()
 
 
+@pytest.mark.parametrize("func", functions)
+@pytest.mark.parametrize("root_location", ["client", "worker", "scheduler"])
 @pytest.mark.nccl
-def test_comm_split(client):
+def test_collectives(func, root_location, request):
+    _test_collectives(_get_client("client", request), func, root_location)
+
+
+@pytest.mark.parametrize("func", functions)
+@pytest.mark.parametrize("root_location", ["client", "worker", "scheduler"])
+@pytest.mark.nccl
+@pytest.mark.ucx
+def test_collectives_ucx(func, root_location, request):
+    _test_collectives(_get_client("ucx_client", request), func, root_location)
+
+
+@pytest.mark.parametrize("func", functions)
+@pytest.mark.parametrize("root_location", ["client", "worker", "scheduler"])
+@pytest.mark.nccl
+@pytest.mark.ucxx
+def test_collectives_ucxx(func, root_location, request):
+    _test_collectives(_get_client("ucxx_client", request), func, root_location)
+
+
+def _test_comm_split(client):
 
     cb = Comms(comms_p2p=True, verbose=True)
     cb.init()
@@ -264,9 +310,24 @@ def test_comm_split(client):
     assert all([x.result() for x in dfs])
 
 
+@pytest.mark.nccl
+def test_comm_split(request):
+    _test_comm_split(_get_client("client", request))
+
+
+@pytest.mark.nccl
 @pytest.mark.ucx
-@pytest.mark.parametrize("n_trials", [1, 5])
-def test_send_recv(n_trials, client):
+def test_comm_split_ucx(request):
+    _test_comm_split(_get_client("ucx_client", request))
+
+
+@pytest.mark.nccl
+@pytest.mark.ucxx
+def test_comm_split_ucxx(request):
+    _test_comm_split(_get_client("ucxx_client", request))
+
+
+def _test_send_recv_protocol(n_trials, client):
 
     cb = Comms(comms_p2p=True, verbose=True)
     cb.init()
@@ -287,9 +348,24 @@ def test_send_recv(n_trials, client):
     assert list(map(lambda x: x.result(), dfs))
 
 
-@pytest.mark.nccl
 @pytest.mark.parametrize("n_trials", [1, 5])
-def test_device_send_or_recv(n_trials, client):
+def test_send_recv_protocol(n_trials, request):
+    _test_send_recv_protocol(n_trials, _get_client("client", request))
+
+
+@pytest.mark.parametrize("n_trials", [1, 5])
+@pytest.mark.ucx
+def test_send_recv_protocol_ucx(n_trials, request):
+    _test_send_recv_protocol(n_trials, _get_client("ucx_client", request))
+
+
+@pytest.mark.parametrize("n_trials", [1, 5])
+@pytest.mark.ucxx
+def test_send_recv_protocol_ucxx(n_trials, request):
+    _test_send_recv_protocol(n_trials, _get_client("ucxx_client", request))
+
+
+def _test_device_send_or_recv(n_trials, client):
 
     cb = Comms(comms_p2p=True, verbose=True)
     cb.init()
@@ -310,9 +386,27 @@ def test_device_send_or_recv(n_trials, client):
     assert list(map(lambda x: x.result(), dfs))
 
 
+@pytest.mark.parametrize("n_trials", [1, 5])
+@pytest.mark.nccl
+def test_device_send_or_recv(n_trials, request):
+    _test_device_send_or_recv(n_trials, _get_client("client", request))
+
+
+@pytest.mark.parametrize("n_trials", [1, 5])
 @pytest.mark.nccl
+@pytest.mark.ucx
+def test_device_send_or_recv_ucx(n_trials, request):
+    _test_device_send_or_recv(n_trials, _get_client("ucx_client", request))
+
+
 @pytest.mark.parametrize("n_trials", [1, 5])
-def test_device_sendrecv(n_trials, client):
+@pytest.mark.nccl
+@pytest.mark.ucxx
+def test_device_send_or_recv_ucxx(n_trials, request):
+    _test_device_send_or_recv(n_trials, _get_client("ucxx_client", request))
+
+
+def _test_device_sendrecv(n_trials, client):
 
     cb = Comms(comms_p2p=True, verbose=True)
     cb.init()
@@ -333,9 +427,27 @@ def test_device_sendrecv(n_trials, client):
     assert list(map(lambda x: x.result(), dfs))
 
 
+@pytest.mark.parametrize("n_trials", [1, 5])
+@pytest.mark.nccl
+def test_device_sendrecv(n_trials, request):
+    _test_device_sendrecv(n_trials, _get_client("client", request))
+
+
+@pytest.mark.parametrize("n_trials", [1, 5])
 @pytest.mark.nccl
+@pytest.mark.ucx
+def test_device_sendrecv_ucx(n_trials, request):
+    _test_device_sendrecv(n_trials, _get_client("ucx_client", request))
+
+
 @pytest.mark.parametrize("n_trials", [1, 5])
-def test_device_multicast_sendrecv(n_trials, client):
+@pytest.mark.nccl
+@pytest.mark.ucxx
+def test_device_sendrecv_ucxx(n_trials, request):
+    _test_device_sendrecv(n_trials, _get_client("ucxx_client", request))
+
+
+def _test_device_multicast_sendrecv(n_trials, client):
 
     cb = Comms(comms_p2p=True, verbose=True)
     cb.init()
@@ -356,6 +468,30 @@ def test_device_multicast_sendrecv(n_trials, client):
     assert list(map(lambda x: x.result(), dfs))
 
 
+@pytest.mark.parametrize("n_trials", [1, 5])
+@pytest.mark.nccl
+def test_device_multicast_sendrecv(n_trials, request):
+    _test_device_multicast_sendrecv(n_trials, _get_client("client", request))
+
+
+@pytest.mark.parametrize("n_trials", [1, 5])
+@pytest.mark.nccl
+@pytest.mark.ucx
+def test_device_multicast_sendrecv_ucx(n_trials, request):
+    _test_device_multicast_sendrecv(
+        n_trials, _get_client("ucx_client", request)
+    )
+
+
+@pytest.mark.parametrize("n_trials", [1, 5])
+@pytest.mark.nccl
+@pytest.mark.ucxx
+def test_device_multicast_sendrecv_ucxx(n_trials, request):
+    _test_device_multicast_sendrecv(
+        n_trials, _get_client("ucxx_client", request)
+    )
+
+
 @pytest.mark.nccl
 @pytest.mark.parametrize(
     "subset", [slice(-1, None), slice(1), slice(None, None, -2)]

From ef28628013a02d6979000a94f46263135182c3ee Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 6 May 2024 21:10:33 -0700
Subject: [PATCH 25/60] Only use functions in the limited API (#2282)

This PR removes usage of the only method in raft's Cython that is not part of the Python limited API. Contributes to https://github.com/rapidsai/build-planning/issues/42

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/2282
---
 python/pylibraft/pylibraft/common/mdspan.pyx  | 34 ++++++++-----------
 .../pylibraft/neighbors/cpp/hnsw.pxd          | 14 +-------
 2 files changed, 15 insertions(+), 33 deletions(-)

diff --git a/python/pylibraft/pylibraft/common/mdspan.pyx b/python/pylibraft/pylibraft/common/mdspan.pyx
index c1a9188585..434eb59752 100644
--- a/python/pylibraft/pylibraft/common/mdspan.pyx
+++ b/python/pylibraft/pylibraft/common/mdspan.pyx
@@ -22,6 +22,7 @@ import io
 
 import numpy as np
 
+from cpython.buffer cimport PyBUF_FULL_RO, PyBuffer_Release, PyObject_GetBuffer
 from cpython.object cimport PyObject
 from cython.operator cimport dereference as deref
 from libc.stddef cimport size_t
@@ -47,10 +48,6 @@ from pylibraft.common.optional cimport make_optional, optional
 from pylibraft.common import DeviceResources
 
 
-cdef extern from "Python.h":
-    Py_buffer* PyMemoryView_GET_BUFFER(PyObject* mview)
-
-
 def run_roundtrip_test_for_mdspan(X, fortran_order=False):
     if not isinstance(X, np.ndarray) or len(X.shape) != 2:
         raise ValueError("Please call this function with a NumPy array with"
@@ -59,6 +56,9 @@ def run_roundtrip_test_for_mdspan(X, fortran_order=False):
     cdef device_resources * handle_ = \
         <device_resources *> <size_t> handle.getHandle()
     cdef ostringstream oss
+    cdef Py_buffer buf
+    PyObject_GetBuffer(X, &buf, PyBUF_FULL_RO)
+    cdef uintptr_t buf_ptr = <uintptr_t>buf.buf
     if X.dtype == np.float32:
         if fortran_order:
             serialize_mdspan[float, matrix_extent[size_t], col_major](
@@ -67,8 +67,7 @@ def run_roundtrip_test_for_mdspan(X, fortran_order=False):
                 <const host_mdspan[float, matrix_extent[size_t],
                                    col_major] &>
                 make_host_matrix_view[float, size_t, col_major](
-                    <float *><uintptr_t>PyMemoryView_GET_BUFFER(
-                        <PyObject *> X.data).buf,
+                    <float *>buf_ptr,
                     X.shape[0], X.shape[1]))
         else:
             serialize_mdspan[float, matrix_extent[size_t], row_major](
@@ -77,8 +76,7 @@ def run_roundtrip_test_for_mdspan(X, fortran_order=False):
                 <const host_mdspan[float, matrix_extent[size_t],
                                    row_major]&>
                 make_host_matrix_view[float, size_t, row_major](
-                    <float *><uintptr_t>PyMemoryView_GET_BUFFER(
-                        <PyObject *> X.data).buf,
+                    <float *>buf_ptr,
                     X.shape[0], X.shape[1]))
     elif X.dtype == np.float64:
         if fortran_order:
@@ -88,8 +86,7 @@ def run_roundtrip_test_for_mdspan(X, fortran_order=False):
                 <const host_mdspan[double, matrix_extent[size_t],
                                    col_major]&>
                 make_host_matrix_view[double, size_t, col_major](
-                    <double *><uintptr_t>PyMemoryView_GET_BUFFER(
-                        <PyObject *> X.data).buf,
+                    <double *>buf_ptr,
                     X.shape[0], X.shape[1]))
         else:
             serialize_mdspan[double, matrix_extent[size_t], row_major](
@@ -98,8 +95,7 @@ def run_roundtrip_test_for_mdspan(X, fortran_order=False):
                 <const host_mdspan[double, matrix_extent[size_t],
                                    row_major]&>
                 make_host_matrix_view[double, size_t, row_major](
-                    <double *><uintptr_t>PyMemoryView_GET_BUFFER(
-                        <PyObject *> X.data).buf,
+                    <double *>buf_ptr,
                     X.shape[0], X.shape[1]))
     elif X.dtype == np.int32:
         if fortran_order:
@@ -109,8 +105,7 @@ def run_roundtrip_test_for_mdspan(X, fortran_order=False):
                 <const host_mdspan[int32_t, matrix_extent[size_t],
                                    col_major]&>
                 make_host_matrix_view[int32_t, size_t, col_major](
-                    <int32_t *><uintptr_t>PyMemoryView_GET_BUFFER(
-                        <PyObject *> X.data).buf,
+                    <int32_t *>buf_ptr,
                     X.shape[0], X.shape[1]))
         else:
             serialize_mdspan[int32_t, matrix_extent[size_t], row_major](
@@ -119,8 +114,7 @@ def run_roundtrip_test_for_mdspan(X, fortran_order=False):
                 <const host_mdspan[int32_t, matrix_extent[size_t],
                                    row_major]&>
                 make_host_matrix_view[int32_t, size_t, row_major](
-                    <int32_t *><uintptr_t>PyMemoryView_GET_BUFFER(
-                        <PyObject *> X.data).buf,
+                    <int32_t *>buf_ptr,
                     X.shape[0], X.shape[1]))
     elif X.dtype == np.uint32:
         if fortran_order:
@@ -130,8 +124,7 @@ def run_roundtrip_test_for_mdspan(X, fortran_order=False):
                 <const host_mdspan[uint32_t, matrix_extent[size_t],
                                    col_major]&>
                 make_host_matrix_view[uint32_t, size_t, col_major](
-                    <uint32_t *><uintptr_t>PyMemoryView_GET_BUFFER(
-                        <PyObject *> X.data).buf,
+                    <uint32_t *>buf_ptr,
                     X.shape[0], X.shape[1]))
         else:
             serialize_mdspan[uint32_t, matrix_extent[size_t], row_major](
@@ -140,11 +133,12 @@ def run_roundtrip_test_for_mdspan(X, fortran_order=False):
                 <const host_mdspan[uint32_t, matrix_extent[size_t],
                                    row_major]&>
                 make_host_matrix_view[uint32_t, size_t, row_major](
-                    <uint32_t *><uintptr_t>PyMemoryView_GET_BUFFER(
-                        <PyObject *> X.data).buf,
+                    <uint32_t *>buf_ptr,
                     X.shape[0], X.shape[1]))
     else:
+        PyBuffer_Release(&buf)
         raise NotImplementedError()
+    PyBuffer_Release(&buf)
     f = io.BytesIO(oss.str())
     X2 = np.load(f)
     assert np.all(X.shape == X2.shape)
diff --git a/python/pylibraft/pylibraft/neighbors/cpp/hnsw.pxd b/python/pylibraft/pylibraft/neighbors/cpp/hnsw.pxd
index 75c0c14aad..7b2cf59c81 100644
--- a/python/pylibraft/pylibraft/neighbors/cpp/hnsw.pxd
+++ b/python/pylibraft/pylibraft/neighbors/cpp/hnsw.pxd
@@ -75,19 +75,7 @@ cdef extern from "raft_runtime/neighbors/hnsw.hpp" \
         host_matrix_view[uint64_t, int64_t, row_major] neighbors,
         host_matrix_view[float, int64_t, row_major] distances) except +
 
-    cdef unique_ptr[index[float]] deserialize_file[float](
-        const device_resources& handle,
-        const string& filename,
-        int dim,
-        DistanceType metric) except +
-
-    cdef unique_ptr[index[int8_t]] deserialize_file[int8_t](
-        const device_resources& handle,
-        const string& filename,
-        int dim,
-        DistanceType metric) except +
-
-    cdef unique_ptr[index[uint8_t]] deserialize_file[uint8_t](
+    cdef unique_ptr[index[T]] deserialize_file[T](
         const device_resources& handle,
         const string& filename,
         int dim,

From 97e38eb4469b928cfc865b5901cdee5451881590 Mon Sep 17 00:00:00 2001
From: tsuki <12711693+enp1s0@users.noreply.github.com>
Date: Wed, 8 May 2024 07:42:14 +0900
Subject: [PATCH 26/60] Normalize dataset vectors in the CAGRA InnerProduct
 tests (#2287)

This PR updates the CAGRA test to normalize the dataset and query vectors in the CAGRA test when the metric is InnerProduct. If we don't normalize them, large L2 norm dataset vectors tend to be included in the search result across all queries. This means that only a part of the graph nodes may be traversed in the search process, leading to test incompleteness.

Authors:
  - tsuki (https://github.com/enp1s0)

Approvers:
  - Tarang Jain (https://github.com/tarang-jain)
  - Tamas Bela Feher (https://github.com/tfeher)

URL: https://github.com/rapidsai/raft/pull/2287
---
 cpp/test/neighbors/ann_cagra.cuh | 87 ++++++++++++++++++++++++--------
 1 file changed, 67 insertions(+), 20 deletions(-)

diff --git a/cpp/test/neighbors/ann_cagra.cuh b/cpp/test/neighbors/ann_cagra.cuh
index 715a94403f..cc787d3e57 100644
--- a/cpp/test/neighbors/ann_cagra.cuh
+++ b/cpp/test/neighbors/ann_cagra.cuh
@@ -26,11 +26,13 @@
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/distance/distance_types.hpp>
 #include <raft/linalg/add.cuh>
+#include <raft/linalg/normalize.cuh>
 #include <raft/neighbors/cagra.cuh>
 #include <raft/neighbors/cagra_serialize.cuh>
 #include <raft/neighbors/ivf_pq_types.hpp>
 #include <raft/neighbors/sample_filter.cuh>
 #include <raft/random/rng.cuh>
+#include <raft/util/cuda_utils.cuh>
 #include <raft/util/itertools.hpp>
 
 #include <raft_internal/neighbors/naive_knn.cuh>
@@ -200,6 +202,67 @@ void GenerateRoundingErrorFreeDataset(
   GenerateRoundingErrorFreeDataset_kernel<T>
     <<<grid_size, block_size, 0, cuda_stream>>>(ptr, size, resolution);
 }
+
+template <class DataT>
+void InitDataset(const raft::resources& handle,
+                 DataT* const datatset_ptr,
+                 std::uint32_t size,
+                 std::uint32_t dim,
+                 raft::distance::DistanceType metric,
+                 raft::random::RngState& r)
+{
+  if constexpr (std::is_same_v<DataT, float> || std::is_same_v<DataT, half>) {
+    GenerateRoundingErrorFreeDataset(handle, datatset_ptr, size, dim, r, true);
+
+    if (metric == raft::distance::InnerProduct) {
+      auto dataset_view = raft::make_device_matrix_view(datatset_ptr, size, dim);
+      raft::linalg::row_normalize(
+        handle, raft::make_const_mdspan(dataset_view), dataset_view, raft::linalg::L2Norm);
+    }
+  } else if constexpr (std::is_same_v<DataT, std::uint8_t> || std::is_same_v<DataT, std::int8_t>) {
+    if constexpr (std::is_same_v<DataT, std::int8_t>) {
+      raft::random::uniformInt(handle, r, datatset_ptr, size * dim, DataT(-10), DataT(10));
+    } else {
+      raft::random::uniformInt(handle, r, datatset_ptr, size * dim, DataT(1), DataT(20));
+    }
+
+    if (metric == raft::distance::InnerProduct) {
+      // TODO (enp1s0): Change this once row_normalize supports (u)int8 matrices.
+      // https://github.com/rapidsai/raft/issues/2291
+
+      using ComputeT    = float;
+      auto dataset_view = raft::make_device_matrix_view(datatset_ptr, size, dim);
+      auto dev_row_norm = raft::make_device_vector<ComputeT>(handle, size);
+      const auto normalized_norm =
+        (std::is_same_v<DataT, std::uint8_t> ? 40 : 20) * std::sqrt(static_cast<ComputeT>(dim));
+
+      raft::linalg::reduce(dev_row_norm.data_handle(),
+                           datatset_ptr,
+                           dim,
+                           size,
+                           0.f,
+                           true,
+                           true,
+                           resource::get_cuda_stream(handle),
+                           false,
+                           raft::sq_op(),
+                           raft::add_op(),
+                           raft::sqrt_op());
+      raft::linalg::matrix_vector_op(
+        handle,
+        raft::make_const_mdspan(dataset_view),
+        raft::make_const_mdspan(dev_row_norm.view()),
+        dataset_view,
+        raft::linalg::Apply::ALONG_COLUMNS,
+        [normalized_norm] __device__(DataT elm, ComputeT norm) {
+          const ComputeT v           = elm / norm * normalized_norm;
+          const ComputeT max_v_range = std::numeric_limits<DataT>::max();
+          const ComputeT min_v_range = std::numeric_limits<DataT>::min();
+          return static_cast<DataT>(std::min(max_v_range, std::max(min_v_range, v)));
+        });
+    }
+  }
+}
 }  // namespace
 
 struct AnnCagraInputs {
@@ -360,16 +423,8 @@ class AnnCagraTest : public ::testing::TestWithParam<AnnCagraInputs> {
     database.resize(((size_t)ps.n_rows) * ps.dim, stream_);
     search_queries.resize(ps.n_queries * ps.dim, stream_);
     raft::random::RngState r(1234ULL);
-    if constexpr (std::is_same_v<DataT, float> || std::is_same_v<DataT, half>) {
-      GenerateRoundingErrorFreeDataset(handle_, database.data(), ps.n_rows, ps.dim, r, true);
-      GenerateRoundingErrorFreeDataset(
-        handle_, search_queries.data(), ps.n_queries, ps.dim, r, true);
-    } else {
-      raft::random::uniformInt(
-        handle_, r, database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20));
-      raft::random::uniformInt(
-        handle_, r, search_queries.data(), ps.n_queries * ps.dim, DataT(1), DataT(20));
-    }
+    InitDataset(handle_, database.data(), ps.n_rows, ps.dim, ps.metric, r);
+    InitDataset(handle_, search_queries.data(), ps.n_queries, ps.dim, ps.metric, r);
     resource::sync_stream(handle_);
   }
 
@@ -744,16 +799,8 @@ class AnnCagraFilterTest : public ::testing::TestWithParam<AnnCagraInputs> {
     database.resize(((size_t)ps.n_rows) * ps.dim, stream_);
     search_queries.resize(ps.n_queries * ps.dim, stream_);
     raft::random::RngState r(1234ULL);
-    if constexpr (std::is_same_v<DataT, float> || std::is_same_v<DataT, half>) {
-      GenerateRoundingErrorFreeDataset(handle_, database.data(), ps.n_rows, ps.dim, r, true);
-      GenerateRoundingErrorFreeDataset(
-        handle_, search_queries.data(), ps.n_queries, ps.dim, r, true);
-    } else {
-      raft::random::uniformInt(
-        handle_, r, database.data(), ps.n_rows * ps.dim, DataT(1), DataT(20));
-      raft::random::uniformInt(
-        handle_, r, search_queries.data(), ps.n_queries * ps.dim, DataT(1), DataT(20));
-    }
+    InitDataset(handle_, database.data(), ps.n_rows, ps.dim, ps.metric, r);
+    InitDataset(handle_, search_queries.data(), ps.n_queries, ps.dim, ps.metric, r);
     resource::sync_stream(handle_);
   }
 

From b760453221c6eb5b6986ceb2153bb0a0284697e2 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Tue, 7 May 2024 22:08:55 -0700
Subject: [PATCH 27/60] Build C++ wheel (#2264)

This PR changes wheel building in raft to create a separate C++ wheel that is then found from the Python wheel. The C++ wheel is now a hard dependency of the Python wheel. This allows Python packaging to more closely mirror the structure of our conda packaging, and the way we would normally wish to package these in any other manager. It also allows us to reduce package sizes by allowing better sharing of artifacts between different Python packages that rely on the same C++ components from other packages.

Contributes to rapidsai/build-planning#33

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/2264
---
 .github/workflows/build.yaml                  | 30 ++++---
 .github/workflows/pr.yaml                     | 27 +++---
 build.sh                                      | 12 +--
 ci/build_wheel.sh                             | 67 ---------------
 ci/build_wheel_cpp.sh                         | 46 ++++++++++
 ci/build_wheel_pylibraft.sh                   |  9 --
 ci/build_wheel_python.sh                      | 84 +++++++++++++++++++
 ci/build_wheel_raft_dask.sh                   |  9 --
 ci/test_wheel_pylibraft.sh                    | 11 +--
 ci/test_wheel_raft_dask.sh                    | 18 ++--
 cpp/CMakeLists.txt                            | 69 +++++++++------
 dependencies.yaml                             | 67 ++++++++++++++-
 python/libraft/CMakeLists.txt                 | 45 ++++++++++
 python/libraft/LICENSE                        |  1 +
 python/libraft/README.md                      |  1 +
 python/libraft/libraft/VERSION                |  1 +
 python/libraft/libraft/__init__.py            | 17 ++++
 python/libraft/libraft/_version.py            | 25 ++++++
 python/libraft/libraft/load.py                | 48 +++++++++++
 python/libraft/pyproject.toml                 | 65 ++++++++++++++
 python/pylibraft/CMakeLists.txt               | 39 +--------
 python/pylibraft/pylibraft/__init__.py        | 12 ++-
 .../pylibraft/cluster/CMakeLists.txt          |  4 +-
 .../pylibraft/pylibraft/common/CMakeLists.txt |  4 +-
 .../pylibraft/distance/CMakeLists.txt         |  2 +-
 .../pylibraft/pylibraft/matrix/CMakeLists.txt |  4 +-
 .../pylibraft/neighbors/CMakeLists.txt        |  2 +-
 .../pylibraft/neighbors/cagra/CMakeLists.txt  |  4 +-
 .../neighbors/ivf_flat/CMakeLists.txt         |  4 +-
 .../pylibraft/neighbors/ivf_pq/CMakeLists.txt |  4 +-
 .../pylibraft/pylibraft/random/CMakeLists.txt |  4 +-
 python/pylibraft/pyproject.toml               |  2 +
 python/raft-dask/CMakeLists.txt               | 28 +------
 python/raft-dask/pyproject.toml               |  2 +
 python/raft-dask/raft_dask/__init__.py        | 13 ++-
 .../raft-dask/raft_dask/common/CMakeLists.txt |  4 +-
 .../raft_dask/include_test/CMakeLists.txt     |  4 +-
 37 files changed, 537 insertions(+), 251 deletions(-)
 delete mode 100755 ci/build_wheel.sh
 create mode 100755 ci/build_wheel_cpp.sh
 delete mode 100755 ci/build_wheel_pylibraft.sh
 create mode 100755 ci/build_wheel_python.sh
 delete mode 100755 ci/build_wheel_raft_dask.sh
 create mode 100644 python/libraft/CMakeLists.txt
 create mode 120000 python/libraft/LICENSE
 create mode 120000 python/libraft/README.md
 create mode 120000 python/libraft/libraft/VERSION
 create mode 100644 python/libraft/libraft/__init__.py
 create mode 100644 python/libraft/libraft/_version.py
 create mode 100644 python/libraft/libraft/load.py
 create mode 100644 python/libraft/pyproject.toml

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index c8837afba7..d99d5d28e5 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -67,37 +67,40 @@ jobs:
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
       sha: ${{ inputs.sha }}
-  wheel-build-pylibraft:
+  wheel-build-cpp:
+    needs: checks
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
+      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
-      script: ci/build_wheel_pylibraft.sh
-  wheel-publish-pylibraft:
-    needs: wheel-build-pylibraft
+      script: ci/build_wheel_cpp.sh
+  wheel-build-python:
+    needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
-      package-name: pylibraft
-  wheel-build-raft-dask:
-    needs: wheel-publish-pylibraft
+      script: ci/build_wheel_python.sh
+  wheel-publish-cpp:
+    needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
-      script: ci/build_wheel_raft_dask.sh
-  wheel-publish-raft-dask:
-    needs: wheel-build-raft-dask
+      package-name: raft
+      package-type: cpp
+  wheel-publish-python:
+    needs: wheel-build-python
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
     with:
@@ -105,4 +108,5 @@ jobs:
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
-      package-name: raft_dask
+      package-name: raft
+      package-type: python
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index c2d9556859..5d0368e3f7 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -19,9 +19,9 @@ jobs:
       - conda-python-build
       - conda-python-tests
       - docs-build
-      - wheel-build-pylibraft
+      - wheel-build-cpp
+      - wheel-build-python
       - wheel-tests-pylibraft
-      - wheel-build-raft-dask
       - wheel-tests-raft-dask
       - devcontainer
     secrets: inherit
@@ -74,29 +74,30 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
-  wheel-build-pylibraft:
+  wheel-build-cpp:
     needs: checks
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
+      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
-      script: ci/build_wheel_pylibraft.sh
-  wheel-tests-pylibraft:
-    needs: wheel-build-pylibraft
+      script: ci/build_wheel_cpp.sh
+  wheel-build-python:
+    needs: wheel-build-cpp
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: pull-request
-      script: ci/test_wheel_pylibraft.sh
-  wheel-build-raft-dask:
-    needs: wheel-tests-pylibraft
+      script: ci/build_wheel_python.sh
+  wheel-tests-pylibraft:
+    needs: wheel-build-python
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: pull-request
-      script: "ci/build_wheel_raft_dask.sh"
+      script: ci/test_wheel_pylibraft.sh
   wheel-tests-raft-dask:
-    needs: wheel-build-raft-dask
+    needs: wheel-build-python
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
diff --git a/build.sh b/build.sh
index da5efa5183..ee16bb4b1b 100755
--- a/build.sh
+++ b/build.sh
@@ -381,14 +381,6 @@ if [[ ${CMAKE_TARGET} == "" ]]; then
     CMAKE_TARGET="all"
 fi
 
-# Append `-DFIND_RAFT_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option.
-SKBUILD_EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}"
-if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_RAFT_CPP"* ]]; then
-    SKBUILD_EXTRA_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS} -DFIND_RAFT_CPP=ON"
-fi
-# Replace spaces with semicolons in SKBUILD_EXTRA_CMAKE_ARGS
-SKBUILD_EXTRA_CMAKE_ARGS=$(echo ${SKBUILD_EXTRA_CMAKE_ARGS} | sed 's/ /;/g')
-
 # If clean given, run it prior to any other steps
 if (( ${CLEAN} == 1 )); then
     # If the dirs to clean are mounted dirs in a container, the
@@ -495,13 +487,13 @@ fi
 
 # Build and (optionally) install the pylibraft Python package
 if (( ${NUMARGS} == 0 )) || hasArg pylibraft; then
-    SKBUILD_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS}" \
+    CMAKE_ARGS="${EXTRA_CMAKE_ARGS}" \
         python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/pylibraft
 fi
 
 # Build and (optionally) install the raft-dask Python package
 if (( ${NUMARGS} == 0 )) || hasArg raft-dask; then
-    SKBUILD_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS}" \
+    CMAKE_ARGS="${EXTRA_CMAKE_ARGS}" \
         python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/raft-dask
 fi
 
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
deleted file mode 100755
index e3e7ce9c89..0000000000
--- a/ci/build_wheel.sh
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-set -euo pipefail
-
-package_name=$1
-package_dir=$2
-underscore_package_name=$(echo "${package_name}" | tr "-" "_")
-
-# Clear out system ucx files to ensure that we're getting ucx from the wheel.
-rm -rf /usr/lib64/ucx
-rm -rf /usr/lib64/libuc*
-
-source rapids-configure-sccache
-source rapids-date-string
-
-version=$(rapids-generate-version)
-git_commit=$(git rev-parse HEAD)
-
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-
-# This is the version of the suffix with a preceding hyphen. It's used
-# everywhere except in the final wheel name.
-PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
-
-# Patch project metadata files to include the CUDA version suffix and version override.
-pyproject_file="${package_dir}/pyproject.toml"
-version_file="${package_dir}/${underscore_package_name}/_version.py"
-
-sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
-echo "${version}" > VERSION
-sed -i "/^__git_commit__ / s/= .*/= \"${git_commit}\"/g" ${version_file}
-
-# For nightlies we want to ensure that we're pulling in alphas as well. The
-# easiest way to do so is to augment the spec with a constraint containing a
-# min alpha version that doesn't affect the version bounds but does allow usage
-# of alpha versions for that dependency without --pre
-alpha_spec=''
-if ! rapids-is-release-build; then
-    alpha_spec=',>=0.0.0a0'
-fi
-
-if [[ ${package_name} == "raft-dask" ]]; then
-    sed -r -i "s/pylibraft==(.*)\"/pylibraft${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
-    sed -r -i "s/libucx(.*)\"/libucx${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
-    sed -r -i "s/ucx-py==(.*)\"/ucx-py${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
-    sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}
-    sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
-    sed -r -i "s/distributed-ucxx==(.*)\"/distributed-ucxx${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
-else
-    sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
-fi
-
-if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then
-    sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file}
-    sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file}
-fi
-
-cd "${package_dir}"
-
-# Hardcode the output dir
-python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
-
-mkdir -p final_dist
-python -m auditwheel repair -w final_dist --exclude "libucp.so.0" dist/*
-
-RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
diff --git a/ci/build_wheel_cpp.sh b/ci/build_wheel_cpp.sh
new file mode 100755
index 0000000000..0e0417dc35
--- /dev/null
+++ b/ci/build_wheel_cpp.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+# Copyright (c) 2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_name="libraft"
+package_dir="python/libraft"
+
+source rapids-configure-sccache
+source rapids-date-string
+
+version=$(rapids-generate-version)
+git_commit=$(git rev-parse HEAD)
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+# everywhere except in the final wheel name.
+PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
+
+# Patch project metadata files to include the CUDA version suffix and version override.
+pyproject_file="${package_dir}/pyproject.toml"
+version_file="${package_dir}/${package_name}/_version.py"
+
+sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
+echo "${version}" > VERSION
+sed -i "/^__git_commit__ / s/= .*/= \"${git_commit}\"/g" ${version_file}
+
+# For nightlies we want to ensure that we're pulling in alphas as well. The
+# easiest way to do so is to augment the spec with a constraint containing a
+# min alpha version that doesn't affect the version bounds but does allow usage
+# of alpha versions for that dependency without --pre
+alpha_spec=''
+if ! rapids-is-release-build; then
+    alpha_spec=',>=0.0.0a0'
+fi
+
+sed -r -i "s/librmm(.*)\"/librmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
+
+cd "${package_dir}"
+
+python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+
+mkdir -p final_dist
+python -m auditwheel repair -w final_dist dist/*
+
+RAPIDS_PY_WHEEL_NAME="raft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp final_dist
diff --git a/ci/build_wheel_pylibraft.sh b/ci/build_wheel_pylibraft.sh
deleted file mode 100755
index ec30a28b92..0000000000
--- a/ci/build_wheel_pylibraft.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
-
-set -euo pipefail
-
-# Set up skbuild options. Enable sccache in skbuild config options
-export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_RAFT_CPP=OFF"
-
-ci/build_wheel.sh pylibraft python/pylibraft
diff --git a/ci/build_wheel_python.sh b/ci/build_wheel_python.sh
new file mode 100755
index 0000000000..ae15d3734b
--- /dev/null
+++ b/ci/build_wheel_python.sh
@@ -0,0 +1,84 @@
+#!/bin/bash
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Clear out system ucx files to ensure that we're getting ucx from the wheel
+# when building raft-dask.
+rm -rf /usr/lib64/ucx
+rm -rf /usr/lib64/libuc*
+
+source rapids-configure-sccache
+source rapids-date-string
+
+version=$(rapids-generate-version)
+git_commit=$(git rev-parse HEAD)
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+# This is the version of the suffix with a preceding hyphen. It's used
+# everywhere except in the final wheel name.
+PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
+
+echo "${version}" > VERSION
+
+# For nightlies we want to ensure that we're pulling in alphas as well. The
+# easiest way to do so is to augment the spec with a constraint containing a
+# min alpha version that doesn't affect the version bounds but does allow usage
+# of alpha versions for that dependency without --pre
+alpha_spec=''
+if ! rapids-is-release-build; then
+    alpha_spec=',>=0.0.0a0'
+fi
+
+CPP_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="raft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libraft_dist)
+PYTHON_WHEELHOUSE="${PWD}/dist/"
+PYTHON_AUDITED_WHEELHOUSE="${PWD}/final_dist/"
+WHEELHOUSES=("${PYTHON_WHEELHOUSE}" "${CPP_WHEELHOUSE}")
+mkdir -p "${PYTHON_AUDITED_WHEELHOUSE}"
+
+FIND_LINKS=""
+# Iterate over the array
+for wheelhouse in "${WHEELHOUSES[@]}"; do
+    FIND_LINKS+="--find-links ${wheelhouse} "
+done
+              
+
+build_wheel () {
+    local package_name="${1}"
+    local underscore_package_name=$(echo "${package_name}" | tr "-" "_")
+
+    local package_dir="python/${package_name}"
+    local pyproject_file="${package_dir}/pyproject.toml"
+    local version_file="${package_dir}/${underscore_package_name}/_version.py"
+
+    sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
+    sed -i "/^__git_commit__ / s/= .*/= \"${git_commit}\"/g" ${version_file}
+
+    sed -r -i "s/libucx(.*)\"/libucx${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
+
+    for dep in rmm libraft pylibraft ucx-py distributed-ucxx; do
+        sed -r -i "s/${dep}==(.*)\"/${dep}${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
+    done
+
+    # dask-cuda & rapids-dask-dependency don't get a suffix, but they do get an alpha spec.
+    for dep in dask-cuda rapids-dask-dependency; do
+        sed -r -i "s/${dep}==(.*)\"/${dep}==\1${alpha_spec}\"/g" ${pyproject_file}
+    done
+
+    if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then
+        sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file}
+        sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file}
+    fi
+
+    pushd "${package_dir}"
+
+    python -m pip wheel . -w "${PYTHON_WHEELHOUSE}" -vvv --no-deps --disable-pip-version-check ${FIND_LINKS}
+    popd
+}
+
+build_wheel pylibraft
+build_wheel raft-dask
+
+python -m auditwheel repair -w "${PYTHON_AUDITED_WHEELHOUSE}" --exclude libraft.so --exclude "libucp.so.0" "${PYTHON_WHEELHOUSE}"/*
+RAPIDS_PY_WHEEL_NAME="raft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python "${PYTHON_AUDITED_WHEELHOUSE}"
diff --git a/ci/build_wheel_raft_dask.sh b/ci/build_wheel_raft_dask.sh
deleted file mode 100755
index 5ae12303d0..0000000000
--- a/ci/build_wheel_raft_dask.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
-
-set -euo pipefail
-
-# Set up skbuild options. Enable sccache in skbuild config options
-export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_RAFT_CPP=OFF"
-
-ci/build_wheel.sh raft-dask python/raft-dask
diff --git a/ci/test_wheel_pylibraft.sh b/ci/test_wheel_pylibraft.sh
index b38f5a690b..230889ae82 100755
--- a/ci/test_wheel_pylibraft.sh
+++ b/ci/test_wheel_pylibraft.sh
@@ -3,11 +3,12 @@
 
 set -euo pipefail
 
-mkdir -p ./dist
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
-# echo to expand wildcard before adding `[extra]` requires for pip
-python -m pip install $(echo ./dist/pylibraft*.whl)[test]
+WHEELHOUSE="${PWD}/dist/"
+RAPIDS_PY_WHEEL_NAME="raft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp "${WHEELHOUSE}"
+RAPIDS_PY_WHEEL_NAME="raft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python "${WHEELHOUSE}"
 
-python -m pytest ./python/pylibraft/pylibraft/test
+python -m pip install "pylibraft-${RAPIDS_PY_CUDA_SUFFIX}[test]>=0.0.0a0" --find-links "${WHEELHOUSE}"
+
+python -m pytest python/pylibraft/pylibraft/test
diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh
index fe2d44f2b3..b2ec9a0c8b 100755
--- a/ci/test_wheel_raft_dask.sh
+++ b/ci/test_wheel_raft_dask.sh
@@ -3,21 +3,21 @@
 
 set -euo pipefail
 
-mkdir -p ./dist
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
-# Download the pylibraft built in the previous step
-RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibraft-dep
-python -m pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl
+WHEELHOUSE="${PWD}/dist/"
+RAPIDS_PY_WHEEL_NAME="raft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp "${WHEELHOUSE}"
+RAPIDS_PY_WHEEL_NAME="raft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python "${WHEELHOUSE}"
 
-python -m pip install "raft_dask-${RAPIDS_PY_CUDA_SUFFIX}[test]>=0.0.0a0" --find-links dist/
+python -m pip install "raft-dask-${RAPIDS_PY_CUDA_SUFFIX}[test]>=0.0.0a0" --find-links "${WHEELHOUSE}"
+
+test_dir="python/raft-dask/raft_dask/test"
 
 # rapids-logger "pytest raft-dask"
-# python -m pytest ./python/raft-dask/raft_dask/test
+# python -m pytest ${test_dir}
 
 # rapids-logger "pytest raft-dask (ucx-py only)"
-# python -m pytest ./python/raft-dask/raft_dask/test --run_ucx
+# python -m pytest ${test_dir} --run_ucx
 
 rapids-logger "pytest raft-dask (ucxx only)"
-python -m pytest ./python/raft-dask/raft_dask/test --run_ucxx
+python -m pytest ${test_dir} --run_ucxx
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 259d9fe428..88ccc27be0 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -75,9 +75,12 @@ if((BUILD_TESTS
 )
   set(RAFT_COMPILE_LIBRARY_DEFAULT ON)
 endif()
-option(RAFT_COMPILE_LIBRARY "Enable building raft shared library instantiations"
+option(RAFT_COMPILE_LIBRARY "Enable building raft library instantiations"
        ${RAFT_COMPILE_LIBRARY_DEFAULT}
 )
+option(RAFT_COMPILE_DYNAMIC_ONLY "Only build the static library and skip the
+static library. Has no effect if RAFT_COMPILE_LIBRARY is OFF" OFF
+)
 
 if(BUILD_CPU_ONLY)
   set(BUILD_SHARED_LIBS OFF)
@@ -582,17 +585,23 @@ if(RAFT_COMPILE_LIBRARY)
   )
 
   add_library(raft_lib SHARED $<TARGET_OBJECTS:raft_objs>)
-  add_library(raft_lib_static STATIC $<TARGET_OBJECTS:raft_objs>)
+
+  set(raft_lib_targets raft_lib)
+  if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
+    add_library(raft_lib_static STATIC $<TARGET_OBJECTS:raft_objs>)
+    list(APPEND raft_lib_targets raft_lib_static)
+  endif()
 
   set_target_properties(
-    raft_lib raft_lib_static
+    ${raft_lib_targets}
     PROPERTIES OUTPUT_NAME raft
                BUILD_RPATH "\$ORIGIN"
                INSTALL_RPATH "\$ORIGIN"
                INTERFACE_POSITION_INDEPENDENT_CODE ON
   )
 
-  foreach(target raft_lib raft_lib_static raft_objs)
+  list(APPEND raft_lib_targets raft_objs)
+  foreach(target IN LISTS raft_lib_targets)
     target_link_libraries(
       ${target}
       PUBLIC raft::raft
@@ -617,20 +626,22 @@ target_link_libraries(raft_compiled INTERFACE raft::raft $<TARGET_NAME_IF_EXISTS
 # ##################################################################################################
 # * raft_compiled_static----------------------------------------------------------------------------
 
-add_library(raft_compiled_static INTERFACE)
+if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
+  add_library(raft_compiled_static INTERFACE)
 
-if(TARGET raft_compiled_static AND (NOT TARGET raft::compiled_static))
-  add_library(raft::compiled_static ALIAS raft_compiled_static)
-endif()
-set_target_properties(raft_compiled_static PROPERTIES EXPORT_NAME compiled_static)
+  if(TARGET raft_compiled_static AND (NOT TARGET raft::compiled_static))
+    add_library(raft::compiled_static ALIAS raft_compiled_static)
+  endif()
+  set_target_properties(raft_compiled_static PROPERTIES EXPORT_NAME compiled_static)
 
-if(TARGET raft_lib_static AND (NOT TARGET raft::raft_lib_static))
-  add_library(raft::raft_lib_static ALIAS raft_lib_static)
-endif()
+  if(TARGET raft_lib_static AND (NOT TARGET raft::raft_lib_static))
+    add_library(raft::raft_lib_static ALIAS raft_lib_static)
+  endif()
 
-target_link_libraries(
-  raft_compiled_static INTERFACE raft::raft $<TARGET_NAME_IF_EXISTS:raft::raft_lib_static>
-)
+  target_link_libraries(
+    raft_compiled_static INTERFACE raft::raft $<TARGET_NAME_IF_EXISTS:raft::raft_lib_static>
+  )
+endif()
 
 # ##################################################################################################
 # * raft_distributed -------------------------------------------------------------------------------
@@ -679,8 +690,12 @@ install(
   EXPORT raft-exports
 )
 
+set(raft_compiled_install_targets raft_compiled)
+if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
+  list(APPEND raft_compiled_install_targets raft_compiled_static)
+endif()
 install(
-  TARGETS raft_compiled raft_compiled_static
+  TARGETS ${raft_compiled_install_targets}
   DESTINATION ${lib_dir}
   COMPONENT raft
   EXPORT raft-compiled-exports
@@ -693,12 +708,14 @@ if(TARGET raft_lib)
     COMPONENT compiled
     EXPORT raft-compiled-lib-exports
   )
-  install(
-    TARGETS raft_lib_static
-    DESTINATION ${lib_dir}
-    COMPONENT compiled-static
-    EXPORT raft-compiled-static-lib-exports
-  )
+  if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
+    install(
+      TARGETS raft_lib_static
+      DESTINATION ${lib_dir}
+      COMPONENT compiled-static
+      EXPORT raft-compiled-static-lib-exports
+    )
+  endif()
   install(
     DIRECTORY include/raft_runtime
     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
@@ -769,8 +786,12 @@ endif()
 set(raft_components compiled distributed)
 set(raft_export_sets raft-compiled-exports raft-distributed-exports)
 if(TARGET raft_lib)
-  list(APPEND raft_components compiled compiled-static)
-  list(APPEND raft_export_sets raft-compiled-lib-exports raft-compiled-static-lib-exports)
+  list(APPEND raft_components compiled)
+  list(APPEND raft_export_sets raft-compiled-lib-exports)
+  if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
+    list(APPEND raft_components compiled-static)
+    list(APPEND raft_export_sets raft-compiled-static-lib-exports)
+  endif()
 endif()
 
 string(
diff --git a/dependencies.yaml b/dependencies.yaml
index a336aa1577..685d0a8fac 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -8,6 +8,7 @@ files:
     includes:
       - build
       - build_pylibraft
+      - cython_build
       - cuda
       - cuda_version
       - depends_on_cupy
@@ -28,6 +29,7 @@ files:
       arch: [x86_64, aarch64]
     includes:
       - build
+      - cython_build
       - cuda
       - cuda_version
       - develop
@@ -60,6 +62,25 @@ files:
       - docs
       - py_version
       - test_pylibraft
+  py_build_libraft:
+    output: pyproject
+    pyproject_dir: python/libraft
+    extras:
+      table: build-system
+    includes:
+      - build
+      - librmm
+  py_run_libraft:
+    output: pyproject
+    pyproject_dir: python/libraft
+    extras:
+      table: project
+    includes:
+      # This is really a build requirement for anything using libraft to build
+      # against, but is not required when _running_ with libraft. There isn't a
+      # great way to express that without separating libraft into libraft and
+      # libraft-dev packages, though.
+      - librmm
   py_build_pylibraft:
     output: pyproject
     pyproject_dir: python/pylibraft
@@ -68,6 +89,7 @@ files:
     includes:
       - build
       - build_pylibraft
+      - cython_build
   py_run_pylibraft:
     output: pyproject
     pyproject_dir: python/pylibraft
@@ -92,6 +114,7 @@ files:
       table: build-system
     includes:
       - build
+      - cython_build
       - depends_on_ucx_build
   py_run_raft_dask:
     output: pyproject
@@ -136,7 +159,6 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - &cmake_ver cmake>=3.26.4
-          - cython>=3.0.0
           - ninja
       - output_types: [conda]
         packages:
@@ -181,7 +203,33 @@ dependencies:
             packages: [nvcc_linux-64=11.2]
           - matrix: {cuda: "11.2", arch: aarch64}
             packages: [nvcc_linux-aarch64=11.2]
-
+  cython_build:
+    common:
+      - output_types: [conda, requirements, pyproject]
+        packages:
+          - cython>=3.0.0
+      - output_types: [requirements, pyproject]
+        packages:
+          - libraft==24.6.*
+  librmm:
+    common:
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          - --extra-index-url=https://pypi.nvidia.com
+          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
+    specific:
+      - output_types: [requirements, pyproject]
+        matrices:
+          - matrix: {cuda: "12.*"}
+            packages:
+              - librmm-cu12==24.6.*
+          - matrix: {cuda: "11.*"}
+            packages:
+              - librmm-cu11==24.6.*
+          - matrix: null
+            packages:
+              - librmm==24.6.*
   build_pylibraft:
     common:
       - output_types: [conda]
@@ -431,10 +479,15 @@ dependencies:
           - matrix: {cuda: "12.*"}
             packages:
               - *rmm_cu12
+              - libraft-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages:
               - *rmm_cu11
-          - {matrix: null, packages: [*rmm_conda]}
+              - libraft-cu11==24.6.*
+          - matrix: null
+            packages:
+              - *rmm_conda
+              - libraft==24.6.*
   run_raft_dask:
     common:
       - output_types: [conda, pyproject]
@@ -466,11 +519,17 @@ dependencies:
             packages:
               - &pylibraft_cu12 pylibraft-cu12==24.6.*
               - &ucx_py_cu12 ucx-py-cu12==0.38.*
+              - libraft-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages:
               - &pylibraft_cu11 pylibraft-cu11==24.6.*
               - &ucx_py_cu11 ucx-py-cu11==0.38.*
-          - {matrix: null, packages: [*pylibraft_conda, *ucx_py_conda]}
+              - libraft-cu11==24.6.*
+          - matrix: null
+            packages: 
+              - *pylibraft_conda
+              - *ucx_py_conda
+              - libraft==24.6.*
   test_python_common:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/libraft/CMakeLists.txt b/python/libraft/CMakeLists.txt
new file mode 100644
index 0000000000..cc3b48d033
--- /dev/null
+++ b/python/libraft/CMakeLists.txt
@@ -0,0 +1,45 @@
+# =============================================================================
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
+
+include(../../rapids_config.cmake)
+
+include(rapids-cuda)
+rapids_cuda_init_architectures(libraft-python)
+
+project(
+  libraft-python
+  VERSION "${RAPIDS_VERSION}"
+  LANGUAGES CXX CUDA
+)
+
+# Check if raft is already available. If so, it is the user's responsibility to ensure that the
+# CMake package is also available at build time of the Python raft package.
+find_package(raft "${RAPIDS_VERSION}")
+
+if(raft_FOUND)
+  return()
+endif()
+
+unset(raft_FOUND)
+
+set(BUILD_TESTS OFF)
+set(BUILD_PRIMS_BENCH OFF)
+set(BUILD_ANN_BENCH OFF)
+set(RAFT_COMPILE_LIBRARY ON)
+set(RAFT_COMPILE_DYNAMIC_ONLY ON)
+set(CUDA_STATIC_RUNTIME ON)
+
+add_subdirectory(../../cpp raft-cpp)
diff --git a/python/libraft/LICENSE b/python/libraft/LICENSE
new file mode 120000
index 0000000000..30cff7403d
--- /dev/null
+++ b/python/libraft/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/libraft/README.md b/python/libraft/README.md
new file mode 120000
index 0000000000..fe84005413
--- /dev/null
+++ b/python/libraft/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/libraft/libraft/VERSION b/python/libraft/libraft/VERSION
new file mode 120000
index 0000000000..d62dc733ef
--- /dev/null
+++ b/python/libraft/libraft/VERSION
@@ -0,0 +1 @@
+../../../VERSION
\ No newline at end of file
diff --git a/python/libraft/libraft/__init__.py b/python/libraft/libraft/__init__.py
new file mode 100644
index 0000000000..2ba8e06d56
--- /dev/null
+++ b/python/libraft/libraft/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+from libraft._version import __git_commit__, __version__
+from libraft.load import load_library
diff --git a/python/libraft/libraft/_version.py b/python/libraft/libraft/_version.py
new file mode 100644
index 0000000000..3e3792a85c
--- /dev/null
+++ b/python/libraft/libraft/_version.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+import importlib.resources
+
+__version__ = (
+    importlib.resources.files("libraft")
+    .joinpath("VERSION")
+    .read_text()
+    .strip()
+)
+__git_commit__ = ""
diff --git a/python/libraft/libraft/load.py b/python/libraft/libraft/load.py
new file mode 100644
index 0000000000..fb7bf35274
--- /dev/null
+++ b/python/libraft/libraft/load.py
@@ -0,0 +1,48 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+import ctypes
+import os
+
+
+def load_library():
+    # Dynamically load libraft.so. Prefer a system library if one is present to
+    # avoid clobbering symbols that other packages might expect, but if no
+    # other library is present use the one in the wheel.
+    libraft_lib = None
+    try:
+        libraft_lib = ctypes.CDLL("libraft.so", ctypes.RTLD_GLOBAL)
+    except OSError:
+        # If neither of these directories contain the library, we assume we are
+        # in an environment where the C++ library is already installed
+        # somewhere else and the CMake build of the libraft Python
+        # package was a no-op. Note that this approach won't work for
+        # real editable installs of the libraft package, but that's not a use
+        # case I think we need to support. scikit-build-core has limited
+        # support for importlib.resources so there isn't a clean way to support
+        # that case yet.
+        for lib_dir in ("lib", "lib64"):
+            if os.path.isfile(
+                lib := os.path.join(
+                    os.path.dirname(__file__), lib_dir, "libraft.so"
+                )
+            ):
+                libraft_lib = ctypes.CDLL(lib, ctypes.RTLD_GLOBAL)
+                break
+
+    # The caller almost never needs to do anything with this library, but no
+    # harm in offering the option since this object at least provides a handle
+    # to inspect where libraft was loaded from.
+    return libraft_lib
diff --git a/python/libraft/pyproject.toml b/python/libraft/pyproject.toml
new file mode 100644
index 0000000000..c5a76e363a
--- /dev/null
+++ b/python/libraft/pyproject.toml
@@ -0,0 +1,65 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[build-system]
+
+requires = [
+    "cmake>=3.26.4",
+    "librmm==24.6.*",
+    "ninja",
+    "scikit-build-core[pyproject]>=0.7.0",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+build-backend = "scikit_build_core.build"
+
+[project]
+name = "libraft"
+dynamic = ["version"]
+description = "RAFT: Reusable Algorithms Functions and other Tools"
+readme = { file = "README.md", content-type = "text/markdown" }
+authors = [
+    { name = "NVIDIA Corporation" },
+]
+license = { text = "Apache 2.0" }
+requires-python = ">=3.9"
+classifiers = [
+    "Intended Audience :: Developers",
+    "Programming Language :: C++",
+    "Environment :: GPU :: NVIDIA CUDA",
+]
+dependencies = [
+    "librmm==24.6.*",
+] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+
+[project.urls]
+Homepage = "https://github.com/rapidsai/raft"
+Documentation = "https://docs.rapids.ai/api/raft/stable/"
+
+[project.entry-points."cmake.prefix"]
+libraft = "libraft"
+
+[tool.scikit-build]
+build-dir = "build/{wheel_tag}"
+cmake.build-type = "Release"
+cmake.minimum-version = "3.26.4"
+ninja.make-fallback = true
+sdist.exclude = ["*tests*"]
+sdist.reproducible = true
+wheel.packages = ["libraft"]
+wheel.install-dir = "libraft"
+wheel.py-api = "py3"
+
+[tool.scikit-build.metadata.version]
+provider = "scikit_build_core.metadata.regex"
+input = "libraft/VERSION"
+regex = "(?P<value>.*)"
diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt
index 7a2d77041d..bba5549f62 100644
--- a/python/pylibraft/CMakeLists.txt
+++ b/python/pylibraft/CMakeLists.txt
@@ -27,42 +27,9 @@ project(
   LANGUAGES CXX CUDA
 )
 
-option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulting to local files"
-       ON
-)
-
-# If the user requested it we attempt to find RAFT.
-if(FIND_RAFT_CPP)
-  find_package(raft "${RAPIDS_VERSION}" REQUIRED COMPONENTS compiled)
-  if(NOT TARGET raft::raft_lib)
-    message(
-      FATAL_ERROR
-        "Building against a preexisting libraft library requires the compiled libraft to have been built!"
-    )
-
-  endif()
-else()
-  set(raft_FOUND OFF)
-endif()
+find_package(raft "${RAPIDS_VERSION}" REQUIRED COMPONENTS compiled)
 
 include(rapids-cython-core)
-
-if(NOT raft_FOUND)
-  set(BUILD_TESTS OFF)
-  set(BUILD_PRIMS_BENCH OFF)
-  set(BUILD_ANN_BENCH OFF)
-  set(RAFT_COMPILE_LIBRARY ON)
-  set(CUDA_STATIC_RUNTIME ON)
-
-  add_subdirectory(../../cpp raft-cpp EXCLUDE_FROM_ALL)
-
-  # When building the C++ libraries from source we must copy libraft.so alongside the
-  # pairwise_distance and random Cython libraries TODO: when we have a single 'compiled' raft
-  # library, we shouldn't need this
-  set(cython_lib_dir pylibraft)
-  install(TARGETS raft_lib DESTINATION ${cython_lib_dir})
-endif()
-
 rapids_cython_init()
 
 add_subdirectory(pylibraft/common)
@@ -71,7 +38,3 @@ add_subdirectory(pylibraft/matrix)
 add_subdirectory(pylibraft/neighbors)
 add_subdirectory(pylibraft/random)
 add_subdirectory(pylibraft/cluster)
-
-if(DEFINED cython_lib_dir)
-  rapids_cython_add_rpath_entries(TARGET raft PATHS "${cython_lib_dir}")
-endif()
diff --git a/python/pylibraft/pylibraft/__init__.py b/python/pylibraft/pylibraft/__init__.py
index 3b67a5f951..8aac8f93da 100644
--- a/python/pylibraft/pylibraft/__init__.py
+++ b/python/pylibraft/pylibraft/__init__.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,3 +14,13 @@
 #
 
 from pylibraft._version import __git_commit__, __version__
+
+# If libraft was installed as a wheel, we must request it to load the library symbols.
+# Otherwise, we assume that the library was installed in a system path that ld can find.
+try:
+    import libraft
+except ModuleNotFoundError:
+    pass
+else:
+    libraft.load_library()
+    del libraft
diff --git a/python/pylibraft/pylibraft/cluster/CMakeLists.txt b/python/pylibraft/pylibraft/cluster/CMakeLists.txt
index 7d6e05d918..06a639436a 100644
--- a/python/pylibraft/pylibraft/cluster/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/cluster/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -20,5 +20,5 @@ set(linked_libraries raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX cluster_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX cluster_
 )
diff --git a/python/pylibraft/pylibraft/common/CMakeLists.txt b/python/pylibraft/pylibraft/common/CMakeLists.txt
index 6ce1dfe347..d1c1acb3aa 100644
--- a/python/pylibraft/pylibraft/common/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/common/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -20,5 +20,5 @@ set(linked_libraries raft::raft)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX common_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX common_
 )
diff --git a/python/pylibraft/pylibraft/distance/CMakeLists.txt b/python/pylibraft/pylibraft/distance/CMakeLists.txt
index 2530e07a98..ffcef45c32 100644
--- a/python/pylibraft/pylibraft/distance/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/distance/CMakeLists.txt
@@ -20,5 +20,5 @@ set(linked_libraries raft::raft raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX distance_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX distance_
 )
diff --git a/python/pylibraft/pylibraft/matrix/CMakeLists.txt b/python/pylibraft/pylibraft/matrix/CMakeLists.txt
index ffba10dea9..07d35325a5 100644
--- a/python/pylibraft/pylibraft/matrix/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/matrix/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -20,5 +20,5 @@ set(linked_libraries raft::raft raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX matrix_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX matrix_
 )
diff --git a/python/pylibraft/pylibraft/neighbors/CMakeLists.txt b/python/pylibraft/pylibraft/neighbors/CMakeLists.txt
index 069038a0e8..2a954183d3 100644
--- a/python/pylibraft/pylibraft/neighbors/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/neighbors/CMakeLists.txt
@@ -20,7 +20,7 @@ set(linked_libraries raft::raft raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX neighbors_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_
 )
 
 add_subdirectory(cagra)
diff --git a/python/pylibraft/pylibraft/neighbors/cagra/CMakeLists.txt b/python/pylibraft/pylibraft/neighbors/cagra/CMakeLists.txt
index 441bb0b311..2df03c7b0b 100644
--- a/python/pylibraft/pylibraft/neighbors/cagra/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/neighbors/cagra/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -20,5 +20,5 @@ set(linked_libraries raft::raft raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX neighbors_cagra_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_cagra_
 )
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_flat/CMakeLists.txt b/python/pylibraft/pylibraft/neighbors/ivf_flat/CMakeLists.txt
index 8f395faec9..f50051ba23 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_flat/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/neighbors/ivf_flat/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -20,5 +20,5 @@ set(linked_libraries raft::raft raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX neighbors_ivfflat_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_ivfflat_
 )
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt b/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt
index e3d721a6ea..e57798fcc6 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -20,5 +20,5 @@ set(linked_libraries raft::raft raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX neighbors_ivfpq_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_ivfpq_
 )
diff --git a/python/pylibraft/pylibraft/random/CMakeLists.txt b/python/pylibraft/pylibraft/random/CMakeLists.txt
index fcc5ee6311..7d61855111 100644
--- a/python/pylibraft/pylibraft/random/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/random/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -23,5 +23,5 @@ set(linked_libraries raft::raft raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX random_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX random_
 )
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index 3e8ca0b6d3..bf98d9b9c8 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -18,6 +18,7 @@ requires = [
     "cmake>=3.26.4",
     "cuda-python>=11.7.1,<12.0a0",
     "cython>=3.0.0",
+    "libraft==24.6.*",
     "ninja",
     "rmm==24.6.*",
     "scikit-build-core[pyproject]>=0.7.0",
@@ -36,6 +37,7 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cuda-python>=11.7.1,<12.0a0",
+    "libraft==24.6.*",
     "numpy>=1.23,<2.0a0",
     "rmm==24.6.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt
index 2c629f3b73..3218ea4d32 100644
--- a/python/raft-dask/CMakeLists.txt
+++ b/python/raft-dask/CMakeLists.txt
@@ -25,38 +25,12 @@ project(
   LANGUAGES CXX CUDA
 )
 
-option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulting to local files"
-       OFF
-)
-
 rapids_cpm_init()
 # Once https://github.com/rapidsai/ucxx/issues/173 is resolved we can remove this.
 find_package(ucx REQUIRED)
 include(cmake/thirdparty/get_ucxx.cmake)
 
-# If the user requested it we attempt to find RAFT.
-if(FIND_RAFT_CPP)
-  find_package(raft "${RAPIDS_VERSION}" REQUIRED COMPONENTS distributed)
-else()
-  set(raft_FOUND OFF)
-endif()
-
-if(NOT raft_FOUND)
-  # raft-dask doesn't actually use raft libraries, it just needs the headers, so we can turn off all
-  # library compilation and we don't need to install anything here.
-  set(BUILD_TESTS OFF)
-  set(BUILD_ANN_BENCH OFF)
-  set(BUILD_PRIMS_BENCH OFF)
-  set(RAFT_COMPILE_LIBRARIES OFF)
-  set(RAFT_COMPILE_DIST_LIBRARY OFF)
-  set(RAFT_COMPILE_NN_LIBRARY OFF)
-  set(CUDA_STATIC_RUNTIME ON)
-  set(RAFT_DASK_UCXX_STATIC ON)
-
-  add_subdirectory(../../cpp raft-cpp EXCLUDE_FROM_ALL)
-  list(APPEND CMAKE_MODULE_PATH ${CMAKE_BINARY_DIR}/cmake/find_modules)
-  find_package(NCCL REQUIRED)
-endif()
+find_package(raft "${RAPIDS_VERSION}" REQUIRED COMPONENTS distributed)
 
 include(rapids-cython-core)
 rapids_cython_init()
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 0181bef4ce..53b1c51e83 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -18,6 +18,7 @@ build-backend = "scikit_build_core.build"
 requires = [
     "cmake>=3.26.4",
     "cython>=3.0.0",
+    "libraft==24.6.*",
     "libucx==1.15.0",
     "ninja",
     "scikit-build-core[pyproject]>=0.7.0",
@@ -36,6 +37,7 @@ requires-python = ">=3.9"
 dependencies = [
     "dask-cuda==24.6.*",
     "joblib>=0.11",
+    "libraft==24.6.*",
     "numba>=0.57",
     "numpy>=1.23,<2.0a0",
     "pylibraft==24.6.*",
diff --git a/python/raft-dask/raft_dask/__init__.py b/python/raft-dask/raft_dask/__init__.py
index 19a037ae75..13b9a96154 100644
--- a/python/raft-dask/raft_dask/__init__.py
+++ b/python/raft-dask/raft_dask/__init__.py
@@ -15,8 +15,17 @@
 
 from raft_dask._version import __git_commit__, __version__
 
-# If libucx was installed as a wheel, we must request it to load the library symbols.
-# Otherwise, we assume that the library was installed in a system path that ld can find.
+# If libraft or libucx was installed as a wheel, we must request that those packages
+# load the library symbols. Otherwise, we assume that the libraries were installed in a
+# system path that ld can find.
+try:
+    import libraft
+except ModuleNotFoundError:
+    pass
+else:
+    libraft.load_library()
+    del libraft
+
 try:
     import libucx
 except ModuleNotFoundError:
diff --git a/python/raft-dask/raft_dask/common/CMakeLists.txt b/python/raft-dask/raft_dask/common/CMakeLists.txt
index 3798b5ac4b..49dee15d8f 100644
--- a/python/raft-dask/raft_dask/common/CMakeLists.txt
+++ b/python/raft-dask/raft_dask/common/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -15,6 +15,6 @@
 set(cython_sources comms_utils.pyx nccl.pyx)
 set(linked_libraries raft::raft raft::distributed)
 rapids_cython_create_modules(
-  SOURCE_FILES "${cython_sources}" ASSOCIATED_TARGETS raft LINKED_LIBRARIES "${linked_libraries}"
+  SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}"
                                                                             CXX
 )
diff --git a/python/raft-dask/raft_dask/include_test/CMakeLists.txt b/python/raft-dask/raft_dask/include_test/CMakeLists.txt
index e588ce1d1e..8475bcaa93 100644
--- a/python/raft-dask/raft_dask/include_test/CMakeLists.txt
+++ b/python/raft-dask/raft_dask/include_test/CMakeLists.txt
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2022, NVIDIA CORPORATION.
+# Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -15,6 +15,6 @@
 set(cython_sources raft_include_test.pyx)
 set(linked_libraries raft::raft)
 rapids_cython_create_modules(
-  SOURCE_FILES "${cython_sources}" ASSOCIATED_TARGETS raft LINKED_LIBRARIES "${linked_libraries}"
+  SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}"
                                                                             CXX
 )

From 6b731b6df0218f75fefcf288880f39cbd934d58d Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Wed, 8 May 2024 06:51:29 -0400
Subject: [PATCH 28/60] `libucx` should be run dependency of `raft-dask`
 (#2296)

As figured out in https://github.com/rapidsai/cuml/pull/5697, `libucx` should not be just a testing dependency of `raft-dask`, it should be a run dependency.

cc @vyasr

Authors:
  - Divye Gala (https://github.com/divyegala)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Dante Gama Dessavre (https://github.com/dantegd)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/raft/pull/2296
---
 dependencies.yaml               | 2 +-
 python/raft-dask/pyproject.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/dependencies.yaml b/dependencies.yaml
index 685d0a8fac..3daa400444 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -123,6 +123,7 @@ files:
       table: project
     includes:
       - run_raft_dask
+      - depends_on_ucx_run
   py_test_raft_dask:
     output: pyproject
     pyproject_dir: python/raft-dask
@@ -132,7 +133,6 @@ files:
     includes:
       - test_python_common
       - depends_on_distributed_ucxx
-      - depends_on_ucx_run
   py_build_raft_ann_bench:
     output: pyproject
     pyproject_dir: python/raft-ann-bench
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 53b1c51e83..37691cc01b 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -38,6 +38,7 @@ dependencies = [
     "dask-cuda==24.6.*",
     "joblib>=0.11",
     "libraft==24.6.*",
+    "libucx>=1.15.0",
     "numba>=0.57",
     "numpy>=1.23,<2.0a0",
     "pylibraft==24.6.*",
@@ -55,7 +56,6 @@ classifiers = [
 [project.optional-dependencies]
 test = [
     "distributed-ucxx==0.38.*",
-    "libucx>=1.15.0",
     "pytest-cov",
     "pytest==7.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From 4c1bf03d082ab83d8ae8e515b6d4f1f09bff9d4b Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Wed, 8 May 2024 01:52:59 -1000
Subject: [PATCH 29/60] Enable warnings as errors for Python tests (#2288)

As part of https://github.com/rapidsai/build-planning/issues/26, warnings in Python tests will be converted to test failures


`ignore:Unknown pytest.mark.ucx:PytestUnknownMarkWarning` could be removed once https://github.com/rapidsai/raft/pull/2281 is merged cc @jameslamb

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Dante Gama Dessavre (https://github.com/dantegd)

URL: https://github.com/rapidsai/raft/pull/2288
---
 python/pylibraft/pyproject.toml | 5 +++++
 python/raft-dask/pytest.ini     | 2 ++
 2 files changed, 7 insertions(+)

diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index bf98d9b9c8..8feb600f11 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -118,3 +118,8 @@ wheel.packages = ["pylibraft"]
 provider = "scikit_build_core.metadata.regex"
 input = "pylibraft/VERSION"
 regex = "(?P<value>.*)"
+
+[tool.pytest.ini_options]
+filterwarnings = [
+    "error",
+]
diff --git a/python/raft-dask/pytest.ini b/python/raft-dask/pytest.ini
index fcb18fe412..2467e2089a 100644
--- a/python/raft-dask/pytest.ini
+++ b/python/raft-dask/pytest.ini
@@ -1,4 +1,6 @@
 [pytest]
+filterwarnings =
+    error
 markers =
   unit: marks unit tests
   quality: marks quality tests

From d9051f6a2c5bccbc112cfaaad50e608f503e3628 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 8 May 2024 11:45:44 -0700
Subject: [PATCH 30/60] Remove nonexistent job from workflow (#2298)

---
 .github/workflows/build.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index d99d5d28e5..9944dd5198 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -68,7 +68,6 @@ jobs:
       run_script: "ci/build_docs.sh"
       sha: ${{ inputs.sha }}
   wheel-build-cpp:
-    needs: checks
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:

From 4590c14be87a8c1ff294abba49a5f82ff877d484 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 8 May 2024 15:16:20 -0700
Subject: [PATCH 31/60] Set the import mode for dask tests (#2142)

This PR fixes the underlying issue that led us to pin pytest in #2137. For now, this PR leaves pytest pinned since there are other issues with pytest 8 and plugin interactions that have broken other RAPIDS libraries so I figured this was safer and we could relax the pin when the rest of RAPIDS does. However, if preferred I'm happy to undo the pinning in this PR too.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/raft/pull/2142
---
 ci/run_raft_dask_pytests.sh | 2 +-
 ci/test_wheel_raft_dask.sh  | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/run_raft_dask_pytests.sh b/ci/run_raft_dask_pytests.sh
index 46cd211d2e..07d0b5baa0 100755
--- a/ci/run_raft_dask_pytests.sh
+++ b/ci/run_raft_dask_pytests.sh
@@ -6,4 +6,4 @@ set -euo pipefail
 # Support invoking run_raft_dask_pytests.sh outside the script directory
 cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../python/raft-dask/raft_dask
 
-pytest --cache-clear "$@" test
+pytest --cache-clear --import-mode=append "$@" test
diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh
index b2ec9a0c8b..bc3af96688 100755
--- a/ci/test_wheel_raft_dask.sh
+++ b/ci/test_wheel_raft_dask.sh
@@ -14,10 +14,10 @@ python -m pip install "raft-dask-${RAPIDS_PY_CUDA_SUFFIX}[test]>=0.0.0a0" --find
 test_dir="python/raft-dask/raft_dask/test"
 
 # rapids-logger "pytest raft-dask"
-# python -m pytest ${test_dir}
+# python -m pytest --import-mode=append ${test_dir}
 
 # rapids-logger "pytest raft-dask (ucx-py only)"
-# python -m pytest ${test_dir} --run_ucx
+# python -m pytest --import-mode=append ${test_dir} --run_ucx
 
 rapids-logger "pytest raft-dask (ucxx only)"
-python -m pytest ${test_dir} --run_ucxx
+python -m pytest --import-mode=append ${test_dir} --run_ucxx

From f222be06d224da91fc30bab2acf8f5571f89a69e Mon Sep 17 00:00:00 2001
From: Micka <mide@nvidia.com>
Date: Thu, 9 May 2024 02:40:07 +0200
Subject: [PATCH 32/60] [FEA] Split Bitset code (#2295)

Splitting the Bitset code into a `.hpp` and `.cuh` header allow the declaration of `ann::search_with_filters` with a `bitset_filter` to be made without using CUDA code in headers.
This is useful for cuVS where the headers `ivf_pq.hpp` need to declare `search_with_filter` and stay CUDA-free.

Authors:
  - Micka (https://github.com/lowener)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2295
---
 cpp/include/raft/core/bitset.cuh | 526 ++++++++++---------------------
 cpp/include/raft/core/bitset.hpp | 275 ++++++++++++++++
 2 files changed, 437 insertions(+), 364 deletions(-)
 create mode 100644 cpp/include/raft/core/bitset.hpp

diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh
index 53fd586ed2..cdfbe0b8dd 100644
--- a/cpp/include/raft/core/bitset.cuh
+++ b/cpp/include/raft/core/bitset.cuh
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <raft/core/bitset.hpp>
 #include <raft/core/detail/mdspan_util.cuh>  // native_popc
 #include <raft/core/device_container_policy.hpp>
 #include <raft/core/device_mdarray.hpp>
@@ -28,372 +29,169 @@
 #include <thrust/for_each.h>
 
 namespace raft::core {
-/**
- * @defgroup bitset Bitset
- * @{
- */
-/**
- * @brief View of a RAFT Bitset.
- *
- * This lightweight structure stores a pointer to a bitset in device memory with it's length.
- * It provides a test() device function to check if a given index is set in the bitset.
- *
- * @tparam bitset_t Underlying type of the bitset array. Default is uint32_t.
- * @tparam index_t Indexing type used. Default is uint32_t.
- */
-template <typename bitset_t = uint32_t, typename index_t = uint32_t>
-struct bitset_view {
-  static constexpr index_t bitset_element_size = sizeof(bitset_t) * 8;
-
-  _RAFT_HOST_DEVICE bitset_view(bitset_t* bitset_ptr, index_t bitset_len)
-    : bitset_ptr_{bitset_ptr}, bitset_len_{bitset_len}
-  {
-  }
-  /**
-   * @brief Create a bitset view from a device vector view of the bitset.
-   *
-   * @param bitset_span Device vector view of the bitset
-   * @param bitset_len Number of bits in the bitset
-   */
-  _RAFT_HOST_DEVICE bitset_view(raft::device_vector_view<bitset_t, index_t> bitset_span,
-                                index_t bitset_len)
-    : bitset_ptr_{bitset_span.data_handle()}, bitset_len_{bitset_len}
-  {
-  }
-  /**
-   * @brief Device function to test if a given index is set in the bitset.
-   *
-   * @param sample_index Single index to test
-   * @return bool True if index has not been unset in the bitset
-   */
-  inline _RAFT_DEVICE auto test(const index_t sample_index) const -> bool
-  {
-    const bitset_t bit_element = bitset_ptr_[sample_index / bitset_element_size];
-    const index_t bit_index    = sample_index % bitset_element_size;
-    const bool is_bit_set      = (bit_element & (bitset_t{1} << bit_index)) != 0;
-    return is_bit_set;
-  }
-  /**
-   * @brief Device function to test if a given index is set in the bitset.
-   *
-   * @param sample_index Single index to test
-   * @return bool True if index has not been unset in the bitset
-   */
-  inline _RAFT_DEVICE auto operator[](const index_t sample_index) const -> bool
-  {
-    return test(sample_index);
-  }
-  /**
-   * @brief Device function to set a given index to set_value in the bitset.
-   *
-   * @param sample_index index to set
-   * @param set_value Value to set the bit to (true or false)
-   */
-  inline _RAFT_DEVICE void set(const index_t sample_index, bool set_value) const
-  {
-    const index_t bit_element = sample_index / bitset_element_size;
-    const index_t bit_index   = sample_index % bitset_element_size;
-    const bitset_t bitmask    = bitset_t{1} << bit_index;
-    if (set_value) {
-      atomicOr(bitset_ptr_ + bit_element, bitmask);
-    } else {
-      const bitset_t bitmask2 = ~bitmask;
-      atomicAnd(bitset_ptr_ + bit_element, bitmask2);
-    }
-  }
-
-  /**
-   * @brief Get the device pointer to the bitset.
-   */
-  inline _RAFT_HOST_DEVICE auto data() -> bitset_t* { return bitset_ptr_; }
-  inline _RAFT_HOST_DEVICE auto data() const -> const bitset_t* { return bitset_ptr_; }
-  /**
-   * @brief Get the number of bits of the bitset representation.
-   */
-  inline _RAFT_HOST_DEVICE auto size() const -> index_t { return bitset_len_; }
-
-  /**
-   * @brief Get the number of elements used by the bitset representation.
-   */
-  inline _RAFT_HOST_DEVICE auto n_elements() const -> index_t
-  {
-    return raft::ceildiv(bitset_len_, bitset_element_size);
-  }
-
-  inline auto to_mdspan() -> raft::device_vector_view<bitset_t, index_t>
-  {
-    return raft::make_device_vector_view<bitset_t, index_t>(bitset_ptr_, n_elements());
-  }
-  inline auto to_mdspan() const -> raft::device_vector_view<const bitset_t, index_t>
-  {
-    return raft::make_device_vector_view<const bitset_t, index_t>(bitset_ptr_, n_elements());
-  }
-
- private:
-  bitset_t* bitset_ptr_;
-  index_t bitset_len_;
-};
-
-/**
- * @brief RAFT Bitset.
- *
- * This structure encapsulates a bitset in device memory. It provides a view() method to get a
- * device-usable lightweight view of the bitset.
- * Each index is represented by a single bit in the bitset. The total number of bytes used is
- * ceil(bitset_len / 8).
- * @tparam bitset_t Underlying type of the bitset array. Default is uint32_t.
- * @tparam index_t Indexing type used. Default is uint32_t.
- */
-template <typename bitset_t = uint32_t, typename index_t = uint32_t>
-struct bitset {
-  static constexpr index_t bitset_element_size = sizeof(bitset_t) * 8;
-
-  /**
-   * @brief Construct a new bitset object with a list of indices to unset.
-   *
-   * @param res RAFT resources
-   * @param mask_index List of indices to unset in the bitset
-   * @param bitset_len Length of the bitset
-   * @param default_value Default value to set the bits to. Default is true.
-   */
-  bitset(const raft::resources& res,
-         raft::device_vector_view<const index_t, index_t> mask_index,
-         index_t bitset_len,
-         bool default_value = true)
-    : bitset_{std::size_t(raft::ceildiv(bitset_len, bitset_element_size)),
-              raft::resource::get_cuda_stream(res)},
-      bitset_len_{bitset_len}
-  {
-    reset(res, default_value);
-    set(res, mask_index, !default_value);
-  }
 
-  /**
-   * @brief Construct a new bitset object
-   *
-   * @param res RAFT resources
-   * @param bitset_len Length of the bitset
-   * @param default_value Default value to set the bits to. Default is true.
-   */
-  bitset(const raft::resources& res, index_t bitset_len, bool default_value = true)
-    : bitset_{std::size_t(raft::ceildiv(bitset_len, bitset_element_size)),
-              resource::get_cuda_stream(res)},
-      bitset_len_{bitset_len}
-  {
-    reset(res, default_value);
-  }
-  // Disable copy constructor
-  bitset(const bitset&)            = delete;
-  bitset(bitset&&)                 = default;
-  bitset& operator=(const bitset&) = delete;
-  bitset& operator=(bitset&&)      = default;
-
-  /**
-   * @brief Create a device-usable view of the bitset.
-   *
-   * @return bitset_view<bitset_t, index_t>
-   */
-  inline auto view() -> raft::core::bitset_view<bitset_t, index_t>
-  {
-    return bitset_view<bitset_t, index_t>(to_mdspan(), bitset_len_);
-  }
-  [[nodiscard]] inline auto view() const -> raft::core::bitset_view<const bitset_t, index_t>
-  {
-    return bitset_view<const bitset_t, index_t>(to_mdspan(), bitset_len_);
-  }
-
-  /**
-   * @brief Get the device pointer to the bitset.
-   */
-  inline auto data() -> bitset_t* { return bitset_.data(); }
-  inline auto data() const -> const bitset_t* { return bitset_.data(); }
-  /**
-   * @brief Get the number of bits of the bitset representation.
-   */
-  inline auto size() const -> index_t { return bitset_len_; }
-
-  /**
-   * @brief Get the number of elements used by the bitset representation.
-   */
-  inline auto n_elements() const -> index_t
-  {
-    return raft::ceildiv(bitset_len_, bitset_element_size);
-  }
-
-  /** @brief Get an mdspan view of the current bitset */
-  inline auto to_mdspan() -> raft::device_vector_view<bitset_t, index_t>
-  {
-    return raft::make_device_vector_view<bitset_t, index_t>(bitset_.data(), n_elements());
-  }
-  [[nodiscard]] inline auto to_mdspan() const -> raft::device_vector_view<const bitset_t, index_t>
-  {
-    return raft::make_device_vector_view<const bitset_t, index_t>(bitset_.data(), n_elements());
-  }
-
-  /** @brief Resize the bitset. If the requested size is larger, new memory is allocated and set to
-   * the default value.
-   * @param res RAFT resources
-   * @param new_bitset_len new size of the bitset
-   * @param default_value default value to initialize the new bits to
-   */
-  void resize(const raft::resources& res, index_t new_bitset_len, bool default_value = true)
-  {
-    auto old_size = raft::ceildiv(bitset_len_, bitset_element_size);
-    auto new_size = raft::ceildiv(new_bitset_len, bitset_element_size);
-    bitset_.resize(new_size);
-    bitset_len_ = new_bitset_len;
-    if (old_size < new_size) {
-      // If the new size is larger, set the new bits to the default value
-
-      thrust::fill_n(resource::get_thrust_policy(res),
-                     bitset_.data() + old_size,
-                     new_size - old_size,
-                     default_value ? ~bitset_t{0} : bitset_t{0});
-    }
-  }
-
-  /**
-   * @brief Test a list of indices in a bitset.
-   *
-   * @tparam output_t Output type of the test. Default is bool.
-   * @param res RAFT resources
-   * @param queries List of indices to test
-   * @param output List of outputs
-   */
-  template <typename output_t = bool>
-  void test(const raft::resources& res,
-            raft::device_vector_view<const index_t, index_t> queries,
-            raft::device_vector_view<output_t, index_t> output) const
-  {
-    RAFT_EXPECTS(output.extent(0) == queries.extent(0), "Output and queries must be same size");
-    auto bitset_view = view();
-    raft::linalg::map(
-      res,
-      output,
-      [bitset_view] __device__(index_t query) { return output_t(bitset_view.test(query)); },
-      queries);
-  }
-  /**
-   * @brief Set a list of indices in a bitset to set_value.
-   *
-   * @param res RAFT resources
-   * @param mask_index indices to remove from the bitset
-   * @param set_value Value to set the bits to (true or false)
-   */
-  void set(const raft::resources& res,
-           raft::device_vector_view<const index_t, index_t> mask_index,
-           bool set_value = false)
-  {
-    auto this_bitset_view = view();
-    thrust::for_each_n(resource::get_thrust_policy(res),
-                       mask_index.data_handle(),
-                       mask_index.extent(0),
-                       [this_bitset_view, set_value] __device__(const index_t sample_index) {
-                         this_bitset_view.set(sample_index, set_value);
-                       });
-  }
-  /**
-   * @brief Flip all the bits in a bitset.
-   * @param res RAFT resources
-   */
-  void flip(const raft::resources& res)
-  {
-    auto bitset_span = this->to_mdspan();
-    raft::linalg::map(
-      res,
-      bitset_span,
-      [] __device__(bitset_t element) { return bitset_t(~element); },
-      raft::make_const_mdspan(bitset_span));
-  }
-  /**
-   * @brief Reset the bits in a bitset.
-   *
-   * @param res RAFT resources
-   * @param default_value Value to set the bits to (true or false)
-   */
-  void reset(const raft::resources& res, bool default_value = true)
-  {
-    thrust::fill_n(resource::get_thrust_policy(res),
-                   bitset_.data(),
-                   n_elements(),
+template <typename bitset_t, typename index_t>
+_RAFT_HOST_DEVICE inline bool bitset_view<bitset_t, index_t>::test(const index_t sample_index) const
+{
+  const bitset_t bit_element = bitset_ptr_[sample_index / bitset_element_size];
+  const index_t bit_index    = sample_index % bitset_element_size;
+  const bool is_bit_set      = (bit_element & (bitset_t{1} << bit_index)) != 0;
+  return is_bit_set;
+}
+
+template <typename bitset_t, typename index_t>
+_RAFT_HOST_DEVICE bool bitset_view<bitset_t, index_t>::operator[](const index_t sample_index) const
+{
+  return test(sample_index);
+}
+
+template <typename bitset_t, typename index_t>
+_RAFT_HOST_DEVICE void bitset_view<bitset_t, index_t>::set(const index_t sample_index,
+                                                           bool set_value) const
+{
+  const index_t bit_element = sample_index / bitset_element_size;
+  const index_t bit_index   = sample_index % bitset_element_size;
+  const bitset_t bitmask    = bitset_t{1} << bit_index;
+  if (set_value) {
+    atomicOr(bitset_ptr_ + bit_element, bitmask);
+  } else {
+    const bitset_t bitmask2 = ~bitmask;
+    atomicAnd(bitset_ptr_ + bit_element, bitmask2);
+  }
+}
+
+template <typename bitset_t, typename index_t>
+bitset<bitset_t, index_t>::bitset(const raft::resources& res,
+                                  raft::device_vector_view<const index_t, index_t> mask_index,
+                                  index_t bitset_len,
+                                  bool default_value)
+  : bitset_{std::size_t(raft::ceildiv(bitset_len, bitset_element_size)),
+            raft::resource::get_cuda_stream(res)},
+    bitset_len_{bitset_len}
+{
+  reset(res, default_value);
+  set(res, mask_index, !default_value);
+}
+
+template <typename bitset_t, typename index_t>
+bitset<bitset_t, index_t>::bitset(const raft::resources& res,
+                                  index_t bitset_len,
+                                  bool default_value)
+  : bitset_{std::size_t(raft::ceildiv(bitset_len, bitset_element_size)),
+            raft::resource::get_cuda_stream(res)},
+    bitset_len_{bitset_len}
+{
+  reset(res, default_value);
+}
+
+template <typename bitset_t, typename index_t>
+index_t bitset<bitset_t, index_t>::n_elements() const
+{
+  return raft::ceildiv(bitset_len_, bitset_element_size);
+}
+
+template <typename bitset_t, typename index_t>
+void bitset<bitset_t, index_t>::resize(const raft::resources& res,
+                                       index_t new_bitset_len,
+                                       bool default_value)
+{
+  auto old_size = raft::ceildiv(bitset_len_, bitset_element_size);
+  auto new_size = raft::ceildiv(new_bitset_len, bitset_element_size);
+  bitset_.resize(new_size);
+  bitset_len_ = new_bitset_len;
+  if (old_size < new_size) {
+    // If the new size is larger, set the new bits to the default value
+    thrust::fill_n(raft::resource::get_thrust_policy(res),
+                   bitset_.data() + old_size,
+                   new_size - old_size,
                    default_value ? ~bitset_t{0} : bitset_t{0});
   }
-  /**
-   * @brief Returns the number of bits set to true in count_gpu_scalar.
-   *
-   * @param[in] res RAFT resources
-   * @param[out] count_gpu_scalar Device scalar to store the count
-   */
-  void count(const raft::resources& res, raft::device_scalar_view<index_t> count_gpu_scalar)
-  {
-    auto n_elements_ = n_elements();
-    auto count_gpu =
-      raft::make_device_vector_view<index_t, index_t>(count_gpu_scalar.data_handle(), 1);
-    auto bitset_matrix_view = raft::make_device_matrix_view<const bitset_t, index_t, col_major>(
-      bitset_.data(), n_elements_, 1);
-
-    bitset_t n_last_element = (bitset_len_ % bitset_element_size);
-    bitset_t last_element_mask =
-      n_last_element ? (bitset_t)((bitset_t{1} << n_last_element) - bitset_t{1}) : ~bitset_t{0};
-    raft::linalg::coalesced_reduction(
-      res,
-      bitset_matrix_view,
-      count_gpu,
-      index_t{0},
-      false,
-      [last_element_mask, n_elements_] __device__(bitset_t element, index_t index) {
-        index_t result = 0;
-        if constexpr (bitset_element_size == 64) {
-          if (index == n_elements_ - 1)
-            result = index_t(raft::detail::popc(element & last_element_mask));
-          else
-            result = index_t(raft::detail::popc(element));
-        } else {  // Needed because popc is not overloaded for 16 and 8 bit elements
-          if (index == n_elements_ - 1)
-            result = index_t(raft::detail::popc(uint32_t{element} & last_element_mask));
-          else
-            result = index_t(raft::detail::popc(uint32_t{element}));
-        }
-
-        return result;
-      });
-  }
-  /**
-   * @brief Returns the number of bits set to true.
-   *
-   * @param res RAFT resources
-   * @return index_t Number of bits set to true
-   */
-  auto count(const raft::resources& res) -> index_t
-  {
-    auto count_gpu_scalar = raft::make_device_scalar<index_t>(res, 0.0);
-    count(res, count_gpu_scalar.view());
-    index_t count_cpu = 0;
-    raft::update_host(
-      &count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res));
-    resource::sync_stream(res);
-    return count_cpu;
-  }
-  /**
-   * @brief Checks if any of the bits are set to true in the bitset.
-   * @param res RAFT resources
-   */
-  bool any(const raft::resources& res) { return count(res) > 0; }
-  /**
-   * @brief Checks if all of the bits are set to true in the bitset.
-   * @param res RAFT resources
-   */
-  bool all(const raft::resources& res) { return count(res) == bitset_len_; }
-  /**
-   * @brief Checks if none of the bits are set to true in the bitset.
-   * @param res RAFT resources
-   */
-  bool none(const raft::resources& res) { return count(res) == 0; }
-
- private:
-  raft::device_uvector<bitset_t> bitset_;
-  index_t bitset_len_;
-};
+}
+
+template <typename bitset_t, typename index_t>
+template <typename output_t>
+void bitset<bitset_t, index_t>::test(const raft::resources& res,
+                                     raft::device_vector_view<const index_t, index_t> queries,
+                                     raft::device_vector_view<output_t, index_t> output) const
+{
+  RAFT_EXPECTS(output.extent(0) == queries.extent(0), "Output and queries must be same size");
+  auto bitset_view = view();
+  raft::linalg::map(
+    res,
+    output,
+    [bitset_view] __device__(index_t query) { return bitset_view.test(query); },
+    queries);
+}
+
+template <typename bitset_t, typename index_t>
+void bitset<bitset_t, index_t>::set(const raft::resources& res,
+                                    raft::device_vector_view<const index_t, index_t> mask_index,
+                                    bool set_value)
+{
+  auto this_bitset_view = view();
+  thrust::for_each_n(raft::resource::get_thrust_policy(res),
+                     mask_index.data_handle(),
+                     mask_index.extent(0),
+                     [this_bitset_view, set_value] __device__(const index_t sample_index) {
+                       this_bitset_view.set(sample_index, set_value);
+                     });
+}
+
+template <typename bitset_t, typename index_t>
+void bitset<bitset_t, index_t>::flip(const raft::resources& res)
+{
+  auto bitset_span = this->to_mdspan();
+  raft::linalg::map(
+    res,
+    bitset_span,
+    [] __device__(bitset_t element) { return bitset_t(~element); },
+    raft::make_const_mdspan(bitset_span));
+}
+
+template <typename bitset_t, typename index_t>
+void bitset<bitset_t, index_t>::reset(const raft::resources& res, bool default_value)
+{
+  thrust::fill_n(raft::resource::get_thrust_policy(res),
+                 bitset_.data(),
+                 n_elements(),
+                 default_value ? ~bitset_t{0} : bitset_t{0});
+}
+
+template <typename bitset_t, typename index_t>
+void bitset<bitset_t, index_t>::count(const raft::resources& res,
+                                      raft::device_scalar_view<index_t> count_gpu_scalar)
+{
+  auto n_elements_ = n_elements();
+  auto count_gpu =
+    raft::make_device_vector_view<index_t, index_t>(count_gpu_scalar.data_handle(), 1);
+  auto bitset_matrix_view = raft::make_device_matrix_view<const bitset_t, index_t, raft::col_major>(
+    bitset_.data(), n_elements_, 1);
+
+  bitset_t n_last_element = (bitset_len_ % bitset_element_size);
+  bitset_t last_element_mask =
+    n_last_element ? (bitset_t)((bitset_t{1} << n_last_element) - bitset_t{1}) : ~bitset_t{0};
+  raft::linalg::coalesced_reduction(
+    res,
+    bitset_matrix_view,
+    count_gpu,
+    index_t{0},
+    false,
+    [last_element_mask, n_elements_] __device__(bitset_t element, index_t index) {
+      index_t result = 0;
+      if constexpr (bitset_element_size == 64) {
+        if (index == n_elements_ - 1)
+          result = index_t(raft::detail::popc(element & last_element_mask));
+        else
+          result = index_t(raft::detail::popc(element));
+      } else {  // Needed because popc is not overloaded for 16 and 8 bit elements
+        if (index == n_elements_ - 1)
+          result = index_t(raft::detail::popc(uint32_t{element} & last_element_mask));
+        else
+          result = index_t(raft::detail::popc(uint32_t{element}));
+      }
+
+      return result;
+    });
+}
 
-/** @} */
 }  // end namespace raft::core
diff --git a/cpp/include/raft/core/bitset.hpp b/cpp/include/raft/core/bitset.hpp
new file mode 100644
index 0000000000..0df12f25e6
--- /dev/null
+++ b/cpp/include/raft/core/bitset.hpp
@@ -0,0 +1,275 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/device_container_policy.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/resource/thrust_policy.hpp>
+#include <raft/core/resources.hpp>
+
+namespace raft::core {
+/**
+ * @defgroup bitset Bitset
+ * @{
+ */
+/**
+ * @brief View of a RAFT Bitset.
+ *
+ * This lightweight structure stores a pointer to a bitset in device memory with it's length.
+ * It provides a test() device function to check if a given index is set in the bitset.
+ *
+ * @tparam bitset_t Underlying type of the bitset array. Default is uint32_t.
+ * @tparam index_t Indexing type used. Default is uint32_t.
+ */
+template <typename bitset_t = uint32_t, typename index_t = uint32_t>
+struct bitset_view {
+  static constexpr index_t bitset_element_size = sizeof(bitset_t) * 8;
+
+  _RAFT_HOST_DEVICE bitset_view(bitset_t* bitset_ptr, index_t bitset_len)
+    : bitset_ptr_{bitset_ptr}, bitset_len_{bitset_len}
+  {
+  }
+  /**
+   * @brief Create a bitset view from a device vector view of the bitset.
+   *
+   * @param bitset_span Device vector view of the bitset
+   * @param bitset_len Number of bits in the bitset
+   */
+  _RAFT_HOST_DEVICE bitset_view(raft::device_vector_view<bitset_t, index_t> bitset_span,
+                                index_t bitset_len)
+    : bitset_ptr_{bitset_span.data_handle()}, bitset_len_{bitset_len}
+  {
+  }
+  /**
+   * @brief Device function to test if a given index is set in the bitset.
+   *
+   * @param sample_index Single index to test
+   * @return bool True if index has not been unset in the bitset
+   */
+  inline _RAFT_HOST_DEVICE auto test(const index_t sample_index) const -> bool;
+  /**
+   * @brief Device function to test if a given index is set in the bitset.
+   *
+   * @param sample_index Single index to test
+   * @return bool True if index has not been unset in the bitset
+   */
+  inline _RAFT_HOST_DEVICE auto operator[](const index_t sample_index) const -> bool;
+  /**
+   * @brief Device function to set a given index to set_value in the bitset.
+   *
+   * @param sample_index index to set
+   * @param set_value Value to set the bit to (true or false)
+   */
+  inline _RAFT_HOST_DEVICE void set(const index_t sample_index, bool set_value) const;
+
+  /**
+   * @brief Get the device pointer to the bitset.
+   */
+  inline _RAFT_HOST_DEVICE auto data() -> bitset_t* { return bitset_ptr_; }
+  inline _RAFT_HOST_DEVICE auto data() const -> const bitset_t* { return bitset_ptr_; }
+  /**
+   * @brief Get the number of bits of the bitset representation.
+   */
+  inline _RAFT_HOST_DEVICE auto size() const -> index_t { return bitset_len_; }
+
+  /**
+   * @brief Get the number of elements used by the bitset representation.
+   */
+  inline _RAFT_HOST_DEVICE auto n_elements() const -> index_t;
+
+  inline auto to_mdspan() -> raft::device_vector_view<bitset_t, index_t>
+  {
+    return raft::make_device_vector_view<bitset_t, index_t>(bitset_ptr_, n_elements());
+  }
+  inline auto to_mdspan() const -> raft::device_vector_view<const bitset_t, index_t>
+  {
+    return raft::make_device_vector_view<const bitset_t, index_t>(bitset_ptr_, n_elements());
+  }
+
+ private:
+  bitset_t* bitset_ptr_;
+  index_t bitset_len_;
+};
+
+/**
+ * @brief RAFT Bitset.
+ *
+ * This structure encapsulates a bitset in device memory. It provides a view() method to get a
+ * device-usable lightweight view of the bitset.
+ * Each index is represented by a single bit in the bitset. The total number of bytes used is
+ * ceil(bitset_len / 8).
+ * @tparam bitset_t Underlying type of the bitset array. Default is uint32_t.
+ * @tparam index_t Indexing type used. Default is uint32_t.
+ */
+template <typename bitset_t = uint32_t, typename index_t = uint32_t>
+struct bitset {
+  static constexpr index_t bitset_element_size = sizeof(bitset_t) * 8;
+
+  /**
+   * @brief Construct a new bitset object with a list of indices to unset.
+   *
+   * @param res RAFT resources
+   * @param mask_index List of indices to unset in the bitset
+   * @param bitset_len Length of the bitset
+   * @param default_value Default value to set the bits to. Default is true.
+   */
+  bitset(const raft::resources& res,
+         raft::device_vector_view<const index_t, index_t> mask_index,
+         index_t bitset_len,
+         bool default_value = true);
+
+  /**
+   * @brief Construct a new bitset object
+   *
+   * @param res RAFT resources
+   * @param bitset_len Length of the bitset
+   * @param default_value Default value to set the bits to. Default is true.
+   */
+  bitset(const raft::resources& res, index_t bitset_len, bool default_value = true);
+  // Disable copy constructor
+  bitset(const bitset&)            = delete;
+  bitset(bitset&&)                 = default;
+  bitset& operator=(const bitset&) = delete;
+  bitset& operator=(bitset&&)      = default;
+
+  /**
+   * @brief Create a device-usable view of the bitset.
+   *
+   * @return bitset_view<bitset_t, index_t>
+   */
+  inline auto view() -> raft::core::bitset_view<bitset_t, index_t>
+  {
+    return bitset_view<bitset_t, index_t>(to_mdspan(), bitset_len_);
+  }
+  [[nodiscard]] inline auto view() const -> raft::core::bitset_view<const bitset_t, index_t>
+  {
+    return bitset_view<const bitset_t, index_t>(to_mdspan(), bitset_len_);
+  }
+
+  /**
+   * @brief Get the device pointer to the bitset.
+   */
+  inline auto data() -> bitset_t* { return bitset_.data(); }
+  inline auto data() const -> const bitset_t* { return bitset_.data(); }
+  /**
+   * @brief Get the number of bits of the bitset representation.
+   */
+  inline auto size() const -> index_t { return bitset_len_; }
+
+  /**
+   * @brief Get the number of elements used by the bitset representation.
+   */
+  inline auto n_elements() const -> index_t;
+
+  /** @brief Get an mdspan view of the current bitset */
+  inline auto to_mdspan() -> raft::device_vector_view<bitset_t, index_t>
+  {
+    return raft::make_device_vector_view<bitset_t, index_t>(bitset_.data(), n_elements());
+  }
+  [[nodiscard]] inline auto to_mdspan() const -> raft::device_vector_view<const bitset_t, index_t>
+  {
+    return raft::make_device_vector_view<const bitset_t, index_t>(bitset_.data(), n_elements());
+  }
+
+  /** @brief Resize the bitset. If the requested size is larger, new memory is allocated and set to
+   * the default value.
+   * @param res RAFT resources
+   * @param new_bitset_len new size of the bitset
+   * @param default_value default value to initialize the new bits to
+   */
+  void resize(const raft::resources& res, index_t new_bitset_len, bool default_value = true);
+
+  /**
+   * @brief Test a list of indices in a bitset.
+   *
+   * @tparam output_t Output type of the test. Default is bool.
+   * @param res RAFT resources
+   * @param queries List of indices to test
+   * @param output List of outputs
+   */
+  template <typename output_t = bool>
+  void test(const raft::resources& res,
+            raft::device_vector_view<const index_t, index_t> queries,
+            raft::device_vector_view<output_t, index_t> output) const;
+  /**
+   * @brief Set a list of indices in a bitset to set_value.
+   *
+   * @param res RAFT resources
+   * @param mask_index indices to remove from the bitset
+   * @param set_value Value to set the bits to (true or false)
+   */
+  void set(const raft::resources& res,
+           raft::device_vector_view<const index_t, index_t> mask_index,
+           bool set_value = false);
+  /**
+   * @brief Flip all the bits in a bitset.
+   * @param res RAFT resources
+   */
+  void flip(const raft::resources& res);
+  /**
+   * @brief Reset the bits in a bitset.
+   *
+   * @param res RAFT resources
+   * @param default_value Value to set the bits to (true or false)
+   */
+  void reset(const raft::resources& res, bool default_value = true);
+  /**
+   * @brief Returns the number of bits set to true in count_gpu_scalar.
+   *
+   * @param[in] res RAFT resources
+   * @param[out] count_gpu_scalar Device scalar to store the count
+   */
+  void count(const raft::resources& res, raft::device_scalar_view<index_t> count_gpu_scalar);
+  /**
+   * @brief Returns the number of bits set to true.
+   *
+   * @param res RAFT resources
+   * @return index_t Number of bits set to true
+   */
+  auto count(const raft::resources& res) -> index_t
+  {
+    auto count_gpu_scalar = raft::make_device_scalar<index_t>(res, 0.0);
+    count(res, count_gpu_scalar.view());
+    index_t count_cpu = 0;
+    raft::update_host(
+      &count_cpu, count_gpu_scalar.data_handle(), 1, resource::get_cuda_stream(res));
+    resource::sync_stream(res);
+    return count_cpu;
+  }
+  /**
+   * @brief Checks if any of the bits are set to true in the bitset.
+   * @param res RAFT resources
+   */
+  bool any(const raft::resources& res) { return count(res) > 0; }
+  /**
+   * @brief Checks if all of the bits are set to true in the bitset.
+   * @param res RAFT resources
+   */
+  bool all(const raft::resources& res) { return count(res) == bitset_len_; }
+  /**
+   * @brief Checks if none of the bits are set to true in the bitset.
+   * @param res RAFT resources
+   */
+  bool none(const raft::resources& res) { return count(res) == 0; }
+
+ private:
+  raft::device_uvector<bitset_t> bitset_;
+  index_t bitset_len_;
+};
+
+/** @} */
+}  // end namespace raft::core

From e6fe56743f06ed55d9a1ca5814c61de85be2b452 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Thu, 9 May 2024 08:18:36 -0400
Subject: [PATCH 33/60] Add VERSION to raft-ann-bench package (#2299)

https://github.com/rapidsai/raft/pull/2285 added dynamic versioning to `pyproject.toml` for `raft-ann-bench`, but did not properly copy in the `VERSION` file. Copy it in with `tool.setuptools.package-data`.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/raft/pull/2299
---
 python/raft-ann-bench/pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/raft-ann-bench/pyproject.toml b/python/raft-ann-bench/pyproject.toml
index 1348343db3..9bb7ae0468 100644
--- a/python/raft-ann-bench/pyproject.toml
+++ b/python/raft-ann-bench/pyproject.toml
@@ -35,7 +35,7 @@ Homepage = "https://github.com/rapidsai/raft"
 where = ["src"]
 
 [tool.setuptools.package-data]
-"*" = ["*.*"]
+"*" = ["*.*", "VERSION"]
 
 [tool.isort]
 line_length = 79

From d11e78a68d06cae0f9ae9659f3d97f9451b5e60e Mon Sep 17 00:00:00 2001
From: Micka <mide@nvidia.com>
Date: Thu, 9 May 2024 14:57:25 +0200
Subject: [PATCH 34/60] Update nvtx3 link in cmake (#2246)

- Link to `nvtx3` instead of deprecated `nvToolsExt`
- Use `LAZY` cmake install message, to avoid flooding the console with all the headers name when they are left untouched.

Authors:
  - Micka (https://github.com/lowener)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/raft/pull/2246
---
 cpp/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 88ccc27be0..f7d1c9b119 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -278,7 +278,7 @@ else()
       "\" OFF)"
       [=[
 
-target_link_libraries(raft::raft INTERFACE $<$<BOOL:${RAFT_NVTX}>:CUDA::nvToolsExt>)
+target_link_libraries(raft::raft INTERFACE $<$<BOOL:${RAFT_NVTX}>:CUDA::nvtx3>)
 target_compile_definitions(raft::raft INTERFACE $<$<BOOL:${RAFT_NVTX}>:NVTX_ENABLED>)
 
   ]=]

From ea54c792b2b415b238cecf238f418aabb37836a2 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 10 May 2024 09:07:31 -0700
Subject: [PATCH 35/60] Revert "Build C++ wheel (#2264)" (#2305)

This reverts commit b760453221c6eb5b6986ceb2153bb0a0284697e2.


The various issues around symbol visibility caused by raft currently supporting a mix of compiled library and header-only usage make this wheel unsafe to use in various environments at the moment. We will revisit this work once all of the compiled components of raft are fully split out into cuvs.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/raft/pull/2305
---
 .github/workflows/build.yaml                  | 28 +++----
 .github/workflows/pr.yaml                     | 27 +++---
 build.sh                                      | 12 ++-
 ci/build_wheel.sh                             | 67 +++++++++++++++
 ci/build_wheel_cpp.sh                         | 46 ----------
 ci/build_wheel_pylibraft.sh                   |  9 ++
 ci/build_wheel_python.sh                      | 84 -------------------
 ci/build_wheel_raft_dask.sh                   |  9 ++
 ci/test_wheel_pylibraft.sh                    | 11 ++-
 ci/test_wheel_raft_dask.sh                    | 10 ++-
 cpp/CMakeLists.txt                            | 69 ++++++---------
 dependencies.yaml                             | 67 +--------------
 python/libraft/CMakeLists.txt                 | 45 ----------
 python/libraft/LICENSE                        |  1 -
 python/libraft/README.md                      |  1 -
 python/libraft/libraft/VERSION                |  1 -
 python/libraft/libraft/__init__.py            | 17 ----
 python/libraft/libraft/_version.py            | 25 ------
 python/libraft/libraft/load.py                | 48 -----------
 python/libraft/pyproject.toml                 | 65 --------------
 python/pylibraft/CMakeLists.txt               | 39 ++++++++-
 python/pylibraft/pylibraft/__init__.py        | 10 ---
 .../pylibraft/cluster/CMakeLists.txt          |  2 +-
 .../pylibraft/pylibraft/common/CMakeLists.txt |  2 +-
 .../pylibraft/distance/CMakeLists.txt         |  2 +-
 .../pylibraft/pylibraft/matrix/CMakeLists.txt |  2 +-
 .../pylibraft/neighbors/CMakeLists.txt        |  2 +-
 .../pylibraft/neighbors/cagra/CMakeLists.txt  |  2 +-
 .../neighbors/ivf_flat/CMakeLists.txt         |  2 +-
 .../pylibraft/neighbors/ivf_pq/CMakeLists.txt |  2 +-
 .../pylibraft/pylibraft/random/CMakeLists.txt |  2 +-
 python/pylibraft/pyproject.toml               |  2 -
 python/raft-dask/CMakeLists.txt               | 28 ++++++-
 python/raft-dask/pyproject.toml               |  2 -
 python/raft-dask/raft_dask/__init__.py        | 13 +--
 .../raft-dask/raft_dask/common/CMakeLists.txt |  2 +-
 .../raft_dask/include_test/CMakeLists.txt     |  2 +-
 37 files changed, 237 insertions(+), 521 deletions(-)
 create mode 100755 ci/build_wheel.sh
 delete mode 100755 ci/build_wheel_cpp.sh
 create mode 100755 ci/build_wheel_pylibraft.sh
 delete mode 100755 ci/build_wheel_python.sh
 create mode 100755 ci/build_wheel_raft_dask.sh
 delete mode 100644 python/libraft/CMakeLists.txt
 delete mode 120000 python/libraft/LICENSE
 delete mode 120000 python/libraft/README.md
 delete mode 120000 python/libraft/libraft/VERSION
 delete mode 100644 python/libraft/libraft/__init__.py
 delete mode 100644 python/libraft/libraft/_version.py
 delete mode 100644 python/libraft/libraft/load.py
 delete mode 100644 python/libraft/pyproject.toml

diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml
index 9944dd5198..e013d4f1c5 100644
--- a/.github/workflows/build.yaml
+++ b/.github/workflows/build.yaml
@@ -67,39 +67,36 @@ jobs:
       node_type: "gpu-v100-latest-1"
       run_script: "ci/build_docs.sh"
       sha: ${{ inputs.sha }}
-  wheel-build-cpp:
+  wheel-build-pylibraft:
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
-      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
-      script: ci/build_wheel_cpp.sh
-  wheel-build-python:
-    needs: wheel-build-cpp
+      script: ci/build_wheel_pylibraft.sh
+  wheel-publish-pylibraft:
+    needs: wheel-build-pylibraft
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
-      script: ci/build_wheel_python.sh
-  wheel-publish-cpp:
-    needs: wheel-build-cpp
+      package-name: pylibraft
+  wheel-build-raft-dask:
     secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
       build_type: ${{ inputs.build_type || 'branch' }}
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
-      package-name: raft
-      package-type: cpp
-  wheel-publish-python:
-    needs: wheel-build-python
+      script: ci/build_wheel_raft_dask.sh
+  wheel-publish-raft-dask:
+    needs: wheel-build-raft-dask
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-publish.yaml@branch-24.06
     with:
@@ -107,5 +104,4 @@ jobs:
       branch: ${{ inputs.branch }}
       sha: ${{ inputs.sha }}
       date: ${{ inputs.date }}
-      package-name: raft
-      package-type: python
+      package-name: raft_dask
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
index 5d0368e3f7..c2d9556859 100644
--- a/.github/workflows/pr.yaml
+++ b/.github/workflows/pr.yaml
@@ -19,9 +19,9 @@ jobs:
       - conda-python-build
       - conda-python-tests
       - docs-build
-      - wheel-build-cpp
-      - wheel-build-python
+      - wheel-build-pylibraft
       - wheel-tests-pylibraft
+      - wheel-build-raft-dask
       - wheel-tests-raft-dask
       - devcontainer
     secrets: inherit
@@ -74,30 +74,29 @@ jobs:
       arch: "amd64"
       container_image: "rapidsai/ci-conda:latest"
       run_script: "ci/build_docs.sh"
-  wheel-build-cpp:
+  wheel-build-pylibraft:
     needs: checks
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
     with:
-      matrix_filter: group_by([.ARCH, (.CUDA_VER|split(".")|map(tonumber)|.[0])]) | map(max_by(.PY_VER|split(".")|map(tonumber)))
       build_type: pull-request
-      script: ci/build_wheel_cpp.sh
-  wheel-build-python:
-    needs: wheel-build-cpp
-    secrets: inherit
-    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
-    with:
-      build_type: pull-request
-      script: ci/build_wheel_python.sh
+      script: ci/build_wheel_pylibraft.sh
   wheel-tests-pylibraft:
-    needs: wheel-build-python
+    needs: wheel-build-pylibraft
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
       build_type: pull-request
       script: ci/test_wheel_pylibraft.sh
+  wheel-build-raft-dask:
+    needs: wheel-tests-pylibraft
+    secrets: inherit
+    uses: rapidsai/shared-workflows/.github/workflows/wheels-build.yaml@branch-24.06
+    with:
+      build_type: pull-request
+      script: "ci/build_wheel_raft_dask.sh"
   wheel-tests-raft-dask:
-    needs: wheel-build-python
+    needs: wheel-build-raft-dask
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.06
     with:
diff --git a/build.sh b/build.sh
index ee16bb4b1b..da5efa5183 100755
--- a/build.sh
+++ b/build.sh
@@ -381,6 +381,14 @@ if [[ ${CMAKE_TARGET} == "" ]]; then
     CMAKE_TARGET="all"
 fi
 
+# Append `-DFIND_RAFT_CPP=ON` to EXTRA_CMAKE_ARGS unless a user specified the option.
+SKBUILD_EXTRA_CMAKE_ARGS="${EXTRA_CMAKE_ARGS}"
+if [[ "${EXTRA_CMAKE_ARGS}" != *"DFIND_RAFT_CPP"* ]]; then
+    SKBUILD_EXTRA_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS} -DFIND_RAFT_CPP=ON"
+fi
+# Replace spaces with semicolons in SKBUILD_EXTRA_CMAKE_ARGS
+SKBUILD_EXTRA_CMAKE_ARGS=$(echo ${SKBUILD_EXTRA_CMAKE_ARGS} | sed 's/ /;/g')
+
 # If clean given, run it prior to any other steps
 if (( ${CLEAN} == 1 )); then
     # If the dirs to clean are mounted dirs in a container, the
@@ -487,13 +495,13 @@ fi
 
 # Build and (optionally) install the pylibraft Python package
 if (( ${NUMARGS} == 0 )) || hasArg pylibraft; then
-    CMAKE_ARGS="${EXTRA_CMAKE_ARGS}" \
+    SKBUILD_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS}" \
         python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/pylibraft
 fi
 
 # Build and (optionally) install the raft-dask Python package
 if (( ${NUMARGS} == 0 )) || hasArg raft-dask; then
-    CMAKE_ARGS="${EXTRA_CMAKE_ARGS}" \
+    SKBUILD_CMAKE_ARGS="${SKBUILD_EXTRA_CMAKE_ARGS}" \
         python -m pip install --no-build-isolation --no-deps ${REPODIR}/python/raft-dask
 fi
 
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
new file mode 100755
index 0000000000..e3e7ce9c89
--- /dev/null
+++ b/ci/build_wheel.sh
@@ -0,0 +1,67 @@
+#!/bin/bash
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+package_name=$1
+package_dir=$2
+underscore_package_name=$(echo "${package_name}" | tr "-" "_")
+
+# Clear out system ucx files to ensure that we're getting ucx from the wheel.
+rm -rf /usr/lib64/ucx
+rm -rf /usr/lib64/libuc*
+
+source rapids-configure-sccache
+source rapids-date-string
+
+version=$(rapids-generate-version)
+git_commit=$(git rev-parse HEAD)
+
+RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+
+# This is the version of the suffix with a preceding hyphen. It's used
+# everywhere except in the final wheel name.
+PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
+
+# Patch project metadata files to include the CUDA version suffix and version override.
+pyproject_file="${package_dir}/pyproject.toml"
+version_file="${package_dir}/${underscore_package_name}/_version.py"
+
+sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
+echo "${version}" > VERSION
+sed -i "/^__git_commit__ / s/= .*/= \"${git_commit}\"/g" ${version_file}
+
+# For nightlies we want to ensure that we're pulling in alphas as well. The
+# easiest way to do so is to augment the spec with a constraint containing a
+# min alpha version that doesn't affect the version bounds but does allow usage
+# of alpha versions for that dependency without --pre
+alpha_spec=''
+if ! rapids-is-release-build; then
+    alpha_spec=',>=0.0.0a0'
+fi
+
+if [[ ${package_name} == "raft-dask" ]]; then
+    sed -r -i "s/pylibraft==(.*)\"/pylibraft${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/libucx(.*)\"/libucx${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/ucx-py==(.*)\"/ucx-py${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/distributed-ucxx==(.*)\"/distributed-ucxx${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
+else
+    sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
+fi
+
+if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then
+    sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file}
+    sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file}
+fi
+
+cd "${package_dir}"
+
+# Hardcode the output dir
+python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
+
+mkdir -p final_dist
+python -m auditwheel repair -w final_dist --exclude "libucp.so.0" dist/*
+
+RAPIDS_PY_WHEEL_NAME="${underscore_package_name}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
diff --git a/ci/build_wheel_cpp.sh b/ci/build_wheel_cpp.sh
deleted file mode 100755
index 0e0417dc35..0000000000
--- a/ci/build_wheel_cpp.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2024, NVIDIA CORPORATION.
-
-set -euo pipefail
-
-package_name="libraft"
-package_dir="python/libraft"
-
-source rapids-configure-sccache
-source rapids-date-string
-
-version=$(rapids-generate-version)
-git_commit=$(git rev-parse HEAD)
-
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-
-# everywhere except in the final wheel name.
-PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
-
-# Patch project metadata files to include the CUDA version suffix and version override.
-pyproject_file="${package_dir}/pyproject.toml"
-version_file="${package_dir}/${package_name}/_version.py"
-
-sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
-echo "${version}" > VERSION
-sed -i "/^__git_commit__ / s/= .*/= \"${git_commit}\"/g" ${version_file}
-
-# For nightlies we want to ensure that we're pulling in alphas as well. The
-# easiest way to do so is to augment the spec with a constraint containing a
-# min alpha version that doesn't affect the version bounds but does allow usage
-# of alpha versions for that dependency without --pre
-alpha_spec=''
-if ! rapids-is-release-build; then
-    alpha_spec=',>=0.0.0a0'
-fi
-
-sed -r -i "s/librmm(.*)\"/librmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
-
-cd "${package_dir}"
-
-python -m pip wheel . -w dist -vvv --no-deps --disable-pip-version-check
-
-mkdir -p final_dist
-python -m auditwheel repair -w final_dist dist/*
-
-RAPIDS_PY_WHEEL_NAME="raft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 cpp final_dist
diff --git a/ci/build_wheel_pylibraft.sh b/ci/build_wheel_pylibraft.sh
new file mode 100755
index 0000000000..895c311f46
--- /dev/null
+++ b/ci/build_wheel_pylibraft.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Set up skbuild options. Enable sccache in skbuild config options
+export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_RAFT_CPP=OFF"
+
+ci/build_wheel.sh pylibraft python/pylibraft
diff --git a/ci/build_wheel_python.sh b/ci/build_wheel_python.sh
deleted file mode 100755
index ae15d3734b..0000000000
--- a/ci/build_wheel_python.sh
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2023-2024, NVIDIA CORPORATION.
-
-set -euo pipefail
-
-# Clear out system ucx files to ensure that we're getting ucx from the wheel
-# when building raft-dask.
-rm -rf /usr/lib64/ucx
-rm -rf /usr/lib64/libuc*
-
-source rapids-configure-sccache
-source rapids-date-string
-
-version=$(rapids-generate-version)
-git_commit=$(git rev-parse HEAD)
-
-RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
-
-# This is the version of the suffix with a preceding hyphen. It's used
-# everywhere except in the final wheel name.
-PACKAGE_CUDA_SUFFIX="-${RAPIDS_PY_CUDA_SUFFIX}"
-
-echo "${version}" > VERSION
-
-# For nightlies we want to ensure that we're pulling in alphas as well. The
-# easiest way to do so is to augment the spec with a constraint containing a
-# min alpha version that doesn't affect the version bounds but does allow usage
-# of alpha versions for that dependency without --pre
-alpha_spec=''
-if ! rapids-is-release-build; then
-    alpha_spec=',>=0.0.0a0'
-fi
-
-CPP_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="raft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp /tmp/libraft_dist)
-PYTHON_WHEELHOUSE="${PWD}/dist/"
-PYTHON_AUDITED_WHEELHOUSE="${PWD}/final_dist/"
-WHEELHOUSES=("${PYTHON_WHEELHOUSE}" "${CPP_WHEELHOUSE}")
-mkdir -p "${PYTHON_AUDITED_WHEELHOUSE}"
-
-FIND_LINKS=""
-# Iterate over the array
-for wheelhouse in "${WHEELHOUSES[@]}"; do
-    FIND_LINKS+="--find-links ${wheelhouse} "
-done
-              
-
-build_wheel () {
-    local package_name="${1}"
-    local underscore_package_name=$(echo "${package_name}" | tr "-" "_")
-
-    local package_dir="python/${package_name}"
-    local pyproject_file="${package_dir}/pyproject.toml"
-    local version_file="${package_dir}/${underscore_package_name}/_version.py"
-
-    sed -i "s/name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
-    sed -i "/^__git_commit__ / s/= .*/= \"${git_commit}\"/g" ${version_file}
-
-    sed -r -i "s/libucx(.*)\"/libucx${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
-
-    for dep in rmm libraft pylibraft ucx-py distributed-ucxx; do
-        sed -r -i "s/${dep}==(.*)\"/${dep}${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
-    done
-
-    # dask-cuda & rapids-dask-dependency don't get a suffix, but they do get an alpha spec.
-    for dep in dask-cuda rapids-dask-dependency; do
-        sed -r -i "s/${dep}==(.*)\"/${dep}==\1${alpha_spec}\"/g" ${pyproject_file}
-    done
-
-    if [[ $PACKAGE_CUDA_SUFFIX == "-cu12" ]]; then
-        sed -i "s/cuda-python[<=>\.,0-9a]*/cuda-python>=12.0,<13.0a0/g" ${pyproject_file}
-        sed -i "s/cupy-cuda11x/cupy-cuda12x/g" ${pyproject_file}
-    fi
-
-    pushd "${package_dir}"
-
-    python -m pip wheel . -w "${PYTHON_WHEELHOUSE}" -vvv --no-deps --disable-pip-version-check ${FIND_LINKS}
-    popd
-}
-
-build_wheel pylibraft
-build_wheel raft-dask
-
-python -m auditwheel repair -w "${PYTHON_AUDITED_WHEELHOUSE}" --exclude libraft.so --exclude "libucp.so.0" "${PYTHON_WHEELHOUSE}"/*
-RAPIDS_PY_WHEEL_NAME="raft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 python "${PYTHON_AUDITED_WHEELHOUSE}"
diff --git a/ci/build_wheel_raft_dask.sh b/ci/build_wheel_raft_dask.sh
new file mode 100755
index 0000000000..feba2d7a5b
--- /dev/null
+++ b/ci/build_wheel_raft_dask.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
+
+set -euo pipefail
+
+# Set up skbuild options. Enable sccache in skbuild config options
+export SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DFIND_RAFT_CPP=OFF"
+
+ci/build_wheel.sh raft-dask python/raft-dask
diff --git a/ci/test_wheel_pylibraft.sh b/ci/test_wheel_pylibraft.sh
index 230889ae82..b38f5a690b 100755
--- a/ci/test_wheel_pylibraft.sh
+++ b/ci/test_wheel_pylibraft.sh
@@ -3,12 +3,11 @@
 
 set -euo pipefail
 
+mkdir -p ./dist
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
-WHEELHOUSE="${PWD}/dist/"
-RAPIDS_PY_WHEEL_NAME="raft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp "${WHEELHOUSE}"
-RAPIDS_PY_WHEEL_NAME="raft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python "${WHEELHOUSE}"
+# echo to expand wildcard before adding `[extra]` requires for pip
+python -m pip install $(echo ./dist/pylibraft*.whl)[test]
 
-python -m pip install "pylibraft-${RAPIDS_PY_CUDA_SUFFIX}[test]>=0.0.0a0" --find-links "${WHEELHOUSE}"
-
-python -m pytest python/pylibraft/pylibraft/test
+python -m pytest ./python/pylibraft/pylibraft/test
diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh
index bc3af96688..b8ca54f6e8 100755
--- a/ci/test_wheel_raft_dask.sh
+++ b/ci/test_wheel_raft_dask.sh
@@ -3,13 +3,15 @@
 
 set -euo pipefail
 
+mkdir -p ./dist
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
+RAPIDS_PY_WHEEL_NAME="raft_dask_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./dist
 
-WHEELHOUSE="${PWD}/dist/"
-RAPIDS_PY_WHEEL_NAME="raft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp "${WHEELHOUSE}"
-RAPIDS_PY_WHEEL_NAME="raft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 python "${WHEELHOUSE}"
+# Download the pylibraft built in the previous step
+RAPIDS_PY_WHEEL_NAME="pylibraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-pylibraft-dep
+python -m pip install --no-deps ./local-pylibraft-dep/pylibraft*.whl
 
-python -m pip install "raft-dask-${RAPIDS_PY_CUDA_SUFFIX}[test]>=0.0.0a0" --find-links "${WHEELHOUSE}"
+python -m pip install "raft_dask-${RAPIDS_PY_CUDA_SUFFIX}[test]>=0.0.0a0" --find-links dist/
 
 test_dir="python/raft-dask/raft_dask/test"
 
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index f7d1c9b119..7270c5a12b 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -75,12 +75,9 @@ if((BUILD_TESTS
 )
   set(RAFT_COMPILE_LIBRARY_DEFAULT ON)
 endif()
-option(RAFT_COMPILE_LIBRARY "Enable building raft library instantiations"
+option(RAFT_COMPILE_LIBRARY "Enable building raft shared library instantiations"
        ${RAFT_COMPILE_LIBRARY_DEFAULT}
 )
-option(RAFT_COMPILE_DYNAMIC_ONLY "Only build the static library and skip the
-static library. Has no effect if RAFT_COMPILE_LIBRARY is OFF" OFF
-)
 
 if(BUILD_CPU_ONLY)
   set(BUILD_SHARED_LIBS OFF)
@@ -585,23 +582,17 @@ if(RAFT_COMPILE_LIBRARY)
   )
 
   add_library(raft_lib SHARED $<TARGET_OBJECTS:raft_objs>)
-
-  set(raft_lib_targets raft_lib)
-  if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
-    add_library(raft_lib_static STATIC $<TARGET_OBJECTS:raft_objs>)
-    list(APPEND raft_lib_targets raft_lib_static)
-  endif()
+  add_library(raft_lib_static STATIC $<TARGET_OBJECTS:raft_objs>)
 
   set_target_properties(
-    ${raft_lib_targets}
+    raft_lib raft_lib_static
     PROPERTIES OUTPUT_NAME raft
                BUILD_RPATH "\$ORIGIN"
                INSTALL_RPATH "\$ORIGIN"
                INTERFACE_POSITION_INDEPENDENT_CODE ON
   )
 
-  list(APPEND raft_lib_targets raft_objs)
-  foreach(target IN LISTS raft_lib_targets)
+  foreach(target raft_lib raft_lib_static raft_objs)
     target_link_libraries(
       ${target}
       PUBLIC raft::raft
@@ -626,23 +617,21 @@ target_link_libraries(raft_compiled INTERFACE raft::raft $<TARGET_NAME_IF_EXISTS
 # ##################################################################################################
 # * raft_compiled_static----------------------------------------------------------------------------
 
-if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
-  add_library(raft_compiled_static INTERFACE)
-
-  if(TARGET raft_compiled_static AND (NOT TARGET raft::compiled_static))
-    add_library(raft::compiled_static ALIAS raft_compiled_static)
-  endif()
-  set_target_properties(raft_compiled_static PROPERTIES EXPORT_NAME compiled_static)
+add_library(raft_compiled_static INTERFACE)
 
-  if(TARGET raft_lib_static AND (NOT TARGET raft::raft_lib_static))
-    add_library(raft::raft_lib_static ALIAS raft_lib_static)
-  endif()
+if(TARGET raft_compiled_static AND (NOT TARGET raft::compiled_static))
+  add_library(raft::compiled_static ALIAS raft_compiled_static)
+endif()
+set_target_properties(raft_compiled_static PROPERTIES EXPORT_NAME compiled_static)
 
-  target_link_libraries(
-    raft_compiled_static INTERFACE raft::raft $<TARGET_NAME_IF_EXISTS:raft::raft_lib_static>
-  )
+if(TARGET raft_lib_static AND (NOT TARGET raft::raft_lib_static))
+  add_library(raft::raft_lib_static ALIAS raft_lib_static)
 endif()
 
+target_link_libraries(
+  raft_compiled_static INTERFACE raft::raft $<TARGET_NAME_IF_EXISTS:raft::raft_lib_static>
+)
+
 # ##################################################################################################
 # * raft_distributed -------------------------------------------------------------------------------
 add_library(raft_distributed INTERFACE)
@@ -690,12 +679,8 @@ install(
   EXPORT raft-exports
 )
 
-set(raft_compiled_install_targets raft_compiled)
-if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
-  list(APPEND raft_compiled_install_targets raft_compiled_static)
-endif()
 install(
-  TARGETS ${raft_compiled_install_targets}
+  TARGETS raft_compiled raft_compiled_static
   DESTINATION ${lib_dir}
   COMPONENT raft
   EXPORT raft-compiled-exports
@@ -708,14 +693,12 @@ if(TARGET raft_lib)
     COMPONENT compiled
     EXPORT raft-compiled-lib-exports
   )
-  if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
-    install(
-      TARGETS raft_lib_static
-      DESTINATION ${lib_dir}
-      COMPONENT compiled-static
-      EXPORT raft-compiled-static-lib-exports
-    )
-  endif()
+  install(
+    TARGETS raft_lib_static
+    DESTINATION ${lib_dir}
+    COMPONENT compiled-static
+    EXPORT raft-compiled-static-lib-exports
+  )
   install(
     DIRECTORY include/raft_runtime
     DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
@@ -786,12 +769,8 @@ endif()
 set(raft_components compiled distributed)
 set(raft_export_sets raft-compiled-exports raft-distributed-exports)
 if(TARGET raft_lib)
-  list(APPEND raft_components compiled)
-  list(APPEND raft_export_sets raft-compiled-lib-exports)
-  if(NOT RAFT_COMPILE_DYNAMIC_ONLY)
-    list(APPEND raft_components compiled-static)
-    list(APPEND raft_export_sets raft-compiled-static-lib-exports)
-  endif()
+  list(APPEND raft_components compiled compiled-static)
+  list(APPEND raft_export_sets raft-compiled-lib-exports raft-compiled-static-lib-exports)
 endif()
 
 string(
diff --git a/dependencies.yaml b/dependencies.yaml
index 3daa400444..6ed26a3b14 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -8,7 +8,6 @@ files:
     includes:
       - build
       - build_pylibraft
-      - cython_build
       - cuda
       - cuda_version
       - depends_on_cupy
@@ -29,7 +28,6 @@ files:
       arch: [x86_64, aarch64]
     includes:
       - build
-      - cython_build
       - cuda
       - cuda_version
       - develop
@@ -62,25 +60,6 @@ files:
       - docs
       - py_version
       - test_pylibraft
-  py_build_libraft:
-    output: pyproject
-    pyproject_dir: python/libraft
-    extras:
-      table: build-system
-    includes:
-      - build
-      - librmm
-  py_run_libraft:
-    output: pyproject
-    pyproject_dir: python/libraft
-    extras:
-      table: project
-    includes:
-      # This is really a build requirement for anything using libraft to build
-      # against, but is not required when _running_ with libraft. There isn't a
-      # great way to express that without separating libraft into libraft and
-      # libraft-dev packages, though.
-      - librmm
   py_build_pylibraft:
     output: pyproject
     pyproject_dir: python/pylibraft
@@ -89,7 +68,6 @@ files:
     includes:
       - build
       - build_pylibraft
-      - cython_build
   py_run_pylibraft:
     output: pyproject
     pyproject_dir: python/pylibraft
@@ -114,7 +92,6 @@ files:
       table: build-system
     includes:
       - build
-      - cython_build
       - depends_on_ucx_build
   py_run_raft_dask:
     output: pyproject
@@ -159,6 +136,7 @@ dependencies:
       - output_types: [conda, requirements, pyproject]
         packages:
           - &cmake_ver cmake>=3.26.4
+          - cython>=3.0.0
           - ninja
       - output_types: [conda]
         packages:
@@ -203,33 +181,7 @@ dependencies:
             packages: [nvcc_linux-64=11.2]
           - matrix: {cuda: "11.2", arch: aarch64}
             packages: [nvcc_linux-aarch64=11.2]
-  cython_build:
-    common:
-      - output_types: [conda, requirements, pyproject]
-        packages:
-          - cython>=3.0.0
-      - output_types: [requirements, pyproject]
-        packages:
-          - libraft==24.6.*
-  librmm:
-    common:
-      - output_types: requirements
-        packages:
-          # pip recognizes the index as a global option for the requirements.txt file
-          - --extra-index-url=https://pypi.nvidia.com
-          - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
-    specific:
-      - output_types: [requirements, pyproject]
-        matrices:
-          - matrix: {cuda: "12.*"}
-            packages:
-              - librmm-cu12==24.6.*
-          - matrix: {cuda: "11.*"}
-            packages:
-              - librmm-cu11==24.6.*
-          - matrix: null
-            packages:
-              - librmm==24.6.*
+
   build_pylibraft:
     common:
       - output_types: [conda]
@@ -479,15 +431,10 @@ dependencies:
           - matrix: {cuda: "12.*"}
             packages:
               - *rmm_cu12
-              - libraft-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages:
               - *rmm_cu11
-              - libraft-cu11==24.6.*
-          - matrix: null
-            packages:
-              - *rmm_conda
-              - libraft==24.6.*
+          - {matrix: null, packages: [*rmm_conda]}
   run_raft_dask:
     common:
       - output_types: [conda, pyproject]
@@ -519,17 +466,11 @@ dependencies:
             packages:
               - &pylibraft_cu12 pylibraft-cu12==24.6.*
               - &ucx_py_cu12 ucx-py-cu12==0.38.*
-              - libraft-cu12==24.6.*
           - matrix: {cuda: "11.*"}
             packages:
               - &pylibraft_cu11 pylibraft-cu11==24.6.*
               - &ucx_py_cu11 ucx-py-cu11==0.38.*
-              - libraft-cu11==24.6.*
-          - matrix: null
-            packages: 
-              - *pylibraft_conda
-              - *ucx_py_conda
-              - libraft==24.6.*
+          - {matrix: null, packages: [*pylibraft_conda, *ucx_py_conda]}
   test_python_common:
     common:
       - output_types: [conda, requirements, pyproject]
diff --git a/python/libraft/CMakeLists.txt b/python/libraft/CMakeLists.txt
deleted file mode 100644
index cc3b48d033..0000000000
--- a/python/libraft/CMakeLists.txt
+++ /dev/null
@@ -1,45 +0,0 @@
-# =============================================================================
-# Copyright (c) 2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
-# in compliance with the License. You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software distributed under the License
-# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
-# or implied. See the License for the specific language governing permissions and limitations under
-# the License.
-# =============================================================================
-
-cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
-
-include(../../rapids_config.cmake)
-
-include(rapids-cuda)
-rapids_cuda_init_architectures(libraft-python)
-
-project(
-  libraft-python
-  VERSION "${RAPIDS_VERSION}"
-  LANGUAGES CXX CUDA
-)
-
-# Check if raft is already available. If so, it is the user's responsibility to ensure that the
-# CMake package is also available at build time of the Python raft package.
-find_package(raft "${RAPIDS_VERSION}")
-
-if(raft_FOUND)
-  return()
-endif()
-
-unset(raft_FOUND)
-
-set(BUILD_TESTS OFF)
-set(BUILD_PRIMS_BENCH OFF)
-set(BUILD_ANN_BENCH OFF)
-set(RAFT_COMPILE_LIBRARY ON)
-set(RAFT_COMPILE_DYNAMIC_ONLY ON)
-set(CUDA_STATIC_RUNTIME ON)
-
-add_subdirectory(../../cpp raft-cpp)
diff --git a/python/libraft/LICENSE b/python/libraft/LICENSE
deleted file mode 120000
index 30cff7403d..0000000000
--- a/python/libraft/LICENSE
+++ /dev/null
@@ -1 +0,0 @@
-../../LICENSE
\ No newline at end of file
diff --git a/python/libraft/README.md b/python/libraft/README.md
deleted file mode 120000
index fe84005413..0000000000
--- a/python/libraft/README.md
+++ /dev/null
@@ -1 +0,0 @@
-../../README.md
\ No newline at end of file
diff --git a/python/libraft/libraft/VERSION b/python/libraft/libraft/VERSION
deleted file mode 120000
index d62dc733ef..0000000000
--- a/python/libraft/libraft/VERSION
+++ /dev/null
@@ -1 +0,0 @@
-../../../VERSION
\ No newline at end of file
diff --git a/python/libraft/libraft/__init__.py b/python/libraft/libraft/__init__.py
deleted file mode 100644
index 2ba8e06d56..0000000000
--- a/python/libraft/libraft/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-from libraft._version import __git_commit__, __version__
-from libraft.load import load_library
diff --git a/python/libraft/libraft/_version.py b/python/libraft/libraft/_version.py
deleted file mode 100644
index 3e3792a85c..0000000000
--- a/python/libraft/libraft/_version.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-
-import importlib.resources
-
-__version__ = (
-    importlib.resources.files("libraft")
-    .joinpath("VERSION")
-    .read_text()
-    .strip()
-)
-__git_commit__ = ""
diff --git a/python/libraft/libraft/load.py b/python/libraft/libraft/load.py
deleted file mode 100644
index fb7bf35274..0000000000
--- a/python/libraft/libraft/load.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) 2024, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-
-import ctypes
-import os
-
-
-def load_library():
-    # Dynamically load libraft.so. Prefer a system library if one is present to
-    # avoid clobbering symbols that other packages might expect, but if no
-    # other library is present use the one in the wheel.
-    libraft_lib = None
-    try:
-        libraft_lib = ctypes.CDLL("libraft.so", ctypes.RTLD_GLOBAL)
-    except OSError:
-        # If neither of these directories contain the library, we assume we are
-        # in an environment where the C++ library is already installed
-        # somewhere else and the CMake build of the libraft Python
-        # package was a no-op. Note that this approach won't work for
-        # real editable installs of the libraft package, but that's not a use
-        # case I think we need to support. scikit-build-core has limited
-        # support for importlib.resources so there isn't a clean way to support
-        # that case yet.
-        for lib_dir in ("lib", "lib64"):
-            if os.path.isfile(
-                lib := os.path.join(
-                    os.path.dirname(__file__), lib_dir, "libraft.so"
-                )
-            ):
-                libraft_lib = ctypes.CDLL(lib, ctypes.RTLD_GLOBAL)
-                break
-
-    # The caller almost never needs to do anything with this library, but no
-    # harm in offering the option since this object at least provides a handle
-    # to inspect where libraft was loaded from.
-    return libraft_lib
diff --git a/python/libraft/pyproject.toml b/python/libraft/pyproject.toml
deleted file mode 100644
index c5a76e363a..0000000000
--- a/python/libraft/pyproject.toml
+++ /dev/null
@@ -1,65 +0,0 @@
-# Copyright (c) 2022, NVIDIA CORPORATION.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-[build-system]
-
-requires = [
-    "cmake>=3.26.4",
-    "librmm==24.6.*",
-    "ninja",
-    "scikit-build-core[pyproject]>=0.7.0",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-build-backend = "scikit_build_core.build"
-
-[project]
-name = "libraft"
-dynamic = ["version"]
-description = "RAFT: Reusable Algorithms Functions and other Tools"
-readme = { file = "README.md", content-type = "text/markdown" }
-authors = [
-    { name = "NVIDIA Corporation" },
-]
-license = { text = "Apache 2.0" }
-requires-python = ">=3.9"
-classifiers = [
-    "Intended Audience :: Developers",
-    "Programming Language :: C++",
-    "Environment :: GPU :: NVIDIA CUDA",
-]
-dependencies = [
-    "librmm==24.6.*",
-] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
-
-[project.urls]
-Homepage = "https://github.com/rapidsai/raft"
-Documentation = "https://docs.rapids.ai/api/raft/stable/"
-
-[project.entry-points."cmake.prefix"]
-libraft = "libraft"
-
-[tool.scikit-build]
-build-dir = "build/{wheel_tag}"
-cmake.build-type = "Release"
-cmake.minimum-version = "3.26.4"
-ninja.make-fallback = true
-sdist.exclude = ["*tests*"]
-sdist.reproducible = true
-wheel.packages = ["libraft"]
-wheel.install-dir = "libraft"
-wheel.py-api = "py3"
-
-[tool.scikit-build.metadata.version]
-provider = "scikit_build_core.metadata.regex"
-input = "libraft/VERSION"
-regex = "(?P<value>.*)"
diff --git a/python/pylibraft/CMakeLists.txt b/python/pylibraft/CMakeLists.txt
index bba5549f62..7a2d77041d 100644
--- a/python/pylibraft/CMakeLists.txt
+++ b/python/pylibraft/CMakeLists.txt
@@ -27,9 +27,42 @@ project(
   LANGUAGES CXX CUDA
 )
 
-find_package(raft "${RAPIDS_VERSION}" REQUIRED COMPONENTS compiled)
+option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulting to local files"
+       ON
+)
+
+# If the user requested it we attempt to find RAFT.
+if(FIND_RAFT_CPP)
+  find_package(raft "${RAPIDS_VERSION}" REQUIRED COMPONENTS compiled)
+  if(NOT TARGET raft::raft_lib)
+    message(
+      FATAL_ERROR
+        "Building against a preexisting libraft library requires the compiled libraft to have been built!"
+    )
+
+  endif()
+else()
+  set(raft_FOUND OFF)
+endif()
 
 include(rapids-cython-core)
+
+if(NOT raft_FOUND)
+  set(BUILD_TESTS OFF)
+  set(BUILD_PRIMS_BENCH OFF)
+  set(BUILD_ANN_BENCH OFF)
+  set(RAFT_COMPILE_LIBRARY ON)
+  set(CUDA_STATIC_RUNTIME ON)
+
+  add_subdirectory(../../cpp raft-cpp EXCLUDE_FROM_ALL)
+
+  # When building the C++ libraries from source we must copy libraft.so alongside the
+  # pairwise_distance and random Cython libraries TODO: when we have a single 'compiled' raft
+  # library, we shouldn't need this
+  set(cython_lib_dir pylibraft)
+  install(TARGETS raft_lib DESTINATION ${cython_lib_dir})
+endif()
+
 rapids_cython_init()
 
 add_subdirectory(pylibraft/common)
@@ -38,3 +71,7 @@ add_subdirectory(pylibraft/matrix)
 add_subdirectory(pylibraft/neighbors)
 add_subdirectory(pylibraft/random)
 add_subdirectory(pylibraft/cluster)
+
+if(DEFINED cython_lib_dir)
+  rapids_cython_add_rpath_entries(TARGET raft PATHS "${cython_lib_dir}")
+endif()
diff --git a/python/pylibraft/pylibraft/__init__.py b/python/pylibraft/pylibraft/__init__.py
index 8aac8f93da..b0869501f3 100644
--- a/python/pylibraft/pylibraft/__init__.py
+++ b/python/pylibraft/pylibraft/__init__.py
@@ -14,13 +14,3 @@
 #
 
 from pylibraft._version import __git_commit__, __version__
-
-# If libraft was installed as a wheel, we must request it to load the library symbols.
-# Otherwise, we assume that the library was installed in a system path that ld can find.
-try:
-    import libraft
-except ModuleNotFoundError:
-    pass
-else:
-    libraft.load_library()
-    del libraft
diff --git a/python/pylibraft/pylibraft/cluster/CMakeLists.txt b/python/pylibraft/pylibraft/cluster/CMakeLists.txt
index 06a639436a..562cff5098 100644
--- a/python/pylibraft/pylibraft/cluster/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/cluster/CMakeLists.txt
@@ -20,5 +20,5 @@ set(linked_libraries raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX cluster_
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX cluster_
 )
diff --git a/python/pylibraft/pylibraft/common/CMakeLists.txt b/python/pylibraft/pylibraft/common/CMakeLists.txt
index d1c1acb3aa..53279bfaf7 100644
--- a/python/pylibraft/pylibraft/common/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/common/CMakeLists.txt
@@ -20,5 +20,5 @@ set(linked_libraries raft::raft)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX common_
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX common_
 )
diff --git a/python/pylibraft/pylibraft/distance/CMakeLists.txt b/python/pylibraft/pylibraft/distance/CMakeLists.txt
index ffcef45c32..2530e07a98 100644
--- a/python/pylibraft/pylibraft/distance/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/distance/CMakeLists.txt
@@ -20,5 +20,5 @@ set(linked_libraries raft::raft raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX distance_
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX distance_
 )
diff --git a/python/pylibraft/pylibraft/matrix/CMakeLists.txt b/python/pylibraft/pylibraft/matrix/CMakeLists.txt
index 07d35325a5..5b7803db00 100644
--- a/python/pylibraft/pylibraft/matrix/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/matrix/CMakeLists.txt
@@ -20,5 +20,5 @@ set(linked_libraries raft::raft raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX matrix_
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX matrix_
 )
diff --git a/python/pylibraft/pylibraft/neighbors/CMakeLists.txt b/python/pylibraft/pylibraft/neighbors/CMakeLists.txt
index 2a954183d3..069038a0e8 100644
--- a/python/pylibraft/pylibraft/neighbors/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/neighbors/CMakeLists.txt
@@ -20,7 +20,7 @@ set(linked_libraries raft::raft raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX neighbors_
 )
 
 add_subdirectory(cagra)
diff --git a/python/pylibraft/pylibraft/neighbors/cagra/CMakeLists.txt b/python/pylibraft/pylibraft/neighbors/cagra/CMakeLists.txt
index 2df03c7b0b..0939d7c5b3 100644
--- a/python/pylibraft/pylibraft/neighbors/cagra/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/neighbors/cagra/CMakeLists.txt
@@ -20,5 +20,5 @@ set(linked_libraries raft::raft raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_cagra_
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX neighbors_cagra_
 )
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_flat/CMakeLists.txt b/python/pylibraft/pylibraft/neighbors/ivf_flat/CMakeLists.txt
index f50051ba23..37c57c45db 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_flat/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/neighbors/ivf_flat/CMakeLists.txt
@@ -20,5 +20,5 @@ set(linked_libraries raft::raft raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_ivfflat_
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX neighbors_ivfflat_
 )
diff --git a/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt b/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt
index e57798fcc6..af431adb16 100644
--- a/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/neighbors/ivf_pq/CMakeLists.txt
@@ -20,5 +20,5 @@ set(linked_libraries raft::raft raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX neighbors_ivfpq_
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX neighbors_ivfpq_
 )
diff --git a/python/pylibraft/pylibraft/random/CMakeLists.txt b/python/pylibraft/pylibraft/random/CMakeLists.txt
index 7d61855111..10ff776471 100644
--- a/python/pylibraft/pylibraft/random/CMakeLists.txt
+++ b/python/pylibraft/pylibraft/random/CMakeLists.txt
@@ -23,5 +23,5 @@ set(linked_libraries raft::raft raft::compiled)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX random_
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS raft MODULE_PREFIX random_
 )
diff --git a/python/pylibraft/pyproject.toml b/python/pylibraft/pyproject.toml
index 8feb600f11..df1001538e 100644
--- a/python/pylibraft/pyproject.toml
+++ b/python/pylibraft/pyproject.toml
@@ -18,7 +18,6 @@ requires = [
     "cmake>=3.26.4",
     "cuda-python>=11.7.1,<12.0a0",
     "cython>=3.0.0",
-    "libraft==24.6.*",
     "ninja",
     "rmm==24.6.*",
     "scikit-build-core[pyproject]>=0.7.0",
@@ -37,7 +36,6 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "cuda-python>=11.7.1,<12.0a0",
-    "libraft==24.6.*",
     "numpy>=1.23,<2.0a0",
     "rmm==24.6.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/raft-dask/CMakeLists.txt b/python/raft-dask/CMakeLists.txt
index 3218ea4d32..2c629f3b73 100644
--- a/python/raft-dask/CMakeLists.txt
+++ b/python/raft-dask/CMakeLists.txt
@@ -25,12 +25,38 @@ project(
   LANGUAGES CXX CUDA
 )
 
+option(FIND_RAFT_CPP "Search for existing RAFT C++ installations before defaulting to local files"
+       OFF
+)
+
 rapids_cpm_init()
 # Once https://github.com/rapidsai/ucxx/issues/173 is resolved we can remove this.
 find_package(ucx REQUIRED)
 include(cmake/thirdparty/get_ucxx.cmake)
 
-find_package(raft "${RAPIDS_VERSION}" REQUIRED COMPONENTS distributed)
+# If the user requested it we attempt to find RAFT.
+if(FIND_RAFT_CPP)
+  find_package(raft "${RAPIDS_VERSION}" REQUIRED COMPONENTS distributed)
+else()
+  set(raft_FOUND OFF)
+endif()
+
+if(NOT raft_FOUND)
+  # raft-dask doesn't actually use raft libraries, it just needs the headers, so we can turn off all
+  # library compilation and we don't need to install anything here.
+  set(BUILD_TESTS OFF)
+  set(BUILD_ANN_BENCH OFF)
+  set(BUILD_PRIMS_BENCH OFF)
+  set(RAFT_COMPILE_LIBRARIES OFF)
+  set(RAFT_COMPILE_DIST_LIBRARY OFF)
+  set(RAFT_COMPILE_NN_LIBRARY OFF)
+  set(CUDA_STATIC_RUNTIME ON)
+  set(RAFT_DASK_UCXX_STATIC ON)
+
+  add_subdirectory(../../cpp raft-cpp EXCLUDE_FROM_ALL)
+  list(APPEND CMAKE_MODULE_PATH ${CMAKE_BINARY_DIR}/cmake/find_modules)
+  find_package(NCCL REQUIRED)
+endif()
 
 include(rapids-cython-core)
 rapids_cython_init()
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 37691cc01b..4718d51a35 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -18,7 +18,6 @@ build-backend = "scikit_build_core.build"
 requires = [
     "cmake>=3.26.4",
     "cython>=3.0.0",
-    "libraft==24.6.*",
     "libucx==1.15.0",
     "ninja",
     "scikit-build-core[pyproject]>=0.7.0",
@@ -37,7 +36,6 @@ requires-python = ">=3.9"
 dependencies = [
     "dask-cuda==24.6.*",
     "joblib>=0.11",
-    "libraft==24.6.*",
     "libucx>=1.15.0",
     "numba>=0.57",
     "numpy>=1.23,<2.0a0",
diff --git a/python/raft-dask/raft_dask/__init__.py b/python/raft-dask/raft_dask/__init__.py
index 13b9a96154..19a037ae75 100644
--- a/python/raft-dask/raft_dask/__init__.py
+++ b/python/raft-dask/raft_dask/__init__.py
@@ -15,17 +15,8 @@
 
 from raft_dask._version import __git_commit__, __version__
 
-# If libraft or libucx was installed as a wheel, we must request that those packages
-# load the library symbols. Otherwise, we assume that the libraries were installed in a
-# system path that ld can find.
-try:
-    import libraft
-except ModuleNotFoundError:
-    pass
-else:
-    libraft.load_library()
-    del libraft
-
+# If libucx was installed as a wheel, we must request it to load the library symbols.
+# Otherwise, we assume that the library was installed in a system path that ld can find.
 try:
     import libucx
 except ModuleNotFoundError:
diff --git a/python/raft-dask/raft_dask/common/CMakeLists.txt b/python/raft-dask/raft_dask/common/CMakeLists.txt
index 49dee15d8f..65d5f06577 100644
--- a/python/raft-dask/raft_dask/common/CMakeLists.txt
+++ b/python/raft-dask/raft_dask/common/CMakeLists.txt
@@ -15,6 +15,6 @@
 set(cython_sources comms_utils.pyx nccl.pyx)
 set(linked_libraries raft::raft raft::distributed)
 rapids_cython_create_modules(
-  SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}"
+  SOURCE_FILES "${cython_sources}" ASSOCIATED_TARGETS raft LINKED_LIBRARIES "${linked_libraries}"
                                                                             CXX
 )
diff --git a/python/raft-dask/raft_dask/include_test/CMakeLists.txt b/python/raft-dask/raft_dask/include_test/CMakeLists.txt
index 8475bcaa93..2ff1cd9150 100644
--- a/python/raft-dask/raft_dask/include_test/CMakeLists.txt
+++ b/python/raft-dask/raft_dask/include_test/CMakeLists.txt
@@ -15,6 +15,6 @@
 set(cython_sources raft_include_test.pyx)
 set(linked_libraries raft::raft)
 rapids_cython_create_modules(
-  SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}"
+  SOURCE_FILES "${cython_sources}" ASSOCIATED_TARGETS raft LINKED_LIBRARIES "${linked_libraries}"
                                                                             CXX
 )

From 840cdcb18161d4204f143a5348a482587011757f Mon Sep 17 00:00:00 2001
From: Ray Douglass <3107146+raydouglass@users.noreply.github.com>
Date: Fri, 10 May 2024 12:29:57 -0400
Subject: [PATCH 36/60] Overhaul ops-codeowners (#2303)

---
 .github/CODEOWNERS | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index fc4fcd458b..d1cc52592c 100755
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -11,11 +11,14 @@ python/setup.py    @rapidsai/raft-cmake-codeowners
 build.sh           @rapidsai/raft-cmake-codeowners
 **/build.sh        @rapidsai/raft-cmake-codeowners
 
-#build/ops code owners
-.github/           @rapidsai/ops-codeowners
-ci/                @rapidsai/ops-codeowners
-conda/             @rapidsai/ops-codeowners
-**/Dockerfile      @rapidsai/ops-codeowners
-**/.dockerignore   @rapidsai/ops-codeowners
-docker/            @rapidsai/ops-codeowners
-dependencies.yaml  @rapidsai/ops-codeowners
+#CI code owners
+/.github/                @rapidsai/ci-codeowners
+/ci/                     @rapidsai/ci-codeowners
+/.pre-commit-config.yaml @rapidsai/ci-codeowners
+
+#packaging code owners
+/.devcontainers/   @rapidsai/packaging-codeowners
+/conda/            @rapidsai/packaging-codeowners
+/dependencies.yaml @rapidsai/packaging-codeowners
+/build.sh          @rapidsai/packaging-codeowners
+pyproject.toml     @rapidsai/packaging-codeowners

From 5fa4f3ca1306f0024f7aec780016e09458b2bcbb Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 10 May 2024 11:13:15 -0700
Subject: [PATCH 37/60] Revert "Add `compile-library` by default on pylibraft
 build" (#2300)

Partial reversion of rapidsai/raft#2090

The unintended consequence of this PR was to cause conda builds of pylibraft to rebuild libraft. As a result, since mid-24.04 we've seen the following: pylibraft conda builds have gone from 15-30 mins to 75-90 mins; pylibraft conda packages are fully repackaging libraft; and therefore, pylibraft conda packages have gone from ~2-3MB to ~300MB. A future PR may attempt to improve the developer experience again, at which point we should keep an eye out for similar regressions.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ray Douglass (https://github.com/raydouglass)
  - Robert Maynard (https://github.com/robertmaynard)

URL: https://github.com/rapidsai/raft/pull/2300
---
 build.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/build.sh b/build.sh
index da5efa5183..148d23c9c1 100755
--- a/build.sh
+++ b/build.sh
@@ -305,7 +305,7 @@ if hasArg --allgpuarch; then
     BUILD_ALL_GPU_ARCH=1
 fi
 
-if hasArg --compile-lib || hasArg pylibraft || (( ${NUMARGS} == 0 )); then
+if hasArg --compile-lib || (( ${NUMARGS} == 0 )); then
     COMPILE_LIBRARY=ON
     CMAKE_TARGET="${CMAKE_TARGET};raft_lib"
 fi
@@ -405,7 +405,7 @@ fi
 
 ################################################################################
 # Configure for building all C++ targets
-if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench-prims || hasArg bench-ann || [[ ${COMPILE_LIBRARY} == ON ]]; then
+if (( ${NUMARGS} == 0 )) || hasArg libraft || hasArg docs || hasArg tests || hasArg bench-prims || hasArg bench-ann; then
     if (( ${BUILD_ALL_GPU_ARCH} == 0 )); then
         RAFT_CMAKE_CUDA_ARCHITECTURES="NATIVE"
         echo "Building for the architecture of the GPU in the system..."

From 7d80f04d74c6ed04e32817030df394fa5ff0bff7 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 13 May 2024 16:59:36 +0200
Subject: [PATCH 38/60] [REVIEW] Reenable raft-dask wheel tests requiring
 UCX-Py (#2307)

With https://github.com/rapidsai/ucx-py/pull/1041 merged, UCX wheels are now fixed and thus reenabling raft-dask wheel tests that require UCX-Py should be safe.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - James Lamb (https://github.com/jameslamb)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/raft/pull/2307
---
 ci/test_wheel_raft_dask.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh
index b8ca54f6e8..bd531e7e85 100755
--- a/ci/test_wheel_raft_dask.sh
+++ b/ci/test_wheel_raft_dask.sh
@@ -15,11 +15,11 @@ python -m pip install "raft_dask-${RAPIDS_PY_CUDA_SUFFIX}[test]>=0.0.0a0" --find
 
 test_dir="python/raft-dask/raft_dask/test"
 
-# rapids-logger "pytest raft-dask"
-# python -m pytest --import-mode=append ${test_dir}
+rapids-logger "pytest raft-dask"
+python -m pytest --import-mode=append ${test_dir}
 
-# rapids-logger "pytest raft-dask (ucx-py only)"
-# python -m pytest --import-mode=append ${test_dir} --run_ucx
+rapids-logger "pytest raft-dask (ucx-py only)"
+python -m pytest --import-mode=append ${test_dir} --run_ucx
 
 rapids-logger "pytest raft-dask (ucxx only)"
 python -m pytest --import-mode=append ${test_dir} --run_ucxx

From 6cd51b28d8d2e79c5cd7acdeadeacfc0be8d0e01 Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Mon, 13 May 2024 17:32:47 +0200
Subject: [PATCH 39/60] [REVIEW] Adjust UCX dependencies (#2304)

Add `distributed-ucxx` as a required dependency to `raft-dask` and remove direct dependencies to `ucx`/`ucx-proc` in favor of transitive installation from `ucx-py`/`distributed-ucxx`.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/raft/pull/2304
---
 conda/environments/all_cuda-118_arch-aarch64.yaml | 2 --
 conda/environments/all_cuda-118_arch-x86_64.yaml  | 2 --
 conda/environments/all_cuda-122_arch-aarch64.yaml | 2 --
 conda/environments/all_cuda-122_arch-x86_64.yaml  | 2 --
 conda/recipes/raft-dask/meta.yaml                 | 3 +--
 dependencies.yaml                                 | 6 +-----
 python/raft-dask/pyproject.toml                   | 3 +--
 7 files changed, 3 insertions(+), 17 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-aarch64.yaml b/conda/environments/all_cuda-118_arch-aarch64.yaml
index 7453df2593..590c3eb68b 100644
--- a/conda/environments/all_cuda-118_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-118_arch-aarch64.yaml
@@ -55,7 +55,5 @@ dependencies:
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.17
-- ucx-proc=*=gpu
 - ucx-py==0.38.*
-- ucx>=1.15.0,<1.16.0
 name: all_cuda-118_arch-aarch64
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index b983eb0388..00ed8fa65e 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -55,7 +55,5 @@ dependencies:
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
-- ucx-proc=*=gpu
 - ucx-py==0.38.*
-- ucx>=1.15.0,<1.16.0
 name: all_cuda-118_arch-x86_64
diff --git a/conda/environments/all_cuda-122_arch-aarch64.yaml b/conda/environments/all_cuda-122_arch-aarch64.yaml
index 7dacfc2d2b..f1f346706d 100644
--- a/conda/environments/all_cuda-122_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-122_arch-aarch64.yaml
@@ -51,7 +51,5 @@ dependencies:
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-aarch64==2.17
-- ucx-proc=*=gpu
 - ucx-py==0.38.*
-- ucx>=1.15.0,<1.16.0
 name: all_cuda-122_arch-aarch64
diff --git a/conda/environments/all_cuda-122_arch-x86_64.yaml b/conda/environments/all_cuda-122_arch-x86_64.yaml
index 1c16d2ea93..505a4f1a97 100644
--- a/conda/environments/all_cuda-122_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-122_arch-x86_64.yaml
@@ -51,7 +51,5 @@ dependencies:
 - sphinx-copybutton
 - sphinx-markdown-tables
 - sysroot_linux-64==2.17
-- ucx-proc=*=gpu
 - ucx-py==0.38.*
-- ucx>=1.15.0,<1.16.0
 name: all_cuda-122_arch-x86_64
diff --git a/conda/recipes/raft-dask/meta.yaml b/conda/recipes/raft-dask/meta.yaml
index 50042780b4..af22c8853e 100644
--- a/conda/recipes/raft-dask/meta.yaml
+++ b/conda/recipes/raft-dask/meta.yaml
@@ -57,7 +57,6 @@ requirements:
     - scikit-build-core >=0.7.0
     - setuptools
     - ucx-py {{ ucx_py_version }}
-    - libucxx {{ ucxx_version }}
     - ucxx {{ ucxx_version }}
   run:
     {% if cuda_major == "11" %}
@@ -74,7 +73,7 @@ requirements:
     - python x.x
     - rmm ={{ minor_version }}
     - ucx-py {{ ucx_py_version }}
-    - ucxx {{ ucxx_version }}
+    - distributed-ucxx {{ ucxx_version }}
 
 tests:
   requirements:
diff --git a/dependencies.yaml b/dependencies.yaml
index 6ed26a3b14..98fc7fa8fc 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -46,7 +46,6 @@ files:
       - test_python_common
       - test_pylibraft
       - depends_on_cupy
-      - depends_on_distributed_ucxx
   checks:
     output: none
     includes:
@@ -100,7 +99,7 @@ files:
       table: project
     includes:
       - run_raft_dask
-      - depends_on_ucx_run
+      - depends_on_distributed_ucxx
   py_test_raft_dask:
     output: pyproject
     pyproject_dir: python/raft-dask
@@ -109,7 +108,6 @@ files:
       key: test
     includes:
       - test_python_common
-      - depends_on_distributed_ucxx
   py_build_raft_ann_bench:
     output: pyproject
     pyproject_dir: python/raft-ann-bench
@@ -447,8 +445,6 @@ dependencies:
           - ucx-py==0.38.*
       - output_types: conda
         packages:
-          - ucx>=1.15.0,<1.16.0
-          - ucx-proc=*=gpu
           - &ucx_py_conda ucx-py==0.38.*
       - output_types: pyproject
         packages:
diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml
index 4718d51a35..e8ded4cd4a 100644
--- a/python/raft-dask/pyproject.toml
+++ b/python/raft-dask/pyproject.toml
@@ -35,8 +35,8 @@ license = { text = "Apache 2.0" }
 requires-python = ">=3.9"
 dependencies = [
     "dask-cuda==24.6.*",
+    "distributed-ucxx==0.38.*",
     "joblib>=0.11",
-    "libucx>=1.15.0",
     "numba>=0.57",
     "numpy>=1.23,<2.0a0",
     "pylibraft==24.6.*",
@@ -53,7 +53,6 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
-    "distributed-ucxx==0.38.*",
     "pytest-cov",
     "pytest==7.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.

From 0b55c33a5c86b97e87fa02291604f05fe17c51d0 Mon Sep 17 00:00:00 2001
From: Rui Lan <abc99lr@gmail.com>
Date: Mon, 13 May 2024 11:44:17 -0700
Subject: [PATCH 40/60] Make thrust nosync execution policy the default thrust
 policy (#2302)

Testing. Do not merge.

Based on the discussions from https://github.com/rapidsai/raft/pull/2293, it's a good idea to test if we could use nosync thrust calls by default. This PR changes the current `rmm::exec_policy` to its async version `rmm::exec_policy_nosync`.

Authors:
  - Rui Lan (https://github.com/abc99lr)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2302
---
 cpp/include/raft/core/device_resources.hpp           | 2 +-
 cpp/include/raft/core/resource/thrust_policy.hpp     | 8 ++++----
 cpp/include/raft/spectral/detail/matrix_wrappers.hpp | 2 +-
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/include/raft/core/device_resources.hpp b/cpp/include/raft/core/device_resources.hpp
index 496c65d91f..856ecc96d7 100644
--- a/cpp/include/raft/core/device_resources.hpp
+++ b/cpp/include/raft/core/device_resources.hpp
@@ -121,7 +121,7 @@ class device_resources : public resources {
 
   cusparseHandle_t get_cusparse_handle() const { return resource::get_cusparse_handle(*this); }
 
-  rmm::exec_policy& get_thrust_policy() const { return resource::get_thrust_policy(*this); }
+  rmm::exec_policy_nosync& get_thrust_policy() const { return resource::get_thrust_policy(*this); }
 
   /**
    * @brief synchronize a stream on the current container
diff --git a/cpp/include/raft/core/resource/thrust_policy.hpp b/cpp/include/raft/core/resource/thrust_policy.hpp
index f81898be8a..c728f0a00e 100644
--- a/cpp/include/raft/core/resource/thrust_policy.hpp
+++ b/cpp/include/raft/core/resource/thrust_policy.hpp
@@ -24,7 +24,7 @@ namespace raft::resource {
 class thrust_policy_resource : public resource {
  public:
   thrust_policy_resource(rmm::cuda_stream_view stream_view)
-    : thrust_policy_(std::make_unique<rmm::exec_policy>(stream_view))
+    : thrust_policy_(std::make_unique<rmm::exec_policy_nosync>(stream_view))
   {
   }
   void* get_resource() override { return thrust_policy_.get(); }
@@ -32,7 +32,7 @@ class thrust_policy_resource : public resource {
   ~thrust_policy_resource() override {}
 
  private:
-  std::unique_ptr<rmm::exec_policy> thrust_policy_;
+  std::unique_ptr<rmm::exec_policy_nosync> thrust_policy_;
 };
 
 /**
@@ -60,13 +60,13 @@ class thrust_policy_resource_factory : public resource_factory {
  * @param res raft res object for managing resources
  * @return thrust execution policy
  */
-inline rmm::exec_policy& get_thrust_policy(resources const& res)
+inline rmm::exec_policy_nosync& get_thrust_policy(resources const& res)
 {
   if (!res.has_resource_factory(resource_type::THRUST_POLICY)) {
     rmm::cuda_stream_view stream = get_cuda_stream(res);
     res.add_resource_factory(std::make_shared<thrust_policy_resource_factory>(stream));
   }
-  return *res.get_resource<rmm::exec_policy>(resource_type::THRUST_POLICY);
+  return *res.get_resource<rmm::exec_policy_nosync>(resource_type::THRUST_POLICY);
 };
 
 /**
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index 30dd6e5e69..1fe078bd32 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -129,7 +129,7 @@ class vector_t {
  private:
   using thrust_exec_policy_t =
     thrust::detail::execute_with_allocator<rmm::mr::thrust_allocator<char>,
-                                           thrust::cuda_cub::execute_on_stream_base>;
+                                           thrust::cuda_cub::execute_on_stream_nosync_base>;
   rmm::device_uvector<value_type> buffer_;
   const thrust_exec_policy_t thrust_policy;
 };

From 8083fb4519e136c16083c11b4f32b5ea6023a0aa Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Wed, 15 May 2024 06:45:47 +0200
Subject: [PATCH 41/60] ANN_BENCH: a global pool of result buffers across
 benchmark cases (#2312)

Introduce a global pool of host/device buffers for result neighbors and distances to keep the allocations across benchmark cases.
This slightly reduces the overheads of the benchmark setup, but, most importantly, reduces the number of cuda driver calls that may block the context. The latter is relevant for benchmarking persistent kernels - to let them run across multiple benchmark cases.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2312
---
 cpp/bench/ann/src/common/benchmark.hpp |  35 +++---
 cpp/bench/ann/src/common/util.hpp      | 141 +++++++++++++++----------
 2 files changed, 109 insertions(+), 67 deletions(-)

diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index d7bcd17a00..1f27c9d6a4 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -280,10 +280,16 @@ void bench_search(::benchmark::State& state,
   /**
    * Each thread will manage its own outputs
    */
-  std::shared_ptr<buf<float>> distances =
-    std::make_shared<buf<float>>(current_algo_props->query_memory_type, k * query_set_size);
-  std::shared_ptr<buf<std::size_t>> neighbors =
-    std::make_shared<buf<std::size_t>>(current_algo_props->query_memory_type, k * query_set_size);
+  using index_type                 = size_t;
+  constexpr size_t kAlignResultBuf = 64;
+  size_t result_elem_count         = k * query_set_size;
+  result_elem_count =
+    ((result_elem_count + kAlignResultBuf - 1) / kAlignResultBuf) * kAlignResultBuf;
+  auto& result_buf =
+    get_result_buffer_from_global_pool(result_elem_count * (sizeof(float) + sizeof(index_type)));
+  auto* neighbors_ptr =
+    reinterpret_cast<index_type*>(result_buf.data(current_algo_props->query_memory_type));
+  auto* distances_ptr = reinterpret_cast<float*>(neighbors_ptr + result_elem_count);
 
   {
     nvtx_case nvtx{state.name()};
@@ -305,8 +311,8 @@ void bench_search(::benchmark::State& state,
         algo->search(query_set + batch_offset * dataset->dim(),
                      n_queries,
                      k,
-                     neighbors->data + out_offset * k,
-                     distances->data + out_offset * k);
+                     neighbors_ptr + out_offset * k,
+                     distances_ptr + out_offset * k);
       } catch (const std::exception& e) {
         state.SkipWithError("Benchmark loop: " + std::string(e.what()));
         break;
@@ -338,12 +344,13 @@ void bench_search(::benchmark::State& state,
   // Each thread calculates recall on their partition of queries.
   // evaluate recall
   if (dataset->max_k() >= k) {
-    const std::int32_t* gt          = dataset->gt_set();
-    const std::uint32_t max_k       = dataset->max_k();
-    buf<std::size_t> neighbors_host = neighbors->move(MemoryType::Host);
-    std::size_t rows                = std::min(queries_processed, query_set_size);
-    std::size_t match_count         = 0;
-    std::size_t total_count         = rows * static_cast<size_t>(k);
+    const std::int32_t* gt    = dataset->gt_set();
+    const std::uint32_t max_k = dataset->max_k();
+    result_buf.transfer_data(MemoryType::Host, current_algo_props->query_memory_type);
+    auto* neighbors_host    = reinterpret_cast<index_type*>(result_buf.data(MemoryType::Host));
+    std::size_t rows        = std::min(queries_processed, query_set_size);
+    std::size_t match_count = 0;
+    std::size_t total_count = rows * static_cast<size_t>(k);
 
     // We go through the groundtruth with same stride as the benchmark loop.
     size_t out_offset   = 0;
@@ -354,7 +361,7 @@ void bench_search(::benchmark::State& state,
         size_t i_out_idx  = out_offset + i;
         if (i_out_idx < rows) {
           for (std::uint32_t j = 0; j < k; j++) {
-            auto act_idx = std::int32_t(neighbors_host.data[i_out_idx * k + j]);
+            auto act_idx = std::int32_t(neighbors_host[i_out_idx * k + j]);
             for (std::uint32_t l = 0; l < k; l++) {
               auto exp_idx = gt[i_orig_idx * max_k + l];
               if (act_idx == exp_idx) {
@@ -717,7 +724,7 @@ inline auto run_main(int argc, char** argv) -> int
   // to a shared library it depends on (dynamic benchmark executable).
   current_algo.reset();
   current_algo_props.reset();
-  reset_global_stream_pool();
+  reset_global_device_resources();
   return 0;
 }
 };  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/common/util.hpp b/cpp/bench/ann/src/common/util.hpp
index 6cdff316e9..ebcdf82e7c 100644
--- a/cpp/bench/ann/src/common/util.hpp
+++ b/cpp/bench/ann/src/common/util.hpp
@@ -56,57 +56,6 @@ inline thread_local int benchmark_thread_id = 0;
  */
 inline thread_local int benchmark_n_threads = 1;
 
-template <typename T>
-struct buf {
-  MemoryType memory_type;
-  std::size_t size;
-  T* data;
-  buf(MemoryType memory_type, std::size_t size)
-    : memory_type(memory_type), size(size), data(nullptr)
-  {
-    switch (memory_type) {
-#ifndef BUILD_CPU_ONLY
-      case MemoryType::Device: {
-        cudaMalloc(reinterpret_cast<void**>(&data), size * sizeof(T));
-        cudaMemset(data, 0, size * sizeof(T));
-      } break;
-#endif
-      default: {
-        data = reinterpret_cast<T*>(malloc(size * sizeof(T)));
-        std::memset(data, 0, size * sizeof(T));
-      }
-    }
-  }
-  ~buf() noexcept
-  {
-    if (data == nullptr) { return; }
-    switch (memory_type) {
-#ifndef BUILD_CPU_ONLY
-      case MemoryType::Device: {
-        cudaFree(data);
-      } break;
-#endif
-      default: {
-        free(data);
-      }
-    }
-  }
-
-  [[nodiscard]] auto move(MemoryType target_memory_type) -> buf<T>
-  {
-    buf<T> r{target_memory_type, size};
-#ifndef BUILD_CPU_ONLY
-    if ((memory_type == MemoryType::Device && target_memory_type != MemoryType::Device) ||
-        (memory_type != MemoryType::Device && target_memory_type == MemoryType::Device)) {
-      cudaMemcpy(r.data, data, size * sizeof(T), cudaMemcpyDefault);
-      return r;
-    }
-#endif
-    std::swap(data, r.data);
-    return r;
-  }
-};
-
 struct cuda_timer {
  private:
   std::optional<cudaStream_t> stream_;
@@ -242,16 +191,102 @@ inline auto get_stream_from_global_pool() -> cudaStream_t
 #endif
 }
 
+struct result_buffer {
+  explicit result_buffer(size_t size, cudaStream_t stream) : size_{size}, stream_{stream}
+  {
+    if (size_ == 0) { return; }
+    data_host_ = malloc(size_);
+#ifndef BUILD_CPU_ONLY
+    cudaMallocAsync(&data_device_, size_, stream_);
+    cudaStreamSynchronize(stream_);
+#endif
+  }
+  result_buffer()                                = delete;
+  result_buffer(result_buffer&&)                 = delete;
+  result_buffer& operator=(result_buffer&&)      = delete;
+  result_buffer(const result_buffer&)            = delete;
+  result_buffer& operator=(const result_buffer&) = delete;
+  ~result_buffer() noexcept
+  {
+    if (size_ == 0) { return; }
+#ifndef BUILD_CPU_ONLY
+    cudaFreeAsync(data_device_, stream_);
+    cudaStreamSynchronize(stream_);
+#endif
+    free(data_host_);
+  }
+
+  [[nodiscard]] auto size() const noexcept { return size_; }
+  [[nodiscard]] auto data(ann::MemoryType loc) const noexcept
+  {
+    switch (loc) {
+      case MemoryType::Device: return data_device_;
+      default: return data_host_;
+    }
+  }
+
+  void transfer_data(ann::MemoryType dst, ann::MemoryType src)
+  {
+    auto dst_ptr = data(dst);
+    auto src_ptr = data(src);
+    if (dst_ptr == src_ptr) { return; }
+#ifndef BUILD_CPU_ONLY
+    cudaMemcpyAsync(dst_ptr, src_ptr, size_, cudaMemcpyDefault, stream_);
+    cudaStreamSynchronize(stream_);
+#endif
+  }
+
+ private:
+  size_t size_{0};
+  cudaStream_t stream_ = nullptr;
+  void* data_host_     = nullptr;
+  void* data_device_   = nullptr;
+};
+
+namespace detail {
+inline std::vector<std::unique_ptr<result_buffer>> global_result_buffer_pool(0);
+inline std::mutex grp_mutex;
+}  // namespace detail
+
+/**
+ * Get a result buffer associated with the current benchmark thread.
+ *
+ * Note, the allocations are reused between the benchmark cases.
+ * This reduces the setup overhead and number of times the context is being blocked
+ * (this is relevant if there is a persistent kernel running across multiples benchmark cases).
+ */
+inline auto get_result_buffer_from_global_pool(size_t size) -> result_buffer&
+{
+  auto stream = get_stream_from_global_pool();
+  auto& rb    = [stream, size]() -> result_buffer& {
+    std::lock_guard guard(detail::grp_mutex);
+    if (static_cast<int>(detail::global_result_buffer_pool.size()) < benchmark_n_threads) {
+      detail::global_result_buffer_pool.resize(benchmark_n_threads);
+    }
+    auto& rb = detail::global_result_buffer_pool[benchmark_thread_id];
+    if (!rb || rb->size() < size) { rb = std::make_unique<result_buffer>(size, stream); }
+    return *rb;
+  }();
+
+  memset(rb.data(MemoryType::Host), 0, size);
+#ifndef BUILD_CPU_ONLY
+  cudaMemsetAsync(rb.data(MemoryType::Device), 0, size, stream);
+  cudaStreamSynchronize(stream);
+#endif
+  return rb;
+}
+
 /**
- * Delete all streams in the global pool.
+ * Delete all streams and memory allocations in the global pool.
  * It's called at the end of the `main` function - before global/static variables and cuda context
  * is destroyed - to make sure they are destroyed gracefully and correctly seen by analysis tools
  * such as nsys.
  */
-inline void reset_global_stream_pool()
+inline void reset_global_device_resources()
 {
 #ifndef BUILD_CPU_ONLY
   std::lock_guard guard(detail::gsp_mutex);
+  detail::global_result_buffer_pool.resize(0);
   detail::global_stream_pool.resize(0);
 #endif
 }

From f3806f1e5d02aa17e9a647f56065b26a72ec0d2d Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Wed, 15 May 2024 06:46:16 +0200
Subject: [PATCH 42/60] Remove the shared state and the mutex from NVTX
 internals (#2310)

Until now, raft has stored a map of NVTX colors (annotation -> color) to avoid using the same color for different annotations and keep using the same color for the same annotations. This map is a shared state.
During an extensive ANN_BENCH throughput testing it has turned out that the mutex guarding the map can sometimes become a bottleneck when the number of concurrent threads is really large (>~ 256). This PR replaces the unordered map and the mutex guarding it with a deterministic hash value of the annotation instead (which is stateless).

**Pros:**
  - No shared state, no mutexes.
  - Assigns the same colors to the same annotations across program runs.

**Cons:**
  - Sometimes different annotations can have the same color (hash collisions).

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2310
---
 cpp/include/raft/core/detail/nvtx.hpp | 40 +++++++++------------------
 1 file changed, 13 insertions(+), 27 deletions(-)

diff --git a/cpp/include/raft/core/detail/nvtx.hpp b/cpp/include/raft/core/detail/nvtx.hpp
index 82db75de84..253d8e5b93 100644
--- a/cpp/include/raft/core/detail/nvtx.hpp
+++ b/cpp/include/raft/core/detail/nvtx.hpp
@@ -24,23 +24,19 @@
 
 #include <cstdint>
 #include <cstdlib>
-#include <mutex>
+#include <limits>
 #include <string>
 #include <type_traits>
-#include <unordered_map>
 #include <vector>
 
 namespace raft::common::nvtx::detail {
 
 /**
- * @brief An internal struct to store associated state with the color
- * generator
+ * @brief An internal struct to to initialize the color generator
  */
-struct color_gen_state {
-  /** collection of all tagged colors generated so far */
-  static inline std::unordered_map<std::string, uint32_t> all_colors_;
-  /** mutex for accessing the above map */
-  static inline std::mutex map_mutex_;
+struct color_gen {
+  /** This determines how many bits of the hash to use for the generator */
+  using hash_type = uint16_t;
   /** saturation */
   static inline constexpr float kS = 0.9f;
   /** value */
@@ -109,32 +105,22 @@ inline auto hsv2rgb(float h, float s, float v) -> uint32_t
 /**
  * @brief Helper method to generate 'visually distinct' colors.
  * Inspired from https://martin.ankerl.com/2009/12/09/how-to-create-random-colors-programmatically/
- * However, if an associated tag is passed, it will look up in its history for
- * any generated color against this tag and if found, just returns it, else
- * generates a new color, assigns a tag to it and stores it for future usage.
+ * It calculates a hash of the passed string and uses the result to generate
+ * distinct yet deterministic colors.
  * Such a thing is very useful for nvtx markers where the ranges associated
  * with a specific tag should ideally get the same color for the purpose of
  * visualizing it on nsight-systems timeline.
- * @param tag look for any previously generated colors with this tag or
- * associate the currently generated color with it
+ * @param tag a string used as an input to generate a distinct color.
  * @return returns 32b RGB integer with alpha channel set of 0xff
  */
 inline auto generate_next_color(const std::string& tag) -> uint32_t
 {
-  // std::unordered_map<std::string, uint32_t> color_gen_state::all_colors_;
-  // std::mutex color_gen_state::map_mutex_;
-
-  std::lock_guard<std::mutex> guard(color_gen_state::map_mutex_);
-  if (!tag.empty()) {
-    auto itr = color_gen_state::all_colors_.find(tag);
-    if (itr != color_gen_state::all_colors_.end()) { return itr->second; }
-  }
-  auto h = static_cast<float>(rand()) / static_cast<float>(RAND_MAX);
-  h += color_gen_state::kInvPhi;
+  auto x = static_cast<color_gen::hash_type>(std::hash<std::string>{}(tag));
+  auto u = std::numeric_limits<color_gen::hash_type>::max();
+  auto h = static_cast<float>(x) / static_cast<float>(u);
+  h += color_gen::kInvPhi;
   if (h >= 1.f) h -= 1.f;
-  auto rgb = hsv2rgb(h, color_gen_state::kS, color_gen_state::kV);
-  if (!tag.empty()) { color_gen_state::all_colors_[tag] = rgb; }
-  return rgb;
+  return hsv2rgb(h, color_gen::kS, color_gen::kV);
 }
 
 template <typename Domain, typename = Domain>

From ee2a0ba739407872b00c3fe19d45d1061711f4d6 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Wed, 15 May 2024 06:47:12 +0200
Subject: [PATCH 43/60] ANN_BENCH: enable move semantics for
 configured_raft_resources (#2311)

Re-enable move constructor for the `configured_raft_resources`. It was implicitly deleted before, which was exposed and made explicit in https://github.com/rapidsai/raft/pull/2269 .
Allowing move semantics here means avoiding an extra unwanted overhead during algorithm preparation in the benchmarks tool.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2311
---
 cpp/bench/ann/src/raft/raft_ann_bench_utils.h | 22 ++++++++++++-------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
index 72a2c0bb05..6cadb26736 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
@@ -122,7 +122,9 @@ class configured_raft_resources {
    * It's used by the copy constructor.
    */
   explicit configured_raft_resources(const std::shared_ptr<shared_raft_resources>& shared_res)
-    : shared_res_{shared_res}, res_{rmm::cuda_stream_view(get_stream_from_global_pool())}
+    : shared_res_{shared_res},
+      res_{std::make_unique<raft::device_resources>(
+        rmm::cuda_stream_view(get_stream_from_global_pool()))}
   {
   }
 
@@ -131,9 +133,9 @@ class configured_raft_resources {
   {
   }
 
-  configured_raft_resources(configured_raft_resources&&)            = delete;
-  configured_raft_resources& operator=(configured_raft_resources&&) = delete;
-  ~configured_raft_resources()                                      = default;
+  configured_raft_resources(configured_raft_resources&&);
+  configured_raft_resources& operator=(configured_raft_resources&&);
+  ~configured_raft_resources() = default;
   configured_raft_resources(const configured_raft_resources& res)
     : configured_raft_resources{res.shared_res_}
   {
@@ -144,11 +146,11 @@ class configured_raft_resources {
     return *this;
   }
 
-  operator raft::resources&() noexcept { return res_; }
-  operator const raft::resources&() const noexcept { return res_; }
+  operator raft::resources&() noexcept { return *res_; }
+  operator const raft::resources&() const noexcept { return *res_; }
 
   /** Get the main stream */
-  [[nodiscard]] auto get_sync_stream() const noexcept { return resource::get_cuda_stream(res_); }
+  [[nodiscard]] auto get_sync_stream() const noexcept { return resource::get_cuda_stream(*res_); }
 
  private:
   /** The resources shared among multiple raft handles / threads. */
@@ -157,7 +159,11 @@ class configured_raft_resources {
    * Until we make the use of copies of raft::resources thread-safe, each benchmark wrapper must
    * have its own copy of it.
    */
-  raft::device_resources res_;
+  std::unique_ptr<raft::device_resources> res_ = std::make_unique<raft::device_resources>();
 };
 
+inline configured_raft_resources::configured_raft_resources(configured_raft_resources&&) = default;
+inline configured_raft_resources& configured_raft_resources::operator=(
+  configured_raft_resources&&) = default;
+
 }  // namespace raft::bench::ann

From 92d4301ae166bf1542f14c24f6eed0e863a5cd7d Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Wed, 15 May 2024 07:03:17 +0200
Subject: [PATCH 44/60] ANN_BENCH: split instances of RaftCagra into multiple
 files (#2313)

Split instances of RaftCagra into multiple files to compile them in parallel and thus reduce the total benchmark compile time.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2313
---
 cpp/bench/ann/CMakeLists.txt                  |  5 ++++-
 cpp/bench/ann/src/raft/raft_cagra_float.cu    | 20 +++++++++++++++++++
 cpp/bench/ann/src/raft/raft_cagra_half.cu     | 20 +++++++++++++++++++
 cpp/bench/ann/src/raft/raft_cagra_int8_t.cu   | 20 +++++++++++++++++++
 .../{raft_cagra.cu => raft_cagra_uint8_t.cu}  |  3 ---
 5 files changed, 64 insertions(+), 4 deletions(-)
 create mode 100644 cpp/bench/ann/src/raft/raft_cagra_float.cu
 create mode 100644 cpp/bench/ann/src/raft/raft_cagra_half.cu
 create mode 100644 cpp/bench/ann/src/raft/raft_cagra_int8_t.cu
 rename cpp/bench/ann/src/raft/{raft_cagra.cu => raft_cagra_uint8_t.cu} (85%)

diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index ee84f7515a..f29d32ccde 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -266,7 +266,10 @@ if(RAFT_ANN_BENCH_USE_RAFT_CAGRA)
     RAFT_CAGRA
     PATH
     bench/ann/src/raft/raft_benchmark.cu
-    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_CAGRA}>:bench/ann/src/raft/raft_cagra.cu>
+    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_CAGRA}>:bench/ann/src/raft/raft_cagra_float.cu>
+    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_CAGRA}>:bench/ann/src/raft/raft_cagra_half.cu>
+    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_CAGRA}>:bench/ann/src/raft/raft_cagra_int8_t.cu>
+    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_CAGRA}>:bench/ann/src/raft/raft_cagra_uint8_t.cu>
     LINKS
     raft::compiled
   )
diff --git a/cpp/bench/ann/src/raft/raft_cagra_float.cu b/cpp/bench/ann/src/raft/raft_cagra_float.cu
new file mode 100644
index 0000000000..058f5bf34a
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_cagra_float.cu
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "raft_cagra_wrapper.h"
+
+namespace raft::bench::ann {
+template class RaftCagra<float, uint32_t>;
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_cagra_half.cu b/cpp/bench/ann/src/raft/raft_cagra_half.cu
new file mode 100644
index 0000000000..a015819ec5
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_cagra_half.cu
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "raft_cagra_wrapper.h"
+
+namespace raft::bench::ann {
+template class RaftCagra<half, uint32_t>;
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_cagra_int8_t.cu b/cpp/bench/ann/src/raft/raft_cagra_int8_t.cu
new file mode 100644
index 0000000000..be3b83ee60
--- /dev/null
+++ b/cpp/bench/ann/src/raft/raft_cagra_int8_t.cu
@@ -0,0 +1,20 @@
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "raft_cagra_wrapper.h"
+
+namespace raft::bench::ann {
+template class RaftCagra<int8_t, uint32_t>;
+}  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_cagra.cu b/cpp/bench/ann/src/raft/raft_cagra_uint8_t.cu
similarity index 85%
rename from cpp/bench/ann/src/raft/raft_cagra.cu
rename to cpp/bench/ann/src/raft/raft_cagra_uint8_t.cu
index c0c1352a43..c9679e404d 100644
--- a/cpp/bench/ann/src/raft/raft_cagra.cu
+++ b/cpp/bench/ann/src/raft/raft_cagra_uint8_t.cu
@@ -17,7 +17,4 @@
 
 namespace raft::bench::ann {
 template class RaftCagra<uint8_t, uint32_t>;
-template class RaftCagra<int8_t, uint32_t>;
-template class RaftCagra<half, uint32_t>;
-template class RaftCagra<float, uint32_t>;
 }  // namespace raft::bench::ann

From 6cc71344f442a66ad3b72ce0ca429c7ea7e0fa82 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Wed, 15 May 2024 15:55:42 +0200
Subject: [PATCH 45/60] ANN_BENCH: AnnGPU::uses_stream() for optional algo GPU
 sync (#2314)

Introduce a new virtual member `uses_stream()` for the `AnnGPU` class. Overriding this allows an algorithm inform the benchmark whether the stream synchronization is needed between benchmark iterations.

This is relevant for a potential persistent kernel where the CPU threads use an independent mechanics to synchronize and get the results from the GPU.
This is different from just not implementing `AnnGPU` for an algorithm in that it allows the algorithm to decide whether the synchronization is needed (depending on input parameters at runtime), while still providing the `get_sync_stream()` functionality.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2314
---
 cpp/bench/ann/src/common/ann_types.hpp | 11 ++++++++++-
 cpp/bench/ann/src/common/util.hpp      |  4 +++-
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
index c6213059dc..776d29a906 100644
--- a/cpp/bench/ann/src/common/ann_types.hpp
+++ b/cpp/bench/ann/src/common/ann_types.hpp
@@ -98,7 +98,16 @@ class AnnGPU {
    * end.
    */
   [[nodiscard]] virtual auto get_sync_stream() const noexcept -> cudaStream_t = 0;
-  virtual ~AnnGPU() noexcept                                                  = default;
+  /**
+   * By default a GPU algorithm uses a fixed stream to order GPU operations.
+   * However, an algorithm may need to synchronize with the host at the end of its execution.
+   * In that case, also synchronizing with a benchmark event would put it at disadvantage.
+   *
+   * We can disable event sync by passing `false` here
+   *   - ONLY IF THE ALGORITHM HAS PRODUCED ITS OUTPUT BY THE TIME IT SYNCHRONIZES WITH CPU.
+   */
+  [[nodiscard]] virtual auto uses_stream() const noexcept -> bool { return true; }
+  virtual ~AnnGPU() noexcept = default;
 };
 
 template <typename T>
diff --git a/cpp/bench/ann/src/common/util.hpp b/cpp/bench/ann/src/common/util.hpp
index ebcdf82e7c..96185c79eb 100644
--- a/cpp/bench/ann/src/common/util.hpp
+++ b/cpp/bench/ann/src/common/util.hpp
@@ -67,7 +67,9 @@ struct cuda_timer {
   static inline auto extract_stream(AnnT* algo) -> std::optional<cudaStream_t>
   {
     auto gpu_ann = dynamic_cast<AnnGPU*>(algo);
-    if (gpu_ann != nullptr) { return std::make_optional(gpu_ann->get_sync_stream()); }
+    if (gpu_ann != nullptr && gpu_ann->uses_stream()) {
+      return std::make_optional(gpu_ann->get_sync_stream());
+    }
     return std::nullopt;
   }
 

From eb1333de98fd77394221c3950aa7dbffca557881 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Wed, 15 May 2024 21:51:34 +0200
Subject: [PATCH 46/60] ANN_BENCH: common AnnBase::index_type (#2315)

Replace the `size_t` type in the `AnnBase::search` for the output neighbor indices with a common `AnnBase::index_type`.
This PR stops short of changing the behavior of the benchmarks, since it keeps `using index_type = size_t`.

The introduction of the new type has couple benefits:
  - Makes the usage of the `index_type` more clear in the code, distinguishing it from the extents type, which is usually `size_t` as well.
  - Makes it possible to quickly change the alias to `uint32_t` during development and experiments. This is needed to avoid calling extra `linalg::map` on the produced results when the algorithm output is not compatible with `size_t`.


As a small extra change, I've factored out common IVF-PQ - CAGRA-Q refinement code into a separate `refine_helper` function.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Tamas Bela Feher (https://github.com/tfeher)

URL: https://github.com/rapidsai/raft/pull/2315
---
 cpp/bench/ann/src/common/ann_types.hpp        |   9 +-
 cpp/bench/ann/src/common/benchmark.hpp        |   2 +-
 cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h   |   9 +-
 cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h   |   9 +-
 cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh       |  16 ++-
 cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h   |  16 ++-
 cpp/bench/ann/src/raft/raft_ann_bench_utils.h |  72 +++++++++++
 .../ann/src/raft/raft_cagra_hnswlib_wrapper.h |  11 +-
 cpp/bench/ann/src/raft/raft_cagra_wrapper.h   |  97 +++++---------
 .../ann/src/raft/raft_ivf_flat_wrapper.h      |  32 ++++-
 cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h  | 121 +++++++++---------
 cpp/bench/ann/src/raft/raft_wrapper.h         |  16 ++-
 12 files changed, 246 insertions(+), 164 deletions(-)

diff --git a/cpp/bench/ann/src/common/ann_types.hpp b/cpp/bench/ann/src/common/ann_types.hpp
index 776d29a906..b010063dee 100644
--- a/cpp/bench/ann/src/common/ann_types.hpp
+++ b/cpp/bench/ann/src/common/ann_types.hpp
@@ -73,6 +73,8 @@ struct AlgoProperty {
 
 class AnnBase {
  public:
+  using index_type = size_t;
+
   inline AnnBase(Metric metric, int dim) : metric_(metric), dim_(dim) {}
   virtual ~AnnBase() noexcept = default;
 
@@ -127,8 +129,11 @@ class ANN : public AnnBase {
   virtual void set_search_param(const AnnSearchParam& param) = 0;
   // TODO: this assumes that an algorithm can always return k results.
   // This is not always possible.
-  virtual void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const = 0;
+  virtual void search(const T* queries,
+                      int batch_size,
+                      int k,
+                      AnnBase::index_type* neighbors,
+                      float* distances) const = 0;
 
   virtual void save(const std::string& file) const = 0;
   virtual void load(const std::string& file)       = 0;
diff --git a/cpp/bench/ann/src/common/benchmark.hpp b/cpp/bench/ann/src/common/benchmark.hpp
index 1f27c9d6a4..8762ccd1fe 100644
--- a/cpp/bench/ann/src/common/benchmark.hpp
+++ b/cpp/bench/ann/src/common/benchmark.hpp
@@ -280,7 +280,7 @@ void bench_search(::benchmark::State& state,
   /**
    * Each thread will manage its own outputs
    */
-  using index_type                 = size_t;
+  using index_type                 = AnnBase::index_type;
   constexpr size_t kAlignResultBuf = 64;
   size_t result_elem_count         = k * query_set_size;
   result_elem_count =
diff --git a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
index 407f7148df..3caca15b7f 100644
--- a/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_cpu_wrapper.h
@@ -88,8 +88,11 @@ class FaissCpu : public ANN<T> {
 
   // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
   // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const final;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const final;
 
   AlgoProperty get_preference() const override
   {
@@ -169,7 +172,7 @@ void FaissCpu<T>::set_search_param(const AnnSearchParam& param)
 
 template <typename T>
 void FaissCpu<T>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   static_assert(sizeof(size_t) == sizeof(faiss::idx_t),
                 "sizes of size_t and faiss::idx_t are different");
diff --git a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
index 633098fd1d..2effe631e5 100644
--- a/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
+++ b/cpp/bench/ann/src/faiss/faiss_gpu_wrapper.h
@@ -111,8 +111,11 @@ class FaissGpu : public ANN<T>, public AnnGPU {
 
   // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
   // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const final;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const final;
 
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
   {
@@ -196,7 +199,7 @@ void FaissGpu<T>::build(const T* dataset, size_t nrow)
 
 template <typename T>
 void FaissGpu<T>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   static_assert(sizeof(size_t) == sizeof(faiss::idx_t),
                 "sizes of size_t and faiss::idx_t are different");
diff --git a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
index c89f02d974..59cf3df806 100644
--- a/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
+++ b/cpp/bench/ann/src/ggnn/ggnn_wrapper.cuh
@@ -58,8 +58,11 @@ class Ggnn : public ANN<T>, public AnnGPU {
   void build(const T* dataset, size_t nrow) override { impl_->build(dataset, nrow); }
 
   void set_search_param(const AnnSearchParam& param) override { impl_->set_search_param(param); }
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const override
   {
     impl_->search(queries, batch_size, k, neighbors, distances);
   }
@@ -123,8 +126,11 @@ class GgnnImpl : public ANN<T>, public AnnGPU {
   void build(const T* dataset, size_t nrow) override;
 
   void set_search_param(const AnnSearchParam& param) override;
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const override;
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override { return stream_; }
 
   void save(const std::string& file) const override;
@@ -243,7 +249,7 @@ void GgnnImpl<T, measure, D, KBuild, KQuery, S>::set_search_param(const AnnSearc
 
 template <typename T, DistanceMeasure measure, int D, int KBuild, int KQuery, int S>
 void GgnnImpl<T, measure, D, KBuild, KQuery, S>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   static_assert(sizeof(size_t) == sizeof(int64_t), "sizes of size_t and GGNN's KeyT are different");
   if (k != KQuery) {
diff --git a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
index a8f7dd824f..5743632bf4 100644
--- a/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/hnswlib/hnswlib_wrapper.h
@@ -79,8 +79,11 @@ class HnswLib : public ANN<T> {
   void build(const T* dataset, size_t nrow) override;
 
   void set_search_param(const AnnSearchParam& param) override;
-  void search(
-    const T* query, int batch_size, int k, size_t* indices, float* distances) const override;
+  void search(const T* query,
+              int batch_size,
+              int k,
+              AnnBase::index_type* indices,
+              float* distances) const override;
 
   void save(const std::string& path_to_index) const override;
   void load(const std::string& path_to_index) override;
@@ -97,7 +100,10 @@ class HnswLib : public ANN<T> {
   void set_base_layer_only() { appr_alg_->base_layer_only = true; }
 
  private:
-  void get_search_knn_results_(const T* query, int k, size_t* indices, float* distances) const;
+  void get_search_knn_results_(const T* query,
+                               int k,
+                               AnnBase::index_type* indices,
+                               float* distances) const;
 
   std::shared_ptr<hnswlib::HierarchicalNSW<typename hnsw_dist_t<T>::type>> appr_alg_;
   std::shared_ptr<hnswlib::SpaceInterface<typename hnsw_dist_t<T>::type>> space_;
@@ -176,7 +182,7 @@ void HnswLib<T>::set_search_param(const AnnSearchParam& param_)
 
 template <typename T>
 void HnswLib<T>::search(
-  const T* query, int batch_size, int k, size_t* indices, float* distances) const
+  const T* query, int batch_size, int k, AnnBase::index_type* indices, float* distances) const
 {
   auto f = [&](int i) {
     // hnsw can only handle a single vector at a time.
@@ -217,7 +223,7 @@ void HnswLib<T>::load(const std::string& path_to_index)
 template <typename T>
 void HnswLib<T>::get_search_knn_results_(const T* query,
                                          int k,
-                                         size_t* indices,
+                                         AnnBase::index_type* indices,
                                          float* distances) const
 {
   auto result = appr_alg_->searchKnn(query, k);
diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
index 6cadb26736..ffe8f8717b 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
@@ -19,9 +19,12 @@
 
 #include <raft/core/device_mdspan.hpp>
 #include <raft/core/device_resources.hpp>
+#include <raft/core/host_mdarray.hpp>
+#include <raft/core/host_mdspan.hpp>
 #include <raft/core/logger.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/distance/distance_types.hpp>
+#include <raft/neighbors/refine.cuh>
 #include <raft/util/cudart_utils.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -166,4 +169,73 @@ inline configured_raft_resources::configured_raft_resources(configured_raft_reso
 inline configured_raft_resources& configured_raft_resources::operator=(
   configured_raft_resources&&) = default;
 
+/** A helper to refine the neighbors when the data is on device or on host. */
+template <typename DatasetT, typename QueriesT, typename CandidatesT>
+void refine_helper(const raft::resources& res,
+                   DatasetT dataset,
+                   QueriesT queries,
+                   CandidatesT candidates,
+                   int k,
+                   AnnBase::index_type* neighbors,
+                   float* distances,
+                   raft::distance::DistanceType metric)
+{
+  using data_type    = typename DatasetT::value_type;
+  using index_type   = AnnBase::index_type;
+  using extents_type = index_type;  // device-side refine requires this
+
+  static_assert(std::is_same_v<data_type, typename QueriesT::value_type>);
+  static_assert(std::is_same_v<data_type, typename DatasetT::value_type>);
+  static_assert(std::is_same_v<index_type, typename CandidatesT::value_type>);
+
+  extents_type batch_size = queries.extent(0);
+  extents_type dim        = queries.extent(1);
+  extents_type k0         = candidates.extent(1);
+
+  if (raft::get_device_for_address(dataset.data_handle()) >= 0) {
+    auto dataset_device = raft::make_device_matrix_view<const data_type, extents_type>(
+      dataset.data_handle(), dataset.extent(0), dataset.extent(1));
+    auto queries_device = raft::make_device_matrix_view<const data_type, extents_type>(
+      queries.data_handle(), batch_size, dim);
+    auto candidates_device = raft::make_device_matrix_view<const index_type, extents_type>(
+      candidates.data_handle(), batch_size, k0);
+    auto neighbors_device =
+      raft::make_device_matrix_view<index_type, extents_type>(neighbors, batch_size, k);
+    auto distances_device =
+      raft::make_device_matrix_view<float, extents_type>(distances, batch_size, k);
+
+    raft::neighbors::refine<index_type, data_type, float, extents_type>(res,
+                                                                        dataset_device,
+                                                                        queries_device,
+                                                                        candidates_device,
+                                                                        neighbors_device,
+                                                                        distances_device,
+                                                                        metric);
+  } else {
+    auto dataset_host = raft::make_host_matrix_view<const data_type, extents_type>(
+      dataset.data_handle(), dataset.extent(0), dataset.extent(1));
+    auto queries_host    = raft::make_host_matrix<data_type, extents_type>(batch_size, dim);
+    auto candidates_host = raft::make_host_matrix<index_type, extents_type>(batch_size, k0);
+    auto neighbors_host  = raft::make_host_matrix<index_type, extents_type>(batch_size, k);
+    auto distances_host  = raft::make_host_matrix<float, extents_type>(batch_size, k);
+
+    auto stream = resource::get_cuda_stream(res);
+    raft::copy(queries_host.data_handle(), queries.data_handle(), queries_host.size(), stream);
+    raft::copy(
+      candidates_host.data_handle(), candidates.data_handle(), candidates_host.size(), stream);
+
+    raft::resource::sync_stream(res);  // wait for the queries and candidates
+    raft::neighbors::refine<index_type, data_type, float, extents_type>(res,
+                                                                        dataset_host,
+                                                                        queries_host.view(),
+                                                                        candidates_host.view(),
+                                                                        neighbors_host.view(),
+                                                                        distances_host.view(),
+                                                                        metric);
+
+    raft::copy(neighbors, neighbors_host.data_handle(), neighbors_host.size(), stream);
+    raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream);
+  }
+}
+
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h
index ed9c120ed4..1c4b847d1a 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_hnswlib_wrapper.h
@@ -41,10 +41,11 @@ class RaftCagraHnswlib : public ANN<T>, public AnnGPU {
 
   void set_search_param(const AnnSearchParam& param) override;
 
-  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
-  // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const override;
 
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
   {
@@ -99,7 +100,7 @@ void RaftCagraHnswlib<T, IdxT>::load(const std::string& file)
 
 template <typename T, typename IdxT>
 void RaftCagraHnswlib<T, IdxT>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   hnswlib_search_.search(queries, batch_size, k, neighbors, distances);
 }
diff --git a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
index 46da8c52e6..0b892dec35 100644
--- a/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_cagra_wrapper.h
@@ -96,12 +96,16 @@ class RaftCagra : public ANN<T>, public AnnGPU {
 
   void set_search_dataset(const T* dataset, size_t nrow) override;
 
-  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
-  // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override;
-  void search_base(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const override;
+  void search_base(const T* queries,
+                   int batch_size,
+                   int k,
+                   AnnBase::index_type* neighbors,
+                   float* distances) const;
 
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
   {
@@ -272,15 +276,18 @@ std::unique_ptr<ANN<T>> RaftCagra<T, IdxT>::copy()
 
 template <typename T, typename IdxT>
 void RaftCagra<T, IdxT>::search_base(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
+  static_assert(std::is_integral_v<AnnBase::index_type>);
+  static_assert(std::is_integral_v<IdxT>);
+
   IdxT* neighbors_IdxT;
-  rmm::device_uvector<IdxT> neighbors_storage(0, resource::get_cuda_stream(handle_));
-  if constexpr (std::is_same_v<IdxT, size_t>) {
-    neighbors_IdxT = neighbors;
+  std::optional<rmm::device_uvector<IdxT>> neighbors_storage{std::nullopt};
+  if constexpr (sizeof(IdxT) == sizeof(AnnBase::index_type)) {
+    neighbors_IdxT = reinterpret_cast<IdxT*>(neighbors);
   } else {
-    neighbors_storage.resize(batch_size * k, resource::get_cuda_stream(handle_));
-    neighbors_IdxT = neighbors_storage.data();
+    neighbors_storage.emplace(batch_size * k, resource::get_cuda_stream(handle_));
+    neighbors_IdxT = neighbors_storage->data();
   }
 
   auto queries_view =
@@ -291,76 +298,36 @@ void RaftCagra<T, IdxT>::search_base(
   raft::neighbors::cagra::search(
     handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
 
-  if constexpr (!std::is_same_v<IdxT, size_t>) {
+  if constexpr (sizeof(IdxT) != sizeof(AnnBase::index_type)) {
     raft::linalg::unaryOp(neighbors,
                           neighbors_IdxT,
                           batch_size * k,
-                          raft::cast_op<size_t>(),
+                          raft::cast_op<AnnBase::index_type>(),
                           raft::resource::get_cuda_stream(handle_));
   }
 }
 
 template <typename T, typename IdxT>
 void RaftCagra<T, IdxT>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   auto k0                       = static_cast<size_t>(refine_ratio_ * k);
   const bool disable_refinement = k0 <= static_cast<size_t>(k);
   const raft::resources& res    = handle_;
-  auto stream                   = resource::get_cuda_stream(res);
 
   if (disable_refinement) {
     search_base(queries, batch_size, k, neighbors, distances);
   } else {
-    auto candidate_ixs   = raft::make_device_matrix<int64_t, int64_t>(res, batch_size, k0);
-    auto candidate_dists = raft::make_device_matrix<float, int64_t>(res, batch_size, k0);
-    search_base(queries,
-                batch_size,
-                k0,
-                reinterpret_cast<size_t*>(candidate_ixs.data_handle()),
-                candidate_dists.data_handle());
-
-    if (raft::get_device_for_address(input_dataset_v_->data_handle()) >= 0) {
-      auto queries_v =
-        raft::make_device_matrix_view<const T, int64_t>(queries, batch_size, dimension_);
-      auto neighours_v = raft::make_device_matrix_view<int64_t, int64_t>(
-        reinterpret_cast<int64_t*>(neighbors), batch_size, k);
-      auto distances_v = raft::make_device_matrix_view<float, int64_t>(distances, batch_size, k);
-      raft::neighbors::refine<int64_t, T, float, int64_t>(
-        res,
-        *input_dataset_v_,
-        queries_v,
-        raft::make_const_mdspan(candidate_ixs.view()),
-        neighours_v,
-        distances_v,
-        index_->metric());
-    } else {
-      auto dataset_host = raft::make_host_matrix_view<const T, int64_t>(
-        input_dataset_v_->data_handle(), input_dataset_v_->extent(0), input_dataset_v_->extent(1));
-      auto queries_host    = raft::make_host_matrix<T, int64_t>(batch_size, dimension_);
-      auto candidates_host = raft::make_host_matrix<int64_t, int64_t>(batch_size, k0);
-      auto neighbors_host  = raft::make_host_matrix<int64_t, int64_t>(batch_size, k);
-      auto distances_host  = raft::make_host_matrix<float, int64_t>(batch_size, k);
-
-      raft::copy(queries_host.data_handle(), queries, queries_host.size(), stream);
-      raft::copy(
-        candidates_host.data_handle(), candidate_ixs.data_handle(), candidates_host.size(), stream);
-
-      raft::resource::sync_stream(res);  // wait for the queries and candidates
-      raft::neighbors::refine<int64_t, T, float, int64_t>(res,
-                                                          dataset_host,
-                                                          queries_host.view(),
-                                                          candidates_host.view(),
-                                                          neighbors_host.view(),
-                                                          distances_host.view(),
-                                                          index_->metric());
-
-      raft::copy(neighbors,
-                 reinterpret_cast<size_t*>(neighbors_host.data_handle()),
-                 neighbors_host.size(),
-                 stream);
-      raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream);
-    }
+    auto queries_v =
+      raft::make_device_matrix_view<const T, AnnBase::index_type>(queries, batch_size, dimension_);
+    auto candidate_ixs =
+      raft::make_device_matrix<AnnBase::index_type, AnnBase::index_type>(res, batch_size, k0);
+    auto candidate_dists =
+      raft::make_device_matrix<float, AnnBase::index_type>(res, batch_size, k0);
+    search_base(
+      queries, batch_size, k0, candidate_ixs.data_handle(), candidate_dists.data_handle());
+    refine_helper(
+      res, *input_dataset_v_, queries_v, candidate_ixs, k, neighbors, distances, index_->metric());
   }
 }
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
index 48d2b9de80..83a3a63aba 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_flat_wrapper.h
@@ -61,10 +61,11 @@ class RaftIvfFlatGpu : public ANN<T>, public AnnGPU {
 
   void set_search_param(const AnnSearchParam& param) override;
 
-  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
-  // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const override;
 
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
   {
@@ -131,17 +132,34 @@ std::unique_ptr<ANN<T>> RaftIvfFlatGpu<T, IdxT>::copy()
 
 template <typename T, typename IdxT>
 void RaftIvfFlatGpu<T, IdxT>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
-  static_assert(sizeof(size_t) == sizeof(IdxT), "IdxT is incompatible with size_t");
+  static_assert(std::is_integral_v<AnnBase::index_type>);
+  static_assert(std::is_integral_v<IdxT>);
+
+  IdxT* neighbors_IdxT;
+  std::optional<rmm::device_uvector<IdxT>> neighbors_storage{std::nullopt};
+  if constexpr (sizeof(IdxT) == sizeof(AnnBase::index_type)) {
+    neighbors_IdxT = reinterpret_cast<IdxT*>(neighbors);
+  } else {
+    neighbors_storage.emplace(batch_size * k, resource::get_cuda_stream(handle_));
+    neighbors_IdxT = neighbors_storage->data();
+  }
   raft::neighbors::ivf_flat::search(handle_,
                                     search_params_,
                                     *index_,
                                     queries,
                                     batch_size,
                                     k,
-                                    (IdxT*)neighbors,
+                                    neighbors_IdxT,
                                     distances,
                                     resource::get_workspace_resource(handle_));
+  if constexpr (sizeof(IdxT) != sizeof(AnnBase::index_type)) {
+    raft::linalg::unaryOp(neighbors,
+                          neighbors_IdxT,
+                          batch_size * k,
+                          raft::cast_op<AnnBase::index_type>(),
+                          raft::resource::get_cuda_stream(handle_));
+  }
 }
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
index 1d73bd2e51..7201467969 100644
--- a/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_ivf_pq_wrapper.h
@@ -61,10 +61,16 @@ class RaftIvfPQ : public ANN<T>, public AnnGPU {
   void set_search_param(const AnnSearchParam& param) override;
   void set_search_dataset(const T* dataset, size_t nrow) override;
 
-  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
-  // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const override;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const override;
+  void search_base(const T* queries,
+                   int batch_size,
+                   int k,
+                   AnnBase::index_type* neighbors,
+                   float* distances) const;
 
   [[nodiscard]] auto get_sync_stream() const noexcept -> cudaStream_t override
   {
@@ -137,68 +143,61 @@ void RaftIvfPQ<T, IdxT>::set_search_dataset(const T* dataset, size_t nrow)
   dataset_ = raft::make_device_matrix_view<const T, IdxT>(dataset, nrow, index_->dim());
 }
 
+template <typename T, typename IdxT>
+void RaftIvfPQ<T, IdxT>::search_base(
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
+{
+  static_assert(std::is_integral_v<AnnBase::index_type>);
+  static_assert(std::is_integral_v<IdxT>);
+
+  IdxT* neighbors_IdxT;
+  std::optional<rmm::device_uvector<IdxT>> neighbors_storage{std::nullopt};
+  if constexpr (sizeof(IdxT) == sizeof(AnnBase::index_type)) {
+    neighbors_IdxT = reinterpret_cast<IdxT*>(neighbors);
+  } else {
+    neighbors_storage.emplace(batch_size * k, resource::get_cuda_stream(handle_));
+    neighbors_IdxT = neighbors_storage->data();
+  }
+
+  auto queries_view =
+    raft::make_device_matrix_view<const T, uint32_t>(queries, batch_size, dimension_);
+  auto neighbors_view =
+    raft::make_device_matrix_view<IdxT, uint32_t>(neighbors_IdxT, batch_size, k);
+  auto distances_view = raft::make_device_matrix_view<float, uint32_t>(distances, batch_size, k);
+
+  raft::neighbors::ivf_pq::search(
+    handle_, search_params_, *index_, queries_view, neighbors_view, distances_view);
+
+  if constexpr (sizeof(IdxT) != sizeof(AnnBase::index_type)) {
+    raft::linalg::unaryOp(neighbors,
+                          neighbors_IdxT,
+                          batch_size * k,
+                          raft::cast_op<AnnBase::index_type>(),
+                          raft::resource::get_cuda_stream(handle_));
+  }
+}
+
 template <typename T, typename IdxT>
 void RaftIvfPQ<T, IdxT>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
-  if (refine_ratio_ > 1.0f) {
-    uint32_t k0 = static_cast<uint32_t>(refine_ratio_ * k);
-    auto queries_v =
-      raft::make_device_matrix_view<const T, uint32_t>(queries, batch_size, index_->dim());
-    auto distances_tmp = raft::make_device_matrix<float, uint32_t>(handle_, batch_size, k0);
-    auto candidates    = raft::make_device_matrix<IdxT, uint32_t>(handle_, batch_size, k0);
-
-    raft::neighbors::ivf_pq::search(
-      handle_, search_params_, *index_, queries_v, candidates.view(), distances_tmp.view());
-
-    if (raft::get_device_for_address(dataset_.data_handle()) >= 0) {
-      auto queries_v =
-        raft::make_device_matrix_view<const T, IdxT>(queries, batch_size, index_->dim());
-      auto neighbors_v = raft::make_device_matrix_view<IdxT, IdxT>((IdxT*)neighbors, batch_size, k);
-      auto distances_v = raft::make_device_matrix_view<float, IdxT>(distances, batch_size, k);
-
-      raft::neighbors::refine<IdxT, T, float, IdxT>(handle_,
-                                                    dataset_,
-                                                    queries_v,
-                                                    candidates.view(),
-                                                    neighbors_v,
-                                                    distances_v,
-                                                    index_->metric());
-    } else {
-      auto queries_host    = raft::make_host_matrix<T, IdxT>(batch_size, index_->dim());
-      auto candidates_host = raft::make_host_matrix<IdxT, IdxT>(batch_size, k0);
-      auto neighbors_host  = raft::make_host_matrix<IdxT, IdxT>(batch_size, k);
-      auto distances_host  = raft::make_host_matrix<float, IdxT>(batch_size, k);
-
-      auto stream = resource::get_cuda_stream(handle_);
-      raft::copy(queries_host.data_handle(), queries, queries_host.size(), stream);
-      raft::copy(
-        candidates_host.data_handle(), candidates.data_handle(), candidates_host.size(), stream);
-
-      auto dataset_v = raft::make_host_matrix_view<const T, IdxT>(
-        dataset_.data_handle(), dataset_.extent(0), dataset_.extent(1));
-
-      raft::resource::sync_stream(handle_);  // wait for the queries and candidates
-      raft::neighbors::refine<IdxT, T, float, IdxT>(handle_,
-                                                    dataset_v,
-                                                    queries_host.view(),
-                                                    candidates_host.view(),
-                                                    neighbors_host.view(),
-                                                    distances_host.view(),
-                                                    index_->metric());
-
-      raft::copy(neighbors, (size_t*)neighbors_host.data_handle(), neighbors_host.size(), stream);
-      raft::copy(distances, distances_host.data_handle(), distances_host.size(), stream);
-    }
+  auto k0                       = static_cast<size_t>(refine_ratio_ * k);
+  const bool disable_refinement = k0 <= static_cast<size_t>(k);
+  const raft::resources& res    = handle_;
+
+  if (disable_refinement) {
+    search_base(queries, batch_size, k, neighbors, distances);
   } else {
     auto queries_v =
-      raft::make_device_matrix_view<const T, uint32_t>(queries, batch_size, index_->dim());
-    auto neighbors_v =
-      raft::make_device_matrix_view<IdxT, uint32_t>((IdxT*)neighbors, batch_size, k);
-    auto distances_v = raft::make_device_matrix_view<float, uint32_t>(distances, batch_size, k);
-
-    raft::neighbors::ivf_pq::search(
-      handle_, search_params_, *index_, queries_v, neighbors_v, distances_v);
+      raft::make_device_matrix_view<const T, AnnBase::index_type>(queries, batch_size, dimension_);
+    auto candidate_ixs =
+      raft::make_device_matrix<AnnBase::index_type, AnnBase::index_type>(res, batch_size, k0);
+    auto candidate_dists =
+      raft::make_device_matrix<float, AnnBase::index_type>(res, batch_size, k0);
+    search_base(
+      queries, batch_size, k0, candidate_ixs.data_handle(), candidate_dists.data_handle());
+    refine_helper(
+      res, dataset_, queries_v, candidate_ixs, k, neighbors, distances, index_->metric());
   }
 }
 }  // namespace raft::bench::ann
diff --git a/cpp/bench/ann/src/raft/raft_wrapper.h b/cpp/bench/ann/src/raft/raft_wrapper.h
index 586b81ae06..2c996058b2 100644
--- a/cpp/bench/ann/src/raft/raft_wrapper.h
+++ b/cpp/bench/ann/src/raft/raft_wrapper.h
@@ -56,10 +56,11 @@ class RaftGpu : public ANN<T>, public AnnGPU {
 
   void set_search_param(const AnnSearchParam& param) override;
 
-  // TODO: if the number of results is less than k, the remaining elements of 'neighbors'
-  // will be filled with (size_t)-1
-  void search(
-    const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const final;
+  void search(const T* queries,
+              int batch_size,
+              int k,
+              AnnBase::index_type* neighbors,
+              float* distances) const final;
 
   // to enable dataset access from GPU memory
   AlgoProperty get_preference() const override
@@ -133,15 +134,16 @@ void RaftGpu<T>::load(const std::string& file)
 
 template <typename T>
 void RaftGpu<T>::search(
-  const T* queries, int batch_size, int k, size_t* neighbors, float* distances) const
+  const T* queries, int batch_size, int k, AnnBase::index_type* neighbors, float* distances) const
 {
   auto queries_view =
     raft::make_device_matrix_view<const T, int64_t>(queries, batch_size, this->dim_);
 
-  auto neighbors_view = raft::make_device_matrix_view<size_t, int64_t>(neighbors, batch_size, k);
+  auto neighbors_view =
+    raft::make_device_matrix_view<AnnBase::index_type, int64_t>(neighbors, batch_size, k);
   auto distances_view = raft::make_device_matrix_view<float, int64_t>(distances, batch_size, k);
 
-  raft::neighbors::brute_force::search<T, size_t>(
+  raft::neighbors::brute_force::search<T, AnnBase::index_type>(
     handle_, *index_, queries_view, neighbors_view, distances_view);
 }
 

From 8158538f74afaf57ae23f82ed2ebb4efab82c666 Mon Sep 17 00:00:00 2001
From: tsuki <12711693+enp1s0@users.noreply.github.com>
Date: Thu, 16 May 2024 06:05:34 +0200
Subject: [PATCH 47/60] Fix citation info (#2318)

I found incorrect citation information in README.md. This PR fixes that and also updates the CAGRA paper information.

Authors:
  - tsuki (https://github.com/enp1s0)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2318
---
 README.md | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index c501c37b2f..ae6591df00 100755
--- a/README.md
+++ b/README.md
@@ -354,10 +354,8 @@ If citing CAGRA, please consider the following bibtex:
 @misc{ootomo2023cagra,
       title={CAGRA: Highly Parallel Graph Construction and Approximate Nearest Neighbor Search for GPUs},
       author={Hiroyuki Ootomo and Akira Naruse and Corey Nolet and Ray Wang and Tamas Feher and Yong Wang},
-      year={2023},
-      eprint={2308.15136},
-      archivePrefix={arXiv},
-      primaryClass={cs.DS}
+      year={2024},
+      series = {ICDE '24}
 }
 ```
 
@@ -365,13 +363,14 @@ If citing the k-selection routines, please consider the following bibtex:
 
 ```bibtex
 @proceedings{10.1145/3581784,
-    title = {SC '23: Proceedings of the International Conference for High Performance Computing, Networking, Storage and Analysis},
+    title = {Parallel Top-K Algorithms on GPU: A Comprehensive Study and New Methods},
+    author={Jingrong Zhang, Akira Naruse, Xipeng Li, and Yong Wang},
     year = {2023},
     isbn = {9798400701092},
     publisher = {Association for Computing Machinery},
     address = {New York, NY, USA},
-    abstract = {Started in 1988, the SC Conference has become the annual nexus for researchers and practitioners from academia, industry and government to share information and foster collaborations to advance the state of the art in High Performance Computing (HPC), Networking, Storage, and Analysis.},
-    location = {, Denver, CO, USA, }
+    location = {Denver, CO, USA}
+    series = {SC '23}
 }
 ```
 
@@ -394,4 +393,4 @@ If citing the nearest neighbors descent API, please consider the following bibte
     location = {Virtual Event, Queensland, Australia},
     series = {CIKM '21}
 }
-```
\ No newline at end of file
+```

From 7e374514cc29ec542286cc69c384b37dff1150b4 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Thu, 16 May 2024 11:55:30 -0400
Subject: [PATCH 48/60] Correct initializer list order found by cuvs (#2317)

Align our constuctor order to match what C++ spec says will occur:
```
Then, non-static data members are initialized in the order they were declared in the class definition (again regardless of the order of the mem-initializers).
```

Otherwise consumer of raft will get warnings when building with `-Wall -Wextra` and using the `index` type.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2317
---
 cpp/include/raft/neighbors/ivf_flat_types.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/neighbors/ivf_flat_types.hpp b/cpp/include/raft/neighbors/ivf_flat_types.hpp
index 0908e3b0b0..7605bd82a3 100644
--- a/cpp/include/raft/neighbors/ivf_flat_types.hpp
+++ b/cpp/include/raft/neighbors/ivf_flat_types.hpp
@@ -272,10 +272,10 @@ struct index : ann::index {
       metric_(metric),
       adaptive_centers_(adaptive_centers),
       conservative_memory_allocation_{conservative_memory_allocation},
-      centers_(make_device_matrix<float, uint32_t>(res, n_lists, dim)),
-      center_norms_(std::nullopt),
       lists_{n_lists},
       list_sizes_{make_device_vector<uint32_t, uint32_t>(res, n_lists)},
+      centers_(make_device_matrix<float, uint32_t>(res, n_lists, dim)),
+      center_norms_(std::nullopt),
       data_ptrs_{make_device_vector<T*, uint32_t>(res, n_lists)},
       inds_ptrs_{make_device_vector<IdxT*, uint32_t>(res, n_lists)},
       accum_sorted_sizes_{make_host_vector<IdxT, uint32_t>(n_lists + 1)}

From 12f00962024a1947649e09cd57e2203da0d840bd Mon Sep 17 00:00:00 2001
From: Chuck Hastings <45364586+ChuckHastings@users.noreply.github.com>
Date: Fri, 17 May 2024 00:04:36 -0400
Subject: [PATCH 49/60] Refactor spectral scale_obs to use existing
 normalization function (#2319)

The scale_obs function was calling a custom kernel to scale the elements of a matrix column by the l2-norm of the column.

There were two issues:
1. The kernel launch parameters would go out of bounds if the graph was too large.  The Y dimension is limited to 65535, but there was no logic in the function to ensure that we didn't set the Y value larger than that
3. A bug in the kernel, the column norm was not being calculated correctly... the outer loop was terminating, hence we were really only computing the column norm of the last column in the block.  Then we were normalizing all columns in the block by that value instead of by each value.

To simplify (there's going to be some optimization work done this summer), I replaced this with a simple thrust call that will scale the values correctly.

Authors:
  - Chuck Hastings (https://github.com/ChuckHastings)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2319
---
 cpp/include/raft/linalg/normalize.cuh         |  2 +
 .../detail/modularity_maximization.hpp        |  6 +-
 .../raft/spectral/detail/spectral_util.cuh    | 81 +------------------
 3 files changed, 7 insertions(+), 82 deletions(-)

diff --git a/cpp/include/raft/linalg/normalize.cuh b/cpp/include/raft/linalg/normalize.cuh
index 1f60860c8c..de5f4e62ce 100644
--- a/cpp/include/raft/linalg/normalize.cuh
+++ b/cpp/include/raft/linalg/normalize.cuh
@@ -18,9 +18,11 @@
 
 #include "detail/normalize.cuh"
 
+#include <raft/core/device_mdspan.hpp>
 #include <raft/core/operators.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/linalg/norm_types.hpp>
+#include <raft/util/input_validation.hpp>
 
 namespace raft {
 namespace linalg {
diff --git a/cpp/include/raft/spectral/detail/modularity_maximization.hpp b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
index 2a3b5cf36c..a4e504883a 100644
--- a/cpp/include/raft/spectral/detail/modularity_maximization.hpp
+++ b/cpp/include/raft/spectral/detail/modularity_maximization.hpp
@@ -19,6 +19,7 @@
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/linalg/normalize.cuh>
 #include <raft/spectral/cluster_solvers.cuh>
 #include <raft/spectral/detail/spectral_util.cuh>
 #include <raft/spectral/eigen_solvers.cuh>
@@ -101,8 +102,9 @@ std::tuple<vertex_t, weight_t, vertex_t> modularity_maximization(
 
   // notice that at this point the matrix has already been transposed, so we are scaling
   // columns
-  scale_obs(nEigVecs, n, eigVecs);
-  RAFT_CHECK_CUDA(stream);
+  auto dataset_view = raft::make_device_matrix_view(eigVecs, nEigVecs, n);
+  raft::linalg::row_normalize(
+    handle, raft::make_const_mdspan(dataset_view), dataset_view, raft::linalg::L2Norm);
 
   // Find partition clustering
   auto pair_cluster = cluster_solver.solve(handle, n, nEigVecs, eigVecs, clusters);
diff --git a/cpp/include/raft/spectral/detail/spectral_util.cuh b/cpp/include/raft/spectral/detail/spectral_util.cuh
index 736936a1f1..002fad9680 100644
--- a/cpp/include/raft/spectral/detail/spectral_util.cuh
+++ b/cpp/include/raft/spectral/detail/spectral_util.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -39,85 +39,6 @@
 namespace raft {
 namespace spectral {
 
-template <typename index_type_t, typename value_type_t>
-RAFT_KERNEL scale_obs_kernel(index_type_t m, index_type_t n, value_type_t* obs)
-{
-  index_type_t i, j, k, index, mm;
-  value_type_t alpha, v, last;
-  bool valid;
-  // ASSUMPTION: kernel is launched with either 2, 4, 8, 16 or 32 threads in x-dimension
-
-  // compute alpha
-  mm    = (((m + blockDim.x - 1) / blockDim.x) * blockDim.x);  // m in multiple of blockDim.x
-  alpha = 0.0;
-
-  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) {
-    for (i = threadIdx.x; i < mm; i += blockDim.x) {
-      // check if the thread is valid
-      valid = i < m;
-
-      // get the value of the last thread
-      last = __shfl_sync(warp_full_mask(), alpha, blockDim.x - 1, blockDim.x);
-
-      // if you are valid read the value from memory, otherwise set your value to 0
-      alpha = (valid) ? obs[i + j * m] : 0.0;
-      alpha = alpha * alpha;
-
-      // do prefix sum (of size warpSize=blockDim.x =< 32)
-      for (k = 1; k < blockDim.x; k *= 2) {
-        v = __shfl_up_sync(warp_full_mask(), alpha, k, blockDim.x);
-        if (threadIdx.x >= k) alpha += v;
-      }
-      // shift by last
-      alpha += last;
-    }
-  }
-
-  // scale by alpha
-  alpha = __shfl_sync(warp_full_mask(), alpha, blockDim.x - 1, blockDim.x);
-  alpha = raft::sqrt(alpha);
-  for (j = threadIdx.y + blockIdx.y * blockDim.y; j < n; j += blockDim.y * gridDim.y) {
-    for (i = threadIdx.x; i < m; i += blockDim.x) {  // blockDim.x=32
-      index      = i + j * m;
-      obs[index] = obs[index] / alpha;
-    }
-  }
-}
-
-template <typename index_type_t>
-index_type_t next_pow2(index_type_t n)
-{
-  index_type_t v;
-  // Reference:
-  // http://graphics.stanford.edu/~seander/bithacks.html#RoundUpPowerOf2Float
-  v = n - 1;
-  v |= v >> 1;
-  v |= v >> 2;
-  v |= v >> 4;
-  v |= v >> 8;
-  v |= v >> 16;
-  return v + 1;
-}
-
-template <typename index_type_t, typename value_type_t>
-cudaError_t scale_obs(index_type_t m, index_type_t n, value_type_t* obs)
-{
-  index_type_t p2m;
-
-  // find next power of 2
-  p2m = next_pow2<index_type_t>(m);
-  // setup launch configuration
-  unsigned int xsize = std::max(2, std::min(p2m, 32));
-  dim3 nthreads{xsize, 256 / xsize, 1};
-
-  dim3 nblocks{1, (n + nthreads.y - 1) / nthreads.y, 1};
-
-  // launch scaling kernel (scale each column of obs by its norm)
-  scale_obs_kernel<index_type_t, value_type_t><<<nblocks, nthreads>>>(m, n, obs);
-
-  return cudaSuccess;
-}
-
 template <typename vertex_t, typename edge_t, typename weight_t>
 void transform_eigen_matrix(raft::resources const& handle,
                             edge_t n,

From 1e3ebaf4726311980a9b27b7221f9c768504f7a6 Mon Sep 17 00:00:00 2001
From: Jake Awe <50372925+AyodeAwe@users.noreply.github.com>
Date: Mon, 20 May 2024 09:36:19 -0500
Subject: [PATCH 50/60] Adds missing files to `update-version.sh` (#2255)

* add missing files to update-version.sh

* use alternate regex

* remove unneeded sed following #2285
---
 ci/release/update-version.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index ef9b3e4b83..9554a7dde8 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -92,6 +92,7 @@ sed_runner "s|branch-[0-9][0-9].[0-9][0-9]|branch-${NEXT_SHORT_TAG}|g" README.md
 find .devcontainer/ -type f -name devcontainer.json -print0 | while IFS= read -r -d '' filename; do
     sed_runner "s@rapidsai/devcontainers:[0-9.]*@rapidsai/devcontainers:${NEXT_SHORT_TAG}@g" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/ucx:[0-9.]*@rapidsai/devcontainers/features/ucx:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
+    sed_runner "s@rapidsai/devcontainers/features/cuda:[0-9.]*@rapidsai/devcontainers/features/cuda:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
     sed_runner "s@rapidsai/devcontainers/features/rapids-build-utils:[0-9.]*@rapidsai/devcontainers/features/rapids-build-utils:${NEXT_SHORT_TAG_PEP440}@" "${filename}"
     sed_runner "s@rapids-\${localWorkspaceFolderBasename}-${CURRENT_SHORT_TAG}@rapids-\${localWorkspaceFolderBasename}-${NEXT_SHORT_TAG}@g" "${filename}"
 done

From 5a8224cd0329e4b6350c78ae987d5bd668620399 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 21 May 2024 11:04:42 -0400
Subject: [PATCH 51/60] Support building faiss main statically (#2323)

Cleans up a collection of anti-patterns in the raft CMake code while also enabling building faiss from latest `main`

Authors:
  - Robert Maynard (https://github.com/robertmaynard)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2323
---
 .devcontainer/Dockerfile                |   7 +
 cpp/CMakeLists.txt                      |   8 +-
 cpp/bench/ann/CMakeLists.txt            | 117 ++----
 cpp/bench/prims/CMakeLists.txt          | 106 ++---
 cpp/cmake/patches/faiss_override.json   |   9 +
 cpp/cmake/patches/ggnn_override.json    |  16 +
 cpp/cmake/patches/hnswlib_override.json |  16 +
 cpp/cmake/thirdparty/get_faiss.cmake    | 188 +++++----
 cpp/cmake/thirdparty/get_ggnn.cmake     |  44 +-
 cpp/cmake/thirdparty/get_hnswlib.cmake  |  70 ++--
 cpp/test/CMakeLists.txt                 | 522 ++++++++++++------------
 11 files changed, 554 insertions(+), 549 deletions(-)
 create mode 100644 cpp/cmake/patches/faiss_override.json
 create mode 100644 cpp/cmake/patches/ggnn_override.json
 create mode 100644 cpp/cmake/patches/hnswlib_override.json

diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile
index 9d35e3f97f..594ba8c3c4 100644
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -5,6 +5,13 @@ ARG PYTHON_PACKAGE_MANAGER=conda
 
 FROM ${BASE} as pip-base
 
+RUN apt update -y \
+ && DEBIAN_FRONTEND=noninteractive apt install -y --no-install-recommends \
+    # faiss dependencies
+    libblas-dev \
+    liblapack-dev \
+ && rm -rf /tmp/* /var/tmp/* /var/cache/apt/* /var/lib/apt/lists/*;
+
 ENV DEFAULT_VIRTUAL_ENV=rapids
 
 FROM ${BASE} as conda-base
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 7270c5a12b..39472cae67 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -825,26 +825,26 @@ rapids_export(
 # * shared test/bench headers ------------------------------------------------
 
 if(BUILD_TESTS OR BUILD_PRIMS_BENCH)
-  include(internal/CMakeLists.txt)
+  add_subdirectory(internal)
 endif()
 
 # ##################################################################################################
 # * build test executable ----------------------------------------------------
 
 if(BUILD_TESTS)
-  include(test/CMakeLists.txt)
+  add_subdirectory(test)
 endif()
 
 # ##################################################################################################
 # * build benchmark executable -----------------------------------------------
 
 if(BUILD_PRIMS_BENCH)
-  include(bench/prims/CMakeLists.txt)
+  add_subdirectory(bench/prims/)
 endif()
 
 # ##################################################################################################
 # * build ann benchmark executable -----------------------------------------------
 
 if(BUILD_ANN_BENCH)
-  include(bench/ann/CMakeLists.txt)
+  add_subdirectory(bench/ann/)
 endif()
diff --git a/cpp/bench/ann/CMakeLists.txt b/cpp/bench/ann/CMakeLists.txt
index f29d32ccde..f489cc62c6 100644
--- a/cpp/bench/ann/CMakeLists.txt
+++ b/cpp/bench/ann/CMakeLists.txt
@@ -12,6 +12,8 @@
 # the License.
 # =============================================================================
 
+list(APPEND CMAKE_MODULE_PATH "${RAFT_SOURCE_DIR}")
+
 # ##################################################################################################
 # * benchmark options ------------------------------------------------------------------------------
 
@@ -40,48 +42,26 @@ option(RAFT_ANN_BENCH_SINGLE_EXE
 
 find_package(Threads REQUIRED)
 
+set(RAFT_ANN_BENCH_USE_FAISS ON)
+set(RAFT_FAISS_ENABLE_GPU ON)
+set(RAFT_USE_FAISS_STATIC ON)
+
 if(BUILD_CPU_ONLY)
 
   # Include necessary logging dependencies
-  include(cmake/thirdparty/get_fmt.cmake)
-  include(cmake/thirdparty/get_spdlog.cmake)
-
+  include(cmake/thirdparty/get_fmt)
+  include(cmake/thirdparty/get_spdlog)
   set(RAFT_FAISS_ENABLE_GPU OFF)
-  set(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT OFF)
-  set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT OFF)
-  set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_CAGRA OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE OFF)
   set(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB OFF)
   set(RAFT_ANN_BENCH_USE_GGNN OFF)
-else()
+elseif(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0.0)
   # Disable faiss benchmarks on CUDA 12 since faiss is not yet CUDA 12-enabled.
   # https://github.com/rapidsai/raft/issues/1627
-  if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.0.0)
-    set(RAFT_FAISS_ENABLE_GPU OFF)
-    set(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT OFF)
-    set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT OFF)
-    set(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ OFF)
-    set(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT OFF)
-    set(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ OFF)
-    set(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT OFF)
-  else()
-    set(RAFT_FAISS_ENABLE_GPU ON)
-  endif()
-endif()
-
-set(RAFT_ANN_BENCH_USE_FAISS OFF)
-if(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT
-   OR RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ
-   OR RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT
-   OR RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT
-   OR RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ
-   OR RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT
-)
-  set(RAFT_ANN_BENCH_USE_FAISS ON)
-  set(RAFT_USE_FAISS_STATIC ON)
+  set(RAFT_FAISS_ENABLE_GPU OFF)
 endif()
 
 set(RAFT_ANN_BENCH_USE_RAFT OFF)
@@ -98,21 +78,17 @@ endif()
 # * Fetch requirements -------------------------------------------------------------
 
 if(RAFT_ANN_BENCH_USE_HNSWLIB OR RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB)
-  include(cmake/thirdparty/get_hnswlib.cmake)
+  include(cmake/thirdparty/get_hnswlib)
 endif()
 
-include(cmake/thirdparty/get_nlohmann_json.cmake)
+include(cmake/thirdparty/get_nlohmann_json)
 
 if(RAFT_ANN_BENCH_USE_GGNN)
-  include(cmake/thirdparty/get_ggnn.cmake)
+  include(cmake/thirdparty/get_ggnn)
 endif()
 
 if(RAFT_ANN_BENCH_USE_FAISS)
-  # We need to ensure that faiss has all the conda information. So we currently use the very ugly
-  # hammer of `link_libraries` to ensure that all targets in this directory and the faiss directory
-  # will have the conda includes/link dirs
-  link_libraries($<TARGET_NAME_IF_EXISTS:conda_env>)
-  include(cmake/thirdparty/get_faiss.cmake)
+  include(cmake/thirdparty/get_faiss)
 endif()
 
 # ##################################################################################################
@@ -173,8 +149,6 @@ function(ConfigureAnnBench)
             $<$<BOOL:${GPU_BUILD}>:${RAFT_CTK_MATH_DEPENDENCIES}>
             $<TARGET_NAME_IF_EXISTS:OpenMP::OpenMP_CXX>
             $<TARGET_NAME_IF_EXISTS:conda_env>
-            -static-libgcc
-            -static-libstdc++
             $<$<BOOL:${BUILD_CPU_ONLY}>:fmt::fmt-header-only>
             $<$<BOOL:${BUILD_CPU_ONLY}>:spdlog::spdlog_header_only>
   )
@@ -225,7 +199,7 @@ endfunction()
 
 if(RAFT_ANN_BENCH_USE_HNSWLIB)
   ConfigureAnnBench(
-    NAME HNSWLIB PATH bench/ann/src/hnswlib/hnswlib_benchmark.cpp LINKS hnswlib::hnswlib
+    NAME HNSWLIB PATH src/hnswlib/hnswlib_benchmark.cpp LINKS hnswlib::hnswlib
   )
 
 endif()
@@ -235,8 +209,8 @@ if(RAFT_ANN_BENCH_USE_RAFT_IVF_PQ)
     NAME
     RAFT_IVF_PQ
     PATH
-    bench/ann/src/raft/raft_benchmark.cu
-    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_IVF_PQ}>:bench/ann/src/raft/raft_ivf_pq.cu>
+    src/raft/raft_benchmark.cu
+    src/raft/raft_ivf_pq.cu
     LINKS
     raft::compiled
   )
@@ -247,8 +221,8 @@ if(RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT)
     NAME
     RAFT_IVF_FLAT
     PATH
-    bench/ann/src/raft/raft_benchmark.cu
-    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_IVF_FLAT}>:bench/ann/src/raft/raft_ivf_flat.cu>
+    src/raft/raft_benchmark.cu
+    src/raft/raft_ivf_flat.cu
     LINKS
     raft::compiled
   )
@@ -256,7 +230,7 @@ endif()
 
 if(RAFT_ANN_BENCH_USE_RAFT_BRUTE_FORCE)
   ConfigureAnnBench(
-    NAME RAFT_BRUTE_FORCE PATH bench/ann/src/raft/raft_benchmark.cu LINKS raft::compiled
+    NAME RAFT_BRUTE_FORCE PATH src/raft/raft_benchmark.cu LINKS raft::compiled
   )
 endif()
 
@@ -265,11 +239,11 @@ if(RAFT_ANN_BENCH_USE_RAFT_CAGRA)
     NAME
     RAFT_CAGRA
     PATH
-    bench/ann/src/raft/raft_benchmark.cu
-    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_CAGRA}>:bench/ann/src/raft/raft_cagra_float.cu>
-    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_CAGRA}>:bench/ann/src/raft/raft_cagra_half.cu>
-    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_CAGRA}>:bench/ann/src/raft/raft_cagra_int8_t.cu>
-    $<$<BOOL:${RAFT_ANN_BENCH_USE_RAFT_CAGRA}>:bench/ann/src/raft/raft_cagra_uint8_t.cu>
+    src/raft/raft_benchmark.cu
+    src/raft/raft_cagra_float.cu
+    src/raft/raft_cagra_half.cu
+    src/raft/raft_cagra_int8_t.cu
+    src/raft/raft_cagra_uint8_t.cu
     LINKS
     raft::compiled
   )
@@ -277,76 +251,63 @@ endif()
 
 if(RAFT_ANN_BENCH_USE_RAFT_CAGRA_HNSWLIB)
   ConfigureAnnBench(
-    NAME RAFT_CAGRA_HNSWLIB PATH bench/ann/src/raft/raft_cagra_hnswlib.cu LINKS raft::compiled
+    NAME RAFT_CAGRA_HNSWLIB PATH src/raft/raft_cagra_hnswlib.cu LINKS raft::compiled
     hnswlib::hnswlib
   )
 endif()
 
-set(RAFT_FAISS_TARGETS faiss::faiss)
-if(TARGET faiss::faiss_avx2)
-  set(RAFT_FAISS_TARGETS faiss::faiss_avx2)
-endif()
-
 message("RAFT_FAISS_TARGETS: ${RAFT_FAISS_TARGETS}")
 message("CUDAToolkit_LIBRARY_DIR: ${CUDAToolkit_LIBRARY_DIR}")
 if(RAFT_ANN_BENCH_USE_FAISS_CPU_FLAT)
   ConfigureAnnBench(
-    NAME FAISS_CPU_FLAT PATH bench/ann/src/faiss/faiss_cpu_benchmark.cpp LINKS
+    NAME FAISS_CPU_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS
     ${RAFT_FAISS_TARGETS}
   )
 endif()
 
 if(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_FLAT)
   ConfigureAnnBench(
-    NAME FAISS_CPU_IVF_FLAT PATH bench/ann/src/faiss/faiss_cpu_benchmark.cpp LINKS
+    NAME FAISS_CPU_IVF_FLAT PATH src/faiss/faiss_cpu_benchmark.cpp LINKS
     ${RAFT_FAISS_TARGETS}
   )
 endif()
 
 if(RAFT_ANN_BENCH_USE_FAISS_CPU_IVF_PQ)
   ConfigureAnnBench(
-    NAME FAISS_CPU_IVF_PQ PATH bench/ann/src/faiss/faiss_cpu_benchmark.cpp LINKS
+    NAME FAISS_CPU_IVF_PQ PATH src/faiss/faiss_cpu_benchmark.cpp LINKS
     ${RAFT_FAISS_TARGETS}
   )
 endif()
 
-if(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT)
+if(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_FLAT AND RAFT_FAISS_ENABLE_GPU)
   ConfigureAnnBench(
-    NAME FAISS_GPU_IVF_FLAT PATH bench/ann/src/faiss/faiss_gpu_benchmark.cu LINKS
+    NAME FAISS_GPU_IVF_FLAT PATH src/faiss/faiss_gpu_benchmark.cu LINKS
     ${RAFT_FAISS_TARGETS}
   )
 endif()
 
-if(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ)
+if(RAFT_ANN_BENCH_USE_FAISS_GPU_IVF_PQ AND RAFT_FAISS_ENABLE_GPU)
   ConfigureAnnBench(
-    NAME FAISS_GPU_IVF_PQ PATH bench/ann/src/faiss/faiss_gpu_benchmark.cu LINKS
+    NAME FAISS_GPU_IVF_PQ PATH src/faiss/faiss_gpu_benchmark.cu LINKS
     ${RAFT_FAISS_TARGETS}
   )
 endif()
 
-if(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT)
+if(RAFT_ANN_BENCH_USE_FAISS_GPU_FLAT AND RAFT_FAISS_ENABLE_GPU)
   ConfigureAnnBench(
-    NAME FAISS_GPU_FLAT PATH bench/ann/src/faiss/faiss_gpu_benchmark.cu LINKS ${RAFT_FAISS_TARGETS}
+    NAME FAISS_GPU_FLAT PATH src/faiss/faiss_gpu_benchmark.cu LINKS ${RAFT_FAISS_TARGETS}
   )
 endif()
 
 if(RAFT_ANN_BENCH_USE_GGNN)
-  include(cmake/thirdparty/get_glog.cmake)
-  ConfigureAnnBench(NAME GGNN PATH bench/ann/src/ggnn/ggnn_benchmark.cu LINKS glog::glog ggnn::ggnn)
+  include(cmake/thirdparty/get_glog)
+  ConfigureAnnBench(NAME GGNN PATH src/ggnn/ggnn_benchmark.cu LINKS glog::glog ggnn::ggnn)
 endif()
 
 # ##################################################################################################
 # * Dynamically-loading ANN_BENCH executable -------------------------------------------------------
 if(RAFT_ANN_BENCH_SINGLE_EXE)
-  add_executable(ANN_BENCH bench/ann/src/common/benchmark.cpp)
-
-  # Build and link static version of the GBench to keep ANN_BENCH self-contained.
-  get_target_property(TMP_PROP benchmark::benchmark SOURCES)
-  add_library(benchmark_static STATIC ${TMP_PROP})
-  get_target_property(TMP_PROP benchmark::benchmark INCLUDE_DIRECTORIES)
-  target_include_directories(benchmark_static PUBLIC ${TMP_PROP})
-  get_target_property(TMP_PROP benchmark::benchmark LINK_LIBRARIES)
-  target_link_libraries(benchmark_static PUBLIC ${TMP_PROP})
+  add_executable(ANN_BENCH src/common/benchmark.cpp)
 
   target_include_directories(ANN_BENCH PRIVATE ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
 
@@ -354,7 +315,7 @@ if(RAFT_ANN_BENCH_SINGLE_EXE)
     ANN_BENCH
     PRIVATE raft::raft
             nlohmann_json::nlohmann_json
-            benchmark_static
+            benchmark::benchmark
             dl
             -static-libgcc
             fmt::fmt-header-only
diff --git a/cpp/bench/prims/CMakeLists.txt b/cpp/bench/prims/CMakeLists.txt
index 0c5521d447..0771a60e58 100644
--- a/cpp/bench/prims/CMakeLists.txt
+++ b/cpp/bench/prims/CMakeLists.txt
@@ -75,31 +75,31 @@ endfunction()
 
 if(BUILD_PRIMS_BENCH)
   ConfigureBench(
-    NAME CORE_BENCH PATH bench/prims/core/bitset.cu bench/prims/core/copy.cu bench/prims/main.cpp
+    NAME CORE_BENCH PATH core/bitset.cu core/copy.cu main.cpp
   )
 
   ConfigureBench(
-    NAME CLUSTER_BENCH PATH bench/prims/cluster/kmeans_balanced.cu bench/prims/cluster/kmeans.cu
-    bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
+    NAME CLUSTER_BENCH PATH cluster/kmeans_balanced.cu cluster/kmeans.cu
+    main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureBench(
-    NAME TUNE_DISTANCE PATH bench/prims/distance/tune_pairwise/kernel.cu
-    bench/prims/distance/tune_pairwise/bench.cu bench/prims/main.cpp
+    NAME TUNE_DISTANCE PATH distance/tune_pairwise/kernel.cu
+    distance/tune_pairwise/bench.cu main.cpp
   )
 
   ConfigureBench(
     NAME
     DISTANCE_BENCH
     PATH
-    bench/prims/distance/distance_cosine.cu
-    bench/prims/distance/distance_exp_l2.cu
-    bench/prims/distance/distance_l1.cu
-    bench/prims/distance/distance_unexp_l2.cu
-    bench/prims/distance/fused_l2_nn.cu
-    bench/prims/distance/masked_nn.cu
-    bench/prims/distance/kernels.cu
-    bench/prims/main.cpp
+    distance/distance_cosine.cu
+    distance/distance_exp_l2.cu
+    distance/distance_l1.cu
+    distance/distance_unexp_l2.cu
+    distance/fused_l2_nn.cu
+    distance/masked_nn.cu
+    distance/kernels.cu
+    main.cpp
     OPTIONAL
     LIB
     EXPLICIT_INSTANTIATE_ONLY
@@ -109,64 +109,64 @@ if(BUILD_PRIMS_BENCH)
     NAME
     LINALG_BENCH
     PATH
-    bench/prims/linalg/add.cu
-    bench/prims/linalg/map_then_reduce.cu
-    bench/prims/linalg/matrix_vector_op.cu
-    bench/prims/linalg/norm.cu
-    bench/prims/linalg/normalize.cu
-    bench/prims/linalg/reduce_cols_by_key.cu
-    bench/prims/linalg/reduce_rows_by_key.cu
-    bench/prims/linalg/reduce.cu
-    bench/prims/linalg/sddmm.cu
-    bench/prims/main.cpp
+    linalg/add.cu
+    linalg/map_then_reduce.cu
+    linalg/matrix_vector_op.cu
+    linalg/norm.cu
+    linalg/normalize.cu
+    linalg/reduce_cols_by_key.cu
+    linalg/reduce_rows_by_key.cu
+    linalg/reduce.cu
+    linalg/sddmm.cu
+    main.cpp
   )
 
   ConfigureBench(
-    NAME MATRIX_BENCH PATH bench/prims/matrix/argmin.cu bench/prims/matrix/gather.cu
-    bench/prims/matrix/select_k.cu bench/prims/main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
+    NAME MATRIX_BENCH PATH matrix/argmin.cu matrix/gather.cu
+    matrix/select_k.cu main.cpp OPTIONAL LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureBench(
-    NAME RANDOM_BENCH PATH bench/prims/random/make_blobs.cu bench/prims/random/permute.cu
-   bench/prims/random/rng.cu bench/prims/random/subsample.cu bench/prims/main.cpp
+    NAME RANDOM_BENCH PATH random/make_blobs.cu random/permute.cu
+   random/rng.cu random/subsample.cu main.cpp
   )
 
   ConfigureBench(
     NAME
     SPARSE_BENCH
     PATH
-    bench/prims/sparse/bitmap_to_csr.cu
-    bench/prims/sparse/convert_csr.cu
-    bench/prims/sparse/select_k_csr.cu
-    bench/prims/main.cpp
+    sparse/bitmap_to_csr.cu
+    sparse/convert_csr.cu
+    sparse/select_k_csr.cu
+    main.cpp
   )
 
   ConfigureBench(
     NAME
     NEIGHBORS_BENCH
     PATH
-    bench/prims/neighbors/knn/brute_force_float_int64_t.cu
-    bench/prims/neighbors/knn/brute_force_float_uint32_t.cu
-    bench/prims/neighbors/knn/cagra_float_uint32_t.cu
-    bench/prims/neighbors/knn/ivf_flat_filter_float_int64_t.cu
-    bench/prims/neighbors/knn/ivf_flat_float_int64_t.cu
-    bench/prims/neighbors/knn/ivf_flat_int8_t_int64_t.cu
-    bench/prims/neighbors/knn/ivf_flat_uint8_t_int64_t.cu
-    bench/prims/neighbors/knn/ivf_pq_float_int64_t.cu
-    bench/prims/neighbors/knn/ivf_pq_filter_float_int64_t.cu
-    bench/prims/neighbors/knn/ivf_pq_int8_t_int64_t.cu
-    bench/prims/neighbors/knn/ivf_pq_uint8_t_int64_t.cu
-    src/neighbors/detail/ivf_pq_search_filtering_float_int64_t.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset64.cu
-    bench/prims/neighbors/refine_float_int64_t.cu
-    bench/prims/neighbors/refine_uint8_t_int64_t.cu
-    bench/prims/main.cpp
+    neighbors/knn/brute_force_float_int64_t.cu
+    neighbors/knn/brute_force_float_uint32_t.cu
+    neighbors/knn/cagra_float_uint32_t.cu
+    neighbors/knn/ivf_flat_filter_float_int64_t.cu
+    neighbors/knn/ivf_flat_float_int64_t.cu
+    neighbors/knn/ivf_flat_int8_t_int64_t.cu
+    neighbors/knn/ivf_flat_uint8_t_int64_t.cu
+    neighbors/knn/ivf_pq_float_int64_t.cu
+    neighbors/knn/ivf_pq_filter_float_int64_t.cu
+    neighbors/knn/ivf_pq_int8_t_int64_t.cu
+    neighbors/knn/ivf_pq_uint8_t_int64_t.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_search_filtering_float_int64_t.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset64.cu
+    neighbors/refine_float_int64_t.cu
+    neighbors/refine_uint8_t_int64_t.cu
+    main.cpp
     OPTIONAL
     LIB
     EXPLICIT_INSTANTIATE_ONLY
diff --git a/cpp/cmake/patches/faiss_override.json b/cpp/cmake/patches/faiss_override.json
new file mode 100644
index 0000000000..19dad362b9
--- /dev/null
+++ b/cpp/cmake/patches/faiss_override.json
@@ -0,0 +1,9 @@
+{
+  "packages" : {
+    "faiss" : {
+      "version": "1.7.4",
+      "git_url": "https://github.com/facebookresearch/faiss.git",
+      "git_tag": "main"
+    }
+  }
+}
diff --git a/cpp/cmake/patches/ggnn_override.json b/cpp/cmake/patches/ggnn_override.json
new file mode 100644
index 0000000000..768fae8b0c
--- /dev/null
+++ b/cpp/cmake/patches/ggnn_override.json
@@ -0,0 +1,16 @@
+{
+  "packages" : {
+    "ggnn" : {
+      "version": "0.5",
+      "git_url": "https://github.com/cgtuebingen/ggnn.git",
+      "git_tag": "release_${version}",
+      "patches" : [
+        {
+          "file" : "${current_json_dir}/ggnn.diff",
+          "issue" : "Correct compilation issues",
+          "fixed_in" : ""
+        }
+      ]
+    }
+  }
+}
diff --git a/cpp/cmake/patches/hnswlib_override.json b/cpp/cmake/patches/hnswlib_override.json
new file mode 100644
index 0000000000..d6ab8a18a5
--- /dev/null
+++ b/cpp/cmake/patches/hnswlib_override.json
@@ -0,0 +1,16 @@
+{
+  "packages" : {
+    "hnswlib" : {
+      "version": "0.6.2",
+      "git_url": "https://github.com/nmslib/hnswlib.git",
+      "git_tag": "v${version}",
+      "patches" : [
+        {
+          "file" : "${current_json_dir}/hnswlib.diff",
+          "issue" : "Correct compilation issues",
+          "fixed_in" : ""
+        }
+      ]
+    }
+  }
+}
diff --git a/cpp/cmake/thirdparty/get_faiss.cmake b/cpp/cmake/thirdparty/get_faiss.cmake
index 85829554ae..288da763bf 100644
--- a/cpp/cmake/thirdparty/get_faiss.cmake
+++ b/cpp/cmake/thirdparty/get_faiss.cmake
@@ -1,5 +1,5 @@
 #=============================================================================
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -15,96 +15,104 @@
 #=============================================================================
 
 function(find_and_configure_faiss)
-    set(oneValueArgs VERSION REPOSITORY PINNED_TAG BUILD_STATIC_LIBS EXCLUDE_FROM_ALL ENABLE_GPU)
-    cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN} )
+  set(oneValueArgs VERSION REPOSITORY PINNED_TAG BUILD_STATIC_LIBS EXCLUDE_FROM_ALL ENABLE_GPU)
+  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
+                        "${multiValueArgs}" ${ARGN} )
+
+  rapids_find_generate_module(faiss
+    HEADER_NAMES  faiss/IndexFlat.h
+    LIBRARY_NAMES faiss
+    )
+
+  set(patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/../patches")
+  rapids_cpm_package_override("${patch_dir}/faiss_override.json")
+
+  include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
+  rapids_cpm_package_details(faiss version repository tag shallow exclude)
+
+  include("${rapids-cmake-dir}/cpm/detail/generate_patch_command.cmake")
+  rapids_cpm_generate_patch_command(faiss ${version} patch_command)
+
+  set(BUILD_SHARED_LIBS ON)
+  if (PKG_BUILD_STATIC_LIBS)
+    set(BUILD_SHARED_LIBS OFF)
+    set(CPM_DOWNLOAD_faiss ON)
+  endif()
+
+  include(cmake/modules/FindAVX)
+  # Link against AVX CPU lib if it exists
+  set(RAFT_FAISS_OPT_LEVEL "generic")
+  if(CXX_AVX2_FOUND)
+    set(RAFT_FAISS_OPT_LEVEL "avx2")
+  endif()
+
+  rapids_cpm_find(faiss ${version}
+    GLOBAL_TARGETS faiss faiss_avx2 faiss_gpu faiss::faiss faiss::faiss_avx2
+    CPM_ARGS
+    GIT_REPOSITORY ${repository}
+    GIT_TAG ${tag}
+    GIT_SHALLOW ${shallow} ${patch_command}
+    EXCLUDE_FROM_ALL ${exclude}
+    OPTIONS
+    "FAISS_ENABLE_GPU ${PKG_ENABLE_GPU}"
+    "FAISS_ENABLE_PYTHON OFF"
+    "FAISS_OPT_LEVEL ${RAFT_FAISS_OPT_LEVEL}"
+    "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}"
+    "BUILD_TESTING OFF"
+    "CMAKE_MESSAGE_LOG_LEVEL VERBOSE"
+    )
+
+  include("${rapids-cmake-dir}/cpm/detail/display_patch_status.cmake")
+  rapids_cpm_display_patch_status(hnswlib)
+
+  if(TARGET faiss AND NOT TARGET faiss::faiss)
+    add_library(faiss::faiss ALIAS faiss)
+    # We need to ensure that faiss has all the conda information. So we use this approach so that
+    # faiss will have the conda includes/link dirs
+    target_link_libraries(faiss PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>)
+  endif()
+  if(TARGET faiss_avx2 AND NOT TARGET faiss::faiss_avx2)
+    add_library(faiss::faiss_avx2 ALIAS faiss_avx2)
+    # We need to ensure that faiss has all the conda information. So we use this approach so that
+    # faiss will have the conda includes/link dirs
+    target_link_libraries(faiss_avx2 PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>)
+  endif()
+  if(TARGET faiss_gpu AND NOT TARGET faiss::faiss_gpu)
+    add_library(faiss::faiss_gpu ALIAS faiss_gpu)
+    # We need to ensure that faiss has all the conda information. So we use this approach so that
+    # faiss will have the conda includes/link dirs
+    target_link_libraries(faiss_gpu PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>)
+  endif()
+
+  if(faiss_ADDED)
+    rapids_export(BUILD faiss
+                  EXPORT_SET faiss-targets
+                  GLOBAL_TARGETS ${RAFT_FAISS_EXPORT_GLOBAL_TARGETS}
+                  NAMESPACE faiss::)
+  endif()
+
+  # Need to tell CMake to rescan the link group of faiss::faiss_gpu and faiss
+  # so that we get proper link order when they are static
+  #
+  # We don't look at the existence of `faiss_avx2` as it will always exist
+  # even when CXX_AVX2_FOUND is false. In addition for arm builds the
+  # faiss_avx2 is marked as `EXCLUDE_FROM_ALL` so we don't want to add
+  # a dependency to it. Adding a dependency will cause it to compile,
+  # and fail due to invalid compiler flags.
+  if(PKG_ENABLE_GPU AND PKG_BUILD_STATIC_LIBS AND CXX_AVX2_FOUND)
+    set(RAFT_FAISS_TARGETS "$<LINK_GROUP:RESCAN,$<LINK_LIBRARY:WHOLE_ARCHIVE,faiss_gpu>,faiss::faiss_avx2>" PARENT_SCOPE)
+  elseif(PKG_ENABLE_GPU AND  PKG_BUILD_STATIC_LIBS)
+    set(RAFT_FAISS_TARGETS "$<LINK_GROUP:RESCAN,$<LINK_LIBRARY:WHOLE_ARCHIVE,faiss_gpu>,faiss::faiss>" PARENT_SCOPE)
+  elseif(CXX_AVX2_FOUND)
+    set(RAFT_FAISS_TARGETS faiss::faiss_avx2 PARENT_SCOPE)
+  else()
+    set(RAFT_FAISS_TARGETS faiss::faiss PARENT_SCOPE)
+  endif()
 
-        rapids_find_generate_module(faiss
-                HEADER_NAMES  faiss/IndexFlat.h
-                LIBRARY_NAMES faiss
-                )
-
-        set(BUILD_SHARED_LIBS ON)
-        if (PKG_BUILD_STATIC_LIBS)
-            set(BUILD_SHARED_LIBS OFF)
-            set(CPM_DOWNLOAD_faiss ON)
-        endif()
-
-        include(cmake/modules/FindAVX.cmake)
-
-        # Link against AVX CPU lib if it exists
-        set(RAFT_FAISS_GLOBAL_TARGETS faiss::faiss)
-        set(RAFT_FAISS_EXPORT_GLOBAL_TARGETS faiss)
-        set(RAFT_FAISS_OPT_LEVEL "generic")
-        if(CXX_AVX_FOUND)
-            set(RAFT_FAISS_OPT_LEVEL "avx2")
-            list(APPEND RAFT_FAISS_GLOBAL_TARGETS faiss::faiss_avx2)
-            list(APPEND RAFT_FAISS_EXPORT_GLOBAL_TARGETS faiss_avx2)
-        endif()
-
-        rapids_cpm_find(faiss ${PKG_VERSION}
-                GLOBAL_TARGETS ${RAFT_FAISS_GLOBAL_TARGETS}
-                CPM_ARGS
-                GIT_REPOSITORY   ${PKG_REPOSITORY}
-                GIT_TAG          ${PKG_PINNED_TAG}
-                EXCLUDE_FROM_ALL ${PKG_EXCLUDE_FROM_ALL}
-                OPTIONS
-                "FAISS_ENABLE_GPU ${PKG_ENABLE_GPU}"
-                "FAISS_ENABLE_PYTHON OFF"
-                "FAISS_OPT_LEVEL ${RAFT_FAISS_OPT_LEVEL}"
-                "FAISS_USE_CUDA_TOOLKIT_STATIC ${CUDA_STATIC_RUNTIME}"
-                "BUILD_TESTING OFF"
-                "CMAKE_MESSAGE_LOG_LEVEL VERBOSE"
-                )
-
-        if(TARGET faiss AND NOT TARGET faiss::faiss)
-            add_library(faiss::faiss ALIAS faiss)
-        endif()
-
-    if(CXX_AVX_FOUND)
-
-        if(TARGET faiss_avx2 AND NOT TARGET faiss::faiss_avx2)
-            add_library(faiss::faiss_avx2 ALIAS faiss_avx2)
-        endif()
-    endif()
-
-
-    if(faiss_ADDED)
-            rapids_export(BUILD faiss
-                    EXPORT_SET faiss-targets
-                    GLOBAL_TARGETS ${RAFT_FAISS_EXPORT_GLOBAL_TARGETS}
-                    NAMESPACE faiss::)
-        endif()
-
-    # We generate the faiss-config files when we built faiss locally, so always do `find_dependency`
-    rapids_export_package(BUILD OpenMP raft-ann-bench-exports) # faiss uses openMP but doesn't export a need for it
-    rapids_export_package(BUILD faiss raft-ann-bench-exports GLOBAL_TARGETS ${RAFT_FAISS_GLOBAL_TARGETS} ${RAFT_FAISS_EXPORT_GLOBAL_TARGETS})
-    rapids_export_package(INSTALL faiss raft-ann-bench-exports GLOBAL_TARGETS ${RAFT_FAISS_GLOBAL_TARGETS} ${RAFT_FAISS_EXPORT_GLOBAL_TARGETS})
-
-    # Tell cmake where it can find the generated faiss-config.cmake we wrote.
-    include("${rapids-cmake-dir}/export/find_package_root.cmake")
-    rapids_export_find_package_root(BUILD faiss [=[${CMAKE_CURRENT_LIST_DIR}]=]
-                                    EXPORT_SET raft-ann-bench-exports)
 endfunction()
 
-if(NOT RAFT_FAISS_GIT_TAG)
-    # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC
-    # (https://github.com/facebookresearch/faiss/pull/2446)
-    set(RAFT_FAISS_GIT_TAG fea/statically-link-ctk)
-    # set(RAFT_FAISS_GIT_TAG bde7c0027191f29c9dadafe4f6e68ca0ee31fb30)
-endif()
-
-if(NOT RAFT_FAISS_GIT_REPOSITORY)
-    # TODO: Remove this once faiss supports FAISS_USE_CUDA_TOOLKIT_STATIC
-    # (https://github.com/facebookresearch/faiss/pull/2446)
-    set(RAFT_FAISS_GIT_REPOSITORY https://github.com/cjnolet/faiss.git)
-    # set(RAFT_FAISS_GIT_REPOSITORY https://github.com/facebookresearch/faiss.git)
-endif()
-
-find_and_configure_faiss(VERSION    1.7.4
-        REPOSITORY  ${RAFT_FAISS_GIT_REPOSITORY}
-        PINNED_TAG  ${RAFT_FAISS_GIT_TAG}
-        BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC}
-        EXCLUDE_FROM_ALL ${RAFT_EXCLUDE_FAISS_FROM_ALL}
-        ENABLE_GPU ${RAFT_FAISS_ENABLE_GPU})
 
+find_and_configure_faiss(
+  BUILD_STATIC_LIBS ${RAFT_USE_FAISS_STATIC}
+  ENABLE_GPU ${RAFT_FAISS_ENABLE_GPU}
+)
diff --git a/cpp/cmake/thirdparty/get_ggnn.cmake b/cpp/cmake/thirdparty/get_ggnn.cmake
index 8137ef84eb..d8af4971a7 100644
--- a/cpp/cmake/thirdparty/get_ggnn.cmake
+++ b/cpp/cmake/thirdparty/get_ggnn.cmake
@@ -15,29 +15,31 @@
 #=============================================================================
 
 function(find_and_configure_ggnn)
-  set(oneValueArgs VERSION REPOSITORY PINNED_TAG)
-  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
-          "${multiValueArgs}" ${ARGN} )
 
+  include(${rapids-cmake-dir}/cpm/package_override.cmake)
+  set(patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/../patches")
+  rapids_cpm_package_override("${patch_dir}/ggnn_override.json")
 
-  set(patch_files_to_run "${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/ggnn.diff")
-  set(patch_issues_to_ref "fix compile issues")
-  set(patch_script "${CMAKE_BINARY_DIR}/rapids-cmake/patches/ggnn/patch.cmake")
-  set(log_file "${CMAKE_BINARY_DIR}/rapids-cmake/patches/ggnn/log")
-  string(TIMESTAMP current_year "%Y" UTC)
-  configure_file(${rapids-cmake-dir}/cpm/patches/command_template.cmake.in "${patch_script}"
-                @ONLY)
+  include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
+  rapids_cpm_package_details(ggnn version repository tag shallow exclude)
+
+  include("${rapids-cmake-dir}/cpm/detail/generate_patch_command.cmake")
+  rapids_cpm_generate_patch_command(ggnn ${version} patch_command)
 
   rapids_cpm_find(
-    ggnn ${PKG_VERSION}
+    ggnn ${version}
     GLOBAL_TARGETS ggnn::ggnn
     CPM_ARGS
-    GIT_REPOSITORY ${PKG_REPOSITORY}
-    GIT_TAG ${PKG_PINNED_TAG}
-    GIT_SHALLOW TRUE
+    GIT_REPOSITORY ${repository}
+    GIT_TAG ${tag}
+    GIT_SHALLOW ${shallow} ${patch_command}
+    EXCLUDE_FROM_ALL ${exclude}
     DOWNLOAD_ONLY ON
-    PATCH_COMMAND ${CMAKE_COMMAND} -P ${patch_script}
   )
+
+  include("${rapids-cmake-dir}/cpm/detail/display_patch_status.cmake")
+  rapids_cpm_display_patch_status(ggnn)
+
   if(NOT TARGET ggnn::ggnn)
     add_library(ggnn INTERFACE)
     target_include_directories(ggnn INTERFACE "$<BUILD_INTERFACE:${ggnn_SOURCE_DIR}/include>")
@@ -45,14 +47,4 @@ function(find_and_configure_ggnn)
   endif()
 
 endfunction()
-if(NOT RAFT_GGNN_GIT_TAG)
-  set(RAFT_GGNN_GIT_TAG release_0.5)
-endif()
-
-if(NOT RAFT_GGNN_GIT_REPOSITORY)
-  set(RAFT_GGNN_GIT_REPOSITORY https://github.com/cgtuebingen/ggnn.git)
-endif()
-find_and_configure_ggnn(VERSION 0.5
-        REPOSITORY       ${RAFT_GGNN_GIT_REPOSITORY}
-        PINNED_TAG       ${RAFT_GGNN_GIT_TAG}
-        )
+find_and_configure_ggnn()
diff --git a/cpp/cmake/thirdparty/get_hnswlib.cmake b/cpp/cmake/thirdparty/get_hnswlib.cmake
index 4d28e9a064..6ef493336f 100644
--- a/cpp/cmake/thirdparty/get_hnswlib.cmake
+++ b/cpp/cmake/thirdparty/get_hnswlib.cmake
@@ -15,78 +15,74 @@
 #=============================================================================
 
 function(find_and_configure_hnswlib)
-  set(oneValueArgs VERSION REPOSITORY PINNED_TAG EXCLUDE_FROM_ALL)
-  cmake_parse_arguments(PKG "${options}" "${oneValueArgs}"
-          "${multiValueArgs}" ${ARGN} )
+  set(oneValueArgs)
 
-  set(patch_files_to_run "${CMAKE_CURRENT_SOURCE_DIR}/cmake/patches/hnswlib.diff")
-  set(patch_issues_to_ref "fix compile issues")
-  set(patch_script "${CMAKE_BINARY_DIR}/rapids-cmake/patches/hnswlib/patch.cmake")
-  set(log_file "${CMAKE_BINARY_DIR}/rapids-cmake/patches/hnswlib/log")
-  string(TIMESTAMP current_year "%Y" UTC)
-  configure_file(${rapids-cmake-dir}/cpm/patches/command_template.cmake.in "${patch_script}"
-                @ONLY)
+  include(${rapids-cmake-dir}/cpm/package_override.cmake)
+  set(patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/../patches")
+  rapids_cpm_package_override("${patch_dir}/hnswlib_override.json")
+
+  include("${rapids-cmake-dir}/cpm/detail/package_details.cmake")
+  rapids_cpm_package_details(hnswlib version repository tag shallow exclude)
+
+  include("${rapids-cmake-dir}/cpm/detail/generate_patch_command.cmake")
+  rapids_cpm_generate_patch_command(hnswlib ${version} patch_command)
 
   rapids_cpm_find(
-    hnswlib ${PKG_VERSION}
-    GLOBAL_TARGETS hnswlib::hnswlib
-    BUILD_EXPORT_SET raft-exports
-    INSTALL_EXPORT_SET raft-exports
+    hnswlib ${version}
+    GLOBAL_TARGETS hnswlib hnswlib::hnswlib
     CPM_ARGS
-    GIT_REPOSITORY ${PKG_REPOSITORY}
-    GIT_TAG ${PKG_PINNED_TAG}
-    GIT_SHALLOW TRUE
+    GIT_REPOSITORY ${repository}
+    GIT_TAG ${tag}
+    GIT_SHALLOW ${shallow} ${patch_command}
+    EXCLUDE_FROM_ALL ${exclude}
     DOWNLOAD_ONLY ON
-    PATCH_COMMAND ${CMAKE_COMMAND} -P ${patch_script}
   )
+
+  include("${rapids-cmake-dir}/cpm/detail/display_patch_status.cmake")
+  rapids_cpm_display_patch_status(hnswlib)
+
   if(NOT TARGET hnswlib::hnswlib)
     add_library(hnswlib INTERFACE )
     add_library(hnswlib::hnswlib ALIAS hnswlib)
     target_include_directories(hnswlib INTERFACE
      "$<BUILD_INTERFACE:${hnswlib_SOURCE_DIR}>"
      "$<INSTALL_INTERFACE:include>")
+  endif()
 
-    if(NOT PKG_EXCLUDE_FROM_ALL)
-      install(TARGETS hnswlib EXPORT hnswlib-exports)
+  if(hnswlib_ADDED)
+    # write build export rules
+    install(TARGETS hnswlib EXPORT hnswlib-exports)
+    if(NOT exclude)
       install(DIRECTORY "${hnswlib_SOURCE_DIR}/hnswlib/" DESTINATION include/hnswlib)
 
       # write install export rules
       rapids_export(
         INSTALL hnswlib
-        VERSION ${PKG_VERSION}
+        VERSION ${version}
         EXPORT_SET hnswlib-exports
         GLOBAL_TARGETS hnswlib
         NAMESPACE hnswlib::)
     endif()
 
-    # write build export rules
     rapids_export(
       BUILD hnswlib
-      VERSION ${PKG_VERSION}
+      VERSION ${version}
       EXPORT_SET hnswlib-exports
       GLOBAL_TARGETS hnswlib
       NAMESPACE hnswlib::)
 
-    include("${rapids-cmake-dir}/export/find_package_root.cmake")
+    include("${rapids-cmake-dir}/export/package.cmake")
+    rapids_export_package(INSTALL hnswlib raft-exports VERSION ${version} GLOBAL_TARGETS hnswlib hnswlib::hnswlib)
+    rapids_export_package(BUILD hnswlib raft-exports VERSION ${version} GLOBAL_TARGETS hnswlib hnswlib::hnswlib)
+
 
     # When using RAFT from the build dir, ensure hnswlib is also found in RAFT's build dir. This
     # line adds `set(hnswlib_ROOT "${CMAKE_CURRENT_LIST_DIR}")` to build/raft-dependencies.cmake
+    include("${rapids-cmake-dir}/export/find_package_root.cmake")
     rapids_export_find_package_root(
       BUILD hnswlib [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET raft-exports
     )
   endif()
 endfunction()
 
-
-if(NOT RAFT_HNSWLIB_GIT_TAG)
-  set(RAFT_HNSWLIB_GIT_TAG v0.6.2)
-endif()
-
-if(NOT RAFT_HNSWLIB_GIT_REPOSITORY)
-  set(RAFT_HNSWLIB_GIT_REPOSITORY https://github.com/nmslib/hnswlib.git)
-endif()
-find_and_configure_hnswlib(VERSION 0.6.2
-        REPOSITORY       ${RAFT_HNSWLIB_GIT_REPOSITORY}
-        PINNED_TAG       ${RAFT_HNSWLIB_GIT_TAG}
-        EXCLUDE_FROM_ALL OFF
-        )
+find_and_configure_hnswlib()
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index 752dffdc16..dac3418c8e 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -99,12 +99,12 @@ if(BUILD_TESTS)
     NAME
     CLUSTER_TEST
     PATH
-    test/cluster/kmeans.cu
-    test/cluster/kmeans_balanced.cu
-    test/cluster/kmeans_find_k.cu
-    test/cluster/cluster_solvers.cu
-    test/cluster/linkage.cu
-    test/cluster/spectral.cu
+    cluster/kmeans.cu
+    cluster/kmeans_balanced.cu
+    cluster/kmeans_find_k.cu
+    cluster/cluster_solvers.cu
+    cluster/linkage.cu
+    cluster/spectral.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
@@ -113,37 +113,37 @@ if(BUILD_TESTS)
     NAME
     CORE_TEST
     PATH
-    test/core/bitset.cu
-    test/core/device_resources_manager.cpp
-    test/core/device_setter.cpp
-    test/core/logger.cpp
-    test/core/math_device.cu
-    test/core/math_host.cpp
-    test/core/operators_device.cu
-    test/core/operators_host.cpp
-    test/core/handle.cpp
-    test/core/interruptible.cu
-    test/core/nvtx.cpp
-    test/core/mdarray.cu
-    test/core/mdbuffer.cu
-    test/core/mdspan_copy.cpp
-    test/core/mdspan_copy.cu
-    test/core/mdspan_utils.cu
-    test/core/numpy_serializer.cu
-    test/core/memory_type.cpp
-    test/core/sparse_matrix.cu
-    test/core/sparse_matrix.cpp
-    test/core/span.cpp
-    test/core/span.cu
-    test/core/stream_view.cpp
-    test/core/temporary_device_buffer.cu
-    test/test.cpp
+    core/bitset.cu
+    core/device_resources_manager.cpp
+    core/device_setter.cpp
+    core/logger.cpp
+    core/math_device.cu
+    core/math_host.cpp
+    core/operators_device.cu
+    core/operators_host.cpp
+    core/handle.cpp
+    core/interruptible.cu
+    core/nvtx.cpp
+    core/mdarray.cu
+    core/mdbuffer.cu
+    core/mdspan_copy.cpp
+    core/mdspan_copy.cu
+    core/mdspan_utils.cu
+    core/numpy_serializer.cu
+    core/memory_type.cpp
+    core/sparse_matrix.cu
+    core/sparse_matrix.cpp
+    core/span.cpp
+    core/span.cu
+    core/stream_view.cpp
+    core/temporary_device_buffer.cu
+    test.cpp
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
-    NAME CORE_TEST PATH test/core/stream_view.cpp test/core/mdspan_copy.cpp LIB
+    NAME CORE_TEST PATH core/stream_view.cpp core/mdspan_copy.cpp LIB
     EXPLICIT_INSTANTIATE_ONLY NOCUDA
   )
 
@@ -151,28 +151,28 @@ if(BUILD_TESTS)
     NAME
     DISTANCE_TEST
     PATH
-    test/distance/dist_adj.cu
-    test/distance/dist_adj_distance_instance.cu
-    test/distance/dist_canberra.cu
-    test/distance/dist_correlation.cu
-    test/distance/dist_cos.cu
-    test/distance/dist_hamming.cu
-    test/distance/dist_hellinger.cu
-    test/distance/dist_inner_product.cu
-    test/distance/dist_jensen_shannon.cu
-    test/distance/dist_kl_divergence.cu
-    test/distance/dist_l1.cu
-    test/distance/dist_l2_exp.cu
-    test/distance/dist_l2_unexp.cu
-    test/distance/dist_l2_sqrt_exp.cu
-    test/distance/dist_l_inf.cu
-    test/distance/dist_lp_unexp.cu
-    test/distance/dist_russell_rao.cu
-    test/distance/masked_nn.cu
-    test/distance/masked_nn_compress_to_bits.cu
-    test/distance/fused_l2_nn.cu
-    test/distance/fused_cosine_nn.cu
-    test/distance/gram.cu
+    distance/dist_adj.cu
+    distance/dist_adj_distance_instance.cu
+    distance/dist_canberra.cu
+    distance/dist_correlation.cu
+    distance/dist_cos.cu
+    distance/dist_hamming.cu
+    distance/dist_hellinger.cu
+    distance/dist_inner_product.cu
+    distance/dist_jensen_shannon.cu
+    distance/dist_kl_divergence.cu
+    distance/dist_l1.cu
+    distance/dist_l2_exp.cu
+    distance/dist_l2_unexp.cu
+    distance/dist_l2_sqrt_exp.cu
+    distance/dist_l_inf.cu
+    distance/dist_lp_unexp.cu
+    distance/dist_russell_rao.cu
+    distance/masked_nn.cu
+    distance/masked_nn_compress_to_bits.cu
+    distance/fused_l2_nn.cu
+    distance/fused_cosine_nn.cu
+    distance/gram.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
@@ -180,22 +180,22 @@ if(BUILD_TESTS)
   list(
     APPEND
     EXT_HEADER_TEST_SOURCES
-    test/ext_headers/raft_neighbors_brute_force.cu
-    test/ext_headers/raft_distance_distance.cu
-    test/ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
-    test/ext_headers/raft_matrix_detail_select_k.cu
-    test/ext_headers/raft_neighbors_ball_cover.cu
-    test/ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
-    test/ext_headers/raft_distance_fused_l2_nn.cu
-    test/ext_headers/raft_neighbors_ivf_pq.cu
-    test/ext_headers/raft_neighbors_ivf_flat.cu
-    test/ext_headers/raft_core_logger.cpp
-    test/ext_headers/raft_neighbors_refine.cu
-    test/ext_headers/raft_neighbors_detail_ivf_flat_search.cu
-    test/ext_headers/raft_linalg_detail_coalesced_reduction.cu
-    test/ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
-    test/ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
-    test/ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
+    ext_headers/raft_neighbors_brute_force.cu
+    ext_headers/raft_distance_distance.cu
+    ext_headers/raft_distance_detail_pairwise_matrix_dispatch.cu
+    ext_headers/raft_matrix_detail_select_k.cu
+    ext_headers/raft_neighbors_ball_cover.cu
+    ext_headers/raft_spatial_knn_detail_fused_l2_knn.cu
+    ext_headers/raft_distance_fused_l2_nn.cu
+    ext_headers/raft_neighbors_ivf_pq.cu
+    ext_headers/raft_neighbors_ivf_flat.cu
+    ext_headers/raft_core_logger.cpp
+    ext_headers/raft_neighbors_refine.cu
+    ext_headers/raft_neighbors_detail_ivf_flat_search.cu
+    ext_headers/raft_linalg_detail_coalesced_reduction.cu
+    ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
+    ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
+    ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
   )
 
   # Test that the split headers compile in isolation with:
@@ -210,134 +210,134 @@ if(BUILD_TESTS)
   ConfigureTest(NAME EXT_HEADERS_TEST_COMPILED_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES} LIB)
   ConfigureTest(NAME EXT_HEADERS_TEST_IMPLICIT PATH ${EXT_HEADER_TEST_SOURCES})
 
-  ConfigureTest(NAME LABEL_TEST PATH test/label/label.cu test/label/merge_labels.cu)
+  ConfigureTest(NAME LABEL_TEST PATH label/label.cu label/merge_labels.cu)
 
   ConfigureTest(
     NAME
     LINALG_TEST
     PATH
-    test/linalg/add.cu
-    test/linalg/axpy.cu
-    test/linalg/binary_op.cu
-    test/linalg/cholesky_r1.cu
-    test/linalg/coalesced_reduction.cu
-    test/linalg/divide.cu
-    test/linalg/dot.cu
-    test/linalg/eig.cu
-    test/linalg/eig_sel.cu
-    test/linalg/gemm_layout.cu
-    test/linalg/gemv.cu
-    test/linalg/map.cu
-    test/linalg/map_then_reduce.cu
-    test/linalg/matrix_vector.cu
-    test/linalg/matrix_vector_op.cu
-    test/linalg/mean_squared_error.cu
-    test/linalg/multiply.cu
-    test/linalg/norm.cu
-    test/linalg/normalize.cu
-    test/linalg/power.cu
-    test/linalg/randomized_svd.cu
-    test/linalg/reduce.cu
-    test/linalg/reduce_cols_by_key.cu
-    test/linalg/reduce_rows_by_key.cu
-    test/linalg/rsvd.cu
-    test/linalg/sqrt.cu
-    test/linalg/strided_reduction.cu
-    test/linalg/subtract.cu
-    test/linalg/svd.cu
-    test/linalg/ternary_op.cu
-    test/linalg/transpose.cu
-    test/linalg/unary_op.cu
+    linalg/add.cu
+    linalg/axpy.cu
+    linalg/binary_op.cu
+    linalg/cholesky_r1.cu
+    linalg/coalesced_reduction.cu
+    linalg/divide.cu
+    linalg/dot.cu
+    linalg/eig.cu
+    linalg/eig_sel.cu
+    linalg/gemm_layout.cu
+    linalg/gemv.cu
+    linalg/map.cu
+    linalg/map_then_reduce.cu
+    linalg/matrix_vector.cu
+    linalg/matrix_vector_op.cu
+    linalg/mean_squared_error.cu
+    linalg/multiply.cu
+    linalg/norm.cu
+    linalg/normalize.cu
+    linalg/power.cu
+    linalg/randomized_svd.cu
+    linalg/reduce.cu
+    linalg/reduce_cols_by_key.cu
+    linalg/reduce_rows_by_key.cu
+    linalg/rsvd.cu
+    linalg/sqrt.cu
+    linalg/strided_reduction.cu
+    linalg/subtract.cu
+    linalg/svd.cu
+    linalg/ternary_op.cu
+    linalg/transpose.cu
+    linalg/unary_op.cu
   )
 
   ConfigureTest(
     NAME
     MATRIX_TEST
     PATH
-    test/matrix/argmax.cu
-    test/matrix/argmin.cu
-    test/matrix/columnSort.cu
-    test/matrix/diagonal.cu
-    test/matrix/gather.cu
-    test/matrix/scatter.cu
-    test/matrix/eye.cu
-    test/matrix/linewise_op.cu
-    test/matrix/math.cu
-    test/matrix/matrix.cu
-    test/matrix/norm.cu
-    test/matrix/reverse.cu
-    test/matrix/sample_rows.cu
-    test/matrix/slice.cu
-    test/matrix/triangular.cu
-    test/sparse/spectral_matrix.cu
+    matrix/argmax.cu
+    matrix/argmin.cu
+    matrix/columnSort.cu
+    matrix/diagonal.cu
+    matrix/gather.cu
+    matrix/scatter.cu
+    matrix/eye.cu
+    matrix/linewise_op.cu
+    matrix/math.cu
+    matrix/matrix.cu
+    matrix/norm.cu
+    matrix/reverse.cu
+    matrix/sample_rows.cu
+    matrix/slice.cu
+    matrix/triangular.cu
+    sparse/spectral_matrix.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
 
-  ConfigureTest(NAME MATRIX_SELECT_TEST PATH test/matrix/select_k.cu LIB EXPLICIT_INSTANTIATE_ONLY)
+  ConfigureTest(NAME MATRIX_SELECT_TEST PATH matrix/select_k.cu LIB EXPLICIT_INSTANTIATE_ONLY)
 
   ConfigureTest(
-    NAME MATRIX_SELECT_LARGE_TEST PATH test/matrix/select_large_k.cu LIB EXPLICIT_INSTANTIATE_ONLY
+    NAME MATRIX_SELECT_LARGE_TEST PATH matrix/select_large_k.cu LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
     NAME
     RANDOM_TEST
     PATH
-    test/random/make_blobs.cu
-    test/random/make_regression.cu
-    test/random/multi_variable_gaussian.cu
-    test/random/rng_pcg_host_api.cu
-    test/random/permute.cu
-    test/random/rng.cu
-    test/random/rng_discrete.cu
-    test/random/rng_int.cu
-    test/random/rmat_rectangular_generator.cu
-    test/random/sample_without_replacement.cu
-    test/random/excess_sampling.cu
+    random/make_blobs.cu
+    random/make_regression.cu
+    random/multi_variable_gaussian.cu
+    random/rng_pcg_host_api.cu
+    random/permute.cu
+    random/rng.cu
+    random/rng_discrete.cu
+    random/rng_int.cu
+    random/rmat_rectangular_generator.cu
+    random/sample_without_replacement.cu
+    random/excess_sampling.cu
   )
 
   ConfigureTest(
-    NAME SOLVERS_TEST PATH test/cluster/cluster_solvers_deprecated.cu test/linalg/eigen_solvers.cu
-    test/lap/lap.cu test/sparse/mst.cu LIB EXPLICIT_INSTANTIATE_ONLY
+    NAME SOLVERS_TEST PATH cluster/cluster_solvers_deprecated.cu linalg/eigen_solvers.cu
+    lap/lap.cu sparse/mst.cu LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
     NAME
     SPARSE_TEST
     PATH
-    test/sparse/add.cu
-    test/sparse/convert_coo.cu
-    test/sparse/convert_csr.cu
-    test/sparse/csr_row_slice.cu
-    test/sparse/csr_to_dense.cu
-    test/sparse/csr_transpose.cu
-    test/sparse/degree.cu
-    test/sparse/filter.cu
-    test/sparse/norm.cu
-    test/sparse/normalize.cu
-    test/sparse/reduce.cu
-    test/sparse/row_op.cu
-    test/sparse/sddmm.cu
-    test/sparse/select_k_csr.cu
-    test/sparse/sort.cu
-    test/sparse/spgemmi.cu
-    test/sparse/spmm.cu
-    test/sparse/symmetrize.cu
+    sparse/add.cu
+    sparse/convert_coo.cu
+    sparse/convert_csr.cu
+    sparse/csr_row_slice.cu
+    sparse/csr_to_dense.cu
+    sparse/csr_transpose.cu
+    sparse/degree.cu
+    sparse/filter.cu
+    sparse/norm.cu
+    sparse/normalize.cu
+    sparse/reduce.cu
+    sparse/row_op.cu
+    sparse/sddmm.cu
+    sparse/select_k_csr.cu
+    sparse/sort.cu
+    sparse/spgemmi.cu
+    sparse/spmm.cu
+    sparse/symmetrize.cu
   )
 
   ConfigureTest(
-    NAME SPARSE_DIST_TEST PATH test/sparse/dist_coo_spmv.cu test/sparse/distance.cu
-    test/sparse/gram.cu LIB EXPLICIT_INSTANTIATE_ONLY
+    NAME SPARSE_DIST_TEST PATH sparse/dist_coo_spmv.cu sparse/distance.cu
+    sparse/gram.cu LIB EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
     NAME
     SPARSE_NEIGHBORS_TEST
     PATH
-    test/sparse/neighbors/cross_component_nn.cu
-    test/sparse/neighbors/brute_force.cu
-    test/sparse/neighbors/knn_graph.cu
+    sparse/neighbors/cross_component_nn.cu
+    sparse/neighbors/brute_force.cu
+    sparse/neighbors/knn_graph.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
@@ -346,19 +346,19 @@ if(BUILD_TESTS)
     NAME
     NEIGHBORS_TEST
     PATH
-    test/neighbors/knn.cu
-    test/neighbors/fused_l2_knn.cu
-    test/neighbors/tiled_knn.cu
-    test/neighbors/haversine.cu
-    test/neighbors/ball_cover.cu
-    test/neighbors/epsilon_neighborhood.cu
-    test/neighbors/refine.cu
+    neighbors/knn.cu
+    neighbors/fused_l2_knn.cu
+    neighbors/tiled_knn.cu
+    neighbors/haversine.cu
+    neighbors/ball_cover.cu
+    neighbors/epsilon_neighborhood.cu
+    neighbors/refine.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
 
   ConfigureTest(
-    NAME NEIGHBORS_ANN_BRUTE_FORCE_TEST PATH test/neighbors/ann_brute_force/test_float.cu LIB
+    NAME NEIGHBORS_ANN_BRUTE_FORCE_TEST PATH neighbors/ann_brute_force/test_float.cu LIB
     EXPLICIT_INSTANTIATE_ONLY GPUS 1 PERCENT 100
   )
 
@@ -366,30 +366,30 @@ if(BUILD_TESTS)
     NAME
     NEIGHBORS_ANN_CAGRA_TEST
     PATH
-    test/neighbors/ann_cagra/test_float_uint32_t.cu
-    test/neighbors/ann_cagra/test_half_uint32_t.cu
-    test/neighbors/ann_cagra/test_int8_t_uint32_t.cu
-    test/neighbors/ann_cagra/test_uint8_t_uint32_t.cu
-    test/neighbors/ann_cagra/test_float_int64_t.cu
-    test/neighbors/ann_cagra/test_half_int64_t.cu
-    test/neighbors/ann_cagra_vpq/test_float_int64_t.cu
-    test/neighbors/ann_cagra_vpq/test_float_uint32_t.cu
-    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
-    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
-    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
-    src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
-    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
-    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
-    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
-    src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
-    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu
-    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu
-    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu
-    src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu
-    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu
-    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu
-    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu
-    src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu
+    neighbors/ann_cagra/test_float_uint32_t.cu
+    neighbors/ann_cagra/test_half_uint32_t.cu
+    neighbors/ann_cagra/test_int8_t_uint32_t.cu
+    neighbors/ann_cagra/test_uint8_t_uint32_t.cu
+    neighbors/ann_cagra/test_float_int64_t.cu
+    neighbors/ann_cagra/test_half_int64_t.cu
+    neighbors/ann_cagra_vpq/test_float_int64_t.cu
+    neighbors/ann_cagra_vpq/test_float_uint32_t.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim128_t8.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim256_t16.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim512_t32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_multi_cta_float_uint64_dim1024_t32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim128_t8.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim256_t16.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim512_t32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_single_cta_float_uint64_dim1024_t32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim128_t8.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim256_t16.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim512_t32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_multi_cta_half_uint64_dim1024_t32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim128_t8.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim256_t16.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim512_t32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/cagra/search_single_cta_half_uint64_dim1024_t32.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
     GPUS
@@ -402,40 +402,40 @@ if(BUILD_TESTS)
     NAME
     NEIGHBORS_ANN_IVF_TEST
     PATH
-    test/neighbors/ann_ivf_flat/test_filter_float_int64_t.cu
-    test/neighbors/ann_ivf_flat/test_float_int64_t.cu
-    test/neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
-    test/neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
-    test/neighbors/ann_ivf_pq/ivf_pq_build_float_uint32_t.cu
-    test/neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu
-    src/neighbors/detail/ivf_pq_search_filtering_float_int64_t.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_float_filt32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_filt32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_filt32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_half_filt32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_filt32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_filt32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_half_filt32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset32.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu
-    src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset64.cu
-    test/neighbors/ann_ivf_pq/test_float_uint32_t.cu
-    test/neighbors/ann_ivf_pq/test_float_int64_t.cu
-    test/neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
-    test/neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu
-    test/neighbors/ann_ivf_pq/test_filter_float_int64_t.cu
-    test/neighbors/ann_ivf_pq/test_filter_int8_t_int64_t.cu
+    neighbors/ann_ivf_flat/test_filter_float_int64_t.cu
+    neighbors/ann_ivf_flat/test_float_int64_t.cu
+    neighbors/ann_ivf_flat/test_int8_t_int64_t.cu
+    neighbors/ann_ivf_flat/test_uint8_t_int64_t.cu
+    neighbors/ann_ivf_pq/ivf_pq_build_float_uint32_t.cu
+    neighbors/ann_ivf_pq/ivf_pq_search_float_uint32_t.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_search_filtering_float_int64_t.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_float_filt32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_filt32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_filt32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_half_filt32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_filt32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_filt32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_half_filt32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset32.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_float_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_false_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_fp8_true_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_float_half_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_false_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_fp8_true_bitset64.cu
+    ${RAFT_SOURCE_DIR}/src/neighbors/detail/ivf_pq_compute_similarity_half_half_bitset64.cu
+    neighbors/ann_ivf_pq/test_float_uint32_t.cu
+    neighbors/ann_ivf_pq/test_float_int64_t.cu
+    neighbors/ann_ivf_pq/test_int8_t_int64_t.cu
+    neighbors/ann_ivf_pq/test_uint8_t_int64_t.cu
+    neighbors/ann_ivf_pq/test_filter_float_int64_t.cu
+    neighbors/ann_ivf_pq/test_filter_int8_t_int64_t.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
     GPUS
@@ -448,9 +448,9 @@ if(BUILD_TESTS)
     NAME
     NEIGHBORS_ANN_NN_DESCENT_TEST
     PATH
-    test/neighbors/ann_nn_descent/test_float_uint32_t.cu
-    test/neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
-    test/neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
+    neighbors/ann_nn_descent/test_float_uint32_t.cu
+    neighbors/ann_nn_descent/test_int8_t_uint32_t.cu
+    neighbors/ann_nn_descent/test_uint8_t_uint32_t.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
     GPUS
@@ -463,32 +463,32 @@ if(BUILD_TESTS)
     NAME
     STATS_TEST
     PATH
-    test/stats/accuracy.cu
-    test/stats/adjusted_rand_index.cu
-    test/stats/completeness_score.cu
-    test/stats/contingencyMatrix.cu
-    test/stats/cov.cu
-    test/stats/dispersion.cu
-    test/stats/entropy.cu
-    test/stats/histogram.cu
-    test/stats/homogeneity_score.cu
-    test/stats/information_criterion.cu
-    test/stats/kl_divergence.cu
-    test/stats/mean.cu
-    test/stats/meanvar.cu
-    test/stats/mean_center.cu
-    test/stats/minmax.cu
-    test/stats/mutual_info_score.cu
-    test/stats/neighborhood_recall.cu
-    test/stats/r2_score.cu
-    test/stats/rand_index.cu
-    test/stats/regression_metrics.cu
-    test/stats/silhouette_score.cu
-    test/stats/stddev.cu
-    test/stats/sum.cu
-    test/stats/trustworthiness.cu
-    test/stats/weighted_mean.cu
-    test/stats/v_measure.cu
+    stats/accuracy.cu
+    stats/adjusted_rand_index.cu
+    stats/completeness_score.cu
+    stats/contingencyMatrix.cu
+    stats/cov.cu
+    stats/dispersion.cu
+    stats/entropy.cu
+    stats/histogram.cu
+    stats/homogeneity_score.cu
+    stats/information_criterion.cu
+    stats/kl_divergence.cu
+    stats/mean.cu
+    stats/meanvar.cu
+    stats/mean_center.cu
+    stats/minmax.cu
+    stats/mutual_info_score.cu
+    stats/neighborhood_recall.cu
+    stats/r2_score.cu
+    stats/rand_index.cu
+    stats/regression_metrics.cu
+    stats/silhouette_score.cu
+    stats/stddev.cu
+    stats/sum.cu
+    stats/trustworthiness.cu
+    stats/weighted_mean.cu
+    stats/v_measure.cu
     LIB
     EXPLICIT_INSTANTIATE_ONLY
   )
@@ -497,15 +497,15 @@ if(BUILD_TESTS)
     NAME
     UTILS_TEST
     PATH
-    test/core/seive.cu
-    test/util/bitonic_sort.cu
-    test/util/cudart_utils.cpp
-    test/util/device_atomics.cu
-    test/util/integer_utils.cpp
-    test/util/integer_utils.cu
-    test/util/memory_type_dispatcher.cu
-    test/util/pow2_utils.cu
-    test/util/reduction.cu
+    core/seive.cu
+    util/bitonic_sort.cu
+    util/cudart_utils.cpp
+    util/device_atomics.cu
+    util/integer_utils.cpp
+    util/integer_utils.cu
+    util/memory_type_dispatcher.cu
+    util/pow2_utils.cu
+    util/reduction.cu
   )
 endif()
 

From efcd11f02445f91ef6ddccbbb6067c4d38a82002 Mon Sep 17 00:00:00 2001
From: "Artem M. Chirkin" <9253178+achirkin@users.noreply.github.com>
Date: Tue, 21 May 2024 17:05:30 +0200
Subject: [PATCH 52/60] Scaling workspace resources (#2322)

### Brief

Add another workspace memory resource that does not have the explicit memory limit. That is, after the change we have the following:

1. `rmm::mr::get_current_device_resource()` is default for all allocations, as before. It is used for the allocations with unlimited lifetime, e.g. returned to the user.
2. `raft::get_workspace_resource()` is for temporary allocations and forced to have fixed size, as before. However, it becomes smaller and should be used only for allocations, which do not scale with problem size. It defaults to a thin layer on top of the `current_device_resource`.
3. `raft::get_large_workspace_resource()` _(new)_  is for temporary allocations, which can scale with the problem size. Unlike `workspace_resource`, its size is not fixed. By default, it points to the `current_device_resource`, but the user can set it to something backed by the host memory (e.g. managed memory) to avoid OOM exceptions when there's not enough device memory left.

## Problem

We have a list of issues/preference/requirements, some of which contradict others

1. We rely on RMM to handle all allocations and we often use [`rmm::mr::pool_memory_resource`](https://github.com/rapidsai/raft/blob/9fb05a2ab3d72760a09f1b7051e711d773682ef1/cpp/bench/ann/src/raft/raft_ann_bench_utils.h#L73) for performance reasons (to avoid lots of cudaMalloc calls in the loops)
2. Historically, we've used managed memory allocators as a workaround to [avoid OOM errors](https://github.com/rapidsai/raft/blob/5e80c1d2159e00a204ab5db0f5ca3f9ec43187c7/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh#L1788-L1795) or [improve speed (by increasing batch sizes)](https://github.com/rapidsai/raft/blob/5e80c1d2159e00a204ab5db0f5ca3f9ec43187c7/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh#L1596-L1603).
3. However, the design goal is to avoid setting allocators on our own and to give the full control to the user (hence the workaround in 2 [was removed](https://github.com/rapidsai/raft/commit/addb059975478375a422d32e9bec30e1aeb16527#diff-f7f070424d71da5321d470416d1a4ca3605c4290c34c4a1c1d8b2240747000d2)).
4. We introduced the [workspace resource](https://github.com/rapidsai/raft/pull/1356) earlier to allow querying the available memory reliably and maximize the batch sizes accordingly (see also issue [#1310](https://github.com/rapidsai/raft/issues/1310)). Without this, some of our batched algorithms either fail with OOM or severely underperform due to small batch sizes.
5. However, we cannot just put all of RAFT temporary allocations into the limited `workspace_resource`, because some of them scale with the problem size and would inevitably fail with OOM at some point.
6. Setting the workspace resource to the managed memory is not advisable as well for performance reasons: we have lots of small allocations in performance critical sections, so we need a pool, but a pool in the managed memory inevitably outgrows the device memory and makes the whole program slow.

## Solution
I propose to split the workspace memory into two:

1. small, fixed-size workspace for small, frequent allocations
2. large workspace for the allocations that scale with the problem size

Notes:
- We still leave the full control over the allocator types to the user.
- Neither of the workspace resource should have unlimited lifetime / returned to the user. As a result, if the user sets the managed memory as the large workspace resource, the memory is guaranteed to be released after the function call.
- We have the option to use the slow managed memory without a pool for large allocations, while still using a fast pool for small allocations.
- We have more flexible control over which allocations are "large" and which are "small", so hopefully using the managed memory is not so bad for performance.

Authors:
  - Artem M. Chirkin (https://github.com/achirkin)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2322
---
 cpp/bench/ann/src/raft/raft_ann_bench_utils.h | 20 +++++--
 .../core/resource/device_memory_resource.hpp  | 52 ++++++++++++++++++-
 .../raft/core/resource/resource_types.hpp     | 35 +++++++------
 .../raft/matrix/detail/select_radix.cuh       |  2 +-
 4 files changed, 86 insertions(+), 23 deletions(-)

diff --git a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
index ffe8f8717b..9b086fdb23 100644
--- a/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
+++ b/cpp/bench/ann/src/raft/raft_ann_bench_utils.h
@@ -31,6 +31,7 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/failure_callback_resource_adaptor.hpp>
+#include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/pool_memory_resource.hpp>
 
 #include <memory>
@@ -74,13 +75,14 @@ inline auto rmm_oom_callback(std::size_t bytes, void*) -> bool
  */
 class shared_raft_resources {
  public:
-  using pool_mr_type = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
-  using mr_type      = rmm::mr::failure_callback_resource_adaptor<pool_mr_type>;
+  using pool_mr_type  = rmm::mr::pool_memory_resource<rmm::mr::device_memory_resource>;
+  using mr_type       = rmm::mr::failure_callback_resource_adaptor<pool_mr_type>;
+  using large_mr_type = rmm::mr::managed_memory_resource;
 
   shared_raft_resources()
   try : orig_resource_{rmm::mr::get_current_device_resource()},
     pool_resource_(orig_resource_, 1024 * 1024 * 1024ull),
-    resource_(&pool_resource_, rmm_oom_callback, nullptr) {
+    resource_(&pool_resource_, rmm_oom_callback, nullptr), large_mr_() {
     rmm::mr::set_current_device_resource(&resource_);
   } catch (const std::exception& e) {
     auto cuda_status = cudaGetLastError();
@@ -103,10 +105,16 @@ class shared_raft_resources {
 
   ~shared_raft_resources() noexcept { rmm::mr::set_current_device_resource(orig_resource_); }
 
+  auto get_large_memory_resource() noexcept
+  {
+    return static_cast<rmm::mr::device_memory_resource*>(&large_mr_);
+  }
+
  private:
   rmm::mr::device_memory_resource* orig_resource_;
   pool_mr_type pool_resource_;
   mr_type resource_;
+  large_mr_type large_mr_;
 };
 
 /**
@@ -129,6 +137,12 @@ class configured_raft_resources {
       res_{std::make_unique<raft::device_resources>(
         rmm::cuda_stream_view(get_stream_from_global_pool()))}
   {
+    // set the large workspace resource to the raft handle, but without the deleter
+    // (this resource is managed by the shared_res).
+    raft::resource::set_large_workspace_resource(
+      *res_,
+      std::shared_ptr<rmm::mr::device_memory_resource>(shared_res_->get_large_memory_resource(),
+                                                       raft::void_op{}));
   }
 
   /** Default constructor creates all resources anew. */
diff --git a/cpp/include/raft/core/resource/device_memory_resource.hpp b/cpp/include/raft/core/resource/device_memory_resource.hpp
index 9aa9e4fb85..b785010a0a 100644
--- a/cpp/include/raft/core/resource/device_memory_resource.hpp
+++ b/cpp/include/raft/core/resource/device_memory_resource.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,6 +35,16 @@ namespace raft::resource {
  * @{
  */
 
+class device_memory_resource : public resource {
+ public:
+  explicit device_memory_resource(std::shared_ptr<rmm::mr::device_memory_resource> mr) : mr_(mr) {}
+  ~device_memory_resource() override = default;
+  auto get_resource() -> void* override { return mr_.get(); }
+
+ private:
+  std::shared_ptr<rmm::mr::device_memory_resource> mr_;
+};
+
 class limiting_memory_resource : public resource {
  public:
   limiting_memory_resource(std::shared_ptr<rmm::mr::device_memory_resource> mr,
@@ -66,6 +76,29 @@ class limiting_memory_resource : public resource {
   }
 };
 
+/**
+ * Factory that knows how to construct a specific raft::resource to populate
+ * the resources instance.
+ */
+class large_workspace_resource_factory : public resource_factory {
+ public:
+  explicit large_workspace_resource_factory(
+    std::shared_ptr<rmm::mr::device_memory_resource> mr = {nullptr})
+    : mr_{mr ? mr
+             : std::shared_ptr<rmm::mr::device_memory_resource>{
+                 rmm::mr::get_current_device_resource(), void_op{}}}
+  {
+  }
+  auto get_resource_type() -> resource_type override
+  {
+    return resource_type::LARGE_WORKSPACE_RESOURCE;
+  }
+  auto make_resource() -> resource* override { return new device_memory_resource(mr_); }
+
+ private:
+  std::shared_ptr<rmm::mr::device_memory_resource> mr_;
+};
+
 /**
  * Factory that knows how to construct a specific raft::resource to populate
  * the resources instance.
@@ -144,7 +177,7 @@ class workspace_resource_factory : public resource_factory {
     // Note, the workspace does not claim all this memory from the start, so it's still usable by
     // the main resource as well.
     // This limit is merely an order for algorithm internals to plan the batching accordingly.
-    return total_size / 2;
+    return total_size / 4;
   }
 };
 
@@ -241,6 +274,21 @@ inline void set_workspace_to_global_resource(
     workspace_resource_factory::default_plain_resource(), allocation_limit, std::nullopt));
 };
 
+inline auto get_large_workspace_resource(resources const& res) -> rmm::mr::device_memory_resource*
+{
+  if (!res.has_resource_factory(resource_type::LARGE_WORKSPACE_RESOURCE)) {
+    res.add_resource_factory(std::make_shared<large_workspace_resource_factory>());
+  }
+  return res.get_resource<rmm::mr::device_memory_resource>(resource_type::LARGE_WORKSPACE_RESOURCE);
+};
+
+inline void set_large_workspace_resource(resources const& res,
+                                         std::shared_ptr<rmm::mr::device_memory_resource> mr = {
+                                           nullptr})
+{
+  res.add_resource_factory(std::make_shared<large_workspace_resource_factory>(mr));
+};
+
 /** @} */
 
 }  // namespace raft::resource
diff --git a/cpp/include/raft/core/resource/resource_types.hpp b/cpp/include/raft/core/resource/resource_types.hpp
index d2021728c4..d9126251c9 100644
--- a/cpp/include/raft/core/resource/resource_types.hpp
+++ b/cpp/include/raft/core/resource/resource_types.hpp
@@ -28,23 +28,24 @@ namespace raft::resource {
  */
 enum resource_type {
   // device-specific resource types
-  CUBLAS_HANDLE = 0,       // cublas handle
-  CUSOLVER_DN_HANDLE,      // cusolver dn handle
-  CUSOLVER_SP_HANDLE,      // cusolver sp handle
-  CUSPARSE_HANDLE,         // cusparse handle
-  CUDA_STREAM_VIEW,        // view of a cuda stream
-  CUDA_STREAM_POOL,        // cuda stream pool
-  CUDA_STREAM_SYNC_EVENT,  // cuda event for syncing streams
-  COMMUNICATOR,            // raft communicator
-  SUB_COMMUNICATOR,        // raft sub communicator
-  DEVICE_PROPERTIES,       // cuda device properties
-  DEVICE_ID,               // cuda device id
-  STREAM_VIEW,             // view of a cuda stream or a placeholder in
-                           // CUDA-free builds
-  THRUST_POLICY,           // thrust execution policy
-  WORKSPACE_RESOURCE,      // rmm device memory resource
-  CUBLASLT_HANDLE,         // cublasLt handle
-  CUSTOM,                  // runtime-shared default-constructible resource
+  CUBLAS_HANDLE = 0,         // cublas handle
+  CUSOLVER_DN_HANDLE,        // cusolver dn handle
+  CUSOLVER_SP_HANDLE,        // cusolver sp handle
+  CUSPARSE_HANDLE,           // cusparse handle
+  CUDA_STREAM_VIEW,          // view of a cuda stream
+  CUDA_STREAM_POOL,          // cuda stream pool
+  CUDA_STREAM_SYNC_EVENT,    // cuda event for syncing streams
+  COMMUNICATOR,              // raft communicator
+  SUB_COMMUNICATOR,          // raft sub communicator
+  DEVICE_PROPERTIES,         // cuda device properties
+  DEVICE_ID,                 // cuda device id
+  STREAM_VIEW,               // view of a cuda stream or a placeholder in
+                             // CUDA-free builds
+  THRUST_POLICY,             // thrust execution policy
+  WORKSPACE_RESOURCE,        // rmm device memory resource for small temporary allocations
+  CUBLASLT_HANDLE,           // cublasLt handle
+  CUSTOM,                    // runtime-shared default-constructible resource
+  LARGE_WORKSPACE_RESOURCE,  // rmm device memory resource for somewhat large temporary allocations
 
   LAST_KEY  // reserved for the last key
 };
diff --git a/cpp/include/raft/matrix/detail/select_radix.cuh b/cpp/include/raft/matrix/detail/select_radix.cuh
index 9480c8e202..2207b0216e 100644
--- a/cpp/include/raft/matrix/detail/select_radix.cuh
+++ b/cpp/include/raft/matrix/detail/select_radix.cuh
@@ -894,7 +894,7 @@ void radix_topk(const T* in,
                 unsigned grid_dim,
                 int sm_cnt,
                 rmm::cuda_stream_view stream,
-                rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource())
+                rmm::device_async_resource_ref mr)
 {
   // TODO: is it possible to relax this restriction?
   static_assert(calc_num_passes<T, BitsPerPass>() > 1);

From ac6be9eddb9e1bda549c6f69cba6c9b387605ed6 Mon Sep 17 00:00:00 2001
From: Aaron Siddhartha Mondal <aaron.mondal@gmail.com>
Date: Tue, 21 May 2024 21:37:39 +0200
Subject: [PATCH 53/60] Fix clang intrinsic warning (#2292)

Store operations are void.

Authors:
  - Aaron Siddhartha Mondal (https://github.com/aaronmondal)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2292
---
 cpp/scripts/__clang_cuda_additional_intrinsics.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/scripts/__clang_cuda_additional_intrinsics.h b/cpp/scripts/__clang_cuda_additional_intrinsics.h
index b9c032dc45..8b1335e5d1 100644
--- a/cpp/scripts/__clang_cuda_additional_intrinsics.h
+++ b/cpp/scripts/__clang_cuda_additional_intrinsics.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2022, NVIDIA CORPORATION.
+// Copyright (c) 2022-2024, NVIDIA CORPORATION.
 #ifndef __CLANG_CUDA_ADDITIONAL_INTRINSICS_H__
 #define __CLANG_CUDA_ADDITIONAL_INTRINSICS_H__
 #ifndef __CUDA__
@@ -233,7 +233,7 @@ __MAKE_LD4(cv, float4, float, "f32", "f", : "memory")
   }
 
 #define __MAKE_ST4(cop, c_typ, int_typ, ptx_typ, inl_typ)                       \
-  __device__ __forceinline__ c_typ __st##cop(c_typ* addr, c_typ v)              \
+  __device__ __forceinline__ void __st##cop(c_typ* addr, c_typ v)               \
   {                                                                             \
     int_typ v1 = v.x, v2 = v.y, v3 = v.z, v4 = v.w;                             \
     asm("st." #cop ".v4." ptx_typ " [%0], {%1, %2, %3, %4};" ::__LDG_PTR(addr), \

From a539c3244641920aa2e4b20ffecb973c49771474 Mon Sep 17 00:00:00 2001
From: Tamas Bela Feher <tfeher@nvidia.com>
Date: Tue, 21 May 2024 23:47:18 +0200
Subject: [PATCH 54/60] Replace too long index file name with hash in ANN bench
 (#2280)

Too long index file name would lead to a crash while calling the index serialization routines. Such long filenames can occur if we try to specialize many parameters for CAGRA ann index. This PR fixes the issue by replacing the long index file name with a hash. Drawback is the filename will not be descriptive.

Authors:
  - Tamas Bela Feher (https://github.com/tfeher)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2280
---
 .../raft-ann-bench/src/raft-ann-bench/run/__main__.py  | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
index 52d536c2e8..c34377d733 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
+++ b/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -553,10 +553,14 @@ def add_algo_group(group_list):
                             index["build_param"], conf_file["dataset"]["dims"]
                         ):
                             continue
-
+                index_filename = (
+                    index_name
+                    if len(index_name) < 128
+                    else str(hash(index_name))
+                )
                 index["name"] = index_name
                 index["file"] = os.path.join(
-                    args.dataset_path, args.dataset, "index", index_name
+                    args.dataset_path, args.dataset, "index", index_filename
                 )
                 index["search_params"] = []
                 all_search_params = itertools.product(*search_param_lists)

From 0b6f5425b4f3f08e4bbaf9f71cc4d04b6866e21d Mon Sep 17 00:00:00 2001
From: Ikko Eltociear Ashimine <eltociear@gmail.com>
Date: Thu, 23 May 2024 00:49:49 +0900
Subject: [PATCH 55/60] docs: update README.md (#2308)

jupyer -> jupyter

Authors:
  - Ikko Eltociear Ashimine (https://github.com/eltociear)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2308
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ae6591df00..fc56859557 100755
--- a/README.md
+++ b/README.md
@@ -27,7 +27,7 @@
 - [RAFT Reference Documentation](https://docs.rapids.ai/api/raft/stable/): API Documentation.
 - [RAFT Getting Started](./docs/source/quick_start.md): Getting started with RAFT.
 - [Build and Install RAFT](./docs/source/build.md): Instructions for installing and building RAFT.
-- [Example Notebooks](./notebooks): Example jupyer notebooks
+- [Example Notebooks](./notebooks): Example jupyter notebooks
 - [RAPIDS Community](https://rapids.ai/community.html): Get help, contribute, and collaborate.
 - [GitHub repository](https://github.com/rapidsai/raft): Download the RAFT source code.
 - [Issue tracker](https://github.com/rapidsai/raft/issues): Report issues or request features.

From 64827fc643b27443094374501ab7d1b7a28a604d Mon Sep 17 00:00:00 2001
From: Divye Gala <divyegala@gmail.com>
Date: Thu, 23 May 2024 14:31:52 -0400
Subject: [PATCH 56/60] InnerProduct testing for CAGRA+HNSW (#2297)

Authors:
  - Divye Gala (https://github.com/divyegala)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Corey J. Nolet (https://github.com/cjnolet)

URL: https://github.com/rapidsai/raft/pull/2297
---
 python/pylibraft/pylibraft/test/test_hnsw.py | 27 +++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/python/pylibraft/pylibraft/test/test_hnsw.py b/python/pylibraft/pylibraft/test/test_hnsw.py
index 487f190e4e..8cdf8c904f 100644
--- a/python/pylibraft/pylibraft/test/test_hnsw.py
+++ b/python/pylibraft/pylibraft/test/test_hnsw.py
@@ -29,6 +29,7 @@ def run_hnsw_build_search_test(
     k=10,
     dtype=np.float32,
     metric="sqeuclidean",
+    build_algo="ivf_pq",
     intermediate_graph_degree=128,
     graph_degree=64,
     search_params={},
@@ -36,11 +37,18 @@ def run_hnsw_build_search_test(
     dataset = generate_data((n_rows, n_cols), dtype)
     if metric == "inner_product":
         dataset = normalize(dataset, norm="l2", axis=1)
+        if dtype in [np.int8, np.uint8]:
+            pytest.skip(
+                "inner_product metric is not supported for int8/uint8 data"
+            )
+        if build_algo == "nn_descent":
+            pytest.skip("inner_product metric is not supported for nn_descent")
 
     build_params = cagra.IndexParams(
         metric=metric,
         intermediate_graph_degree=intermediate_graph_degree,
         graph_degree=graph_degree,
+        build_algo=build_algo,
     )
 
     index = cagra.build(build_params, dataset)
@@ -57,7 +65,14 @@ def run_hnsw_build_search_test(
     out_dist, out_idx = hnsw.search(search_params, hnsw_index, queries, k)
 
     # Calculate reference values with sklearn
-    nn_skl = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
+    skl_metric = {
+        "sqeuclidean": "sqeuclidean",
+        "inner_product": "cosine",
+        "euclidean": "euclidean",
+    }[metric]
+    nn_skl = NearestNeighbors(
+        n_neighbors=k, algorithm="brute", metric=skl_metric
+    )
     nn_skl.fit(dataset)
     skl_idx = nn_skl.kneighbors(queries, return_distance=False)
 
@@ -69,9 +84,15 @@ def run_hnsw_build_search_test(
 @pytest.mark.parametrize("k", [10, 20])
 @pytest.mark.parametrize("ef", [30, 40])
 @pytest.mark.parametrize("num_threads", [2, 4])
-def test_hnsw(dtype, k, ef, num_threads):
+@pytest.mark.parametrize("metric", ["sqeuclidean", "inner_product"])
+@pytest.mark.parametrize("build_algo", ["ivf_pq", "nn_descent"])
+def test_hnsw(dtype, k, ef, num_threads, metric, build_algo):
     # Note that inner_product tests use normalized input which we cannot
     # represent in int8, therefore we test only sqeuclidean metric here.
     run_hnsw_build_search_test(
-        dtype=dtype, k=k, search_params={"ef": ef, "num_threads": num_threads}
+        dtype=dtype,
+        k=k,
+        metric=metric,
+        build_algo=build_algo,
+        search_params={"ef": ef, "num_threads": num_threads},
     )

From 9c8d111044affcab08c7e4a1b8a4830e621373d2 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Thu, 23 May 2024 17:32:55 -0400
Subject: [PATCH 57/60] Rename raft-ann-bench module to raft_ann_bench (#2333)

Replace hyphens with underscores in `raft-ann-bench` to make it a valid Python identifier. Also add a Python 3.11 tag to `raft-ann-bench`, and use the `VERSION` file instead of an attribute.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Mike Sarahan (https://github.com/msarahan)

URL: https://github.com/rapidsai/raft/pull/2333
---
 docs/source/ann_benchmarks_dataset.md         |  6 +++---
 docs/source/ann_benchmarks_low_level.md       |  2 +-
 docs/source/raft_ann_benchmarks.md            | 20 +++++++++----------
 python/raft-ann-bench/pyproject.toml          |  3 ++-
 .../VERSION                                   |  0
 .../__init__.py                               |  0
 .../_version.py                               |  0
 .../constraints/__init__.py                   |  0
 .../data_export/__main__.py                   |  0
 .../generate_groundtruth/__main__.py          |  6 +++---
 .../generate_groundtruth/utils.py             |  0
 .../get_dataset/__main__.py                   |  0
 .../get_dataset/fbin_to_f16bin.py             |  0
 .../get_dataset/hdf5_to_fbin.py               |  0
 .../plot/__main__.py                          |  0
 .../run/__main__.py                           |  0
 .../run/algos.yaml                            |  0
 .../run/conf/algos/faiss_cpu_flat.yaml        |  0
 .../run/conf/algos/faiss_gpu_flat.yaml        |  0
 .../run/conf/algos/faiss_gpu_ivf_flat.yaml    |  0
 .../run/conf/algos/faiss_gpu_ivf_pq.yaml      |  0
 .../run/conf/algos/hnswlib.yaml               |  0
 .../run/conf/algos/raft_brute_force.yaml      |  0
 .../run/conf/algos/raft_cagra.yaml            |  0
 .../run/conf/algos/raft_cagra_hnswlib.yaml    |  0
 .../run/conf/algos/raft_ivf_flat.yaml         |  0
 .../run/conf/algos/raft_ivf_pq.yaml           |  0
 .../run/conf/bigann-100M.json                 |  0
 .../run/conf/datasets.yaml                    |  0
 .../run/conf/deep-100M.json                   |  0
 .../run/conf/deep-1B.json                     |  0
 .../run/conf/deep-image-96-inner.json         |  0
 .../run/conf/fashion-mnist-784-euclidean.json |  0
 .../run/conf/gist-960-euclidean.json          |  0
 .../run/conf/glove-100-angular.json           |  0
 .../run/conf/glove-100-inner.json             |  0
 .../run/conf/glove-50-angular.json            |  0
 .../run/conf/glove-50-inner.json              |  0
 .../run/conf/lastfm-65-angular.json           |  0
 .../run/conf/mnist-784-euclidean.json         |  0
 .../run/conf/nytimes-256-angular.json         |  0
 .../run/conf/nytimes-256-inner.json           |  0
 .../run/conf/sift-128-euclidean.json          |  0
 .../run/conf/wiki_all_10M.json                |  0
 .../run/conf/wiki_all_1M.json                 |  0
 .../run/conf/wiki_all_88M.json                |  0
 .../split_groundtruth/__main__.py             |  0
 .../split_groundtruth/split_groundtruth.pl    |  0
 48 files changed, 19 insertions(+), 18 deletions(-)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/VERSION (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/__init__.py (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/_version.py (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/constraints/__init__.py (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/data_export/__main__.py (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/generate_groundtruth/__main__.py (97%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/generate_groundtruth/utils.py (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/get_dataset/__main__.py (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/get_dataset/fbin_to_f16bin.py (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/get_dataset/hdf5_to_fbin.py (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/plot/__main__.py (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/__main__.py (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/algos.yaml (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/algos/faiss_cpu_flat.yaml (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/algos/faiss_gpu_flat.yaml (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/algos/faiss_gpu_ivf_flat.yaml (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/algos/faiss_gpu_ivf_pq.yaml (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/algos/hnswlib.yaml (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/algos/raft_brute_force.yaml (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/algos/raft_cagra.yaml (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/algos/raft_cagra_hnswlib.yaml (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/algos/raft_ivf_flat.yaml (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/algos/raft_ivf_pq.yaml (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/bigann-100M.json (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/datasets.yaml (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/deep-100M.json (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/deep-1B.json (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/deep-image-96-inner.json (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/fashion-mnist-784-euclidean.json (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/gist-960-euclidean.json (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/glove-100-angular.json (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/glove-100-inner.json (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/glove-50-angular.json (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/glove-50-inner.json (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/lastfm-65-angular.json (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/mnist-784-euclidean.json (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/nytimes-256-angular.json (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/nytimes-256-inner.json (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/sift-128-euclidean.json (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/wiki_all_10M.json (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/wiki_all_1M.json (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/run/conf/wiki_all_88M.json (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/split_groundtruth/__main__.py (100%)
 rename python/raft-ann-bench/src/{raft-ann-bench => raft_ann_bench}/split_groundtruth/split_groundtruth.pl (100%)

diff --git a/docs/source/ann_benchmarks_dataset.md b/docs/source/ann_benchmarks_dataset.md
index 821345b07c..26c1559504 100644
--- a/docs/source/ann_benchmarks_dataset.md
+++ b/docs/source/ann_benchmarks_dataset.md
@@ -52,12 +52,12 @@ If you have a dataset, but no corresponding ground truth file, then you can gene
 
 ```bash
 # With existing query file
-python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
+python -m raft_ann_bench.generate_groundtruth --dataset /dataset/base.fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
 
 # With randomly generated queries
-python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --output=groundtruth_dir --queries=random --n_queries=10000
+python -m raft_ann_bench.generate_groundtruth --dataset /dataset/base.fbin --output=groundtruth_dir --queries=random --n_queries=10000
 
 # Using only a subset of the dataset. Define queries by randomly
 # selecting vectors from the (subset of the) dataset.
-python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.fbin --nrows=2000000 --output=groundtruth_dir --queries=random-choice --n_queries=10000
+python -m raft_ann_bench.generate_groundtruth --dataset /dataset/base.fbin --nrows=2000000 --output=groundtruth_dir --queries=random-choice --n_queries=10000
 ```
\ No newline at end of file
diff --git a/docs/source/ann_benchmarks_low_level.md b/docs/source/ann_benchmarks_low_level.md
index 55238954ba..cb583b119b 100644
--- a/docs/source/ann_benchmarks_low_level.md
+++ b/docs/source/ann_benchmarks_low_level.md
@@ -8,7 +8,7 @@ cd raft
 
 # (1) prepare a dataset
 export PYTHONPATH=python/raft-ann-bench/src:$PYTHONPATH
-python -m raft-ann-bench.get_dataset --dataset glove-100-angular --normalize
+python -m raft_ann_bench.get_dataset --dataset glove-100-angular --normalize
 
 # option --normalize is used here to normalize vectors so cosine distance is converted
 # to inner product; don't use -n for l2 distance
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index 3eaa72beae..146cc104d1 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -265,16 +265,16 @@ The steps below demonstrate how to download, install, and run benchmarks on a su
 ```bash
 
 # (1) prepare dataset.
-python -m raft-ann-bench.get_dataset --dataset deep-image-96-angular --normalize
+python -m raft_ann_bench.get_dataset --dataset deep-image-96-angular --normalize
 
 # (2) build and search index
-python -m raft-ann-bench.run --dataset deep-image-96-inner --algorithms raft_cagra --batch-size 10 -k 10
+python -m raft_ann_bench.run --dataset deep-image-96-inner --algorithms raft_cagra --batch-size 10 -k 10
 
 # (3) export data
-python -m raft-ann-bench.data_export --dataset deep-image-96-inner
+python -m raft_ann_bench.data_export --dataset deep-image-96-inner
 
 # (4) plot results
-python -m raft-ann-bench.plot --dataset deep-image-96-inner
+python -m raft_ann_bench.plot --dataset deep-image-96-inner
 ```
 
 Configuration files already exist for the following list of the million-scale datasets. Please refer to [ann-benchmarks datasets](https://github.com/erikbern/ann-benchmarks/#data-sets) for more information, including actual train and sizes. These all work out-of-the-box with the `--dataset` argument. Other million-scale datasets from `ann-benchmarks.com` will work, but will require a json configuration file to be created in `$CONDA_PREFIX/lib/python3.xx/site-packages/raft-ann-bench/run/conf`, or you can specify the `--configuration` option to use a specific file.
@@ -308,20 +308,20 @@ mkdir -p datasets/deep-1B
 # (1) prepare dataset
 # download manually "Ground Truth" file of "Yandex DEEP"
 # suppose the file name is deep_new_groundtruth.public.10K.bin
-python -m raft-ann-bench.split_groundtruth --groundtruth datasets/deep-1B/deep_new_groundtruth.public.10K.bin
+python -m raft_ann_bench.split_groundtruth --groundtruth datasets/deep-1B/deep_new_groundtruth.public.10K.bin
 # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
 
 # (2) build and search index
-python -m raft-ann-bench.run --dataset deep-1B --algorithms raft_cagra --batch-size 10 -k 10
+python -m raft_ann_bench.run --dataset deep-1B --algorithms raft_cagra --batch-size 10 -k 10
 
 # (3) export data
-python -m raft-ann-bench.data_export --dataset deep-1B
+python -m raft_ann_bench.data_export --dataset deep-1B
 
 # (4) plot results
-python -m raft-ann-bench.plot --dataset deep-1B
+python -m raft_ann_bench.plot --dataset deep-1B
 ```
 
-The usage of `python -m raft-ann-bench.split_groundtruth` is:
+The usage of `python -m raft_ann_bench.split_groundtruth` is:
 ```bash
 usage: split_groundtruth.py [-h] --groundtruth GROUNDTRUTH
 
@@ -395,7 +395,7 @@ docker run --gpus all --rm -it -u $(id -u)          \
 This will drop you into a command line in the container, with the `raft-ann-bench` python package ready to use, as described in the [Running the benchmarks](#running-the-benchmarks) section above:
 
 ```
-(base) root@00b068fbb862:/data/benchmarks# python -m raft-ann-bench.get_dataset --dataset deep-image-96-angular --normalize
+(base) root@00b068fbb862:/data/benchmarks# python -m raft_ann_bench.get_dataset --dataset deep-image-96-angular --normalize
 ```
 
 Additionally, the containers can be run in detached mode without any issue.
diff --git a/python/raft-ann-bench/pyproject.toml b/python/raft-ann-bench/pyproject.toml
index 9bb7ae0468..e1f0e18304 100644
--- a/python/raft-ann-bench/pyproject.toml
+++ b/python/raft-ann-bench/pyproject.toml
@@ -26,6 +26,7 @@ classifiers = [
     "Programming Language :: Python",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
 ]
 
 [project.urls]
@@ -59,4 +60,4 @@ skip = [
 ]
 
 [tool.setuptools.dynamic]
-version = { attr = "raft-ann-bench.__version__" }
+version = { file = "raft_ann_bench/VERSION" }
diff --git a/python/raft-ann-bench/src/raft-ann-bench/VERSION b/python/raft-ann-bench/src/raft_ann_bench/VERSION
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/VERSION
rename to python/raft-ann-bench/src/raft_ann_bench/VERSION
diff --git a/python/raft-ann-bench/src/raft-ann-bench/__init__.py b/python/raft-ann-bench/src/raft_ann_bench/__init__.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/__init__.py
rename to python/raft-ann-bench/src/raft_ann_bench/__init__.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/_version.py b/python/raft-ann-bench/src/raft_ann_bench/_version.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/_version.py
rename to python/raft-ann-bench/src/raft_ann_bench/_version.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/constraints/__init__.py b/python/raft-ann-bench/src/raft_ann_bench/constraints/__init__.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/constraints/__init__.py
rename to python/raft-ann-bench/src/raft_ann_bench/constraints/__init__.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py b/python/raft-ann-bench/src/raft_ann_bench/data_export/__main__.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/data_export/__main__.py
rename to python/raft-ann-bench/src/raft_ann_bench/data_export/__main__.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py b/python/raft-ann-bench/src/raft_ann_bench/generate_groundtruth/__main__.py
similarity index 97%
rename from python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py
rename to python/raft-ann-bench/src/raft_ann_bench/generate_groundtruth/__main__.py
index a5ebb76635..e6f7aaf99c 100644
--- a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/__main__.py
+++ b/python/raft-ann-bench/src/raft_ann_bench/generate_groundtruth/__main__.py
@@ -96,16 +96,16 @@ def main():
         "The input and output files are in big-ann-benchmark's binary format.",
         epilog="""Example usage
     # With existing query file
-    python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\
+    python -m raft_ann_bench.generate_groundtruth --dataset /dataset/base.\
 fbin --output=groundtruth_dir --queries=/dataset/query.public.10K.fbin
 
     # With randomly generated queries
-    python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\
+    python -m raft_ann_bench.generate_groundtruth --dataset /dataset/base.\
 fbin --output=groundtruth_dir --queries=random --n_queries=10000
 
     # Using only a subset of the dataset. Define queries by randomly
     # selecting vectors from the (subset of the) dataset.
-    python -m raft-ann-bench.generate_groundtruth --dataset /dataset/base.\
+    python -m raft_ann_bench.generate_groundtruth --dataset /dataset/base.\
 fbin --nrows=2000000 --cols=128 --output=groundtruth_dir \
 --queries=random-choice --n_queries=10000
     """,
diff --git a/python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py b/python/raft-ann-bench/src/raft_ann_bench/generate_groundtruth/utils.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/generate_groundtruth/utils.py
rename to python/raft-ann-bench/src/raft_ann_bench/generate_groundtruth/utils.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/get_dataset/__main__.py b/python/raft-ann-bench/src/raft_ann_bench/get_dataset/__main__.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/get_dataset/__main__.py
rename to python/raft-ann-bench/src/raft_ann_bench/get_dataset/__main__.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/get_dataset/fbin_to_f16bin.py b/python/raft-ann-bench/src/raft_ann_bench/get_dataset/fbin_to_f16bin.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/get_dataset/fbin_to_f16bin.py
rename to python/raft-ann-bench/src/raft_ann_bench/get_dataset/fbin_to_f16bin.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/get_dataset/hdf5_to_fbin.py b/python/raft-ann-bench/src/raft_ann_bench/get_dataset/hdf5_to_fbin.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/get_dataset/hdf5_to_fbin.py
rename to python/raft-ann-bench/src/raft_ann_bench/get_dataset/hdf5_to_fbin.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py b/python/raft-ann-bench/src/raft_ann_bench/plot/__main__.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/plot/__main__.py
rename to python/raft-ann-bench/src/raft_ann_bench/plot/__main__.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/__main__.py b/python/raft-ann-bench/src/raft_ann_bench/run/__main__.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/__main__.py
rename to python/raft-ann-bench/src/raft_ann_bench/run/__main__.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/algos.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/algos.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/algos.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/algos.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_cpu_flat.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_cpu_flat.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_cpu_flat.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_cpu_flat.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_flat.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_flat.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_flat.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_flat.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_ivf_flat.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_flat.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_ivf_flat.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_flat.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_ivf_pq.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_pq.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/faiss_gpu_ivf_pq.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/faiss_gpu_ivf_pq.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/hnswlib.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/hnswlib.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/hnswlib.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/hnswlib.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_brute_force.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_brute_force.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_brute_force.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_brute_force.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_cagra.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_cagra.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra_hnswlib.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_cagra_hnswlib.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_cagra_hnswlib.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_cagra_hnswlib.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_ivf_flat.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_flat.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_ivf_flat.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_flat.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_ivf_pq.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_pq.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/algos/raft_ivf_pq.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_pq.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/bigann-100M.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/bigann-100M.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/bigann-100M.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/bigann-100M.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/datasets.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/datasets.yaml
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/datasets.yaml
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/datasets.yaml
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-100M.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/deep-100M.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-100M.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/deep-100M.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-1B.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/deep-1B.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-1B.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/deep-1B.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-image-96-inner.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/deep-image-96-inner.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/deep-image-96-inner.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/deep-image-96-inner.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/fashion-mnist-784-euclidean.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/fashion-mnist-784-euclidean.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/fashion-mnist-784-euclidean.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/fashion-mnist-784-euclidean.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/gist-960-euclidean.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/gist-960-euclidean.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/gist-960-euclidean.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/gist-960-euclidean.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-angular.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-100-angular.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-angular.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-100-angular.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-100-inner.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-100-inner.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-angular.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-50-angular.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-angular.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-50-angular.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-inner.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-50-inner.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-50-inner.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-50-inner.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/lastfm-65-angular.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/lastfm-65-angular.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/lastfm-65-angular.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/lastfm-65-angular.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/mnist-784-euclidean.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/mnist-784-euclidean.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/mnist-784-euclidean.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-angular.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/nytimes-256-angular.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-angular.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/nytimes-256-angular.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-inner.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/nytimes-256-inner.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/nytimes-256-inner.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/nytimes-256-inner.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/sift-128-euclidean.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/sift-128-euclidean.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/sift-128-euclidean.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/sift-128-euclidean.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/wiki_all_10M.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/wiki_all_10M.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/wiki_all_10M.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/wiki_all_10M.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/wiki_all_1M.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/wiki_all_1M.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/wiki_all_1M.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/wiki_all_1M.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/run/conf/wiki_all_88M.json b/python/raft-ann-bench/src/raft_ann_bench/run/conf/wiki_all_88M.json
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/run/conf/wiki_all_88M.json
rename to python/raft-ann-bench/src/raft_ann_bench/run/conf/wiki_all_88M.json
diff --git a/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/__main__.py b/python/raft-ann-bench/src/raft_ann_bench/split_groundtruth/__main__.py
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/__main__.py
rename to python/raft-ann-bench/src/raft_ann_bench/split_groundtruth/__main__.py
diff --git a/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/split_groundtruth.pl b/python/raft-ann-bench/src/raft_ann_bench/split_groundtruth/split_groundtruth.pl
similarity index 100%
rename from python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/split_groundtruth.pl
rename to python/raft-ann-bench/src/raft_ann_bench/split_groundtruth/split_groundtruth.pl

From 5c6cd927fe40e7cb4a0732bbc3745ef3ca2f2b00 Mon Sep 17 00:00:00 2001
From: Kyle Edwards <kyedwards@nvidia.com>
Date: Thu, 23 May 2024 21:54:01 -0400
Subject: [PATCH 58/60] Fix import of VERSION file in raft-ann-bench (#2338)

Change the imported package name to reflect the new name as of #2333.

Authors:
  - Kyle Edwards (https://github.com/KyleFromNVIDIA)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/2338
---
 docs/source/ann_benchmarks_low_level.md       | 12 +++++------
 docs/source/raft_ann_benchmarks.md            | 20 +++++++++----------
 .../src/raft_ann_bench/_version.py            |  2 +-
 .../run/conf/algos/hnswlib.yaml               |  2 +-
 .../run/conf/algos/raft_cagra.yaml            |  4 ++--
 .../run/conf/algos/raft_cagra_hnswlib.yaml    |  2 +-
 .../run/conf/algos/raft_ivf_pq.yaml           |  4 ++--
 7 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/docs/source/ann_benchmarks_low_level.md b/docs/source/ann_benchmarks_low_level.md
index cb583b119b..7ba13dec8d 100644
--- a/docs/source/ann_benchmarks_low_level.md
+++ b/docs/source/ann_benchmarks_low_level.md
@@ -18,7 +18,7 @@ $CONDA_PREFIX/bin/ann/RAFT_IVF_FLAT_ANN_BENCH \
   --data_prefix=datasets \
   --build \
   --benchmark_filter="raft_ivf_flat\..*" \
-  python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json 
+  python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-100-inner.json 
 
 # (3) search
 $CONDA_PREFIX/bin/ann/RAFT_IVF_FLAT_ANN_BENCH\
@@ -29,7 +29,7 @@ $CONDA_PREFIX/bin/ann/RAFT_IVF_FLAT_ANN_BENCH\
   --benchmark_counters_tabular \
   --search \
   --benchmark_filter="raft_ivf_flat\..*" \
-    python/raft-ann-bench/src/raft-ann-bench/run/conf/glove-100-inner.json 
+    python/raft-ann-bench/src/raft_ann_bench/run/conf/glove-100-inner.json 
 
 
 # optional step: plot QPS-Recall figure using data in ivf_flat_search.csv with your favorite tool
@@ -43,12 +43,12 @@ A dataset usually has 4 binary files containing database vectors, query vectors,
 The file suffixes `.fbin`, `.f16bin`, `.ibin`, `.u8bin`, and `.i8bin` denote that the data type of vectors stored in the file are `float32`, `float16`(a.k.a `half`), `int`, `uint8`, and `int8`, respectively.
 These binary files are little-endian and the format is: the first 8 bytes are `num_vectors` (`uint32_t`) and `num_dimensions` (`uint32_t`), and the following `num_vectors * num_dimensions * sizeof(type)` bytes are vectors stored in row-major order.
 
-Some implementation can take `float16` database and query vectors as inputs and will have better performance. Use `python/raft-ann-bench/src/raft-ann-bench/get_dataset/fbin_to_f16bin.py` to transform dataset from `float32` to `float16` type.
+Some implementation can take `float16` database and query vectors as inputs and will have better performance. Use `python/raft-ann-bench/src/raft_ann_bench/get_dataset/fbin_to_f16bin.py` to transform dataset from `float32` to `float16` type.
 
 Commonly used datasets can be downloaded from two websites:
 1. Million-scale datasets can be found at the [Data sets](https://github.com/erikbern/ann-benchmarks#data-sets) section of [`ann-benchmarks`](https://github.com/erikbern/ann-benchmarks).
 
-    However, these datasets are in HDF5 format. Use `python/raft-ann-bench/src/raft-ann-bench/get_dataset/fbin_to_f16bin.py/hdf5_to_fbin.py` to transform the format. A few Python packages are required to run it:
+    However, these datasets are in HDF5 format. Use `python/raft-ann-bench/src/raft_ann_bench/get_dataset/fbin_to_f16bin.py/hdf5_to_fbin.py` to transform the format. A few Python packages are required to run it:
     ```bash
     pip3 install numpy h5py
     ```
@@ -68,7 +68,7 @@ Commonly used datasets can be downloaded from two websites:
 
 2. Billion-scale datasets can be found at [`big-ann-benchmarks`](http://big-ann-benchmarks.com). The ground truth file contains both neighbors and distances, thus should be split. A script is provided for this:
     ```bash
-    $ python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/split_groundtruth.pl
+    $ python/raft-ann-bench/src/raft_ann_bench/split_groundtruth/split_groundtruth.pl
     usage: split_groundtruth.pl input output_prefix
     ```
     Take Deep-1B dataset as an example:
@@ -78,7 +78,7 @@ Commonly used datasets can be downloaded from two websites:
     mkdir -p data/deep-1B && cd data/deep-1B
     # download manually "Ground Truth" file of "Yandex DEEP"
     # suppose the file name is deep_new_groundtruth.public.10K.bin
-    /path/to/raft/python/raft-ann-bench/src/raft-ann-bench/split_groundtruth/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth
+    /path/to/raft/python/raft-ann-bench/src/raft_ann_bench/split_groundtruth/split_groundtruth.pl deep_new_groundtruth.public.10K.bin groundtruth
     # two files 'groundtruth.neighbors.ibin' and 'groundtruth.distances.fbin' should be produced
     popd
     ```
diff --git a/docs/source/raft_ann_benchmarks.md b/docs/source/raft_ann_benchmarks.md
index 146cc104d1..4b3aef5600 100644
--- a/docs/source/raft_ann_benchmarks.md
+++ b/docs/source/raft_ann_benchmarks.md
@@ -96,7 +96,7 @@ We provide a collection of lightweight Python scripts to run the benchmarks. The
 4. Plot Results
 
 ### Step 1: Prepare Dataset
-The script `raft-ann-bench.get_dataset` will download and unpack the dataset in directory
+The script `raft_ann_bench.get_dataset` will download and unpack the dataset in directory
 that the user provides. As of now, only million-scale datasets are supported by this
 script. For more information on [datasets and formats](ann_benchmarks_dataset.md).
 
@@ -117,10 +117,10 @@ will be normalized to inner product. So, for example, the dataset `glove-100-ang
 will be written at location `datasets/glove-100-inner/`.
 
 ### Step 2: Build and Search Index
-The script `raft-ann-bench.run` will build and search indices for a given dataset and its
+The script `raft_ann_bench.run` will build and search indices for a given dataset and its
 specified configuration.
 
-The usage of the script `raft-ann-bench.run` is:
+The usage of the script `raft_ann_bench.run` is:
 ```bash
 usage: __main__.py [-h] [--subset-size SUBSET_SIZE] [-k COUNT] [-bs BATCH_SIZE] [--dataset-configuration DATASET_CONFIGURATION] [--configuration CONFIGURATION] [--dataset DATASET]
                    [--dataset-path DATASET_PATH] [--build] [--search] [--algorithms ALGORITHMS] [--groups GROUPS] [--algo-groups ALGO_GROUPS] [-f] [-m SEARCH_MODE]
@@ -186,8 +186,8 @@ it is assumed both are `True`.
 is available in `algos.yaml` and not disabled, as well as having an associated executable.
 
 ### Step 3: Data Export
-The script `raft-ann-bench.data_export` will convert the intermediate JSON outputs produced by `raft-ann-bench.run` to more
-easily readable CSV files, which are needed to build charts made by `raft-ann-bench.plot`.
+The script `raft_ann_bench.data_export` will convert the intermediate JSON outputs produced by `raft_ann_bench.run` to more
+easily readable CSV files, which are needed to build charts made by `raft_ann_bench.plot`.
 
 ```bash
 usage: data_export.py [-h] [--dataset DATASET] [--dataset-path DATASET_PATH]
@@ -206,7 +206,7 @@ and index search statistics CSV file in `<dataset-path/<dataset>/result/search/<
 
 
 ### Step 4: Plot Results
-The script `raft-ann-bench.plot` will plot results for all algorithms found in index search statistics
+The script `raft_ann_bench.plot` will plot results for all algorithms found in index search statistics
 CSV files `<dataset-path/<dataset>/result/search/*.csv`.
 
 The usage of this script is:
@@ -277,7 +277,7 @@ python -m raft_ann_bench.data_export --dataset deep-image-96-inner
 python -m raft_ann_bench.plot --dataset deep-image-96-inner
 ```
 
-Configuration files already exist for the following list of the million-scale datasets. Please refer to [ann-benchmarks datasets](https://github.com/erikbern/ann-benchmarks/#data-sets) for more information, including actual train and sizes. These all work out-of-the-box with the `--dataset` argument. Other million-scale datasets from `ann-benchmarks.com` will work, but will require a json configuration file to be created in `$CONDA_PREFIX/lib/python3.xx/site-packages/raft-ann-bench/run/conf`, or you can specify the `--configuration` option to use a specific file.
+Configuration files already exist for the following list of the million-scale datasets. Please refer to [ann-benchmarks datasets](https://github.com/erikbern/ann-benchmarks/#data-sets) for more information, including actual train and sizes. These all work out-of-the-box with the `--dataset` argument. Other million-scale datasets from `ann-benchmarks.com` will work, but will require a json configuration file to be created in `$CONDA_PREFIX/lib/python3.xx/site-packages/raft_ann_bench/run/conf`, or you can specify the `--configuration` option to use a specific file.
 
 | Dataset Name | Train Rows | Columns | Test Rows      | Distance   | 
 |-----|------------|----|----------------|------------|
@@ -293,7 +293,7 @@ All of the datasets above contain ground test datasets with 100 neighbors. Thus
 
 ### End to end: large-scale benchmarks (>10M vectors)
 
-`raft-ann-bench.get_dataset` cannot be used to download the [billion-scale datasets](ann_benchmarks_dataset.md#billion-scale)
+`raft_ann_bench.get_dataset` cannot be used to download the [billion-scale datasets](ann_benchmarks_dataset.md#billion-scale)
 due to their size. You should instead use our billion-scale datasets guide to download and prepare them.
 All other python commands mentioned below work as intended once the
 billion-scale dataset has been downloaded.
@@ -441,7 +441,7 @@ Note the following:
 
 A single configuration will often define a set of algorithms, with associated index and search parameters, that can be generalize across datasets. We use YAML to define dataset specific and algorithm specific configurations.
 
-<a id='yaml-dataset-config'></a>A default `datasets.yaml` is provided by RAFT in `${RAFT_HOME}/python/raft-ann-bench/src/raft-ann-bench/run/conf` with configurations available for several datasets. Here's a simple example entry for the `sift-128-euclidean` dataset:
+<a id='yaml-dataset-config'></a>A default `datasets.yaml` is provided by RAFT in `${RAFT_HOME}/python/raft-ann-bench/src/raft_ann_bench/run/conf` with configurations available for several datasets. Here's a simple example entry for the `sift-128-euclidean` dataset:
 
 ```yaml
 - name: sift-128-euclidean
@@ -452,7 +452,7 @@ A single configuration will often define a set of algorithms, with associated in
   distance: euclidean
 ```
 
-<a id='yaml-algo-config'></a>Configuration files for ANN algorithms supported by `raft-ann-bench` are provided in `${RAFT_HOME}/python/raft-ann-bench/src/raft-ann-bench/run/conf`. `raft_cagra` algorithm configuration looks like:
+<a id='yaml-algo-config'></a>Configuration files for ANN algorithms supported by `raft-ann-bench` are provided in `${RAFT_HOME}/python/raft-ann-bench/src/raft_ann_bench/run/conf`. `raft_cagra` algorithm configuration looks like:
 ```yaml
 name: raft_cagra
 groups:
diff --git a/python/raft-ann-bench/src/raft_ann_bench/_version.py b/python/raft-ann-bench/src/raft_ann_bench/_version.py
index 6dbb8e81b0..394acd755d 100644
--- a/python/raft-ann-bench/src/raft_ann_bench/_version.py
+++ b/python/raft-ann-bench/src/raft_ann_bench/_version.py
@@ -17,7 +17,7 @@
 import importlib.resources
 
 __version__ = (
-    importlib.resources.files("raft-ann-bench")
+    importlib.resources.files("raft_ann_bench")
     .joinpath("VERSION")
     .read_text()
     .strip()
diff --git a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/hnswlib.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/hnswlib.yaml
index 9268c4cb08..e7a4e6b506 100644
--- a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/hnswlib.yaml
+++ b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/hnswlib.yaml
@@ -1,6 +1,6 @@
 name: hnswlib
 constraints:
-  search: raft-ann-bench.constraints.hnswlib_search_constraints
+  search: raft_ann_bench.constraints.hnswlib_search_constraints
 groups:
   base:
     build:
diff --git a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_cagra.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_cagra.yaml
index 374458989a..bb66b4b232 100644
--- a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_cagra.yaml
+++ b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_cagra.yaml
@@ -1,7 +1,7 @@
 name: raft_cagra
 constraints:
-  build: raft-ann-bench.constraints.raft_cagra_build_constraints
-  search: raft-ann-bench.constraints.raft_cagra_search_constraints
+  build: raft_ann_bench.constraints.raft_cagra_build_constraints
+  search: raft_ann_bench.constraints.raft_cagra_search_constraints
 groups:
   base:
     build:
diff --git a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_cagra_hnswlib.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_cagra_hnswlib.yaml
index 787675d65d..3ac2d16b68 100644
--- a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_cagra_hnswlib.yaml
+++ b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_cagra_hnswlib.yaml
@@ -1,6 +1,6 @@
 name: raft_cagra_hnswlib
 constraints:
-  search: raft-ann-bench.constraints.hnswlib_search_constraints
+  search: raft_ann_bench.constraints.hnswlib_search_constraints
 groups:
   base:
     build:
diff --git a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_pq.yaml b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_pq.yaml
index fac383119a..7eaec2b77b 100644
--- a/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_pq.yaml
+++ b/python/raft-ann-bench/src/raft_ann_bench/run/conf/algos/raft_ivf_pq.yaml
@@ -1,7 +1,7 @@
 name: raft_ivf_pq
 constraints:
-  build: raft-ann-bench.constraints.raft_ivf_pq_build_constraints
-  search: raft-ann-bench.constraints.raft_ivf_pq_search_constraints
+  build: raft_ann_bench.constraints.raft_ivf_pq_build_constraints
+  search: raft_ann_bench.constraints.raft_ivf_pq_search_constraints
 groups:
   base:
     build:

From 5f0dfeded3e8bc63832dc6ab37fda1e62910d423 Mon Sep 17 00:00:00 2001
From: rhdong <hrong@nvidia.com>
Date: Thu, 23 May 2024 21:35:53 -0700
Subject: [PATCH 59/60] [FEA] support of prefiltered brute force (#2294)

- This PR is one part of the feature of #1969
- Add the API of 'search_with_filtering' for brute force.
Authors:
  - James Rong (https://github.com/rhdong)

```shell
***WARNING*** CPU scaling is enabled, the benchmark real time measurements may be noisy and will incur extra overhead.
-----------------------------------------------------------------------------------------------------
Benchmark                                                           Time             CPU   Iterations
-----------------------------------------------------------------------------------------------------
KNN/float/int64_t/brute_force_filter_knn/0/0/0/manual_time       33.1 ms         69.9 ms           21 1000000#128#1000#255#0#InnerProduct#NO_COPY#SEARCH
KNN/float/int64_t/brute_force_filter_knn/1/0/0/manual_time       38.0 ms         74.8 ms           18 1000000#128#1000#255#0#L2Expanded#NO_COPY#SEARCH
KNN/float/int64_t/brute_force_filter_knn/2/0/0/manual_time       41.7 ms         78.5 ms           17 1000000#128#1000#255#0.8#InnerProduct#NO_COPY#SEARCH
KNN/float/int64_t/brute_force_filter_knn/3/0/0/manual_time       57.5 ms         94.3 ms           12 1000000#128#1000#255#0.8#L2Expanded#NO_COPY#SEARCH
KNN/float/int64_t/brute_force_filter_knn/4/0/0/manual_time       19.7 ms         56.4 ms           35 1000000#128#1000#255#0.9#InnerProduct#NO_COPY#SEARCH
KNN/float/int64_t/brute_force_filter_knn/5/0/0/manual_time       26.1 ms         62.8 ms           27 1000000#128#1000#255#0.9#L2Expanded#NO_COPY#SEARCH```

Authors:
  - rhdong (https://github.com/rhdong)
  - Artem M. Chirkin (https://github.com/achirkin)
  - Corey J. Nolet (https://github.com/cjnolet)

Approvers:
  - Robert Maynard (https://github.com/robertmaynard)
  - Corey J. Nolet (https://github.com/cjnolet)
  - Divye Gala (https://github.com/divyegala)

URL: https://github.com/rapidsai/raft/pull/2294
---
 cpp/include/raft/core/bitmap.cuh              | 116 +++-------------
 cpp/include/raft/core/bitmap.hpp              | 123 +++++++++++++++++
 cpp/include/raft/core/bitset.cuh              |  42 ++----
 cpp/include/raft/core/detail/popc.cuh         |  75 +++++++++++
 .../sparse/convert/detail/bitmap_to_csr.cuh   |  10 +-
 .../raft/sparse/distance/detail/utils.cuh     | 127 +++++++++++++++++-
 .../sparse/matrix/detail/select_k-ext.cuh     |   2 +-
 .../raft/sparse/matrix/detail/select_k.cuh    |   3 +-
 .../matrix/detail/select_k_double_int64_t.cu  |  32 -----
 .../matrix/detail/select_k_double_uint32_t.cu |  34 -----
 .../matrix/detail/select_k_float_int32.cu     |  32 -----
 .../matrix/detail/select_k_float_int64_t.cu   |  32 -----
 .../matrix/detail/select_k_float_uint32_t.cu  |  32 -----
 .../matrix/detail/select_k_half_int64_t.cu    |  32 -----
 .../matrix/detail/select_k_half_uint32_t.cu   |  32 -----
 cpp/test/CMakeLists.txt                       |   1 +
 cpp/test/ext_headers/00_generate.py           |   1 +
 .../raft_sparse_matrix_detail_select_k.cu     |  27 ++++
 18 files changed, 388 insertions(+), 365 deletions(-)
 create mode 100644 cpp/include/raft/core/bitmap.hpp
 create mode 100644 cpp/include/raft/core/detail/popc.cuh
 delete mode 100644 cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu
 delete mode 100644 cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu
 delete mode 100644 cpp/src/sparse/matrix/detail/select_k_float_int32.cu
 delete mode 100644 cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu
 delete mode 100644 cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu
 delete mode 100644 cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu
 delete mode 100644 cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu
 create mode 100644 cpp/test/ext_headers/raft_sparse_matrix_detail_select_k.cu

diff --git a/cpp/include/raft/core/bitmap.cuh b/cpp/include/raft/core/bitmap.cuh
index 829c84ed25..2c23a77e47 100644
--- a/cpp/include/raft/core/bitmap.cuh
+++ b/cpp/include/raft/core/bitmap.cuh
@@ -16,112 +16,30 @@
 
 #pragma once
 
+#include <raft/core/bitmap.hpp>
 #include <raft/core/bitset.cuh>
 #include <raft/core/detail/mdspan_util.cuh>
 #include <raft/core/device_container_policy.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/resources.hpp>
 
-namespace raft::core {
-/**
- * @defgroup bitmap Bitmap
- * @{
- */
-/**
- * @brief View of a RAFT Bitmap.
- *
- * This lightweight structure which represents and manipulates a two-dimensional bitmap matrix view
- * with row major order. This class provides functionality for handling a matrix where each element
- * is represented as a bit in a bitmap.
- *
- * @tparam bitmap_t Underlying type of the bitmap array. Default is uint32_t.
- * @tparam index_t Indexing type used. Default is uint32_t.
- */
-template <typename bitmap_t = uint32_t, typename index_t = uint32_t>
-struct bitmap_view : public bitset_view<bitmap_t, index_t> {
-  static_assert((std::is_same<bitmap_t, uint32_t>::value ||
-                 std::is_same<bitmap_t, uint64_t>::value),
-                "The bitmap_t must be uint32_t or uint64_t.");
-  /**
-   * @brief Create a bitmap view from a device raw pointer.
-   *
-   * @param bitmap_ptr Device raw pointer
-   * @param rows Number of row in the matrix.
-   * @param cols Number of col in the matrix.
-   */
-  _RAFT_HOST_DEVICE bitmap_view(bitmap_t* bitmap_ptr, index_t rows, index_t cols)
-    : bitset_view<bitmap_t, index_t>(bitmap_ptr, rows * cols), rows_(rows), cols_(cols)
-  {
-  }
-
-  /**
-   * @brief Create a bitmap view from a device vector view of the bitset.
-   *
-   * @param bitmap_span Device vector view of the bitmap
-   * @param rows Number of row in the matrix.
-   * @param cols Number of col in the matrix.
-   */
-  _RAFT_HOST_DEVICE bitmap_view(raft::device_vector_view<bitmap_t, index_t> bitmap_span,
-                                index_t rows,
-                                index_t cols)
-    : bitset_view<bitmap_t, index_t>(bitmap_span, rows * cols), rows_(rows), cols_(cols)
-  {
-  }
+#include <type_traits>
 
- private:
-  // Hide the constructors of bitset_view.
-  _RAFT_HOST_DEVICE bitmap_view(bitmap_t* bitmap_ptr, index_t bitmap_len)
-    : bitset_view<bitmap_t, index_t>(bitmap_ptr, bitmap_len)
-  {
-  }
-
-  _RAFT_HOST_DEVICE bitmap_view(raft::device_vector_view<bitmap_t, index_t> bitmap_span,
-                                index_t bitmap_len)
-    : bitset_view<bitmap_t, index_t>(bitmap_span, bitmap_len)
-  {
-  }
-
- public:
-  /**
-   * @brief Device function to test if a given row and col are set in the bitmap.
-   *
-   * @param row Row index of the bit to test
-   * @param col Col index of the bit to test
-   * @return bool True if index has not been unset in the bitset
-   */
-  inline _RAFT_DEVICE auto test(const index_t row, const index_t col) const -> bool
-  {
-    return test(row * cols_ + col);
-  }
-
-  /**
-   * @brief Device function to set a given row and col to set_value in the bitset.
-   *
-   * @param row Row index of the bit to set
-   * @param col Col index of the bit to set
-   * @param new_value Value to set the bit to (true or false)
-   */
-  inline _RAFT_DEVICE void set(const index_t row, const index_t col, bool new_value) const
-  {
-    set(row * cols_ + col, &new_value);
-  }
-
-  /**
-   * @brief Get the total number of rows
-   * @return index_t The total number of rows
-   */
-  inline _RAFT_HOST_DEVICE index_t get_n_rows() const { return rows_; }
-
-  /**
-   * @brief Get the total number of columns
-   * @return index_t The total number of columns
-   */
-  inline _RAFT_HOST_DEVICE index_t get_n_cols() const { return cols_; }
+namespace raft::core {
 
- private:
-  index_t rows_;
-  index_t cols_;
-};
+template <typename bitmap_t, typename index_t>
+_RAFT_HOST_DEVICE inline bool bitmap_view<bitmap_t, index_t>::test(const index_t row,
+                                                                   const index_t col) const
+{
+  return test(row * cols_ + col);
+}
+
+template <typename bitmap_t, typename index_t>
+_RAFT_HOST_DEVICE void bitmap_view<bitmap_t, index_t>::set(const index_t row,
+                                                           const index_t col,
+                                                           bool new_value) const
+{
+  set(row * cols_ + col, &new_value);
+}
 
-/** @} */
 }  // end namespace raft::core
diff --git a/cpp/include/raft/core/bitmap.hpp b/cpp/include/raft/core/bitmap.hpp
new file mode 100644
index 0000000000..5c77866164
--- /dev/null
+++ b/cpp/include/raft/core/bitmap.hpp
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <raft/core/bitset.hpp>
+#include <raft/core/detail/mdspan_util.cuh>
+#include <raft/core/device_container_policy.hpp>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/resources.hpp>
+
+#include <type_traits>
+
+namespace raft::core {
+/**
+ * @defgroup bitmap Bitmap
+ * @{
+ */
+/**
+ * @brief View of a RAFT Bitmap.
+ *
+ * This lightweight structure which represents and manipulates a two-dimensional bitmap matrix view
+ * with row major order. This class provides functionality for handling a matrix where each element
+ * is represented as a bit in a bitmap.
+ *
+ * @tparam bitmap_t Underlying type of the bitmap array. Default is uint32_t.
+ * @tparam index_t Indexing type used. Default is uint32_t.
+ */
+template <typename bitmap_t = uint32_t, typename index_t = uint32_t>
+struct bitmap_view : public bitset_view<bitmap_t, index_t> {
+  static_assert((std::is_same<typename std::remove_const<bitmap_t>::type, uint32_t>::value ||
+                 std::is_same<typename std::remove_const<bitmap_t>::type, uint64_t>::value),
+                "The bitmap_t must be uint32_t or uint64_t.");
+  /**
+   * @brief Create a bitmap view from a device raw pointer.
+   *
+   * @param bitmap_ptr Device raw pointer
+   * @param rows Number of row in the matrix.
+   * @param cols Number of col in the matrix.
+   */
+  _RAFT_HOST_DEVICE bitmap_view(bitmap_t* bitmap_ptr, index_t rows, index_t cols)
+    : bitset_view<bitmap_t, index_t>(bitmap_ptr, rows * cols), rows_(rows), cols_(cols)
+  {
+  }
+
+  /**
+   * @brief Create a bitmap view from a device vector view of the bitset.
+   *
+   * @param bitmap_span Device vector view of the bitmap
+   * @param rows Number of row in the matrix.
+   * @param cols Number of col in the matrix.
+   */
+  _RAFT_HOST_DEVICE bitmap_view(raft::device_vector_view<bitmap_t, index_t> bitmap_span,
+                                index_t rows,
+                                index_t cols)
+    : bitset_view<bitmap_t, index_t>(bitmap_span, rows * cols), rows_(rows), cols_(cols)
+  {
+  }
+
+ private:
+  // Hide the constructors of bitset_view.
+  _RAFT_HOST_DEVICE bitmap_view(bitmap_t* bitmap_ptr, index_t bitmap_len)
+    : bitset_view<bitmap_t, index_t>(bitmap_ptr, bitmap_len)
+  {
+  }
+
+  _RAFT_HOST_DEVICE bitmap_view(raft::device_vector_view<bitmap_t, index_t> bitmap_span,
+                                index_t bitmap_len)
+    : bitset_view<bitmap_t, index_t>(bitmap_span, bitmap_len)
+  {
+  }
+
+ public:
+  /**
+   * @brief Device function to test if a given row and col are set in the bitmap.
+   *
+   * @param row Row index of the bit to test
+   * @param col Col index of the bit to test
+   * @return bool True if index has not been unset in the bitset
+   */
+  inline _RAFT_HOST_DEVICE bool test(const index_t row, const index_t col) const;
+
+  /**
+   * @brief Device function to set a given row and col to set_value in the bitset.
+   *
+   * @param row Row index of the bit to set
+   * @param col Col index of the bit to set
+   * @param new_value Value to set the bit to (true or false)
+   */
+  inline _RAFT_HOST_DEVICE void set(const index_t row, const index_t col, bool new_value) const;
+
+  /**
+   * @brief Get the total number of rows
+   * @return index_t The total number of rows
+   */
+  inline _RAFT_HOST_DEVICE index_t get_n_rows() const { return rows_; }
+
+  /**
+   * @brief Get the total number of columns
+   * @return index_t The total number of columns
+   */
+  inline _RAFT_HOST_DEVICE index_t get_n_cols() const { return cols_; }
+
+ private:
+  index_t rows_;
+  index_t cols_;
+};
+
+/** @} */
+}  // end namespace raft::core
diff --git a/cpp/include/raft/core/bitset.cuh b/cpp/include/raft/core/bitset.cuh
index cdfbe0b8dd..d7eedee92e 100644
--- a/cpp/include/raft/core/bitset.cuh
+++ b/cpp/include/raft/core/bitset.cuh
@@ -17,7 +17,7 @@
 #pragma once
 
 #include <raft/core/bitset.hpp>
-#include <raft/core/detail/mdspan_util.cuh>  // native_popc
+#include <raft/core/detail/popc.cuh>
 #include <raft/core/device_container_policy.hpp>
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/resource/thrust_policy.hpp>
@@ -60,6 +60,12 @@ _RAFT_HOST_DEVICE void bitset_view<bitset_t, index_t>::set(const index_t sample_
   }
 }
 
+template <typename bitset_t, typename index_t>
+_RAFT_HOST_DEVICE inline index_t bitset_view<bitset_t, index_t>::n_elements() const
+{
+  return raft::ceildiv(bitset_len_, bitset_element_size);
+}
+
 template <typename bitset_t, typename index_t>
 bitset<bitset_t, index_t>::bitset(const raft::resources& res,
                                   raft::device_vector_view<const index_t, index_t> mask_index,
@@ -161,37 +167,9 @@ template <typename bitset_t, typename index_t>
 void bitset<bitset_t, index_t>::count(const raft::resources& res,
                                       raft::device_scalar_view<index_t> count_gpu_scalar)
 {
-  auto n_elements_ = n_elements();
-  auto count_gpu =
-    raft::make_device_vector_view<index_t, index_t>(count_gpu_scalar.data_handle(), 1);
-  auto bitset_matrix_view = raft::make_device_matrix_view<const bitset_t, index_t, raft::col_major>(
-    bitset_.data(), n_elements_, 1);
-
-  bitset_t n_last_element = (bitset_len_ % bitset_element_size);
-  bitset_t last_element_mask =
-    n_last_element ? (bitset_t)((bitset_t{1} << n_last_element) - bitset_t{1}) : ~bitset_t{0};
-  raft::linalg::coalesced_reduction(
-    res,
-    bitset_matrix_view,
-    count_gpu,
-    index_t{0},
-    false,
-    [last_element_mask, n_elements_] __device__(bitset_t element, index_t index) {
-      index_t result = 0;
-      if constexpr (bitset_element_size == 64) {
-        if (index == n_elements_ - 1)
-          result = index_t(raft::detail::popc(element & last_element_mask));
-        else
-          result = index_t(raft::detail::popc(element));
-      } else {  // Needed because popc is not overloaded for 16 and 8 bit elements
-        if (index == n_elements_ - 1)
-          result = index_t(raft::detail::popc(uint32_t{element} & last_element_mask));
-        else
-          result = index_t(raft::detail::popc(uint32_t{element}));
-      }
-
-      return result;
-    });
+  auto values =
+    raft::make_device_vector_view<const bitset_t, index_t>(bitset_.data(), n_elements());
+  raft::detail::popc(res, values, bitset_len_, count_gpu_scalar);
 }
 
 }  // end namespace raft::core
diff --git a/cpp/include/raft/core/detail/popc.cuh b/cpp/include/raft/core/detail/popc.cuh
new file mode 100644
index 0000000000..d74b68b715
--- /dev/null
+++ b/cpp/include/raft/core/detail/popc.cuh
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <raft/core/detail/mdspan_util.cuh>
+#include <raft/core/device_mdarray.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/linalg/coalesced_reduction.cuh>
+
+namespace raft::detail {
+
+/**
+ * @brief Count the number of bits that are set to 1 in a vector.
+ *
+ * @tparam value_t the value type of the vector.
+ * @tparam index_t the index type of vector and scalar.
+ *
+ * @param[in] res raft handle for managing expensive resources
+ * @param[in] values Number of row in the matrix.
+ * @param[in] max_len Maximum number of bits to count.
+ * @param[out] counter Number of bits that are set to 1.
+ */
+template <typename value_t, typename index_t>
+void popc(const raft::resources& res,
+          device_vector_view<value_t, index_t> values,
+          index_t max_len,
+          raft::device_scalar_view<index_t> counter)
+{
+  auto values_size   = values.size();
+  auto values_matrix = raft::make_device_matrix_view<value_t, index_t, col_major>(
+    values.data_handle(), values_size, 1);
+  auto counter_vector = raft::make_device_vector_view<index_t, index_t>(counter.data_handle(), 1);
+
+  static constexpr index_t len_per_item = sizeof(value_t) * 8;
+
+  value_t tail_len  = (max_len % len_per_item);
+  value_t tail_mask = tail_len ? (value_t)((value_t{1} << tail_len) - value_t{1}) : ~value_t{0};
+  raft::linalg::coalesced_reduction(
+    res,
+    values_matrix,
+    counter_vector,
+    index_t{0},
+    false,
+    [tail_mask, values_size] __device__(value_t value, index_t index) {
+      index_t result = 0;
+      if constexpr (len_per_item == 64) {
+        if (index == values_size - 1)
+          result = index_t(raft::detail::popc(value & tail_mask));
+        else
+          result = index_t(raft::detail::popc(value));
+      } else {  // Needed because popc is not overloaded for 16 and 8 bit elements
+        if (index == values_size - 1)
+          result = index_t(raft::detail::popc(uint32_t{value} & tail_mask));
+        else
+          result = index_t(raft::detail::popc(uint32_t{value}));
+      }
+
+      return result;
+    });
+}
+
+}  // end namespace raft::detail
\ No newline at end of file
diff --git a/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh b/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
index b0315486ff..b1b0291a85 100644
--- a/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/bitmap_to_csr.cuh
@@ -67,8 +67,8 @@ RAFT_KERNEL __launch_bounds__(calc_nnz_by_rows_tpb) calc_nnz_by_rows_kernel(cons
     index_t l_sum  = 0;
 
     while (offset < num_cols) {
-      index_t bitmap_idx = lane_id + (s_bit + offset) / BITS_PER_BITMAP;
-      bitmap_t l_bitmap  = bitmap_t(0);
+      index_t bitmap_idx                     = lane_id + (s_bit + offset) / BITS_PER_BITMAP;
+      std::remove_const_t<bitmap_t> l_bitmap = 0;
 
       if (bitmap_idx * BITS_PER_BITMAP < e_bit) { l_bitmap = bitmap[bitmap_idx]; }
 
@@ -176,9 +176,9 @@ RAFT_KERNEL __launch_bounds__(fill_indices_by_rows_tpb)
 
 #pragma unroll
     for (index_t offset = 0; offset < num_cols; offset += BITS_PER_BITMAP * warpSize) {
-      index_t bitmap_idx = lane_id + (s_bit + offset) / BITS_PER_BITMAP;
-      bitmap_t l_bitmap  = bitmap_t(0);
-      index_t l_offset   = offset + lane_id * BITS_PER_BITMAP - (s_bit % BITS_PER_BITMAP);
+      index_t bitmap_idx                     = lane_id + (s_bit + offset) / BITS_PER_BITMAP;
+      std::remove_const_t<bitmap_t> l_bitmap = 0;
+      index_t l_offset = offset + lane_id * BITS_PER_BITMAP - (s_bit % BITS_PER_BITMAP);
 
       if (bitmap_idx * BITS_PER_BITMAP < e_bit) { l_bitmap = bitmap[bitmap_idx]; }
 
diff --git a/cpp/include/raft/sparse/distance/detail/utils.cuh b/cpp/include/raft/sparse/distance/detail/utils.cuh
index ed2b414c70..42b545180b 100644
--- a/cpp/include/raft/sparse/distance/detail/utils.cuh
+++ b/cpp/include/raft/sparse/distance/detail/utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,7 +16,11 @@
 
 #pragma once
 
+#include <raft/core/math.hpp>
+#include <raft/distance/distance_types.hpp>
+
 #include <cub/cub.cuh>
+#include <cuda_pipeline.h>
 
 namespace raft {
 namespace sparse {
@@ -37,6 +41,127 @@ inline int max_cols_per_block()
          sizeof(value_t);
 }
 
+template <typename value_idx, typename value_t>
+RAFT_KERNEL faster_dot_on_csr_kernel(value_t* __restrict__ dot,
+                                     const value_idx* __restrict__ indptr,
+                                     const value_idx* __restrict__ cols,
+                                     const value_t* __restrict__ A,
+                                     const value_t* __restrict__ B,
+                                     const value_idx nnz,
+                                     const value_idx n_rows,
+                                     const value_idx dim)
+{
+  auto vec_id  = threadIdx.x;
+  auto lane_id = threadIdx.x & 0x1f;
+
+  extern __shared__ char smem[];
+  value_t* s_A      = (value_t*)smem;
+  value_idx cur_row = -1;
+
+  for (int row = blockIdx.x; row < n_rows; row += gridDim.x) {
+    for (int dot_id = blockIdx.y + indptr[row]; dot_id < indptr[row + 1]; dot_id += gridDim.y) {
+      if (dot_id >= nnz) { return; }
+      const value_idx col               = cols[dot_id] * dim;
+      const value_t* __restrict__ B_col = B + col;
+
+      if (threadIdx.x == 0) { dot[dot_id] = 0.0; }
+      __syncthreads();
+
+      if (cur_row != row) {
+        for (value_idx k = vec_id; k < dim; k += blockDim.x) {
+          s_A[k] = A[row * dim + k];
+        }
+        cur_row = row;
+      }
+
+      value_t l_dot_ = 0.0;
+      for (value_idx k = vec_id; k < dim; k += blockDim.x) {
+        asm("prefetch.global.L2 [%0];" ::"l"(B_col + k + blockDim.x));
+        l_dot_ += s_A[k] * __ldcg(B_col + k);
+      }
+      l_dot_ += __shfl_down_sync(0xffffffff, l_dot_, 16);
+      l_dot_ += __shfl_down_sync(0xffff, l_dot_, 8);
+      l_dot_ += __shfl_down_sync(0xff, l_dot_, 4);
+      l_dot_ += __shfl_down_sync(0xf, l_dot_, 2);
+      l_dot_ += __shfl_down_sync(0x3, l_dot_, 1);
+
+      if (lane_id == 0) { atomicAdd_block(dot + dot_id, l_dot_); }
+    }
+  }
+}
+
+template <typename value_idx, typename value_t>
+void faster_dot_on_csr(raft::resources const& handle,
+                       value_t* dot,
+                       const value_idx nnz,
+                       const value_idx* indptr,
+                       const value_idx* cols,
+                       const value_t* A,
+                       const value_t* B,
+                       const value_idx n_rows,
+                       const value_idx dim)
+{
+  if (nnz == 0 || n_rows == 0) return;
+
+  auto stream = resource::get_cuda_stream(handle);
+
+  constexpr value_idx MAX_ROW_PER_ITER = 500;
+  int dev_id, sm_count, blocks_per_sm;
+
+  const int smem_size = dim * sizeof(value_t);
+  cudaGetDevice(&dev_id);
+  cudaDeviceGetAttribute(&sm_count, cudaDevAttrMultiProcessorCount, dev_id);
+
+  if (dim < 128) {
+    constexpr int tpb = 64;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+
+  } else if (dim < 256) {
+    constexpr int tpb = 128;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+  } else if (dim < 512) {
+    constexpr int tpb = 256;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+  } else {
+    constexpr int tpb = 512;
+    cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &blocks_per_sm, faster_dot_on_csr_kernel<value_idx, value_t>, tpb, smem_size);
+    auto block_x = std::min(n_rows, MAX_ROW_PER_ITER);
+    auto block_y =
+      (std::min(value_idx(blocks_per_sm * sm_count * 16), nnz) + block_x - 1) / block_x;
+    dim3 blocks(block_x, block_y, 1);
+
+    faster_dot_on_csr_kernel<value_idx, value_t>
+      <<<blocks, tpb, smem_size, stream>>>(dot, indptr, cols, A, B, nnz, n_rows, dim);
+  }
+
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
+}
+
 }  // namespace detail
 }  // namespace distance
 }  // namespace sparse
diff --git a/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh b/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
index 922356b040..01625a0ce8 100644
--- a/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
+++ b/cpp/include/raft/sparse/matrix/detail/select_k-ext.cuh
@@ -37,7 +37,7 @@ void select_k(raft::resources const& handle,
               raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,
               bool select_min,
               bool sorted                   = false,
-              raft::matrix::SelectAlgo algo = SelectAlgo::kAuto) RAFT_EXPLICIT;
+              raft::matrix::SelectAlgo algo = raft::matrix::SelectAlgo::kAuto) RAFT_EXPLICIT;
 }  // namespace raft::sparse::matrix::detail
 
 #endif  // RAFT_EXPLICIT_INSTANTIATE_ONLY
diff --git a/cpp/include/raft/sparse/matrix/detail/select_k.cuh b/cpp/include/raft/sparse/matrix/detail/select_k.cuh
index 711169984b..5d52b94b2f 100644
--- a/cpp/include/raft/sparse/matrix/detail/select_k.cuh
+++ b/cpp/include/raft/sparse/matrix/detail/select_k.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,6 +17,7 @@
 
 #ifndef RAFT_EXPLICIT_INSTANTIATE_ONLY
 #include "select_k-inl.cuh"
+
 #endif
 
 #ifdef RAFT_COMPILED
diff --git a/cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu b/cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu
deleted file mode 100644
index c784b50dad..0000000000
--- a/cpp/src/sparse/matrix/detail/select_k_double_int64_t.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/sparse/matrix/detail/select_k-inl.cuh>
-
-#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
-  template void raft::sparse::matrix::detail::select_k(               \
-    raft::resources const& handle,                                    \
-    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
-    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
-    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    bool sorted,                                                      \
-    raft::matrix::SelectAlgo algo)
-
-instantiate_raft_sparse_matrix_detail_select_k(double, int64_t);
-
-#undef instantiate_raft_sparse_matrix_detail_select_k
\ No newline at end of file
diff --git a/cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu b/cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu
deleted file mode 100644
index 98bab9a504..0000000000
--- a/cpp/src/sparse/matrix/detail/select_k_double_uint32_t.cu
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/sparse/matrix/detail/select_k-inl.cuh>
-
-#include <cstdint>  // uint32_t
-
-#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
-  template void raft::sparse::matrix::detail::select_k(               \
-    raft::resources const& handle,                                    \
-    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
-    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
-    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    bool sorted,                                                      \
-    raft::matrix::SelectAlgo algo)
-
-instantiate_raft_sparse_matrix_detail_select_k(double, uint32_t);
-
-#undef instantiate_raft_sparse_matrix_detail_select_k
\ No newline at end of file
diff --git a/cpp/src/sparse/matrix/detail/select_k_float_int32.cu b/cpp/src/sparse/matrix/detail/select_k_float_int32.cu
deleted file mode 100644
index bff213ae69..0000000000
--- a/cpp/src/sparse/matrix/detail/select_k_float_int32.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/sparse/matrix/detail/select_k-inl.cuh>
-
-#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
-  template void raft::matrix::detail::select_k(                       \
-    raft::resources const& handle,                                    \
-    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
-    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
-    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    bool sorted,                                                      \
-    raft::matrix::SelectAlgo algo)
-
-instantiate_raft_sparse_matrix_detail_select_k(float, int);
-
-#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu b/cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu
deleted file mode 100644
index 412b06e587..0000000000
--- a/cpp/src/sparse/matrix/detail/select_k_float_int64_t.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/sparse/matrix/detail/select_k-inl.cuh>
-
-#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
-  template void raft::sparse::matrix::detail::select_k(               \
-    raft::resources const& handle,                                    \
-    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
-    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
-    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    bool sorted,                                                      \
-    raft::matrix::SelectAlgo algo)
-
-instantiate_raft_sparse_matrix_detail_select_k(float, int64_t);
-
-#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu b/cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu
deleted file mode 100644
index 8ba3f0e22b..0000000000
--- a/cpp/src/sparse/matrix/detail/select_k_float_uint32_t.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/sparse/matrix/detail/select_k-inl.cuh>
-
-#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
-  template void raft::sparse::matrix::detail::select_k(               \
-    raft::resources const& handle,                                    \
-    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
-    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
-    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    bool sorted,                                                      \
-    raft::matrix::SelectAlgo algo)
-
-instantiate_raft_sparse_matrix_detail_select_k(float, uint32_t);
-
-#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu b/cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu
deleted file mode 100644
index 24c844f8c8..0000000000
--- a/cpp/src/sparse/matrix/detail/select_k_half_int64_t.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/sparse/matrix/detail/select_k-inl.cuh>
-
-#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
-  template void raft::sparse::matrix::detail::select_k(               \
-    raft::resources const& handle,                                    \
-    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
-    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
-    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    bool sorted,                                                      \
-    raft::matrix::SelectAlgo algo)
-
-instantiate_raft_sparse_matrix_detail_select_k(__half, int64_t);
-
-#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu b/cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu
deleted file mode 100644
index d63dc64933..0000000000
--- a/cpp/src/sparse/matrix/detail/select_k_half_uint32_t.cu
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <raft/sparse/matrix/detail/select_k-inl.cuh>
-
-#define instantiate_raft_sparse_matrix_detail_select_k(T, IdxT)       \
-  template void raft::sparse::matrix::detail::select_k(               \
-    raft::resources const& handle,                                    \
-    raft::device_csr_matrix_view<const T, IdxT, IdxT, IdxT> in_val,   \
-    std::optional<raft::device_vector_view<const IdxT, IdxT>> in_idx, \
-    raft::device_matrix_view<T, IdxT, raft::row_major> out_val,       \
-    raft::device_matrix_view<IdxT, IdxT, raft::row_major> out_idx,    \
-    bool select_min,                                                  \
-    bool sorted,                                                      \
-    raft::matrix::SelectAlgo algo)
-
-instantiate_raft_sparse_matrix_detail_select_k(__half, uint32_t);
-
-#undef instantiate_raft_sparse_matrix_detail_select_k
diff --git a/cpp/test/CMakeLists.txt b/cpp/test/CMakeLists.txt
index dac3418c8e..ff0518a4d0 100644
--- a/cpp/test/CMakeLists.txt
+++ b/cpp/test/CMakeLists.txt
@@ -193,6 +193,7 @@ if(BUILD_TESTS)
     ext_headers/raft_neighbors_refine.cu
     ext_headers/raft_neighbors_detail_ivf_flat_search.cu
     ext_headers/raft_linalg_detail_coalesced_reduction.cu
+    ext_headers/raft_sparse_matrix_detail_select_k.cu
     ext_headers/raft_spatial_knn_detail_ball_cover_registers.cu
     ext_headers/raft_neighbors_detail_ivf_flat_interleaved_scan.cu
     ext_headers/raft_neighbors_detail_ivf_pq_compute_similarity.cu
diff --git a/cpp/test/ext_headers/00_generate.py b/cpp/test/ext_headers/00_generate.py
index d9c766979b..1e1106f8bf 100644
--- a/cpp/test/ext_headers/00_generate.py
+++ b/cpp/test/ext_headers/00_generate.py
@@ -54,6 +54,7 @@
     "raft/neighbors/refine-ext.cuh",
     "raft/neighbors/detail/ivf_flat_search-ext.cuh",
     "raft/linalg/detail/coalesced_reduction-ext.cuh",
+    "raft/sparse/matrix/detail/select_k-ext.cuh",
     "raft/spatial/knn/detail/ball_cover/registers-ext.cuh",
     "raft/neighbors/detail/ivf_flat_interleaved_scan-ext.cuh",
     "raft/neighbors/detail/ivf_pq_compute_similarity-ext.cuh",
diff --git a/cpp/test/ext_headers/raft_sparse_matrix_detail_select_k.cu b/cpp/test/ext_headers/raft_sparse_matrix_detail_select_k.cu
new file mode 100644
index 0000000000..b748a31a5b
--- /dev/null
+++ b/cpp/test/ext_headers/raft_sparse_matrix_detail_select_k.cu
@@ -0,0 +1,27 @@
+
+/*
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+ * NOTE: this file is generated by 00_generate.py
+ *
+ * Make changes there and run in this directory:
+ *
+ * > python 00_generate.py
+ *
+ */
+
+#include <raft/sparse/matrix/detail/select_k.cuh>

From 02e4504c22fdc5bf251bbc3edae3868d17e18214 Mon Sep 17 00:00:00 2001
From: Ray Douglass <ray@raydouglass.com>
Date: Wed, 5 Jun 2024 10:13:14 -0400
Subject: [PATCH 60/60] Update Changelog [skip ci]

---
 CHANGELOG.md | 79 ++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 6a4da6197e..e0599dae8a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,82 @@
+# raft 24.06.00 (5 Jun 2024)
+
+## 🚨 Breaking Changes
+
+- Rename raft-ann-bench module to raft_ann_bench ([#2333](https://github.com/rapidsai/raft/pull/2333)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Scaling workspace resources ([#2322](https://github.com/rapidsai/raft/pull/2322)) [@achirkin](https://github.com/achirkin)
+- [REVIEW] Adjust UCX dependencies ([#2304](https://github.com/rapidsai/raft/pull/2304)) [@pentschev](https://github.com/pentschev)
+- Convert device_memory_resource* to device_async_resource_ref ([#2269](https://github.com/rapidsai/raft/pull/2269)) [@harrism](https://github.com/harrism)
+
+## 🐛 Bug Fixes
+
+- Fix import of VERSION file in raft-ann-bench ([#2338](https://github.com/rapidsai/raft/pull/2338)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Rename raft-ann-bench module to raft_ann_bench ([#2333](https://github.com/rapidsai/raft/pull/2333)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Support building faiss main statically ([#2323](https://github.com/rapidsai/raft/pull/2323)) [@robertmaynard](https://github.com/robertmaynard)
+- Refactor spectral scale_obs to use existing normalization function ([#2319](https://github.com/rapidsai/raft/pull/2319)) [@ChuckHastings](https://github.com/ChuckHastings)
+- Correct initializer list order found by cuvs ([#2317](https://github.com/rapidsai/raft/pull/2317)) [@robertmaynard](https://github.com/robertmaynard)
+- ANN_BENCH: enable move semantics for configured_raft_resources ([#2311](https://github.com/rapidsai/raft/pull/2311)) [@achirkin](https://github.com/achirkin)
+- Revert &quot;Build C++ wheel ([#2264)&quot; (#2305](https://github.com/rapidsai/raft/pull/2264)&quot; (#2305)) [@vyasr](https://github.com/vyasr)
+- Revert &quot;Add `compile-library` by default on pylibraft build&quot; ([#2300](https://github.com/rapidsai/raft/pull/2300)) [@vyasr](https://github.com/vyasr)
+- Add VERSION to raft-ann-bench package ([#2299](https://github.com/rapidsai/raft/pull/2299)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Remove nonexistent job from workflow ([#2298](https://github.com/rapidsai/raft/pull/2298)) [@vyasr](https://github.com/vyasr)
+- `libucx` should be run dependency of `raft-dask` ([#2296](https://github.com/rapidsai/raft/pull/2296)) [@divyegala](https://github.com/divyegala)
+- Fix clang intrinsic warning ([#2292](https://github.com/rapidsai/raft/pull/2292)) [@aaronmondal](https://github.com/aaronmondal)
+- Replace too long index file name with hash in ANN bench ([#2280](https://github.com/rapidsai/raft/pull/2280)) [@tfeher](https://github.com/tfeher)
+- Fix build command for C++ compilation ([#2270](https://github.com/rapidsai/raft/pull/2270)) [@lowener](https://github.com/lowener)
+- Fix a compilation error in CAGRA when enabling log output ([#2262](https://github.com/rapidsai/raft/pull/2262)) [@enp1s0](https://github.com/enp1s0)
+- Correct member initialization order ([#2254](https://github.com/rapidsai/raft/pull/2254)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix time computation in CAGRA notebook ([#2231](https://github.com/rapidsai/raft/pull/2231)) [@lowener](https://github.com/lowener)
+
+## 📖 Documentation
+
+- Fix citation info ([#2318](https://github.com/rapidsai/raft/pull/2318)) [@enp1s0](https://github.com/enp1s0)
+
+## 🚀 New Features
+
+- Scaling workspace resources ([#2322](https://github.com/rapidsai/raft/pull/2322)) [@achirkin](https://github.com/achirkin)
+- ANN_BENCH: AnnGPU::uses_stream() for optional algo GPU sync ([#2314](https://github.com/rapidsai/raft/pull/2314)) [@achirkin](https://github.com/achirkin)
+- [FEA] Split Bitset code ([#2295](https://github.com/rapidsai/raft/pull/2295)) [@lowener](https://github.com/lowener)
+- [FEA] support of prefiltered brute force ([#2294](https://github.com/rapidsai/raft/pull/2294)) [@rhdong](https://github.com/rhdong)
+- Always use a static gtest and gbench ([#2265](https://github.com/rapidsai/raft/pull/2265)) [@robertmaynard](https://github.com/robertmaynard)
+- Build C++ wheel ([#2264](https://github.com/rapidsai/raft/pull/2264)) [@vyasr](https://github.com/vyasr)
+- InnerProduct Distance Metric for CAGRA search ([#2260](https://github.com/rapidsai/raft/pull/2260)) [@tarang-jain](https://github.com/tarang-jain)
+- [FEA] Add support for `select_k` on CSR matrix ([#2140](https://github.com/rapidsai/raft/pull/2140)) [@rhdong](https://github.com/rhdong)
+
+## 🛠️ Improvements
+
+- ANN_BENCH: common AnnBase::index_type ([#2315](https://github.com/rapidsai/raft/pull/2315)) [@achirkin](https://github.com/achirkin)
+- ANN_BENCH: split instances of RaftCagra into multiple files ([#2313](https://github.com/rapidsai/raft/pull/2313)) [@achirkin](https://github.com/achirkin)
+- ANN_BENCH: a global pool of result buffers across benchmark cases ([#2312](https://github.com/rapidsai/raft/pull/2312)) [@achirkin](https://github.com/achirkin)
+- Remove the shared state and the mutex from NVTX internals ([#2310](https://github.com/rapidsai/raft/pull/2310)) [@achirkin](https://github.com/achirkin)
+- docs: update README.md ([#2308](https://github.com/rapidsai/raft/pull/2308)) [@eltociear](https://github.com/eltociear)
+- [REVIEW] Reenable raft-dask wheel tests requiring UCX-Py ([#2307](https://github.com/rapidsai/raft/pull/2307)) [@pentschev](https://github.com/pentschev)
+- [REVIEW] Adjust UCX dependencies ([#2304](https://github.com/rapidsai/raft/pull/2304)) [@pentschev](https://github.com/pentschev)
+- Overhaul ops-codeowners ([#2303](https://github.com/rapidsai/raft/pull/2303)) [@raydouglass](https://github.com/raydouglass)
+- Make thrust nosync execution policy the default thrust policy ([#2302](https://github.com/rapidsai/raft/pull/2302)) [@abc99lr](https://github.com/abc99lr)
+- InnerProduct testing for CAGRA+HNSW ([#2297](https://github.com/rapidsai/raft/pull/2297)) [@divyegala](https://github.com/divyegala)
+- Enable warnings as errors for Python tests ([#2288](https://github.com/rapidsai/raft/pull/2288)) [@mroeschke](https://github.com/mroeschke)
+- Normalize dataset vectors in the CAGRA InnerProduct tests ([#2287](https://github.com/rapidsai/raft/pull/2287)) [@enp1s0](https://github.com/enp1s0)
+- Use dynamic version for raft-ann-bench ([#2285](https://github.com/rapidsai/raft/pull/2285)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Make &#39;librmm&#39; a &#39;host&#39; dependency for conda packages ([#2284](https://github.com/rapidsai/raft/pull/2284)) [@jameslamb](https://github.com/jameslamb)
+- Fix comments in cpp/include/raft/neighbors/cagra_serialize.cuh ([#2283](https://github.com/rapidsai/raft/pull/2283)) [@jiangyinzuo](https://github.com/jiangyinzuo)
+- Only use functions in the limited API ([#2282](https://github.com/rapidsai/raft/pull/2282)) [@vyasr](https://github.com/vyasr)
+- define &#39;ucx&#39; pytest marker ([#2281](https://github.com/rapidsai/raft/pull/2281)) [@jameslamb](https://github.com/jameslamb)
+- Migrate to `{{ stdlib(&quot;c&quot;) }}` ([#2278](https://github.com/rapidsai/raft/pull/2278)) [@hcho3](https://github.com/hcho3)
+- add --rm and --name to devcontainer run args ([#2275](https://github.com/rapidsai/raft/pull/2275)) [@trxcllnt](https://github.com/trxcllnt)
+- Update pip devcontainers to UCX v1.15.0 ([#2274](https://github.com/rapidsai/raft/pull/2274)) [@trxcllnt](https://github.com/trxcllnt)
+- `#ifdef` out pragma deprecation warning messages ([#2271](https://github.com/rapidsai/raft/pull/2271)) [@trxcllnt](https://github.com/trxcllnt)
+- Convert device_memory_resource* to device_async_resource_ref ([#2269](https://github.com/rapidsai/raft/pull/2269)) [@harrism](https://github.com/harrism)
+- Update the developer&#39;s guide with new copyright hook ([#2266](https://github.com/rapidsai/raft/pull/2266)) [@KyleFromNVIDIA](https://github.com/KyleFromNVIDIA)
+- Improve coalesced reduction performance for tall and thin matrices (up to 2.6x faster) ([#2259](https://github.com/rapidsai/raft/pull/2259)) [@Nyrio](https://github.com/Nyrio)
+- Adds missing files to `update-version.sh` ([#2255](https://github.com/rapidsai/raft/pull/2255)) [@AyodeAwe](https://github.com/AyodeAwe)
+- Enable all tests for `arm64` jobs ([#2248](https://github.com/rapidsai/raft/pull/2248)) [@galipremsagar](https://github.com/galipremsagar)
+- Update nvtx3 link in cmake ([#2246](https://github.com/rapidsai/raft/pull/2246)) [@lowener](https://github.com/lowener)
+- Add CAGRA-Q subspace dim = 4 support ([#2244](https://github.com/rapidsai/raft/pull/2244)) [@enp1s0](https://github.com/enp1s0)
+- Get rid of `cuco::sentinel` namespace ([#2243](https://github.com/rapidsai/raft/pull/2243)) [@PointKernel](https://github.com/PointKernel)
+- Replace usages of raw `get_upstream` with `get_upstream_resource()` ([#2207](https://github.com/rapidsai/raft/pull/2207)) [@miscco](https://github.com/miscco)
+- Set the import mode for dask tests ([#2142](https://github.com/rapidsai/raft/pull/2142)) [@vyasr](https://github.com/vyasr)
+- Add UCXX support ([#1983](https://github.com/rapidsai/raft/pull/1983)) [@pentschev](https://github.com/pentschev)
+
 # raft 24.04.00 (10 Apr 2024)
 
 ## 🐛 Bug Fixes