From 73266e23ddfacf5fee3091b8da2a44af851a68cb Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 25 Sep 2024 09:11:35 -0500 Subject: [PATCH] bump NCCL floor to 2.18.1.1, relax PyTorch pin (#218) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Contributes to https://github.com/rapidsai/build-planning/issues/102 Fixes #217 ## Notes for Reviewers ### How I tested this Temporarily added a CUDA 11.4.3 test job to CI here (the same specs as the failing nightly), by pointing at the branch from https://github.com/rapidsai/shared-workflows/pull/246. Observed the exact same failures with CUDA 11.4 reported in https://github.com/rapidsai/build-planning/issues/102. ```text ... + nccl 2.10.3.1 hcad2f07_0 rapidsai-nightly 125MB ... ./WHOLEGRAPH_CSR_WEIGHTED_SAMPLE_WITHOUT_REPLACEMENT_TEST: symbol lookup error: /opt/conda/envs/test/bin/gtests/libwholegraph/../../../lib/libwholegraph.so: undefined symbol: ncclCommSplit sh -c exec "$0" ./WHOLEMEMORY_HANDLE_TEST ./WHOLEMEMORY_HANDLE_TEST: symbol lookup error: /opt/conda/envs/test/bin/gtests/libwholegraph/../../../lib/libwholegraph.so: undefined symbol: ncclCommSplit sh -c exec "$0" ./GRAPH_APPEND_UNIQUE_TEST ``` ([build link](https://github.com/rapidsai/wholegraph/actions/runs/10966022370/job/30453393224?pr=218)) Pushed a commit adding a floor of `nccl>=2.18.1.1`. Saw all tests pass with CUDA 11.4 😁 ```text ... + nccl 2.22.3.1 hee583db_1 conda-forge 131MB ... (various log messages showing all tests passed) ``` ([build link](https://github.com/rapidsai/wholegraph/actions/runs/10966210441/job/30454147250?pr=218)) Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - https://github.com/linhu-nv - https://github.com/jakirkham URL: https://github.com/rapidsai/wholegraph/pull/218 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 4 ++-- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/libwholegraph/conda_build_config.yaml | 2 +- dependencies.yaml | 12 ++++++------ 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index d989d880e..f20d98977 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -28,7 +28,7 @@ dependencies: - librmm==24.10.*,>=0.0.0a0 - nanobind>=0.2.0 - nbsphinx -- nccl +- nccl>=2.18.1.1 - ninja - numpy>=1.23,<3.0a0 - numpydoc @@ -40,7 +40,7 @@ dependencies: - pytest-xdist - python>=3.10,<3.13 - pytorch-cuda=11.8 -- pytorch=2.0.0 +- pytorch>=2.0,<2.4.0a0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 - recommonmark - scikit-build-core>=0.10.0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 5b152cd31..5988a9893 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -30,7 +30,7 @@ dependencies: - librmm==24.10.*,>=0.0.0a0 - nanobind>=0.2.0 - nbsphinx -- nccl +- nccl>=2.18.1.1 - ninja - numpy>=1.23,<3.0a0 - numpydoc diff --git a/conda/recipes/libwholegraph/conda_build_config.yaml b/conda/recipes/libwholegraph/conda_build_config.yaml index 35b1d6b62..8b6dd3439 100644 --- a/conda/recipes/libwholegraph/conda_build_config.yaml +++ b/conda/recipes/libwholegraph/conda_build_config.yaml @@ -17,7 +17,7 @@ doxygen_version: - ">=1.8.11" nccl_version: - - ">=2.9.9" + - ">=2.18.1.1" c_stdlib: - sysroot diff --git a/dependencies.yaml b/dependencies.yaml index 8aaf92cd9..950e1979a 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -87,7 +87,7 @@ dependencies: - libraft-headers==24.10.*,>=0.0.0a0 - librmm==24.10.*,>=0.0.0a0 - nanobind>=0.2.0 - - nccl + - &nccl nccl>=2.18.1.1 specific: - output_types: conda matrices: @@ -216,14 +216,14 @@ dependencies: common: - output_types: [conda] packages: - - nccl + - *nccl test_python: common: - output_types: [conda] packages: - c-compiler - cxx-compiler - - nccl + - *nccl - output_types: [conda, requirements] packages: - ninja @@ -285,13 +285,13 @@ dependencies: # If conda-forge supports the new cuda-* packages for CUDA 11.8 # at some point, then we can fully support/properly specify # this environment. - - pytorch=2.0.0 + - &pytorch pytorch>=2.0,<2.4.0a0 - pytorch-cuda=11.8 - matrix: arch: aarch64 cuda: "11.8" packages: - - pytorch=2.0.0 + - *pytorch - pytorch-cuda=11.8 - matrix: packages: @@ -318,7 +318,7 @@ dependencies: common: - output_types: [conda] packages: - - pytorch=2.0.0 + - *pytorch - cpuonly clang_tools: common: