Fix sync issues with calls to CUSV APIs on aarch64 (#823)

### Before submitting Please complete the following checklist when submitting a PR: - [x] All new features must include a unit test. If you've fixed a bug or added code that should be tested, add a test to the [`tests`](../tests) directory! - [x] All new functions and code must be clearly commented and documented. If you do make documentation changes, make sure that the docs build and render correctly by running `make docs`. - [x] Ensure that the test suite passes, by running `make test`. - [x] Add a new entry to the `.github/CHANGELOG.md` file, summarizing the change, and including a link back to the PR. - [x] Ensure that code is properly formatted by running `make format`. When all the above are checked, delete everything above the dashed line and fill in the pull request template. ------------------------------------------------------------------------------------------------------------ **Context:** Fixes the known sync issues on aarch64 + GraceHopper when using custatevec API calls. **Description of the Change:** Adds stream sync to all CUSV async API calls. **Benefits:** Fixes #793 **Possible Drawbacks:** Potential sync point may introduce (minimal) overhead for smaller problems. **Related GitHub Issues:** --------- Co-authored-by: Lee J. O'Riordan <lee@xanadu.au> Co-authored-by: ringo-but-quantum <github-ringo-but-quantum@xanadu.ai> Co-authored-by: Ali Asadi <10773383+maliasadi@users.noreply.github.com>
PennyLaneAI · Aug 2, 2024 · 619b807 · 619b807
1 parent cfc3006
commit 619b807
Show file tree

Hide file tree

Showing 8 changed files with 41 additions and 6 deletions.
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -73,6 +73,12 @@
 
 ### Bug fixes
 
+* Fix cuQuantum SDK path pass-though in CMake.
+  [(#831)](https://github.com/PennyLaneAI/pennylane-lightning/pull/831)
+
+* Fix CUDA sync issues on aarch64+GraceHopper.
+  [(#823)](https://github.com/PennyLaneAI/pennylane-lightning/pull/823)
+
 * Check for the number of wires for Hermitian observables in Lightning-Tensor. Only 1-wire Hermitian observables are supported as of `cuTensorNet-v24.03.0`.
   [(#806)](https://github.com/PennyLaneAI/pennylane-lightning/pull/806)
 

diff --git a/.github/workflows/tests_lgpumpi_cpp.yml b/.github/workflows/tests_lgpumpi_cpp.yml
@@ -123,7 +123,7 @@ jobs:
       - name: Build and run unit tests
         run: |
           source /etc/profile.d/modules.sh && module use /opt/modules/ && module load ${{ matrix.mpilib }}/cuda-${{ matrix.cuda_version_maj }}.${{ matrix.cuda_version_min }}
-          export CUQUANTUM_SDK=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum/lib')")
+          export CUQUANTUM_SDK=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")
           cmake . -BBuild \
             -DPL_BACKEND=lightning_gpu \
             -DENABLE_PYTHON=OFF \
@@ -134,6 +134,7 @@ jobs:
             -DCMAKE_CXX_COMPILER=mpicxx \
             -DCMAKE_CUDA_COMPILER=$(which nvcc) \
             -DCMAKE_CUDA_ARCHITECTURES="86" \
+            -DCUQUANTUM_SDK=${CUQUANTUM_SDK} \
             -DPython_EXECUTABLE:FILE="${{ steps.python_path.outputs.python }}" \
             -G Ninja
           cmake --build ./Build

diff --git a/.github/workflows/tests_lgpumpi_python.yml b/.github/workflows/tests_lgpumpi_python.yml
@@ -129,7 +129,7 @@ jobs:
 
       - name: Build and install package
         env:
-          CUQUANTUM_SDK: $(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum/lib')")
+          CUQUANTUM_SDK: $(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")
         run: |
           source /etc/profile.d/modules.sh && module use /opt/modules/ && module load ${{ matrix.mpilib }}/cuda-${{ matrix.cuda_version_maj }}.${{ matrix.cuda_version_min }}
           CMAKE_ARGS="-DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpicxx -DENABLE_MPI=ON -DCMAKE_CUDA_COMPILER=$(which nvcc) -DCMAKE_CUDA_ARCHITECTURES=${{ env.CI_CUDA_ARCH }} -DPython_EXECUTABLE=${{ steps.python_path.outputs.python }}" \

diff --git a/README.rst b/README.rst
@@ -249,7 +249,7 @@ Then the `cuStateVec`_ library can be installed and set a ``CUQUANTUM_SDK`` envi
 .. code-block:: console
 
     python -m pip install wheel custatevec-cu12
-    export CUQUANTUM_SDK=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum/lib')")
+    export CUQUANTUM_SDK=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")
 
 The Lightning-GPU can then be installed with ``pip``:
 
@@ -386,7 +386,7 @@ Then the `cutensornet`_ library can be installed and set a ``CUQUANTUM_SDK`` env
 .. code-block:: console
 
     pip install cutensornet-cu12
-    export CUQUANTUM_SDK=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum/lib')")
+    export CUQUANTUM_SDK=$(python -c "import site; print( f'{site.getsitepackages()[0]}/cuquantum')")
 
 The Lightning-Tensor can then be installed with ``pip``:
 

diff --git a/cmake/support_pllgpu.cmake b/cmake/support_pllgpu.cmake
@@ -47,6 +47,7 @@ endmacro()
 
 # Macro to aid in finding cuStateVec lib
 macro(findCustatevec external_libs)
+    set(CUQUANTUM_ENV "$ENV{CUQUANTUM_SDK}")
     find_library(CUSTATEVEC_LIB
         NAMES   libcustatevec.so.1 custatevec.so.1
         HINTS   /usr/lib
@@ -58,6 +59,8 @@ macro(findCustatevec external_libs)
             lib64
             ${CUQUANTUM_SDK}/lib
             ${CUQUANTUM_SDK}/lib64
+            ${CUQUANTUM_ENV}/lib
+            ${CUQUANTUM_ENV}/lib64
             ${CUDAToolkit_LIBRARY_DIR}
             ${CUDA_TOOLKIT_ROOT_DIR}/lib
             ${CUDA_TOOLKIT_ROOT_DIR}/lib64
@@ -74,6 +77,7 @@ macro(findCustatevec external_libs)
             /opt/cuda
             include
             ${CUQUANTUM_SDK}/include
+            ${CUQUANTUM_ENV}/include
             ${CUDAToolkit_INCLUDE_DIRS}
             ${CUDA_TOOLKIT_ROOT_DIR}/include
             ${Python_SITELIB}/cuquantum/include

diff --git a/pennylane_lightning/core/_version.py b/pennylane_lightning/core/_version.py
@@ -16,4 +16,4 @@
    Version number (major.minor.patch[-label])
 """
 
-__version__ = "0.38.0-dev24"
+__version__ = "0.38.0-dev25"
diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/StateVectorCudaManaged.hpp
@@ -1359,6 +1359,8 @@ class StateVectorCudaManaged
             /* const int32_t* */ ctrlsInt.data(),
             /* const int32_t* */ nullptr,
             /* const uint32_t */ ctrls.size()));
+        PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(
+            BaseType::getDataBuffer().getDevTag().getStreamID()));
     }
 
     /**
@@ -1419,6 +1421,9 @@ class StateVectorCudaManaged
             /* custatevecComputeType_t */ compute_type,
             /* std::size_t* */ &extraWorkspaceSizeInBytes));
 
+        PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(
+            BaseType::getDataBuffer().getDevTag().getStreamID()));
+
         // allocate external workspace if necessary
         // LCOV_EXCL_START
         if (extraWorkspaceSizeInBytes > 0) {
@@ -1445,6 +1450,9 @@ class StateVectorCudaManaged
             /* custatevecComputeType_t */ compute_type,
             /* void* */ extraWorkspace,
             /* std::size_t */ extraWorkspaceSizeInBytes));
+
+        PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(
+            BaseType::getDataBuffer().getDevTag().getStreamID()));
         // LCOV_EXCL_START
         if (extraWorkspaceSizeInBytes)
             PL_CUDA_IS_SUCCESS(cudaFree(extraWorkspace));

diff --git a/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp b/pennylane_lightning/core/src/simulators/lightning_gpu/measurements/MeasurementsGPU.hpp
@@ -135,6 +135,8 @@ class Measurements final
             /* const int32_t* */ maskBitString,
             /* const int32_t* */ maskOrdering,
             /* const uint32_t */ maskLen));
+        PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(
+            this->_statevector.getDataBuffer().getDevTag().getStreamID()));
 
         if constexpr (std::is_same_v<CFP_t, cuDoubleComplex> ||
                       std::is_same_v<CFP_t, double2>) {
@@ -252,6 +254,8 @@ class Measurements final
             this->_statevector.getCusvHandle(), this->_statevector.getData(),
             data_type, num_qubits, &sampler, num_samples,
             &extraWorkspaceSizeInBytes));
+        PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(
+            this->_statevector.getDataBuffer().getDevTag().getStreamID()));
 
         // allocate external workspace if necessary
         if (extraWorkspaceSizeInBytes > 0)
@@ -262,12 +266,16 @@ class Measurements final
         PL_CUSTATEVEC_IS_SUCCESS(custatevecSamplerPreprocess(
             this->_statevector.getCusvHandle(), sampler, extraWorkspace,
             extraWorkspaceSizeInBytes));
+        PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(
+            this->_statevector.getDataBuffer().getDevTag().getStreamID()));
 
         // sample bit strings
         PL_CUSTATEVEC_IS_SUCCESS(custatevecSamplerSample(
             this->_statevector.getCusvHandle(), sampler, bitStrings.data(),
             bitOrdering.data(), bitStringLen, rand_nums.data(), num_samples,
             CUSTATEVEC_SAMPLER_OUTPUT_ASCENDING_ORDER));
+        PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(
+            this->_statevector.getDataBuffer().getDevTag().getStreamID()));
 
         // destroy descriptor and handle
         PL_CUSTATEVEC_IS_SUCCESS(custatevecSamplerDestroy(sampler));
@@ -497,6 +505,9 @@ class Measurements final
             const_cast<const int32_t **>(basisBits_ptr.data()),
             /* const uint32_t */ n_basisBits.data()));
 
+        PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(
+            this->_statevector.getDataBuffer().getDevTag().getStreamID()));
+
         std::complex<PrecisionT> result{0, 0};
 
         if constexpr (std::is_same_v<PrecisionT, double>) {
@@ -804,6 +815,8 @@ class Measurements final
             /* const uint32_t */ tgtsInt.size(),
             /* custatevecComputeType_t */ compute_type,
             /* std::size_t* */ &extraWorkspaceSizeInBytes));
+        PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(
+            this->_statevector.getDataBuffer().getDevTag().getStreamID()));
 
         // LCOV_EXCL_START
         if (extraWorkspaceSizeInBytes > 0) {
@@ -832,6 +845,9 @@ class Measurements final
             /* void* */ extraWorkspace,
             /* std::size_t */ extraWorkspaceSizeInBytes));
 
+        PL_CUDA_IS_SUCCESS(cudaStreamSynchronize(
+            this->_statevector.getDataBuffer().getDevTag().getStreamID()));
+
         // LCOV_EXCL_START
         if (extraWorkspaceSizeInBytes)
             PL_CUDA_IS_SUCCESS(cudaFree(extraWorkspace));
@@ -840,4 +856,4 @@ class Measurements final
         return static_cast<PrecisionT>(expect.x);
     }
 }; // class Measurements
-} // namespace Pennylane::LightningGPU::Measures
+} // namespace Pennylane::LightningGPU::Measures