From 5ea9d0819e4e61eece145feec3437a585cd13899 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Juan=20Gonz=C3=A1lez?= Date: Mon, 16 Sep 2024 14:44:26 +0200 Subject: [PATCH] GPU geometry transfer fix and stack limit configuration option (#312) - Fix for the copy of the surface model to GPU, which was being called from all threads - Removed the hardcoded device stack limit and added a configuration option in the macro - Added the line `SET(CMAKE_EXE_LINKER_FLAGS "-Wl,--disable-new-dtags")` to `Example1/CMakeLists.txt` which solves an issue where Geant4 libraries would not be properly linked in some systems --- examples/Example1/CMakeLists.txt | 5 +- examples/Example1/macros/example1.mac.in | 1 + .../macros/example1_large_stack.mac.in | 78 +++++++++++++++++++ .../Example1/macros/example1_ttbar.mac.in | 1 + .../macros/example1_ttbar_LHCb.mac.in | 2 + include/AdePT/core/AdePTConfiguration.hh | 13 ++-- include/AdePT/core/AdePTTransport.cuh | 21 ++--- include/AdePT/core/AdePTTransport.h | 3 + include/AdePT/core/AdePTTransport.icc | 14 +++- .../AdePTConfigurationMessenger.hh | 1 + src/AdePTConfigurationMessenger.cc | 6 ++ src/AdePTTrackingManager.cc | 1 + 12 files changed, 127 insertions(+), 19 deletions(-) create mode 100644 examples/Example1/macros/example1_large_stack.mac.in diff --git a/examples/Example1/CMakeLists.txt b/examples/Example1/CMakeLists.txt index 7d767d9c..595d455f 100644 --- a/examples/Example1/CMakeLists.txt +++ b/examples/Example1/CMakeLists.txt @@ -1,6 +1,8 @@ # SPDX-FileCopyrightText: 2023 CERN # SPDX-License-Identifier: Apache-2.0 +SET(CMAKE_EXE_LINKER_FLAGS "-Wl,--disable-new-dtags") + if(NOT TARGET G4HepEm::g4HepEm) message(STATUS "Disabling example1 (needs G4HepEm)") return() @@ -65,6 +67,7 @@ target_link_libraries(example1 # Install macros and geometry file SET(GDML ${PROJECT_BINARY_DIR}/cms2018_sd.gdml) configure_file("macros/example1.mac.in" "${PROJECT_BINARY_DIR}/example1.mac") +configure_file("macros/example1_large_stack.mac.in" "${PROJECT_BINARY_DIR}/example1_large_stack.mac") configure_file("macros/example1_ttbar.mac.in" "${PROJECT_BINARY_DIR}/example1_ttbar.mac") configure_file("macros/example1_ttbar_LHCb.mac.in" "${PROJECT_BINARY_DIR}/example1_ttbar_LHCb.mac") configure_file("macros/example1_ttbar_noadept.mac.in" "${PROJECT_BINARY_DIR}/example1_ttbar_noadept.mac") @@ -72,5 +75,5 @@ configure_file("macros/example1_ttbar_noadept.mac.in" "${PROJECT_BINARY_DIR}/exa # Tests add_test(NAME example1 - COMMAND $ -m ${PROJECT_BINARY_DIR}/example1.mac + COMMAND $ -m ${PROJECT_BINARY_DIR}/example1_large_stack.mac ) \ No newline at end of file diff --git a/examples/Example1/macros/example1.mac.in b/examples/Example1/macros/example1.mac.in index 25074c65..e5497696 100644 --- a/examples/Example1/macros/example1.mac.in +++ b/examples/Example1/macros/example1.mac.in @@ -25,6 +25,7 @@ ## Total number of GPU track slots (not per thread) /adept/setMillionsOfTrackSlots 1 /adept/setMillionsOfHitSlots 1 +# /adept/setCUDAStackLimit 4096 # If true, particles are transported on the GPU across the whole geometry, GPU regions are ignored /adept/setTrackInAllRegions true diff --git a/examples/Example1/macros/example1_large_stack.mac.in b/examples/Example1/macros/example1_large_stack.mac.in new file mode 100644 index 00000000..ca684e14 --- /dev/null +++ b/examples/Example1/macros/example1_large_stack.mac.in @@ -0,0 +1,78 @@ +# SPDX-FileCopyrightText: 2023 CERN +# SPDX-License-Identifier: Apache-2.0 +# example23.in +# + +## ============================================================================= +## Geant4 macro for modelling simplified sampling calorimeters +## ============================================================================= +## +/run/numberOfThreads 1 +/control/verbose 0 +/run/verbose 0 +/process/verbose 0 +/tracking/verbose 0 +/event/verbose 0 +## +/adept/setSeed 1 + +/detector/filename @GDML@ +# Temporary workaround since we don't have a G4 to VecGeom converter +/adept/setVecGeomGDML @GDML@ +/adept/setVerbosity 0 +## Threshold for buffering tracks before sending to GPU +/adept/setTransportBufferThreshold 2000 +## Total number of GPU track slots (not per thread) +/adept/setMillionsOfTrackSlots 1 +/adept/setMillionsOfHitSlots 1 +/adept/setCUDAStackLimit 8192 + +# If true, particles are transported on the GPU across the whole geometry, GPU regions are ignored +/adept/setTrackInAllRegions true +# In order to do the GPU transport only in specific regions +/adept/addGPURegion EcalRegion +/adept/addGPURegion HcalRegion + + +## ----------------------------------------------------------------------------- +## Optionally, set a constant magnetic filed: +## ----------------------------------------------------------------------------- +/detector/setField 0 0 0 tesla +#/detector/setField 0 0 3.8 tesla + +## ----------------------------------------------------------------------------- +## Set secondary production threshold, init. the run and set primary properties +## ----------------------------------------------------------------------------- +/run/setCut 0.7 mm +/run/initialize + +## User-defined Event verbosity: 1 = total edep, 2 = energy deposit per placed sensitive volume +/eventAction/verbose 2 + +/gun/setDefault +/gun/particle e- +/gun/energy 10 GeV +/gun/number 200 +/gun/position 0 0 0 +/gun/print true + +# If false, the following parameters are ignored +/gun/randomizeGun true +# Usage: /gun/addParticle type ["weight" weight] ["energy" energy unit] +/gun/addParticle e- weight 1 energy 10 GeV +/gun/addParticle proton weight 0 energy 10 GeV +/gun/minPhi 0 deg +/gun/maxPhi 360 deg +/gun/minTheta 10 deg +/gun/maxTheta 170 deg + +## ----------------------------------------------------------------------------- +## Run the simulation with the given number of events and print list of processes +## ----------------------------------------------------------------------------- + +/adept/setSeed 1 + +# run events with parametrised simulation +# by default all created models are active +/run/beamOn 1 + diff --git a/examples/Example1/macros/example1_ttbar.mac.in b/examples/Example1/macros/example1_ttbar.mac.in index 68c4fc4f..14e9ba04 100644 --- a/examples/Example1/macros/example1_ttbar.mac.in +++ b/examples/Example1/macros/example1_ttbar.mac.in @@ -25,6 +25,7 @@ ## Total number of GPU track slots (not per thread) /adept/setMillionsOfTrackSlots 8 /adept/setMillionsOfHitSlots 1 +# /adept/setCUDAStackLimit 4096 # If true, particles are transported on the GPU across the whole geometry, GPU regions are ignored /adept/setTrackInAllRegions true diff --git a/examples/Example1/macros/example1_ttbar_LHCb.mac.in b/examples/Example1/macros/example1_ttbar_LHCb.mac.in index 468c5378..bfbf963f 100644 --- a/examples/Example1/macros/example1_ttbar_LHCb.mac.in +++ b/examples/Example1/macros/example1_ttbar_LHCb.mac.in @@ -25,6 +25,8 @@ ## Total number of GPU track slots (not per thread) /adept/setMillionsOfTrackSlots 8 /adept/setMillionsOfHitSlots 1 +## Device stack limit +# /adept/setCUDAStackLimit 4096 # If true, particles are transported on the GPU across the whole geometry, GPU regions are ignored /adept/setTrackInAllRegions true diff --git a/include/AdePT/core/AdePTConfiguration.hh b/include/AdePT/core/AdePTConfiguration.hh index 509b87e0..0e4652c0 100644 --- a/include/AdePT/core/AdePTConfiguration.hh +++ b/include/AdePT/core/AdePTConfiguration.hh @@ -21,32 +21,35 @@ public: void SetMillionsOfTrackSlots(double millionSlots) { fMillionsOfTrackSlots = millionSlots; } void SetMillionsOfHitSlots(double millionSlots) { fMillionsOfHitSlots = millionSlots; } void SetHitBufferFlushThreshold(float threshold) { fHitBufferFlushThreshold = threshold; } + void SetCUDAStackLimit(int limit) { fCUDAStackLimit = limit; } // We temporarily load VecGeom geometry from GDML void SetVecGeomGDML(std::string filename) { fVecGeomGDML = filename; } bool GetTrackInAllRegions() { return fTrackInAllRegions; } - std::vector *GetGPURegionNames() { return &fGPURegionNames; } bool IsAdePTActivated() { return fAdePTActivated; } int GetVerbosity() { return fVerbosity; }; int GetTransportBufferThreshold() { return fTransportBufferThreshold; } + int GetCUDAStackLimit() { return fCUDAStackLimit; } + float GetHitBufferFlushThreshold() { return fHitBufferFlushThreshold; } double GetMillionsOfTrackSlots() { return fMillionsOfTrackSlots; } double GetMillionsOfHitSlots() { return fMillionsOfHitSlots; } - float GetHitBufferFlushThreshold() { return fHitBufferFlushThreshold; } + std::vector *GetGPURegionNames() { return &fGPURegionNames; } // Temporary std::string GetVecGeomGDML() { return fVecGeomGDML; } private: - int fRandomSeed; bool fTrackInAllRegions{false}; - std::vector fGPURegionNames{}; bool fAdePTActivated{true}; + int fRandomSeed; int fVerbosity{0}; int fTransportBufferThreshold{200}; + int fCUDAStackLimit{0}; + float fHitBufferFlushThreshold{0.8}; double fMillionsOfTrackSlots{1}; double fMillionsOfHitSlots{1}; - float fHitBufferFlushThreshold{0.8}; + std::vector fGPURegionNames{}; std::string fVecGeomGDML{""}; diff --git a/include/AdePT/core/AdePTTransport.cuh b/include/AdePT/core/AdePTTransport.cuh index 15a665d0..cbc76a98 100644 --- a/include/AdePT/core/AdePTTransport.cuh +++ b/include/AdePT/core/AdePTTransport.cuh @@ -144,9 +144,9 @@ __global__ void InitTracks(adeptint::TrackData *trackinfo, int ntracks, int star track.navState.SetBoundaryState(true); // nextState is initialized as needed. #ifndef ADEPT_USE_SURF - int lvolID = track.navState.Top()->GetLogicalVolume()->id(); + int lvolID = track.navState.Top()->GetLogicalVolume()->id(); #else - int lvolID = track.navState.GetLogicalId(); + int lvolID = track.navState.GetLogicalId(); #endif assert(auxDataArray[lvolID].fGPUregion); } @@ -202,8 +202,6 @@ __global__ void ClearLeakedQueues(LeakedTracks all) bool InitializeField(double bz) { - // Try 16384 if debug mode is crashing - COPCORE_CUDA_CHECK(vecgeom::cxx::CudaDeviceSetStackLimit(8192 * 2)); // Initialize field COPCORE_CUDA_CHECK(cudaMemcpyToSymbol(BzFieldValue, &bz, sizeof(double))); return true; @@ -225,13 +223,9 @@ void PrepareLeakedBuffers(int numLeaked, adeptint::TrackBuffer &buffer, GPUstate } } -GPUstate *InitializeGPU(adeptint::TrackBuffer &buffer, int capacity, int maxbatch) +void CopySurfaceModelToGPU() { - using TrackData = adeptint::TrackData; - auto gpuState_ptr = new GPUstate; - auto &gpuState = *gpuState_ptr; - - // Copy surface data to GPU +// Copy surface data to GPU #ifdef ADEPT_USE_SURF #ifdef ADEPT_USE_SURF_SINGLE using SurfData = vgbrep::SurfData; @@ -243,6 +237,13 @@ GPUstate *InitializeGPU(adeptint::TrackBuffer &buffer, int capacity, int maxbatc BrepCudaManager::Instance().TransferSurfData(SurfData::Instance()); printf("== Surface data transferred to GPU\n"); #endif +} + +GPUstate *InitializeGPU(adeptint::TrackBuffer &buffer, int capacity, int maxbatch) +{ + using TrackData = adeptint::TrackData; + auto gpuState_ptr = new GPUstate; + auto &gpuState = *gpuState_ptr; // Allocate track managers, streams and synchronization events. const size_t kQueueSize = MParrayTracks::SizeOfInstance(capacity); // Create a stream to synchronize kernels of all particle types. diff --git a/include/AdePT/core/AdePTTransport.h b/include/AdePT/core/AdePTTransport.h index 1dea28dd..c741c3ea 100644 --- a/include/AdePT/core/AdePTTransport.h +++ b/include/AdePT/core/AdePTTransport.h @@ -64,6 +64,8 @@ class AdePTTransport { bool GetTrackInAllRegions() { return fTrackInAllRegions; } /// @brief Set Geant4 region to which it applies void SetGPURegionNames(std::vector *regionNames) { fGPURegionNames = regionNames; } + /// @brief Set CUDA device stack limit + void SetCUDAStackLimit(int limit) { fCUDAStackLimit = limit; } std::vector *GetGPURegionNames() { return fGPURegionNames; } /// @brief Create material-cut couple index array /// @brief Initialize service and copy geometry & physics data on device @@ -83,6 +85,7 @@ class AdePTTransport { int fNumSensitive{0}; ///< Total number of sensitive volumes int fBufferThreshold{20}; ///< Buffer threshold for flushing AdePT transport buffer int fDebugLevel{1}; ///< Debug level + int fCUDAStackLimit{0}; ///< CUDA device stack limit GPUstate *fGPUstate{nullptr}; ///< CUDA state placeholder AdeptScoring *fScoring{nullptr}; ///< User scoring object AdeptScoring *fScoring_dev{nullptr}; ///< Device ptr for scoring data diff --git a/include/AdePT/core/AdePTTransport.icc b/include/AdePT/core/AdePTTransport.icc index a7addf18..117e3dce 100644 --- a/include/AdePT/core/AdePTTransport.icc +++ b/include/AdePT/core/AdePTTransport.icc @@ -30,6 +30,7 @@ void FreeVolAuxArray(adeptint::VolAuxArray &); G4HepEmState *InitG4HepEm(); GPUstate *InitializeGPU(TrackBuffer &, int, int); AdeptScoring *InitializeScoringGPU(AdeptScoring *scoring); +void CopySurfaceModelToGPU(); void FreeGPU(GPUstate &, G4HepEmState *); template void ShowerGPU(IntegrationLayer &integration, int event, TrackBuffer &buffer, GPUstate &gpuState, AdeptScoring *scoring, @@ -67,11 +68,18 @@ template bool AdePTTransport::InitializeGeometry(const vecgeom::cxx::VPlacedVolume *world) { auto &cudaManager = vecgeom::cxx::CudaManager::Instance(); - bool success = true; + if(fCUDAStackLimit > 0) + { + std::cout << "CUDA Device stack limit: " << fCUDAStackLimit << "\n"; + cudaDeviceSetLimit(cudaLimitStackSize, fCUDAStackLimit); + } + bool success = true; #ifdef ADEPT_USE_SURF #ifdef ADEPT_USE_SURF_SINGLE + using SurfData = vgbrep::SurfData; using BrepHelper = vgbrep::BrepHelper; #else + using SurfData = vgbrep::SurfData; using BrepHelper = vgbrep::BrepHelper; #endif vecgeom::Stopwatch timer; @@ -81,15 +89,15 @@ bool AdePTTransport::InitializeGeometry(const vecgeom::cxx::VP std::cout << "== Conversion to surface model done in " << timer.Stop() << " [s]\n"; // Upload only navigation table to the GPU cudaManager.SynchronizeNavigationTable(); + adept_impl::CopySurfaceModelToGPU(); #else // Upload solid geometry to GPU. - cudaDeviceSetLimit(cudaLimitStackSize, 1024 * 8); cudaManager.LoadGeometry(world); auto world_dev = cudaManager.Synchronize(); success = world_dev != nullptr; -#endif // Initialize BVH InitBVH(); +#endif return success; } diff --git a/include/AdePT/integration/AdePTConfigurationMessenger.hh b/include/AdePT/integration/AdePTConfigurationMessenger.hh index 9523c9cf..beecf06f 100644 --- a/include/AdePT/integration/AdePTConfigurationMessenger.hh +++ b/include/AdePT/integration/AdePTConfigurationMessenger.hh @@ -34,6 +34,7 @@ private: G4UIdirectory *fDir; G4UIcmdWithAnInteger *fSetSeedCmd; + G4UIcmdWithAnInteger *fSetCUDAStackLimitCmd; G4UIcmdWithABool *fSetTrackInAllRegionsCmd; G4UIcmdWithAString *fAddRegionCmd; G4UIcmdWithABool *fActivateAdePTCmd; diff --git a/src/AdePTConfigurationMessenger.cc b/src/AdePTConfigurationMessenger.cc index 6df5061a..f5d27def 100644 --- a/src/AdePTConfigurationMessenger.cc +++ b/src/AdePTConfigurationMessenger.cc @@ -58,6 +58,9 @@ AdePTConfigurationMessenger::AdePTConfigurationMessenger(AdePTConfiguration *ade fSetGDMLCmd = new G4UIcmdWithAString("/adept/setVecGeomGDML", this); fSetGDMLCmd->SetGuidance("Temporary method for setting the geometry to use with VecGeom"); + + fSetCUDAStackLimitCmd = new G4UIcmdWithAnInteger("/adept/setCUDAStackLimit", this); + fSetCUDAStackLimitCmd->SetGuidance("Set the CUDA device stack limit"); } //....oooOO0OOooo........oooOO0OOooo........oooOO0OOooo........oooOO0OOooo...... @@ -66,6 +69,7 @@ AdePTConfigurationMessenger::~AdePTConfigurationMessenger() { delete fDir; delete fSetSeedCmd; + delete fSetCUDAStackLimitCmd; delete fSetTrackInAllRegionsCmd; delete fAddRegionCmd; delete fActivateAdePTCmd; @@ -101,6 +105,8 @@ void AdePTConfigurationMessenger::SetNewValue(G4UIcommand *command, G4String new fAdePTConfiguration->SetHitBufferFlushThreshold(fSetHitBufferFlushThresholdCmd->GetNewDoubleValue(newValue)); } else if (command == fSetGDMLCmd) { fAdePTConfiguration->SetVecGeomGDML(newValue); + } else if (command == fSetCUDAStackLimitCmd) { + fAdePTConfiguration->SetCUDAStackLimit(fSetCUDAStackLimitCmd->GetNewIntValue(newValue)); } } diff --git a/src/AdePTTrackingManager.cc b/src/AdePTTrackingManager.cc index e25566e0..78e959e7 100644 --- a/src/AdePTTrackingManager.cc +++ b/src/AdePTTrackingManager.cc @@ -32,6 +32,7 @@ void AdePTTrackingManager::InitializeAdePT() fAdeptTransport->SetMaxBatch(2 * fAdePTConfiguration->GetTransportBufferThreshold()); fAdeptTransport->SetTrackInAllRegions(fAdePTConfiguration->GetTrackInAllRegions()); fAdeptTransport->SetGPURegionNames(fAdePTConfiguration->GetGPURegionNames()); + fAdeptTransport->SetCUDAStackLimit(fAdePTConfiguration->GetCUDAStackLimit()); // Check if this is a sequential run G4RunManager::RMType rmType = G4RunManager::GetRunManager()->GetRunManagerType();