From 5ea9d0819e4e61eece145feec3437a585cd13899 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juan=20Gonz=C3=A1lez?= <j.gonzalez@cern.ch>
Date: Mon, 16 Sep 2024 14:44:26 +0200
Subject: [PATCH] GPU geometry transfer fix and stack limit configuration
 option (#312)

- Fix for the copy of the surface model to GPU, which was being called
from all threads
- Removed the hardcoded device stack limit and added a configuration
option in the macro
- Added the line `SET(CMAKE_EXE_LINKER_FLAGS "-Wl,--disable-new-dtags")`
to `Example1/CMakeLists.txt` which solves an issue where Geant4
libraries would not be properly linked in some systems
---
 examples/Example1/CMakeLists.txt              |  5 +-
 examples/Example1/macros/example1.mac.in      |  1 +
 .../macros/example1_large_stack.mac.in        | 78 +++++++++++++++++++
 .../Example1/macros/example1_ttbar.mac.in     |  1 +
 .../macros/example1_ttbar_LHCb.mac.in         |  2 +
 include/AdePT/core/AdePTConfiguration.hh      | 13 ++--
 include/AdePT/core/AdePTTransport.cuh         | 21 ++---
 include/AdePT/core/AdePTTransport.h           |  3 +
 include/AdePT/core/AdePTTransport.icc         | 14 +++-
 .../AdePTConfigurationMessenger.hh            |  1 +
 src/AdePTConfigurationMessenger.cc            |  6 ++
 src/AdePTTrackingManager.cc                   |  1 +
 12 files changed, 127 insertions(+), 19 deletions(-)
 create mode 100644 examples/Example1/macros/example1_large_stack.mac.in

diff --git a/examples/Example1/CMakeLists.txt b/examples/Example1/CMakeLists.txt
index 7d767d9c..595d455f 100644
--- a/examples/Example1/CMakeLists.txt
+++ b/examples/Example1/CMakeLists.txt
@@ -1,6 +1,8 @@
 # SPDX-FileCopyrightText: 2023 CERN
 # SPDX-License-Identifier: Apache-2.0
 
+SET(CMAKE_EXE_LINKER_FLAGS "-Wl,--disable-new-dtags")
+
 if(NOT TARGET G4HepEm::g4HepEm)
   message(STATUS "Disabling example1 (needs G4HepEm)")
   return()
@@ -65,6 +67,7 @@ target_link_libraries(example1
 # Install macros and geometry file
 SET(GDML ${PROJECT_BINARY_DIR}/cms2018_sd.gdml)
 configure_file("macros/example1.mac.in" "${PROJECT_BINARY_DIR}/example1.mac")
+configure_file("macros/example1_large_stack.mac.in" "${PROJECT_BINARY_DIR}/example1_large_stack.mac")
 configure_file("macros/example1_ttbar.mac.in" "${PROJECT_BINARY_DIR}/example1_ttbar.mac")
 configure_file("macros/example1_ttbar_LHCb.mac.in" "${PROJECT_BINARY_DIR}/example1_ttbar_LHCb.mac")
 configure_file("macros/example1_ttbar_noadept.mac.in" "${PROJECT_BINARY_DIR}/example1_ttbar_noadept.mac")
@@ -72,5 +75,5 @@ configure_file("macros/example1_ttbar_noadept.mac.in" "${PROJECT_BINARY_DIR}/exa
 # Tests
 
 add_test(NAME example1
-  COMMAND $<TARGET_FILE:example1> -m ${PROJECT_BINARY_DIR}/example1.mac
+  COMMAND $<TARGET_FILE:example1> -m ${PROJECT_BINARY_DIR}/example1_large_stack.mac
 )
\ No newline at end of file
diff --git a/examples/Example1/macros/example1.mac.in b/examples/Example1/macros/example1.mac.in
index 25074c65..e5497696 100644
--- a/examples/Example1/macros/example1.mac.in
+++ b/examples/Example1/macros/example1.mac.in
@@ -25,6 +25,7 @@
 ## Total number of GPU track slots (not per thread)
 /adept/setMillionsOfTrackSlots 1
 /adept/setMillionsOfHitSlots 1
+# /adept/setCUDAStackLimit 4096
 
 # If true, particles are transported on the GPU across the whole geometry, GPU regions are ignored
 /adept/setTrackInAllRegions true
diff --git a/examples/Example1/macros/example1_large_stack.mac.in b/examples/Example1/macros/example1_large_stack.mac.in
new file mode 100644
index 00000000..ca684e14
--- /dev/null
+++ b/examples/Example1/macros/example1_large_stack.mac.in
@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: 2023 CERN
+# SPDX-License-Identifier: Apache-2.0
+#  example23.in
+#
+
+## =============================================================================
+## Geant4 macro for modelling simplified sampling calorimeters
+## =============================================================================
+##
+/run/numberOfThreads 1
+/control/verbose 0
+/run/verbose 0
+/process/verbose 0
+/tracking/verbose 0
+/event/verbose 0
+##
+/adept/setSeed 1
+
+/detector/filename @GDML@
+# Temporary workaround since we don't have a G4 to VecGeom converter
+/adept/setVecGeomGDML @GDML@
+/adept/setVerbosity 0
+## Threshold for buffering tracks before sending to GPU
+/adept/setTransportBufferThreshold 2000
+## Total number of GPU track slots (not per thread)
+/adept/setMillionsOfTrackSlots 1
+/adept/setMillionsOfHitSlots 1
+/adept/setCUDAStackLimit 8192
+
+# If true, particles are transported on the GPU across the whole geometry, GPU regions are ignored
+/adept/setTrackInAllRegions true
+# In order to do the GPU transport only in specific regions
+/adept/addGPURegion EcalRegion
+/adept/addGPURegion HcalRegion
+
+
+## -----------------------------------------------------------------------------
+## Optionally, set a constant magnetic filed:
+## -----------------------------------------------------------------------------
+/detector/setField 0 0 0 tesla
+#/detector/setField 0 0 3.8 tesla
+
+## -----------------------------------------------------------------------------
+## Set secondary production threshold, init. the run and set primary properties
+## -----------------------------------------------------------------------------
+/run/setCut 0.7 mm
+/run/initialize
+
+## User-defined Event verbosity: 1 = total edep, 2 = energy deposit per placed sensitive volume
+/eventAction/verbose 2
+
+/gun/setDefault
+/gun/particle e-
+/gun/energy 10 GeV
+/gun/number 200
+/gun/position 0 0 0
+/gun/print true
+
+# If false, the following parameters are ignored
+/gun/randomizeGun true
+# Usage: /gun/addParticle type ["weight" weight] ["energy" energy unit]
+/gun/addParticle e- weight 1 energy 10 GeV
+/gun/addParticle proton weight 0 energy 10 GeV
+/gun/minPhi 0 deg
+/gun/maxPhi 360 deg
+/gun/minTheta 10 deg
+/gun/maxTheta 170 deg
+
+## -----------------------------------------------------------------------------
+## Run the simulation with the given number of events and print list of processes
+## -----------------------------------------------------------------------------
+
+/adept/setSeed 1
+
+# run events with parametrised simulation
+# by default all created models are active
+/run/beamOn 1
+
diff --git a/examples/Example1/macros/example1_ttbar.mac.in b/examples/Example1/macros/example1_ttbar.mac.in
index 68c4fc4f..14e9ba04 100644
--- a/examples/Example1/macros/example1_ttbar.mac.in
+++ b/examples/Example1/macros/example1_ttbar.mac.in
@@ -25,6 +25,7 @@
 ## Total number of GPU track slots (not per thread)
 /adept/setMillionsOfTrackSlots 8
 /adept/setMillionsOfHitSlots 1
+# /adept/setCUDAStackLimit 4096
 
 # If true, particles are transported on the GPU across the whole geometry, GPU regions are ignored
 /adept/setTrackInAllRegions true
diff --git a/examples/Example1/macros/example1_ttbar_LHCb.mac.in b/examples/Example1/macros/example1_ttbar_LHCb.mac.in
index 468c5378..bfbf963f 100644
--- a/examples/Example1/macros/example1_ttbar_LHCb.mac.in
+++ b/examples/Example1/macros/example1_ttbar_LHCb.mac.in
@@ -25,6 +25,8 @@
 ## Total number of GPU track slots (not per thread)
 /adept/setMillionsOfTrackSlots 8
 /adept/setMillionsOfHitSlots 1
+## Device stack limit
+# /adept/setCUDAStackLimit 4096
 
 # If true, particles are transported on the GPU across the whole geometry, GPU regions are ignored
 /adept/setTrackInAllRegions true
diff --git a/include/AdePT/core/AdePTConfiguration.hh b/include/AdePT/core/AdePTConfiguration.hh
index 509b87e0..0e4652c0 100644
--- a/include/AdePT/core/AdePTConfiguration.hh
+++ b/include/AdePT/core/AdePTConfiguration.hh
@@ -21,32 +21,35 @@ public:
   void SetMillionsOfTrackSlots(double millionSlots) { fMillionsOfTrackSlots = millionSlots; }
   void SetMillionsOfHitSlots(double millionSlots) { fMillionsOfHitSlots = millionSlots; }
   void SetHitBufferFlushThreshold(float threshold) { fHitBufferFlushThreshold = threshold; }
+  void SetCUDAStackLimit(int limit) { fCUDAStackLimit = limit; }
 
   // We temporarily load VecGeom geometry from GDML
   void SetVecGeomGDML(std::string filename) { fVecGeomGDML = filename; }
 
   bool GetTrackInAllRegions() { return fTrackInAllRegions; }
-  std::vector<std::string> *GetGPURegionNames() { return &fGPURegionNames; }
   bool IsAdePTActivated() { return fAdePTActivated; }
   int GetVerbosity() { return fVerbosity; };
   int GetTransportBufferThreshold() { return fTransportBufferThreshold; }
+  int GetCUDAStackLimit() { return fCUDAStackLimit; }
+  float GetHitBufferFlushThreshold() { return fHitBufferFlushThreshold; }
   double GetMillionsOfTrackSlots() { return fMillionsOfTrackSlots; }
   double GetMillionsOfHitSlots() { return fMillionsOfHitSlots; }
-  float GetHitBufferFlushThreshold() { return fHitBufferFlushThreshold; }
+  std::vector<std::string> *GetGPURegionNames() { return &fGPURegionNames; }
 
   // Temporary
   std::string GetVecGeomGDML() { return fVecGeomGDML; }
 
 private:
-  int fRandomSeed;
   bool fTrackInAllRegions{false};
-  std::vector<std::string> fGPURegionNames{};
   bool fAdePTActivated{true};
+  int fRandomSeed;
   int fVerbosity{0};
   int fTransportBufferThreshold{200};
+  int fCUDAStackLimit{0};
+  float fHitBufferFlushThreshold{0.8};
   double fMillionsOfTrackSlots{1};
   double fMillionsOfHitSlots{1};
-  float fHitBufferFlushThreshold{0.8};
+  std::vector<std::string> fGPURegionNames{};
 
   std::string fVecGeomGDML{""};
 
diff --git a/include/AdePT/core/AdePTTransport.cuh b/include/AdePT/core/AdePTTransport.cuh
index 15a665d0..cbc76a98 100644
--- a/include/AdePT/core/AdePTTransport.cuh
+++ b/include/AdePT/core/AdePTTransport.cuh
@@ -144,9 +144,9 @@ __global__ void InitTracks(adeptint::TrackData *trackinfo, int ntracks, int star
     track.navState.SetBoundaryState(true);
     // nextState is initialized as needed.
 #ifndef ADEPT_USE_SURF
-    int lvolID  = track.navState.Top()->GetLogicalVolume()->id();
+    int lvolID = track.navState.Top()->GetLogicalVolume()->id();
 #else
-    int lvolID  = track.navState.GetLogicalId();
+    int lvolID = track.navState.GetLogicalId();
 #endif
     assert(auxDataArray[lvolID].fGPUregion);
   }
@@ -202,8 +202,6 @@ __global__ void ClearLeakedQueues(LeakedTracks all)
 
 bool InitializeField(double bz)
 {
-  // Try 16384 if debug mode is crashing
-  COPCORE_CUDA_CHECK(vecgeom::cxx::CudaDeviceSetStackLimit(8192 * 2));
   // Initialize field
   COPCORE_CUDA_CHECK(cudaMemcpyToSymbol(BzFieldValue, &bz, sizeof(double)));
   return true;
@@ -225,13 +223,9 @@ void PrepareLeakedBuffers(int numLeaked, adeptint::TrackBuffer &buffer, GPUstate
   }
 }
 
-GPUstate *InitializeGPU(adeptint::TrackBuffer &buffer, int capacity, int maxbatch)
+void CopySurfaceModelToGPU()
 {
-  using TrackData   = adeptint::TrackData;
-  auto gpuState_ptr = new GPUstate;
-  auto &gpuState    = *gpuState_ptr;
-
-  // Copy surface data to GPU
+// Copy surface data to GPU
 #ifdef ADEPT_USE_SURF
 #ifdef ADEPT_USE_SURF_SINGLE
   using SurfData        = vgbrep::SurfData<float>;
@@ -243,6 +237,13 @@ GPUstate *InitializeGPU(adeptint::TrackBuffer &buffer, int capacity, int maxbatc
   BrepCudaManager::Instance().TransferSurfData(SurfData::Instance());
   printf("== Surface data transferred to GPU\n");
 #endif
+}
+
+GPUstate *InitializeGPU(adeptint::TrackBuffer &buffer, int capacity, int maxbatch)
+{
+  using TrackData   = adeptint::TrackData;
+  auto gpuState_ptr = new GPUstate;
+  auto &gpuState    = *gpuState_ptr;
   // Allocate track managers, streams and synchronization events.
   const size_t kQueueSize = MParrayTracks::SizeOfInstance(capacity);
   // Create a stream to synchronize kernels of all particle types.
diff --git a/include/AdePT/core/AdePTTransport.h b/include/AdePT/core/AdePTTransport.h
index 1dea28dd..c741c3ea 100644
--- a/include/AdePT/core/AdePTTransport.h
+++ b/include/AdePT/core/AdePTTransport.h
@@ -64,6 +64,8 @@ class AdePTTransport {
   bool GetTrackInAllRegions() { return fTrackInAllRegions; }
   /// @brief Set Geant4 region to which it applies
   void SetGPURegionNames(std::vector<std::string> *regionNames) { fGPURegionNames = regionNames; }
+  /// @brief Set CUDA device stack limit
+  void SetCUDAStackLimit(int limit) { fCUDAStackLimit = limit; }
   std::vector<std::string> *GetGPURegionNames() { return fGPURegionNames; }
   /// @brief Create material-cut couple index array
   /// @brief Initialize service and copy geometry & physics data on device
@@ -83,6 +85,7 @@ class AdePTTransport {
   int fNumSensitive{0};                                ///< Total number of sensitive volumes
   int fBufferThreshold{20};                            ///< Buffer threshold for flushing AdePT transport buffer
   int fDebugLevel{1};                                  ///< Debug level
+  int fCUDAStackLimit{0};                              ///< CUDA device stack limit
   GPUstate *fGPUstate{nullptr};                        ///< CUDA state placeholder
   AdeptScoring *fScoring{nullptr};                     ///< User scoring object
   AdeptScoring *fScoring_dev{nullptr};                 ///< Device ptr for scoring data
diff --git a/include/AdePT/core/AdePTTransport.icc b/include/AdePT/core/AdePTTransport.icc
index a7addf18..117e3dce 100644
--- a/include/AdePT/core/AdePTTransport.icc
+++ b/include/AdePT/core/AdePTTransport.icc
@@ -30,6 +30,7 @@ void FreeVolAuxArray(adeptint::VolAuxArray &);
 G4HepEmState *InitG4HepEm();
 GPUstate *InitializeGPU(TrackBuffer &, int, int);
 AdeptScoring *InitializeScoringGPU(AdeptScoring *scoring);
+void CopySurfaceModelToGPU();
 void FreeGPU(GPUstate &, G4HepEmState *);
 template <typename IntegrationLayer>
 void ShowerGPU(IntegrationLayer &integration, int event, TrackBuffer &buffer, GPUstate &gpuState, AdeptScoring *scoring,
@@ -67,11 +68,18 @@ template <typename IntegrationLayer>
 bool AdePTTransport<IntegrationLayer>::InitializeGeometry(const vecgeom::cxx::VPlacedVolume *world)
 {
   auto &cudaManager = vecgeom::cxx::CudaManager::Instance();
-  bool success      = true;
+  if(fCUDAStackLimit > 0)
+  {
+    std::cout <<  "CUDA Device stack limit: " <<  fCUDAStackLimit << "\n";
+    cudaDeviceSetLimit(cudaLimitStackSize, fCUDAStackLimit);
+  }
+  bool success = true;
 #ifdef ADEPT_USE_SURF
 #ifdef ADEPT_USE_SURF_SINGLE
+  using SurfData   = vgbrep::SurfData<float>;
   using BrepHelper = vgbrep::BrepHelper<float>;
 #else
+  using SurfData   = vgbrep::SurfData<double>;
   using BrepHelper = vgbrep::BrepHelper<double>;
 #endif
   vecgeom::Stopwatch timer;
@@ -81,15 +89,15 @@ bool AdePTTransport<IntegrationLayer>::InitializeGeometry(const vecgeom::cxx::VP
   std::cout << "== Conversion to surface model done in " << timer.Stop() << " [s]\n";
   // Upload only navigation table to the GPU
   cudaManager.SynchronizeNavigationTable();
+  adept_impl::CopySurfaceModelToGPU();
 #else
   // Upload solid geometry to GPU.
-  cudaDeviceSetLimit(cudaLimitStackSize, 1024 * 8);
   cudaManager.LoadGeometry(world);
   auto world_dev = cudaManager.Synchronize();
   success        = world_dev != nullptr;
-#endif
   // Initialize BVH
   InitBVH();
+#endif
   return success;
 }
 
diff --git a/include/AdePT/integration/AdePTConfigurationMessenger.hh b/include/AdePT/integration/AdePTConfigurationMessenger.hh
index 9523c9cf..beecf06f 100644
--- a/include/AdePT/integration/AdePTConfigurationMessenger.hh
+++ b/include/AdePT/integration/AdePTConfigurationMessenger.hh
@@ -34,6 +34,7 @@ private:
 
   G4UIdirectory *fDir;
   G4UIcmdWithAnInteger *fSetSeedCmd;
+  G4UIcmdWithAnInteger *fSetCUDAStackLimitCmd;
   G4UIcmdWithABool *fSetTrackInAllRegionsCmd;
   G4UIcmdWithAString *fAddRegionCmd;
   G4UIcmdWithABool *fActivateAdePTCmd;
diff --git a/src/AdePTConfigurationMessenger.cc b/src/AdePTConfigurationMessenger.cc
index 6df5061a..f5d27def 100644
--- a/src/AdePTConfigurationMessenger.cc
+++ b/src/AdePTConfigurationMessenger.cc
@@ -58,6 +58,9 @@ AdePTConfigurationMessenger::AdePTConfigurationMessenger(AdePTConfiguration *ade
 
   fSetGDMLCmd = new G4UIcmdWithAString("/adept/setVecGeomGDML", this);
   fSetGDMLCmd->SetGuidance("Temporary method for setting the geometry to use with VecGeom");
+
+  fSetCUDAStackLimitCmd = new G4UIcmdWithAnInteger("/adept/setCUDAStackLimit", this);
+  fSetCUDAStackLimitCmd->SetGuidance("Set the CUDA device stack limit");
 }
 
 //....oooOO0OOooo........oooOO0OOooo........oooOO0OOooo........oooOO0OOooo......
@@ -66,6 +69,7 @@ AdePTConfigurationMessenger::~AdePTConfigurationMessenger()
 {
   delete fDir;
   delete fSetSeedCmd;
+  delete fSetCUDAStackLimitCmd;
   delete fSetTrackInAllRegionsCmd;
   delete fAddRegionCmd;
   delete fActivateAdePTCmd;
@@ -101,6 +105,8 @@ void AdePTConfigurationMessenger::SetNewValue(G4UIcommand *command, G4String new
     fAdePTConfiguration->SetHitBufferFlushThreshold(fSetHitBufferFlushThresholdCmd->GetNewDoubleValue(newValue));
   } else if (command == fSetGDMLCmd) {
     fAdePTConfiguration->SetVecGeomGDML(newValue);
+  } else if (command == fSetCUDAStackLimitCmd) {
+    fAdePTConfiguration->SetCUDAStackLimit(fSetCUDAStackLimitCmd->GetNewIntValue(newValue));
   }
 }
 
diff --git a/src/AdePTTrackingManager.cc b/src/AdePTTrackingManager.cc
index e25566e0..78e959e7 100644
--- a/src/AdePTTrackingManager.cc
+++ b/src/AdePTTrackingManager.cc
@@ -32,6 +32,7 @@ void AdePTTrackingManager::InitializeAdePT()
   fAdeptTransport->SetMaxBatch(2 * fAdePTConfiguration->GetTransportBufferThreshold());
   fAdeptTransport->SetTrackInAllRegions(fAdePTConfiguration->GetTrackInAllRegions());
   fAdeptTransport->SetGPURegionNames(fAdePTConfiguration->GetGPURegionNames());
+  fAdeptTransport->SetCUDAStackLimit(fAdePTConfiguration->GetCUDAStackLimit());
 
   // Check if this is a sequential run
   G4RunManager::RMType rmType = G4RunManager::GetRunManager()->GetRunManagerType();