ITS-GPU: put standalone version at state of the art (AliceO2Group#11558)

> Fine for me to merge it as is for now. We should just keep the ossibility to run it inside the gpuworkflow operational. Agreed, my actual plan is to move asap to have the GpuWF in a working state and to have the standalone version as a backup. I am therefore merging this for now. * Add skeleton for road finding * Add gpu array keeping all pointers to separate buffers * Add arrays for idxd access to cells and friends * Fix deltaphi calculation on gpu code * Make Road a template class * Improve metrics logging * Add tracker metrics * add cmakelist entry
fcatalan92 · Jun 27, 2023 · 1b7ea0d · 1b7ea0d
1 parent da7103d
commit 1b7ea0d
Show file tree

Hide file tree

Showing 12 changed files with 362 additions and 103 deletions.
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
@@ -103,13 +103,18 @@ class GpuTimeFrameChunk
   int* getDeviceTrackletsLookupTables(const int);
   Cell* getDeviceCells(const int);
   int* getDeviceCellsLookupTables(const int);
+  Road<nLayers - 2>* getDeviceRoads() { return mRoadsDevice; }
+  int* getDeviceRoadsLookupTables(const int);
   TimeFrameGPUParameters* getTimeFrameGPUParameters() const { return mTFGPUParams; }
 
   int* getDeviceCUBTmpBuffer() { return mCUBTmpBufferDevice; }
   int* getDeviceFoundTracklets() { return mFoundTrackletsDevice; }
   int* getDeviceNFoundCells() { return mNFoundCellsDevice; }
   int* getDeviceCellNeigboursLookupTables(const int);
   int* getDeviceCellNeighbours(const int);
+  Cell** getDeviceArrayCells() const { return mCellsDeviceArray; }
+  int** getDeviceArrayNeighboursCell() const { return mNeighboursCellDeviceArray; }
+  int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLookupTablesDeviceArray; }
 
   /// Vertexer only
   int* getDeviceNTrackletCluster(const int combid) { return mNTrackletsPerClusterDevice[combid]; }
@@ -133,10 +138,18 @@ class GpuTimeFrameChunk
   std::array<Tracklet*, nLayers - 1> mTrackletsDevice;
   std::array<int*, nLayers - 1> mTrackletsLookupTablesDevice;
   std::array<Cell*, nLayers - 2> mCellsDevice;
+  Road<nLayers - 2>* mRoadsDevice;
   std::array<int*, nLayers - 2> mCellsLookupTablesDevice;
   std::array<int*, nLayers - 3> mNeighboursCellDevice;
   std::array<int*, nLayers - 3> mNeighboursCellLookupTablesDevice;
+  std::array<int*, nLayers - 2> mRoadsLookupTablesDevice;
 
+  // These are to make them accessible using layer index
+  Cell** mCellsDeviceArray;
+  int** mNeighboursCellDeviceArray;
+  int** mNeighboursCellLookupTablesDeviceArray;
+
+  // Small accessory buffers
   int* mCUBTmpBufferDevice;
   int* mFoundTrackletsDevice;
   int* mNFoundCellsDevice;

diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -55,16 +55,21 @@ GpuTimeFrameChunk<nLayers>::~GpuTimeFrameChunk()
         if (i < nLayers - 2) {
           checkGPUError(cudaFree(mCellsDevice[i]));
           checkGPUError(cudaFree(mCellsLookupTablesDevice[i]));
+          checkGPUError(cudaFree(mRoadsLookupTablesDevice[i]));
           if (i < nLayers - 3) {
             checkGPUError(cudaFree(mNeighboursCellLookupTablesDevice[i]));
             checkGPUError(cudaFree(mNeighboursCellDevice[i]));
           }
         }
       }
     }
+    checkGPUError(cudaFree(mRoadsDevice));
     checkGPUError(cudaFree(mCUBTmpBufferDevice));
     checkGPUError(cudaFree(mFoundTrackletsDevice));
     checkGPUError(cudaFree(mNFoundCellsDevice));
+    checkGPUError(cudaFree(mCellsDeviceArray));
+    checkGPUError(cudaFree(mNeighboursCellDeviceArray));
+    checkGPUError(cudaFree(mNeighboursCellLookupTablesDeviceArray));
   }
 }
 
@@ -84,6 +89,7 @@ void GpuTimeFrameChunk<nLayers>::allocate(const size_t nrof, Stream& stream)
       if (i < nLayers - 2) {
         checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&(mCellsLookupTablesDevice[i])), sizeof(int) * mTFGPUParams->validatedTrackletsCapacity * nrof, stream.get()));
         checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&(mCellsDevice[i])), sizeof(Cell) * mTFGPUParams->maxNeighboursSize * nrof, stream.get()));
+        checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mRoadsLookupTablesDevice[i]), sizeof(int) * mTFGPUParams->maxNeighboursSize * nrof, stream.get()));
         if (i < nLayers - 3) {
           checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&(mNeighboursCellLookupTablesDevice[i])), sizeof(int) * mTFGPUParams->maxNeighboursSize * nrof, stream.get()));
           checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&(mNeighboursCellDevice[i])), sizeof(int) * mTFGPUParams->maxNeighboursSize * nrof, stream.get()));
@@ -100,10 +106,19 @@ void GpuTimeFrameChunk<nLayers>::allocate(const size_t nrof, Stream& stream)
   checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mNExclusiveFoundLinesDevice), sizeof(int) * mTFGPUParams->clustersPerROfCapacity * nrof + 1, stream.get())); // + 1 for cub::DeviceScan::ExclusiveSum, to cover cases where we have maximum number of clusters per ROF
   checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mUsedTrackletsDevice), sizeof(unsigned char) * mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * nrof, stream.get()));
   checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mClusteredLinesDevice), sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mTFGPUParams->maxTrackletsPerCluster * nrof, stream.get()));
+  checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mRoadsDevice), sizeof(Road<nLayers - 2>) * mTFGPUParams->maxRoadPerRofSize * nrof, stream.get()));
 
   /// Invariant allocations
   checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mFoundTrackletsDevice), (nLayers - 1) * sizeof(int) * nrof, stream.get())); // No need to reset, we always read it after writing
   checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mNFoundCellsDevice), (nLayers - 2) * sizeof(int) * nrof, stream.get()));
+  checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(Cell*), stream.get()));
+  checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mNeighboursCellDeviceArray), (nLayers - 3) * sizeof(int*), stream.get()));
+  checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mNeighboursCellLookupTablesDeviceArray), (nLayers - 3) * sizeof(int*), stream.get()));
+
+  /// Copy pointers of allocated memory to regrouping arrays
+  checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(Cell*), cudaMemcpyHostToDevice, stream.get()));
+  checkGPUError(cudaMemcpyAsync(mNeighboursCellDeviceArray, mNeighboursCellDevice.data(), (nLayers - 3) * sizeof(int*), cudaMemcpyHostToDevice, stream.get()));
+  checkGPUError(cudaMemcpyAsync(mNeighboursCellLookupTablesDeviceArray, mNeighboursCellLookupTablesDevice.data(), (nLayers - 3) * sizeof(int*), cudaMemcpyHostToDevice, stream.get()));
 
   mAllocated = true;
 }
@@ -130,6 +145,7 @@ void GpuTimeFrameChunk<nLayers>::reset(const Task task, Stream& stream)
         thrust::fill(THRUST_NAMESPACE::par.on(stream.get()), thrustTrackletsBegin, thrustTrackletsEnd, Tracklet{});
         if (i < nLayers - 2) {
           checkGPUError(cudaMemsetAsync(mCellsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->cellsLUTsize * mNRof, stream.get()));
+          checkGPUError(cudaMemsetAsync(mRoadsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get()));
           if (i < nLayers - 3) {
             checkGPUError(cudaMemsetAsync(mNeighboursCellLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get()));
             checkGPUError(cudaMemsetAsync(mNeighboursCellDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get()));
@@ -157,6 +173,8 @@ size_t GpuTimeFrameChunk<nLayers>::computeScalingSizeBytes(const int nrof, const
   rofsize += (nLayers - 2) * sizeof(Cell) * config.maxNeighboursSize;                                          // cells
   rofsize += (nLayers - 3) * sizeof(int) * config.maxNeighboursSize;                                           // cell neighbours lookup tables
   rofsize += (nLayers - 3) * sizeof(int) * config.maxNeighboursSize;                                           // cell neighbours
+  rofsize += sizeof(Road<nLayers - 2>) * config.maxRoadPerRofSize;                                             // roads
+  rofsize += (nLayers - 2) * sizeof(int) * config.maxNeighboursSize;                                           // road LUT
   rofsize += sizeof(Line) * config.maxTrackletsPerCluster * config.clustersPerROfCapacity;                     // lines
   rofsize += sizeof(int) * config.clustersPerROfCapacity;                                                      // found lines
   rofsize += sizeof(int) * config.clustersPerROfCapacity;                                                      // found lines exclusive sum
@@ -243,6 +261,12 @@ int* GpuTimeFrameChunk<nLayers>::getDeviceCellNeighbours(const int layer)
   return mNeighboursCellDevice[layer];
 }
 
+template <int nLayers>
+int* GpuTimeFrameChunk<nLayers>::getDeviceRoadsLookupTables(const int layer)
+{
+  return mRoadsLookupTablesDevice[layer];
+}
+
 // Load data
 template <int nLayers>
 size_t GpuTimeFrameChunk<nLayers>::loadDataOnDevice(const size_t startRof, const size_t maxRof, const int maxLayers, Stream& stream)