Skip to content

Commit

Permalink
ITS-GPU: put standalone version at state of the art (AliceO2Group#11558)
Browse files Browse the repository at this point in the history
> Fine for me to merge it as is for now. We should just keep the ossibility to run it inside the gpuworkflow operational.

Agreed, my actual plan is to move asap to have the GpuWF in a working state and to have the standalone version as a backup.

I am therefore merging this for now.

* Add skeleton for road finding

* Add gpu array keeping all pointers to separate buffers

* Add arrays for idxd access to cells and friends

* Fix deltaphi calculation on gpu code

* Make Road a template class

* Improve metrics logging

* Add tracker metrics

* add cmakelist entry
  • Loading branch information
mconcas authored Jun 27, 2023
1 parent da7103d commit 1b7ea0d
Show file tree
Hide file tree
Showing 12 changed files with 362 additions and 103 deletions.
13 changes: 13 additions & 0 deletions Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,18 @@ class GpuTimeFrameChunk
int* getDeviceTrackletsLookupTables(const int);
Cell* getDeviceCells(const int);
int* getDeviceCellsLookupTables(const int);
Road<nLayers - 2>* getDeviceRoads() { return mRoadsDevice; }
int* getDeviceRoadsLookupTables(const int);
TimeFrameGPUParameters* getTimeFrameGPUParameters() const { return mTFGPUParams; }

int* getDeviceCUBTmpBuffer() { return mCUBTmpBufferDevice; }
int* getDeviceFoundTracklets() { return mFoundTrackletsDevice; }
int* getDeviceNFoundCells() { return mNFoundCellsDevice; }
int* getDeviceCellNeigboursLookupTables(const int);
int* getDeviceCellNeighbours(const int);
Cell** getDeviceArrayCells() const { return mCellsDeviceArray; }
int** getDeviceArrayNeighboursCell() const { return mNeighboursCellDeviceArray; }
int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLookupTablesDeviceArray; }

/// Vertexer only
int* getDeviceNTrackletCluster(const int combid) { return mNTrackletsPerClusterDevice[combid]; }
Expand All @@ -133,10 +138,18 @@ class GpuTimeFrameChunk
std::array<Tracklet*, nLayers - 1> mTrackletsDevice;
std::array<int*, nLayers - 1> mTrackletsLookupTablesDevice;
std::array<Cell*, nLayers - 2> mCellsDevice;
Road<nLayers - 2>* mRoadsDevice;
std::array<int*, nLayers - 2> mCellsLookupTablesDevice;
std::array<int*, nLayers - 3> mNeighboursCellDevice;
std::array<int*, nLayers - 3> mNeighboursCellLookupTablesDevice;
std::array<int*, nLayers - 2> mRoadsLookupTablesDevice;

// These are to make them accessible using layer index
Cell** mCellsDeviceArray;
int** mNeighboursCellDeviceArray;
int** mNeighboursCellLookupTablesDeviceArray;

// Small accessory buffers
int* mCUBTmpBufferDevice;
int* mFoundTrackletsDevice;
int* mNFoundCellsDevice;
Expand Down
24 changes: 24 additions & 0 deletions Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
Original file line number Diff line number Diff line change
Expand Up @@ -55,16 +55,21 @@ GpuTimeFrameChunk<nLayers>::~GpuTimeFrameChunk()
if (i < nLayers - 2) {
checkGPUError(cudaFree(mCellsDevice[i]));
checkGPUError(cudaFree(mCellsLookupTablesDevice[i]));
checkGPUError(cudaFree(mRoadsLookupTablesDevice[i]));
if (i < nLayers - 3) {
checkGPUError(cudaFree(mNeighboursCellLookupTablesDevice[i]));
checkGPUError(cudaFree(mNeighboursCellDevice[i]));
}
}
}
}
checkGPUError(cudaFree(mRoadsDevice));
checkGPUError(cudaFree(mCUBTmpBufferDevice));
checkGPUError(cudaFree(mFoundTrackletsDevice));
checkGPUError(cudaFree(mNFoundCellsDevice));
checkGPUError(cudaFree(mCellsDeviceArray));
checkGPUError(cudaFree(mNeighboursCellDeviceArray));
checkGPUError(cudaFree(mNeighboursCellLookupTablesDeviceArray));
}
}

Expand All @@ -84,6 +89,7 @@ void GpuTimeFrameChunk<nLayers>::allocate(const size_t nrof, Stream& stream)
if (i < nLayers - 2) {
checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&(mCellsLookupTablesDevice[i])), sizeof(int) * mTFGPUParams->validatedTrackletsCapacity * nrof, stream.get()));
checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&(mCellsDevice[i])), sizeof(Cell) * mTFGPUParams->maxNeighboursSize * nrof, stream.get()));
checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mRoadsLookupTablesDevice[i]), sizeof(int) * mTFGPUParams->maxNeighboursSize * nrof, stream.get()));
if (i < nLayers - 3) {
checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&(mNeighboursCellLookupTablesDevice[i])), sizeof(int) * mTFGPUParams->maxNeighboursSize * nrof, stream.get()));
checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&(mNeighboursCellDevice[i])), sizeof(int) * mTFGPUParams->maxNeighboursSize * nrof, stream.get()));
Expand All @@ -100,10 +106,19 @@ void GpuTimeFrameChunk<nLayers>::allocate(const size_t nrof, Stream& stream)
checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mNExclusiveFoundLinesDevice), sizeof(int) * mTFGPUParams->clustersPerROfCapacity * nrof + 1, stream.get())); // + 1 for cub::DeviceScan::ExclusiveSum, to cover cases where we have maximum number of clusters per ROF
checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mUsedTrackletsDevice), sizeof(unsigned char) * mTFGPUParams->maxTrackletsPerCluster * mTFGPUParams->clustersPerROfCapacity * nrof, stream.get()));
checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mClusteredLinesDevice), sizeof(int) * mTFGPUParams->clustersPerROfCapacity * mTFGPUParams->maxTrackletsPerCluster * nrof, stream.get()));
checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mRoadsDevice), sizeof(Road<nLayers - 2>) * mTFGPUParams->maxRoadPerRofSize * nrof, stream.get()));

/// Invariant allocations
checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mFoundTrackletsDevice), (nLayers - 1) * sizeof(int) * nrof, stream.get())); // No need to reset, we always read it after writing
checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mNFoundCellsDevice), (nLayers - 2) * sizeof(int) * nrof, stream.get()));
checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mCellsDeviceArray), (nLayers - 2) * sizeof(Cell*), stream.get()));
checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mNeighboursCellDeviceArray), (nLayers - 3) * sizeof(int*), stream.get()));
checkGPUError(cudaMallocAsync(reinterpret_cast<void**>(&mNeighboursCellLookupTablesDeviceArray), (nLayers - 3) * sizeof(int*), stream.get()));

/// Copy pointers of allocated memory to regrouping arrays
checkGPUError(cudaMemcpyAsync(mCellsDeviceArray, mCellsDevice.data(), (nLayers - 2) * sizeof(Cell*), cudaMemcpyHostToDevice, stream.get()));
checkGPUError(cudaMemcpyAsync(mNeighboursCellDeviceArray, mNeighboursCellDevice.data(), (nLayers - 3) * sizeof(int*), cudaMemcpyHostToDevice, stream.get()));
checkGPUError(cudaMemcpyAsync(mNeighboursCellLookupTablesDeviceArray, mNeighboursCellLookupTablesDevice.data(), (nLayers - 3) * sizeof(int*), cudaMemcpyHostToDevice, stream.get()));

mAllocated = true;
}
Expand All @@ -130,6 +145,7 @@ void GpuTimeFrameChunk<nLayers>::reset(const Task task, Stream& stream)
thrust::fill(THRUST_NAMESPACE::par.on(stream.get()), thrustTrackletsBegin, thrustTrackletsEnd, Tracklet{});
if (i < nLayers - 2) {
checkGPUError(cudaMemsetAsync(mCellsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->cellsLUTsize * mNRof, stream.get()));
checkGPUError(cudaMemsetAsync(mRoadsLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get()));
if (i < nLayers - 3) {
checkGPUError(cudaMemsetAsync(mNeighboursCellLookupTablesDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get()));
checkGPUError(cudaMemsetAsync(mNeighboursCellDevice[i], 0, sizeof(int) * mTFGPUParams->maxNeighboursSize * mNRof, stream.get()));
Expand Down Expand Up @@ -157,6 +173,8 @@ size_t GpuTimeFrameChunk<nLayers>::computeScalingSizeBytes(const int nrof, const
rofsize += (nLayers - 2) * sizeof(Cell) * config.maxNeighboursSize; // cells
rofsize += (nLayers - 3) * sizeof(int) * config.maxNeighboursSize; // cell neighbours lookup tables
rofsize += (nLayers - 3) * sizeof(int) * config.maxNeighboursSize; // cell neighbours
rofsize += sizeof(Road<nLayers - 2>) * config.maxRoadPerRofSize; // roads
rofsize += (nLayers - 2) * sizeof(int) * config.maxNeighboursSize; // road LUT
rofsize += sizeof(Line) * config.maxTrackletsPerCluster * config.clustersPerROfCapacity; // lines
rofsize += sizeof(int) * config.clustersPerROfCapacity; // found lines
rofsize += sizeof(int) * config.clustersPerROfCapacity; // found lines exclusive sum
Expand Down Expand Up @@ -243,6 +261,12 @@ int* GpuTimeFrameChunk<nLayers>::getDeviceCellNeighbours(const int layer)
return mNeighboursCellDevice[layer];
}

template <int nLayers>
int* GpuTimeFrameChunk<nLayers>::getDeviceRoadsLookupTables(const int layer)
{
return mRoadsLookupTablesDevice[layer];
}

// Load data
template <int nLayers>
size_t GpuTimeFrameChunk<nLayers>::loadDataOnDevice(const size_t startRof, const size_t maxRof, const int maxLayers, Stream& stream)
Expand Down
Loading

0 comments on commit 1b7ea0d

Please sign in to comment.