-
Notifications
You must be signed in to change notification settings - Fork 3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'master' into gb_cuda_examples2
- Loading branch information
Showing
26 changed files
with
890 additions
and
85 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
/** | ||
* Copyright (c) 2023 by Contributors | ||
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) | ||
* @file cuda/gpu_cache.cu | ||
* @brief GPUCache implementation on CUDA. | ||
*/ | ||
#include <numeric> | ||
|
||
#include "./common.h" | ||
#include "./gpu_cache.h" | ||
|
||
namespace graphbolt { | ||
namespace cuda { | ||
|
||
GpuCache::GpuCache(const std::vector<int64_t> &shape, torch::ScalarType dtype) { | ||
TORCH_CHECK(shape.size() >= 2, "Shape must at least have 2 dimensions."); | ||
const auto num_items = shape[0]; | ||
const int64_t num_feats = | ||
std::accumulate(shape.begin() + 1, shape.end(), 1ll, std::multiplies<>()); | ||
const int element_size = | ||
torch::empty(1, torch::TensorOptions().dtype(dtype)).element_size(); | ||
num_bytes_ = num_feats * element_size; | ||
num_float_feats_ = (num_bytes_ + sizeof(float) - 1) / sizeof(float); | ||
cache_ = std::make_unique<gpu_cache_t>( | ||
(num_items + bucket_size - 1) / bucket_size, num_float_feats_); | ||
shape_ = shape; | ||
shape_[0] = -1; | ||
dtype_ = dtype; | ||
device_id_ = cuda::GetCurrentStream().device_index(); | ||
} | ||
|
||
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> GpuCache::Query( | ||
torch::Tensor keys) { | ||
TORCH_CHECK(keys.device().is_cuda(), "Keys should be on a CUDA device."); | ||
TORCH_CHECK( | ||
keys.device().index() == device_id_, | ||
"Keys should be on the correct CUDA device."); | ||
TORCH_CHECK(keys.sizes().size() == 1, "Keys should be a 1D tensor."); | ||
keys = keys.to(torch::kLong); | ||
auto values = torch::empty( | ||
{keys.size(0), num_float_feats_}, keys.options().dtype(torch::kFloat)); | ||
auto missing_index = | ||
torch::empty(keys.size(0), keys.options().dtype(torch::kLong)); | ||
auto missing_keys = | ||
torch::empty(keys.size(0), keys.options().dtype(torch::kLong)); | ||
cuda::CopyScalar<size_t> missing_len; | ||
auto stream = cuda::GetCurrentStream(); | ||
cache_->Query( | ||
reinterpret_cast<const key_t *>(keys.data_ptr()), keys.size(0), | ||
values.data_ptr<float>(), | ||
reinterpret_cast<uint64_t *>(missing_index.data_ptr()), | ||
reinterpret_cast<key_t *>(missing_keys.data_ptr()), missing_len.get(), | ||
stream); | ||
values = values.view(torch::kByte) | ||
.slice(1, 0, num_bytes_) | ||
.view(dtype_) | ||
.view(shape_); | ||
// To safely read missing_len, we synchronize | ||
stream.synchronize(); | ||
missing_index = missing_index.slice(0, 0, static_cast<size_t>(missing_len)); | ||
missing_keys = missing_keys.slice(0, 0, static_cast<size_t>(missing_len)); | ||
return std::make_tuple(values, missing_index, missing_keys); | ||
} | ||
|
||
void GpuCache::Replace(torch::Tensor keys, torch::Tensor values) { | ||
TORCH_CHECK(keys.device().is_cuda(), "Keys should be on a CUDA device."); | ||
TORCH_CHECK( | ||
keys.device().index() == device_id_, | ||
"Keys should be on the correct CUDA device."); | ||
TORCH_CHECK(values.device().is_cuda(), "Keys should be on a CUDA device."); | ||
TORCH_CHECK( | ||
values.device().index() == device_id_, | ||
"Values should be on the correct CUDA device."); | ||
TORCH_CHECK( | ||
keys.size(0) == values.size(0), | ||
"The first dimensions of keys and values must match."); | ||
TORCH_CHECK( | ||
std::equal(shape_.begin() + 1, shape_.end(), values.sizes().begin() + 1), | ||
"Values should have the correct dimensions."); | ||
TORCH_CHECK( | ||
values.scalar_type() == dtype_, "Values should have the correct dtype."); | ||
keys = keys.to(torch::kLong); | ||
torch::Tensor float_values; | ||
if (num_bytes_ % sizeof(float) != 0) { | ||
float_values = torch::empty( | ||
{values.size(0), num_float_feats_}, | ||
values.options().dtype(torch::kFloat)); | ||
float_values.view(torch::kByte) | ||
.slice(1, 0, num_bytes_) | ||
.copy_(values.view(torch::kByte).view({values.size(0), -1})); | ||
} else { | ||
float_values = values.view(torch::kByte) | ||
.view({values.size(0), -1}) | ||
.view(torch::kFloat) | ||
.contiguous(); | ||
} | ||
cache_->Replace( | ||
reinterpret_cast<const key_t *>(keys.data_ptr()), keys.size(0), | ||
float_values.data_ptr<float>(), cuda::GetCurrentStream()); | ||
} | ||
|
||
c10::intrusive_ptr<GpuCache> GpuCache::Create( | ||
const std::vector<int64_t> &shape, torch::ScalarType dtype) { | ||
return c10::make_intrusive<GpuCache>(shape, dtype); | ||
} | ||
|
||
} // namespace cuda | ||
} // namespace graphbolt |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
/** | ||
* Copyright (c) 2023 by Contributors | ||
* Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) | ||
* @file cuda/gpu_cache.h | ||
* @brief Header file of HugeCTR gpu_cache wrapper. | ||
*/ | ||
|
||
#ifndef GRAPHBOLT_GPU_CACHE_H_ | ||
#define GRAPHBOLT_GPU_CACHE_H_ | ||
|
||
#include <torch/custom_class.h> | ||
#include <torch/torch.h> | ||
|
||
#include <limits> | ||
#include <nv_gpu_cache.hpp> | ||
|
||
namespace graphbolt { | ||
namespace cuda { | ||
|
||
class GpuCache : public torch::CustomClassHolder { | ||
using key_t = long long; | ||
constexpr static int set_associativity = 2; | ||
constexpr static int WARP_SIZE = 32; | ||
constexpr static int bucket_size = WARP_SIZE * set_associativity; | ||
using gpu_cache_t = ::gpu_cache::gpu_cache< | ||
key_t, uint64_t, std::numeric_limits<key_t>::max(), set_associativity, | ||
WARP_SIZE>; | ||
|
||
public: | ||
/** | ||
* @brief Constructor for the GpuCache struct. | ||
* | ||
* @param shape The shape of the GPU cache. | ||
* @param dtype The datatype of items to be stored. | ||
*/ | ||
GpuCache(const std::vector<int64_t>& shape, torch::ScalarType dtype); | ||
|
||
GpuCache() = default; | ||
|
||
std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> Query( | ||
torch::Tensor keys); | ||
|
||
void Replace(torch::Tensor keys, torch::Tensor values); | ||
|
||
static c10::intrusive_ptr<GpuCache> Create( | ||
const std::vector<int64_t>& shape, torch::ScalarType dtype); | ||
|
||
private: | ||
std::vector<int64_t> shape_; | ||
torch::ScalarType dtype_; | ||
std::unique_ptr<gpu_cache_t> cache_; | ||
int64_t num_bytes_; | ||
int64_t num_float_feats_; | ||
torch::DeviceIndex device_id_; | ||
}; | ||
|
||
// The cu file in HugeCTR gpu cache uses unsigned int and long long. | ||
// Changing to int64_t results in a mismatch of template arguments. | ||
static_assert( | ||
sizeof(long long) == sizeof(int64_t), | ||
"long long and int64_t needs to have the same size."); // NOLINT | ||
|
||
} // namespace cuda | ||
} // namespace graphbolt | ||
|
||
#endif // GRAPHBOLT_GPU_CACHE_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.