diff --git a/energon/kernel/cuda_native/csrc/get_ncclid.cpp b/energon/kernel/cuda_native/csrc/get_ncclid.cpp new file mode 100644 index 0000000..7b2e8c6 --- /dev/null +++ b/energon/kernel/cuda_native/csrc/get_ncclid.cpp @@ -0,0 +1,134 @@ +#include +#include +#include +#include +#include + +#include "nccl.h" +#include +#include +#include + +// c10::intrusive_ptr +void sendNcclUniqueId(at::Tensor& ncclid, int dstRank, const c10::intrusive_ptr& pg) + { + // pack in + // auto tensor = torch::from_blob(ncclId->internal, {int(32)}, torch::TensorOptions(torch::kCUDA).dtype(torch::kFloat32).requires_grad(false)); + // at::Tensor tensor = torch::zeros({int(32)}, torch::TensorOptions(torch::kCUDA).dtype(torch::kFloat32)); + std::vector tensors = {ncclid}; + printf("[INFO] rank start send \n"); + + if(pg == c10::detail::UniqueVoidPtr()) + { + auto ret = pg->send(tensors, dstRank, 0); + ret->wait(); + } + + + printf("[INFO] rank finish send \n"); + // return ret; + } + +void recvNcclUniqueId(at::Tensor& ncclid, int srcRank, const c10::intrusive_ptr& pg) + { + // pack in + at::Tensor tensor = torch::zeros({int(32)}, torch::TensorOptions(torch::kCUDA).dtype(torch::kFloat32)); + // auto tensor = torch::from_blob(ncclId->internal, {int(32)}, torch::TensorOptions(torch::kCUDA).dtype(torch::kFloat32).requires_grad(false)); + std::vector tensors = {ncclid}; + printf("[INFO] rank start recv \n"); + + if (pg == c10::detail::UniqueVoidPtr()){ + auto ret = pg->recv(tensors, srcRank, 0); + ret->wait(); + } + + printf("[INFO] rank finish recv \n"); + // at::Tensor tensor = tensors[0]; + // float* temp = tensor.data_ptr(); + // ncclId->internal + // char * x = reinterpret_cast(temp); + // get_ptr(tensor); + } + // if(local_rank == 0) +// { +// for(int i = 1; i& pg){ + + std::vector tensors = {ncclid}; + + printf("[INFO] rank start ncclid broadcast \n"); + + if (pg != c10::detail::UniqueVoidPtr()){ + auto ret = pg->broadcast(tensors, c10d::BroadcastOptions()); + ret->wait(); + } + + printf("[INFO] rank finish ncclid broadcast in func \n"); + + + // char* temp = reinterpret_cast(cpuNCCLID.data_ptr()); + // for(int i = 0; i& pg){ + + ncclUniqueId tensor_para_nccl_uid; + ncclGetUniqueId(&tensor_para_nccl_uid); + auto tensor = torch::from_blob(tensor_para_nccl_uid.internal, {int(32)}, torch::TensorOptions(torch::kCPU).dtype(torch::kFloat32).requires_grad(false)); + torch::Tensor gpuNCCLID = tensor.to(torch::kCUDA); + broadcastUniqueId(gpuNCCLID, local_rank, pg); + torch::Tensor cpuNCCLID = gpuNCCLID.to(torch::kCPU); + + // char* temp = reinterpret_cast(cpuNCCLID.data_ptr()); + // for(int i = 0; i