Skip to content
This repository has been archived by the owner on Oct 16, 2023. It is now read-only.

Commit

Permalink
Merge pull request #8 from hpcaitech/feature/giant_op
Browse files Browse the repository at this point in the history
gen new ncclid and broadcast to devices in the same Tensor Parallelis…
  • Loading branch information
dujiangsu authored Mar 4, 2022
2 parents b020672 + 8bc5749 commit ca5514a
Show file tree
Hide file tree
Showing 2 changed files with 143 additions and 3 deletions.
134 changes: 134 additions & 0 deletions energon/kernel/cuda_native/csrc/get_ncclid.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
#include <c10/util/intrusive_ptr.h>
#include <c10d/Types.hpp>
#include <c10d/NCCLUtils.hpp>
#include <c10d/ProcessGroupNCCL.hpp>
#include <c10d/ProcessGroup.hpp>

#include "nccl.h"
#include <iostream>
#include <string>
#include <torch/extension.h>

// c10::intrusive_ptr<c10d::ProcessGroup::Work>
void sendNcclUniqueId(at::Tensor& ncclid, int dstRank, const c10::intrusive_ptr<c10d::ProcessGroupNCCL>& pg)
{
// pack in
// auto tensor = torch::from_blob(ncclId->internal, {int(32)}, torch::TensorOptions(torch::kCUDA).dtype(torch::kFloat32).requires_grad(false));
// at::Tensor tensor = torch::zeros({int(32)}, torch::TensorOptions(torch::kCUDA).dtype(torch::kFloat32));
std::vector<at::Tensor> tensors = {ncclid};
printf("[INFO] rank start send \n");

if(pg == c10::detail::UniqueVoidPtr())
{
auto ret = pg->send(tensors, dstRank, 0);
ret->wait();
}


printf("[INFO] rank finish send \n");
// return ret;
}

void recvNcclUniqueId(at::Tensor& ncclid, int srcRank, const c10::intrusive_ptr<c10d::ProcessGroupNCCL>& pg)
{
// pack in
at::Tensor tensor = torch::zeros({int(32)}, torch::TensorOptions(torch::kCUDA).dtype(torch::kFloat32));
// auto tensor = torch::from_blob(ncclId->internal, {int(32)}, torch::TensorOptions(torch::kCUDA).dtype(torch::kFloat32).requires_grad(false));
std::vector<at::Tensor> tensors = {ncclid};
printf("[INFO] rank start recv \n");

if (pg == c10::detail::UniqueVoidPtr()){
auto ret = pg->recv(tensors, srcRank, 0);
ret->wait();
}

printf("[INFO] rank finish recv \n");
// at::Tensor tensor = tensors[0];
// float* temp = tensor.data_ptr<float>();
// ncclId->internal
// char * x = reinterpret_cast<char*>(temp);
// get_ptr<ncclUniqueId>(tensor);
}
// if(local_rank == 0)
// {
// for(int i = 1; i<tensor_para_size; i++){
// printf("[INFO] rank %d sends tensor_para_nccl_uid to rank %d \n", int(rank), int(rank + i));
// sendNcclUniqueId(&tensor_para_nccl_uid, rank+i, pg);
// }
// }else{
// printf("[INFO] rank %d receives tensor_para_nccl_uid from rank %d \n", int(rank), int(rank - local_rank));
// recvNcclUniqueId(&tensor_para_nccl_uid ,rank - local_rank, pg);
// }
// std::string res(tensor_para_nccl_uid.internal, NCCL_UNIQUE_ID_BYTES);



// ncclUniqueId* ncclId, int srcRank,
void broadcastUniqueId(at::Tensor &ncclid, int local_rank, const c10::intrusive_ptr<c10d::ProcessGroupNCCL>& pg){

std::vector<at::Tensor> tensors = {ncclid};

printf("[INFO] rank start ncclid broadcast \n");

if (pg != c10::detail::UniqueVoidPtr()){
auto ret = pg->broadcast(tensors, c10d::BroadcastOptions());
ret->wait();
}

printf("[INFO] rank finish ncclid broadcast in func \n");


// char* temp = reinterpret_cast<char*>(cpuNCCLID.data_ptr<float>());
// for(int i = 0; i<NCCL_UNIQUE_ID_BYTES; i++){
// std::cout<<temp[i]-48<<",";
// }
}

// if(local_rank == 0)
// {
// for(int i = 1; i<tensor_para_size; i++){
// printf("[INFO] rank %d sends tensor_para_nccl_uid to rank %d \n", int(rank), int(rank + i));
// sendNcclUniqueId(tensor, rank+i, pg);
// }
// }else{
// printf("[INFO] rank %d receives tensor_para_nccl_uid from rank %d \n", int(rank), int(rank - local_rank));
// recvNcclUniqueId(tensor,rank - local_rank, pg);
// }

// #define NCCL_UNIQUE_ID_BYTES 128
// typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
// std::string
at::Tensor getNCCLInitID(int64_t tensor_para_size, int64_t local_rank, const c10::intrusive_ptr<c10d::ProcessGroupNCCL>& pg){

ncclUniqueId tensor_para_nccl_uid;
ncclGetUniqueId(&tensor_para_nccl_uid);
auto tensor = torch::from_blob(tensor_para_nccl_uid.internal, {int(32)}, torch::TensorOptions(torch::kCPU).dtype(torch::kFloat32).requires_grad(false));
torch::Tensor gpuNCCLID = tensor.to(torch::kCUDA);
broadcastUniqueId(gpuNCCLID, local_rank, pg);
torch::Tensor cpuNCCLID = gpuNCCLID.to(torch::kCPU);

// char* temp = reinterpret_cast<char*>(cpuNCCLID.data_ptr<float>());
// for(int i = 0; i<NCCL_UNIQUE_ID_BYTES; i++){
// std::cout<<temp[i]-48<<",";
// }

return cpuNCCLID;
}


PYBIND11_MODULE(TORCH_EXTENSION_NAME, m){
m.def("init_nccl", &getNCCLInitID, "GET NCCL UNIQUE ID");
}


// printf("[INFO] rank %d get ncclid \n", int(local_rank));
// for(int i = 0; i<NCCL_UNIQUE_ID_BYTES; i++){
// std::cout<<tensor_para_nccl_uid.internal[i]-48<<",";
// }
// float* temp = new float(32);

// at::Tensor tensor = torch::zeros({int(32)}, torch::TensorOptions(torch::kCUDA).dtype(torch::kFloat32));

// for(int i = 0; i<NCCL_UNIQUE_ID_BYTES; i++){
// std::cout<<temp[i]-48<<",";
// }
12 changes: 9 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def fetch_requirements(path):
# and
# https://github.com/NVIDIA/apex/issues/456
# https://github.com/pytorch/pytorch/commit/eb7b39e02f7d75c26d8a795ea8c7fd911334da7e#diff-4632522f237f1e4e728cb824300403ac
version_dependent_macros = ['-DVERSION_GE_1_1', '-DVERSION_GE_1_3', '-DVERSION_GE_1_5']
version_dependent_macros = ['-DVERSION_GE_1_1', '-DVERSION_GE_1_3', '-DVERSION_GE_1_5', '-DUSE_C10D_NCCL']

if "--cuda_ext" in sys.argv:
sys.argv.remove("--cuda_ext")
Expand All @@ -99,7 +99,7 @@ def cuda_ext_helper(name, sources, extra_cuda_flags):
return CUDAExtension(name=name,
sources=[os.path.join('energon/kernel/cuda_native/csrc', path) for path in sources],
include_dirs=[os.path.join(
this_dir, 'energon/kernel/cuda_native/csrc/kernels/include')],
this_dir, 'energon/kernel/cuda_native/csrc/kernels/include'), '/opt/lcsoftware/spack/opt/spack/linux-ubuntu20.04-zen2/gcc-9.3.0/nccl-2.9.6-1-ysovaavjkgjez2fwms4dkvatu5yrxbec/include'],
extra_compile_args={'cxx': ['-O3'] + version_dependent_macros,
'nvcc': append_nvcc_threads(['-O3',
'--use_fast_math'] + version_dependent_macros + extra_cuda_flags)})
Expand Down Expand Up @@ -143,7 +143,8 @@ def cuda_ext_helper(name, sources, extra_cuda_flags):
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__',
'-DTHRUST_IGNORE_CUB_VERSION_CHECK']
'-DTHRUST_IGNORE_CUB_VERSION_CHECK'
]

# ext_modules.append(cuda_ext_helper('colossal_multihead_attention',
# ['multihead_attention_1d.cpp',
Expand All @@ -161,6 +162,11 @@ def cuda_ext_helper(name, sources, extra_cuda_flags):
extra_cuda_flags + cc_flag))


ext_modules.append(cuda_ext_helper('energon_nccl',
['get_ncclid.cpp'],
extra_cuda_flags + cc_flag))


install_requires = fetch_requirements('requirements.txt')

setup(
Expand Down

0 comments on commit ca5514a

Please sign in to comment.