Skip to content

Commit

Permalink
Remove all the submodules introduced by GPU features. (#151)
Browse files Browse the repository at this point in the history
* Remove submodules for GRAPE-GPU.
Signed-off-by: septicmk <mengke.mk@alibaba-inc.com>
  • Loading branch information
mengke-mk committed Jul 19, 2023
1 parent 45281c4 commit 40ecead
Show file tree
Hide file tree
Showing 18 changed files with 77 additions and 156 deletions.
9 changes: 0 additions & 9 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -1,10 +1 @@

[submodule "thirdparty/cub"]
path = thirdparty/cub
url = https://github.com/NVIDIA/cub.git
[submodule "thirdparty/moderngpu"]
path = thirdparty/moderngpu
url = https://github.com/moderngpu/moderngpu.git
[submodule "thirdparty/thrust"]
path = thirdparty/thrust
url = https://github.com/NVIDIA/thrust.git
16 changes: 0 additions & 16 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -105,11 +105,6 @@ else()
if (NCCL_VERSION VERSION_LESS "2.7")
message(WARNING "Disable GPU support because NCCL >= 2.7 not found")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
elseif ((NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/thirdparty/cub/.git") OR
(NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/thirdparty/thrust/.git") OR
(NOT EXISTS "${CMAKE_CURRENT_LIST_DIR}/thirdparty/moderngpu/.git"))
message(WARNING "Disable GPU support because dependencies not found, please run 'git submodules update --init --recursive'")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11")
else ()
option(WITH_CUDA "Whether to enable cuda support" ON)
message(STATUS "Build with CUDA support")
Expand Down Expand Up @@ -252,7 +247,6 @@ else ()

if (WITH_CUDA)
cuda_add_executable(gpu_analytical_apps examples/analytical_apps/flags.cc examples/analytical_apps/run_cuda_app.cu)
target_include_directories(gpu_analytical_apps SYSTEM BEFORE PRIVATE thirdparty/cub thirdparty/thrust thirdparty/moderngpu/src)
target_include_directories(gpu_analytical_apps PRIVATE examples/analytical_apps)
set_target_properties(gpu_analytical_apps PROPERTIES OUTPUT_NAME run_cuda_app)
target_link_libraries(gpu_analytical_apps grape-lite ${GFLAGS_LIBRARIES} ${CUDA_LIBS} ${NCCL_LIBRARIES} ${CMAKE_DL_LIBS})
Expand Down Expand Up @@ -295,16 +289,6 @@ install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/flat_hash_map
)

if (WITH_CUDA)
install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/cub/cub
${PROJECT_SOURCE_DIR}/thirdparty/thrust/thrust
${PROJECT_SOURCE_DIR}/thirdparty/moderngpu/src/moderngpu
DESTINATION include
FILES_MATCHING
PATTERN "*.h"
PATTERN "*.cuh"
PATTERN "*.hpp"
PATTERN "*.hxx"
)
install(DIRECTORY ${PROJECT_SOURCE_DIR}/thirdparty/cuda_hashmap
DESTINATION include/cuda_hashmap
FILES_MATCHING
Expand Down
8 changes: 2 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,8 @@ make gnn_sampler

### Building libgrape-lite with GPU support

libgrape-lite supports deploying graph algorithms to GPUs. To enable the support for GPUs, you first
need initialize the dependencies with the following command before building.

```bash
git submodule update --init --recursive
```
libgrape-lite supports deploying graph algorithms to GPUs.
When CUDA is detected on the machine and NCCL >= 2.7, GPU support will be enabled automatically.

## Running libgrape-lite applications

Expand Down
5 changes: 0 additions & 5 deletions grape/config.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ limitations under the License.

#ifdef __CUDACC__
#include <thrust/host_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>
#endif

#include "grape/utils/default_allocator.h"
Expand Down Expand Up @@ -56,10 +55,6 @@ using Allocator = DefaultAllocator<T>;
#define MAX_GRID_SIZE 768
#define TID_1D (threadIdx.x + blockIdx.x * blockDim.x)
#define TOTAL_THREADS_1D (gridDim.x * blockDim.x)

template <typename T>
using pinned_vector =
thrust::host_vector<T, thrust::cuda::experimental::pinned_allocator<T>>;
#else
#define DEV_HOST
#define DEV_HOST_INLINE inline
Expand Down
14 changes: 8 additions & 6 deletions grape/cuda/fragment/host_fragment.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ class HostFragment
stream.cuda_stream()));

auto prefix_sum = compute_prefix_sum(ieoffset);
ArrayView<VID_T> d_prefix_sum(prefix_sum.data(), prefix_sum.size());
ArrayView<VID_T> d_prefix_sum(prefix_sum);

CalculateOffsetWithPrefixSum<nbr_t, vid_t>(
stream, d_prefix_sum, thrust::raw_pointer_cast(d_ie_.data()),
Expand All @@ -176,7 +176,7 @@ class HostFragment
stream.cuda_stream()));

auto prefix_sum = compute_prefix_sum(oeoffset);
ArrayView<VID_T> d_prefix_sum(prefix_sum.data(), prefix_sum.size());
ArrayView<VID_T> d_prefix_sum(prefix_sum);

CalculateOffsetWithPrefixSum<nbr_t, vid_t>(
stream, d_prefix_sum, thrust::raw_pointer_cast(d_oe_.data()),
Expand Down Expand Up @@ -354,7 +354,7 @@ class HostFragment
cudaMemcpyHostToDevice, stream.cuda_stream()));

auto prefix_sum = compute_prefix_sum(ieoffset);
ArrayView<VID_T> d_prefix_sum(prefix_sum.data(), prefix_sum.size());
ArrayView<VID_T> d_prefix_sum(prefix_sum);

CalculateOffsetWithPrefixSum<nbr_t, vid_t>(
stream, d_prefix_sum, thrust::raw_pointer_cast(d_ie_.data()),
Expand All @@ -370,7 +370,7 @@ class HostFragment
cudaMemcpyHostToDevice, stream.cuda_stream()));

auto prefix_sum = compute_prefix_sum(oeoffset);
ArrayView<VID_T> d_prefix_sum(prefix_sum.data(), prefix_sum.size());
ArrayView<VID_T> d_prefix_sum(prefix_sum);

CalculateOffsetWithPrefixSum<nbr_t, vid_t>(
stream, d_prefix_sum, thrust::raw_pointer_cast(d_oe_.data()),
Expand Down Expand Up @@ -414,6 +414,7 @@ class HostFragment
[] __device__(VID_T * gids, VID_T * lids, VID_T size,
CUDASTL::HashMap<VID_T, VID_T> * ovg2l) {
auto tid = TID_1D;
gids = thrust::raw_pointer_cast(gids);
auto nthreads = TOTAL_THREADS_1D;

for (VID_T idx = 0 + tid; idx < size; idx += nthreads) {
Expand All @@ -423,7 +424,8 @@ class HostFragment
(*ovg2l)[gid] = lid;
}
},
gids.data(), lids.data(), size, d_ovg2l_.get());
thrust::raw_pointer_cast(gids.data()),
thrust::raw_pointer_cast(lids.data()), size, d_ovg2l_.get());
}

d_mirrors_of_frag_holder_.resize(fnum_);
Expand Down Expand Up @@ -633,7 +635,7 @@ class HostFragment
thrust::device_vector<fid_t>& d_fid_list,
thrust::device_vector<fid_t*>& d_fid_list_offset) {
pinned_vector<size_t> prefix_sum(ivnum_ + 1, 0);
ArrayView<size_t> d_prefix_sum(prefix_sum.data(), prefix_sum.size());
ArrayView<size_t> d_prefix_sum(prefix_sum);

for (VID_T i = 0; i < ivnum_; ++i) {
prefix_sum[i + 1] =
Expand Down
13 changes: 6 additions & 7 deletions grape/cuda/parallel/parallel_engine.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,12 @@ limitations under the License.

#include <cuda_profiler_api.h>

#include <iostream>
#include <unordered_set>
#pragma push
#pragma diag_suppress = initialization_not_reachable
#include <thrust/binary_search.h>

#include <cub/cub.cuh>
#include <moderngpu/kernel_sortedsearch.hxx>
#pragma pop

#include "grape/config.h"
Expand All @@ -33,6 +32,7 @@ limitations under the License.
#include "grape/cuda/utils/launcher.h"
#include "grape/cuda/utils/shared_value.h"
#include "grape/cuda/utils/sorted_search.h"
#include "grape/cuda/utils/stream.h"
#include "grape/cuda/utils/work_source.h"

// TODO(liang): we may split this to multiple headers
Expand Down Expand Up @@ -930,7 +930,7 @@ DEV_INLINE void LBSTRICT(const FRAG_T& dev_frag, const ArrayView<size_t>& sidx,
while (block_output_processed < block_output_size &&
iter_input_start < block_input_end) {
size_t iter_input_size =
min((size_t)(blockDim.x - 1), block_input_end - iter_input_start);
min((size_t) (blockDim.x - 1), block_input_end - iter_input_start);
size_t iter_input_end = iter_input_start + iter_input_size;
size_t iter_output_end =
iter_input_end < size ? row_offset[iter_input_end] : total_edges;
Expand Down Expand Up @@ -1362,10 +1362,9 @@ class ParallelEngine {
},
ArrayView<size_t>(seid_per_block));

sorted_search<mgpu::bounds_lower>(
stream, thrust::raw_pointer_cast(seid_per_block.data()), block_num,
thrust::raw_pointer_cast(prefix_sum_.data()), size,
thrust::raw_pointer_cast(sidx.data()), mgpu::less_t<size_t>());
sorted_search(stream, thrust::raw_pointer_cast(seid_per_block.data()),
block_num, thrust::raw_pointer_cast(prefix_sum_.data()), size,
thrust::raw_pointer_cast(sidx.data()));

KernelWrapper<<<block_num, block_size, calc_shmem_size(block_size),
stream.cuda_stream()>>>(
Expand Down
10 changes: 3 additions & 7 deletions grape/cuda/utils/array_view.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,8 @@ limitations under the License.

#ifndef GRAPE_CUDA_UTILS_ARRAY_VIEW_H_
#define GRAPE_CUDA_UTILS_ARRAY_VIEW_H_
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/swap.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>

#include "grape/config.h"
#include "grape/cuda/utils/cuda_utils.h"

namespace grape {
namespace cuda {
Expand All @@ -33,8 +29,7 @@ class ArrayView {
: data_(const_cast<T*>(thrust::raw_pointer_cast(vec.data()))),
size_(vec.size()) {}

explicit ArrayView(const thrust::host_vector<
T, thrust::cuda::experimental::pinned_allocator<T>>& vec)
explicit ArrayView(const pinned_vector<T>& vec)
: data_(const_cast<T*>(thrust::raw_pointer_cast(vec.data()))),
size_(vec.size()) {}

Expand Down Expand Up @@ -69,6 +64,7 @@ class ArrayView {
T* data_{};
size_t size_{};
};

} // namespace cuda
} // namespace grape
#endif // GRAPE_CUDA_UTILS_ARRAY_VIEW_H_
27 changes: 24 additions & 3 deletions grape/cuda/utils/cuda_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,18 @@ limitations under the License.
#include <nccl.h>
#include <sys/resource.h>
#include <sys/time.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/host_vector.h>
#include <thrust/swap.h>
#include <thrust/transform_reduce.h>
#include <cub/cub.cuh>

#include "cub/cub.cuh"
#include "grape/config.h"
#if THRUST_VERSION > 101700
#include <thrust/system/cuda/memory_resource.h>
#else
#include <thrust/system/cuda/experimental/pinned_allocator.h>
#endif

#if defined(__unix__) || defined(__unix) || defined(unix) || \
(defined(__APPLE__) && defined(__MACH__))
Expand All @@ -48,6 +55,8 @@ limitations under the License.
#include <sys/stat.h>
#include <sys/types.h>

#include "grape/config.h"

#define CHECK_CUDA(err) \
do { \
cudaError_t errr = (err); \
Expand Down Expand Up @@ -161,7 +170,7 @@ size_t get_rss(bool include_shared_memory) {
if (include_shared_memory) {
return (size_t) rss * (size_t) sysconf(_SC_PAGESIZE);
} else {
return (size_t)(rss - shared_rss) * (size_t) sysconf(_SC_PAGESIZE);
return (size_t) (rss - shared_rss) * (size_t) sysconf(_SC_PAGESIZE);
}
#else
/* Unknown OS ----------------------------------------------- */
Expand Down Expand Up @@ -210,6 +219,18 @@ static cudaError_t SortKeys64(void* d_temp_storage, size_t& temp_storage_bytes,
#endif
}

#if THRUST_VERSION > 101700
using memory_resource =
thrust::system::cuda::universal_host_pinned_memory_resource;
template <typename T>
using pinned_vector = thrust::host_vector<
T, thrust::mr::stateless_resource_allocator<T, memory_resource>>;
#else
template <typename T>
using pinned_vector =
thrust::host_vector<T, thrust::cuda::experimental::pinned_allocator<T>>;
#endif

template <typename InputIteratorT, typename OutputIteratorT>
static cudaError_t PrefixSumKernel64(void* d_temp_storage,
size_t& temp_storage_bytes,
Expand Down
6 changes: 1 addition & 5 deletions grape/cuda/utils/shared_array.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@ limitations under the License.

#ifndef GRAPE_CUDA_UTILS_SHARED_ARRAY_H_
#define GRAPE_CUDA_UTILS_SHARED_ARRAY_H_
#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>

#include "grape/cuda/utils/cuda_utils.h"
#include "grape/cuda/utils/stream.h"
Expand All @@ -30,8 +27,7 @@ class SharedArray {

public:
using device_t = thrust::device_vector<T>;
using host_t =
thrust::host_vector<T, thrust::cuda::experimental::pinned_allocator<T>>;
using host_t = pinned_vector<T>;

SharedArray() = default;

Expand Down
7 changes: 1 addition & 6 deletions grape/cuda/utils/shared_value.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,6 @@ limitations under the License.
#ifndef GRAPE_CUDA_UTILS_SHARED_VALUE_H_
#define GRAPE_CUDA_UTILS_SHARED_VALUE_H_

#include <thrust/device_vector.h>
#include <thrust/host_vector.h>
#include <thrust/system/cuda/experimental/pinned_allocator.h>

#include "grape/cuda/utils/cuda_utils.h"
#include "grape/cuda/utils/stream.h"

Expand Down Expand Up @@ -81,8 +77,7 @@ class SharedValue {

private:
thrust::device_vector<T> d_buffer_;
thrust::host_vector<T, thrust::cuda::experimental::pinned_allocator<T>>
h_buffer_;
pinned_vector<T> h_buffer_;
};
} // namespace cuda
} // namespace grape
Expand Down
Loading

0 comments on commit 40ecead

Please sign in to comment.