From c20df198ba7b25758b7f7ce6d4af3318c9c94c48 Mon Sep 17 00:00:00 2001 From: luoxiaojian Date: Fri, 23 Aug 2024 12:41:22 +0800 Subject: [PATCH] Refactor vertex map, imported pthash. (#168) --- CMakeLists.txt | 4 +- .../analytical_apps/cuda/pagerank/pagerank.h | 3 + examples/analytical_apps/flags.cc | 8 +- examples/analytical_apps/flags.h | 3 +- examples/analytical_apps/lcc/lcc_opt.h | 20 +- examples/analytical_apps/run_app.h | 83 +- examples/analytical_apps/run_app_opt.h | 270 ++++--- examples/analytical_apps/run_cuda_app.h | 126 +-- .../append_only_edgecut_fragment.h | 85 +- examples/gnn_sampler/run_sampler.cc | 1 - grape/app/mutation_context.h | 77 +- grape/communication/sync_comm.h | 64 +- grape/cuda/fragment/device_fragment.h | 5 +- grape/cuda/fragment/host_fragment.h | 62 +- grape/cuda/vertex_map/device_vertex_map.h | 32 +- grape/fragment/basic_efile_fragment_loader.h | 185 +++++ grape/fragment/basic_fragment_loader.h | 411 +++------- grape/fragment/basic_fragment_loader_base.h | 440 ++++++++++ grape/fragment/basic_fragment_mutator.h | 100 +-- grape/fragment/basic_local_fragment_loader.h | 250 ++++++ grape/fragment/basic_rb_fragment_loader.h | 228 ++++++ grape/fragment/csr_edgecut_fragment_base.h | 1 - grape/fragment/edgecut_fragment_base.h | 4 +- grape/fragment/ev_fragment_loader.h | 67 +- grape/fragment/ev_fragment_mutator.h | 1 - grape/fragment/ev_fragment_rebalance_loader.h | 432 ---------- grape/fragment/fragment_base.h | 24 +- grape/fragment/immutable_edgecut_fragment.h | 86 +- grape/fragment/loader.h | 18 +- grape/fragment/mutable_edgecut_fragment.h | 88 +- grape/fragment/rebalancer.h | 170 ++++ grape/graph/id_indexer.h | 437 +++++++--- grape/graph/immutable_csr.h | 7 + grape/graph/mutable_csr.h | 53 +- grape/types.h | 27 + grape/util.h | 34 +- grape/utils/concurrent_queue.h | 14 +- grape/utils/pthash_utils/ef_sequence_view.h | 149 ++++ grape/utils/pthash_utils/encoders_view.h | 62 ++ grape/utils/pthash_utils/ph_indexer_view.h | 81 ++ grape/utils/pthash_utils/single_phf_view.h | 218 +++++ grape/utils/ref_vector.h | 85 ++ grape/utils/string_view_vector.h | 127 +++ grape/vertex_map/global_vertex_map.h | 318 -------- grape/vertex_map/idxers/hashmap_idxer.h | 130 +++ grape/vertex_map/idxers/hashmap_idxer_view.h | 157 ++++ grape/vertex_map/idxers/idxer_base.h | 105 +++ grape/vertex_map/idxers/idxers.h | 114 +++ grape/vertex_map/idxers/local_idxer.h | 121 +++ grape/vertex_map/idxers/pthash_idxer.h | 186 +++++ grape/vertex_map/idxers/sorted_array_idxer.h | 198 +++++ grape/vertex_map/local_vertex_map.h | 280 ------- grape/vertex_map/partitioner.h | 292 +++++++ grape/vertex_map/vertex_map.h | 525 ++++++++++++ grape/vertex_map/vertex_map_base.h | 147 ---- misc/app_tests.sh | 49 +- misc/cuda_app_tests.sh | 10 +- misc/load_tests.cc | 178 +++++ misc/mutable_fragment_tests.cc | 271 +++++++ misc/vertex_map_tests.cc | 330 ++++++++ tests/load_tests.cc | 178 +++++ tests/mutable_fragment_tests.cc | 2 +- tests/vertex_map_tests.cc | 401 ++++++---- thirdparty/flat_hash_map/flat_hash_map.hpp | 5 + .../external_memory_builder_single_phf.hpp | 753 ++++++++++++++++++ .../internal_memory_builder_single_phf.hpp | 365 +++++++++ thirdparty/pthash/builders/search.hpp | 358 +++++++++ thirdparty/pthash/builders/util.hpp | 301 +++++++ thirdparty/pthash/encoders/bit_vector.hpp | 347 ++++++++ thirdparty/pthash/encoders/compact_vector.hpp | 306 +++++++ thirdparty/pthash/encoders/darray.hpp | 185 +++++ thirdparty/pthash/encoders/ef_sequence.hpp | 145 ++++ thirdparty/pthash/encoders/encoders.hpp | 161 ++++ thirdparty/pthash/encoders/util.hpp | 114 +++ thirdparty/pthash/essentials/essentials.hpp | 644 +++++++++++++++ thirdparty/pthash/fastmod/fastmod.h | 209 +++++ thirdparty/pthash/mm_file/mm_file.hpp | 176 ++++ thirdparty/pthash/pthash.hpp | 25 + thirdparty/pthash/single_phf.hpp | 159 ++++ thirdparty/pthash/utils/bucketers.hpp | 92 +++ thirdparty/pthash/utils/hasher.hpp | 188 +++++ thirdparty/pthash/utils/logger.hpp | 87 ++ thirdparty/pthash/utils/util.hpp | 57 ++ 83 files changed, 10968 insertions(+), 2343 deletions(-) create mode 100644 grape/fragment/basic_efile_fragment_loader.h create mode 100644 grape/fragment/basic_fragment_loader_base.h create mode 100644 grape/fragment/basic_local_fragment_loader.h create mode 100644 grape/fragment/basic_rb_fragment_loader.h delete mode 100644 grape/fragment/ev_fragment_rebalance_loader.h create mode 100644 grape/fragment/rebalancer.h create mode 100644 grape/utils/pthash_utils/ef_sequence_view.h create mode 100644 grape/utils/pthash_utils/encoders_view.h create mode 100644 grape/utils/pthash_utils/ph_indexer_view.h create mode 100644 grape/utils/pthash_utils/single_phf_view.h create mode 100644 grape/utils/ref_vector.h delete mode 100644 grape/vertex_map/global_vertex_map.h create mode 100644 grape/vertex_map/idxers/hashmap_idxer.h create mode 100644 grape/vertex_map/idxers/hashmap_idxer_view.h create mode 100644 grape/vertex_map/idxers/idxer_base.h create mode 100644 grape/vertex_map/idxers/idxers.h create mode 100644 grape/vertex_map/idxers/local_idxer.h create mode 100644 grape/vertex_map/idxers/pthash_idxer.h create mode 100644 grape/vertex_map/idxers/sorted_array_idxer.h delete mode 100644 grape/vertex_map/local_vertex_map.h create mode 100644 grape/vertex_map/partitioner.h create mode 100644 grape/vertex_map/vertex_map.h delete mode 100644 grape/vertex_map/vertex_map_base.h create mode 100644 misc/load_tests.cc create mode 100644 misc/mutable_fragment_tests.cc create mode 100644 misc/vertex_map_tests.cc create mode 100644 tests/load_tests.cc create mode 100644 thirdparty/pthash/builders/external_memory_builder_single_phf.hpp create mode 100644 thirdparty/pthash/builders/internal_memory_builder_single_phf.hpp create mode 100644 thirdparty/pthash/builders/search.hpp create mode 100644 thirdparty/pthash/builders/util.hpp create mode 100644 thirdparty/pthash/encoders/bit_vector.hpp create mode 100644 thirdparty/pthash/encoders/compact_vector.hpp create mode 100644 thirdparty/pthash/encoders/darray.hpp create mode 100644 thirdparty/pthash/encoders/ef_sequence.hpp create mode 100644 thirdparty/pthash/encoders/encoders.hpp create mode 100644 thirdparty/pthash/encoders/util.hpp create mode 100644 thirdparty/pthash/essentials/essentials.hpp create mode 100644 thirdparty/pthash/fastmod/fastmod.h create mode 100644 thirdparty/pthash/mm_file/mm_file.hpp create mode 100644 thirdparty/pthash/pthash.hpp create mode 100644 thirdparty/pthash/single_phf.hpp create mode 100644 thirdparty/pthash/utils/bucketers.hpp create mode 100644 thirdparty/pthash/utils/hasher.hpp create mode 100644 thirdparty/pthash/utils/logger.hpp create mode 100644 thirdparty/pthash/utils/util.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index e2f58893..c39a06a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -79,10 +79,10 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wall") if (APPLE) set(CMAKE_MACOSX_RPATH ON) else () - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -Werror -Wl,-rpath,$ORIGIN") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -Werror -Wl,-rpath,$ORIGIN -march=native") endif () if (USE_SIMD) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2 -march=native") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx2") endif () set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -O0 -g -fprofile-arcs -ftest-coverage") set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3 -g") diff --git a/examples/analytical_apps/cuda/pagerank/pagerank.h b/examples/analytical_apps/cuda/pagerank/pagerank.h index 86a623f4..a1e7ffa2 100644 --- a/examples/analytical_apps/cuda/pagerank/pagerank.h +++ b/examples/analytical_apps/cuda/pagerank/pagerank.h @@ -18,6 +18,9 @@ limitations under the License. #ifdef __CUDACC__ #include "cuda/app_config.h" #include "grape/grape.h" +#include +#include +#include namespace grape { namespace cuda { diff --git a/examples/analytical_apps/flags.cc b/examples/analytical_apps/flags.cc index 17e24c48..08acc594 100644 --- a/examples/analytical_apps/flags.cc +++ b/examples/analytical_apps/flags.cc @@ -43,9 +43,13 @@ DEFINE_int32(kcore_k, 0, "k value of kcore."); DEFINE_int32(kclique_k, 0, "k value of kclique."); DEFINE_bool(opt, false, "whether to use optimization."); +DEFINE_string(partitioner_type, "map", + "partitioner type, these options can be used: " + "hash, map, segment"); +DEFINE_string(idxer_type, "hashmap", + "idxer type, these options can be used: " + "sorted_array, hashmap, pthash, local"); -DEFINE_bool(segmented_partition, true, - "whether to use segmented partitioning."); DEFINE_bool(rebalance, false, "whether to rebalance graph after loading."); DEFINE_int32(rebalance_vertex_factor, 0, "vertex factor of rebalancing."); diff --git a/examples/analytical_apps/flags.h b/examples/analytical_apps/flags.h index 95b0f3e2..03c51dd7 100644 --- a/examples/analytical_apps/flags.h +++ b/examples/analytical_apps/flags.h @@ -40,8 +40,9 @@ DECLARE_int32(kclique_k); DECLARE_int32(degree_threshold); DECLARE_bool(opt); +DECLARE_string(partitioner_type); +DECLARE_string(idxer_type); -DECLARE_bool(segmented_partition); DECLARE_bool(rebalance); DECLARE_int32(rebalance_vertex_factor); diff --git a/examples/analytical_apps/lcc/lcc_opt.h b/examples/analytical_apps/lcc/lcc_opt.h index 20c94d57..ba815a49 100644 --- a/examples/analytical_apps/lcc/lcc_opt.h +++ b/examples/analytical_apps/lcc/lcc_opt.h @@ -351,14 +351,15 @@ class LCCOpt(frag, v, - ctx.global_degree[v], tid); + channels[tid].SendMsgThroughOEdges(frag, v, + ctx.global_degree[v]); }); // Just in case we are running on single process and no messages will @@ -504,6 +505,7 @@ class LCCOpt::hash(frag.GetInnerVertexGid(v)); auto& pool = ctx.memory_pools[tid]; auto& nbr_vec = ctx.complete_neighbor[v]; @@ -543,8 +545,8 @@ class LCCOpt(frag, v, msg_vec, - tid); + channels[tid].SendMsgThroughOEdges(frag, v, + msg_vec); }); messages.ForceContinue(); } else if (ctx.stage == 1) { @@ -586,10 +588,10 @@ class LCCOpt( - frag, v, ctx.tricnt[v], tid); + channels[tid].SyncStateOnOuterVertex( + frag, v, ctx.tricnt[v]); } }); messages.ForceContinue(); diff --git a/examples/analytical_apps/run_app.h b/examples/analytical_apps/run_app.h index f396331a..aa586138 100644 --- a/examples/analytical_apps/run_app.h +++ b/examples/analytical_apps/run_app.h @@ -33,7 +33,6 @@ limitations under the License. #include #include #include -#include #ifdef GRANULA #include "thirdparty/atlarge-research-granula/granula.hpp" @@ -73,10 +72,7 @@ void Init() { if (FLAGS_deserialize && FLAGS_serialization_prefix.empty()) { LOG(FATAL) << "Please assign a serialization prefix."; } else if (FLAGS_efile.empty()) { - LOG(FATAL) << "Please assign input edge files."; - } else if (FLAGS_vfile.empty() && FLAGS_segmented_partition) { - LOG(FATAL) << "EFragmentLoader dosen't support Segmented Partitioner. " - "Please assign vertex files or use Hash Partitioner"; + LOG(FATAL) << "Please assign input edge file."; } if (!FLAGS_out_prefix.empty() && access(FLAGS_out_prefix.c_str(), 0) != 0) { @@ -173,28 +169,19 @@ void CreateAndQuery(const CommSpec& comm_spec, const std::string& out_prefix, } else if (FLAGS_serialize) { graph_spec.set_serialize(true, FLAGS_serialization_prefix); } - if (FLAGS_segmented_partition) { - using VertexMapType = - GlobalVertexMap>; - using FRAG_T = ImmutableEdgecutFragment; - std::shared_ptr fragment = - LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); - using AppType = APP_T; - auto app = std::make_shared(); - DoQuery(fragment, app, comm_spec, spec, - out_prefix, args...); - } else { - graph_spec.set_rebalance(false, 0); - using FRAG_T = - ImmutableEdgecutFragment; - std::shared_ptr fragment = - LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); - using AppType = APP_T; - auto app = std::make_shared(); - DoQuery(fragment, app, comm_spec, spec, - out_prefix, args...); - } + + graph_spec.partitioner_type = + grape::parse_partitioner_type_name(FLAGS_partitioner_type); + graph_spec.idxer_type = grape::parse_idxer_type_name(FLAGS_idxer_type); + + using FRAG_T = + ImmutableEdgecutFragment; + std::shared_ptr fragment = + LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); + using AppType = APP_T; + auto app = std::make_shared(); + DoQuery(fragment, app, comm_spec, spec, out_prefix, + args...); } template >; - using FRAG_T = ImmutableEdgecutFragment; - std::shared_ptr fragment = - LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); - using App1Type = APP1_T; - auto app1 = std::make_shared(); - using App2Type = APP2_T; - auto app2 = std::make_shared(); - DoDualQuery( - fragment, app1, app2, comm_spec, spec, out_prefix, args...); - } else { - graph_spec.set_rebalance(false, 0); - using FRAG_T = - ImmutableEdgecutFragment; - std::shared_ptr fragment = - LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); - using App1Type = APP1_T; - auto app1 = std::make_shared(); - using App2Type = APP2_T; - auto app2 = std::make_shared(); - DoDualQuery( - fragment, app1, app2, comm_spec, spec, out_prefix, args...); - } + + graph_spec.partitioner_type = + grape::parse_partitioner_type_name(FLAGS_partitioner_type); + graph_spec.idxer_type = grape::parse_idxer_type_name(FLAGS_idxer_type); + + using FRAG_T = + ImmutableEdgecutFragment; + std::shared_ptr fragment = + LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); + + using App1Type = APP1_T; + auto app1 = std::make_shared(); + using App2Type = APP2_T; + auto app2 = std::make_shared(); + DoDualQuery( + fragment, app1, app2, comm_spec, spec, out_prefix, args...); } template diff --git a/examples/analytical_apps/run_app_opt.h b/examples/analytical_apps/run_app_opt.h index e5c93c34..24ce04cd 100644 --- a/examples/analytical_apps/run_app_opt.h +++ b/examples/analytical_apps/run_app_opt.h @@ -66,63 +66,20 @@ void RunUndirectedPageRankOpt(const CommSpec& comm_spec, graph_spec.set_rebalance(FLAGS_rebalance, FLAGS_rebalance_vertex_factor); if (FLAGS_deserialize) { graph_spec.set_deserialize(true, FLAGS_serialization_prefix); - } else if (FLAGS_serialize) { + } + if (FLAGS_serialize) { graph_spec.set_serialize(true, FLAGS_serialization_prefix); } - if (FLAGS_segmented_partition) { - using VertexMapType = - GlobalVertexMap>; - using FRAG_T = - ImmutableEdgecutFragment; - std::shared_ptr fragment = - LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); - bool push; - if (fragment->fnum() >= 8) { - uint64_t local_ivnum = fragment->GetInnerVerticesNum(); - uint64_t local_ovnum = fragment->GetOuterVerticesNum(); - uint64_t total_ivnum, total_ovnum; - MPI_Allreduce(&local_ivnum, &total_ivnum, 1, MPI_UINT64_T, MPI_SUM, - comm_spec.comm()); - MPI_Allreduce(&local_ovnum, &total_ovnum, 1, MPI_UINT64_T, MPI_SUM, - comm_spec.comm()); - - double avg_degree = static_cast(FLAGS_edge_num) / - static_cast(FLAGS_vertex_num); - double rate = - static_cast(total_ovnum) / static_cast(total_ivnum); - - if (rate < 0.5) { - // not to many outer vertices - push = true; - } else if (avg_degree > 60) { - // dense - push = true; - } else { - push = false; - } - } else { - push = true; - } - - if (!push) { - using AppType = PageRankOpt; - auto app = std::make_shared(); - DoQuery(fragment, app, comm_spec, spec, - out_prefix, delta, mr); - } else { - using AppType = PageRankPushOpt; - auto app = std::make_shared(); - DoQuery(fragment, app, comm_spec, spec, - out_prefix, delta, mr); - } - } else { - graph_spec.set_rebalance(false, 0); - using FRAG_T = ImmutableEdgecutFragment; - std::shared_ptr fragment = - LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); + graph_spec.partitioner_type = + parse_partitioner_type_name(FLAGS_partitioner_type); + graph_spec.idxer_type = parse_idxer_type_name(FLAGS_idxer_type); + using FRAG_T = ImmutableEdgecutFragment; + std::shared_ptr fragment = + LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); + bool push; + if (fragment->fnum() >= 8) { uint64_t local_ivnum = fragment->GetInnerVerticesNum(); uint64_t local_ovnum = fragment->GetOuterVerticesNum(); uint64_t total_ivnum, total_ovnum; @@ -131,18 +88,34 @@ void RunUndirectedPageRankOpt(const CommSpec& comm_spec, MPI_Allreduce(&local_ovnum, &total_ovnum, 1, MPI_UINT64_T, MPI_SUM, comm_spec.comm()); - if (static_cast(total_ovnum) > - static_cast(total_ivnum) * 3.2) { - using AppType = PageRank; - auto app = std::make_shared(); - DoQuery(fragment, app, comm_spec, spec, - out_prefix, delta, mr); + double avg_degree = static_cast(FLAGS_edge_num) / + static_cast(FLAGS_vertex_num); + double rate = + static_cast(total_ovnum) / static_cast(total_ivnum); + + if (rate < 0.5) { + // not to many outer vertices + push = true; + } else if (avg_degree > 60) { + // dense + push = true; } else { - using AppType = PageRankPushOpt; - auto app = std::make_shared(); - DoQuery(fragment, app, comm_spec, spec, - out_prefix, delta, mr); + push = false; } + } else { + push = true; + } + + if (!push) { + using AppType = PageRankOpt; + auto app = std::make_shared(); + DoQuery(fragment, app, comm_spec, spec, + out_prefix, delta, mr); + } else { + using AppType = PageRankPushOpt; + auto app = std::make_shared(); + DoQuery(fragment, app, comm_spec, spec, + out_prefix, delta, mr); } } @@ -200,9 +173,13 @@ void RunDirectedCDLP(const CommSpec& comm_spec, const std::string& out_prefix, graph_spec.set_rebalance(FLAGS_rebalance, FLAGS_rebalance_vertex_factor); if (FLAGS_deserialize) { graph_spec.set_deserialize(true, FLAGS_serialization_prefix); - } else if (FLAGS_serialize) { + } + if (FLAGS_serialize) { graph_spec.set_serialize(true, FLAGS_serialization_prefix); } + graph_spec.partitioner_type = + parse_partitioner_type_name(FLAGS_partitioner_type); + graph_spec.idxer_type = parse_idxer_type_name(FLAGS_idxer_type); using FRAG_T = ImmutableEdgecutFragment; @@ -211,7 +188,7 @@ void RunDirectedCDLP(const CommSpec& comm_spec, const std::string& out_prefix, LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); std::pair min_max_id = - get_min_max_id(*fragment->GetVertexMap()); + get_min_max_id(fragment->GetVertexMap()); if (is_int32(min_max_id.first) && is_int32(min_max_id.second)) { using AppType = CDLPOpt; auto app = std::make_shared(); @@ -233,15 +210,16 @@ void RunUndirectedCDLP(const CommSpec& comm_spec, const std::string& out_prefix, graph_spec.set_rebalance(FLAGS_rebalance, FLAGS_rebalance_vertex_factor); if (FLAGS_deserialize) { graph_spec.set_deserialize(true, FLAGS_serialization_prefix); - } else if (FLAGS_serialize) { + } + if (FLAGS_serialize) { graph_spec.set_serialize(true, FLAGS_serialization_prefix); } + graph_spec.partitioner_type = + parse_partitioner_type_name(FLAGS_partitioner_type); + graph_spec.idxer_type = parse_idxer_type_name(FLAGS_idxer_type); - using VertexMapType = - GlobalVertexMap>; - using FRAG_T = - ImmutableEdgecutFragment; + using FRAG_T = ImmutableEdgecutFragment; std::shared_ptr fragment = LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); @@ -249,7 +227,7 @@ void RunUndirectedCDLP(const CommSpec& comm_spec, const std::string& out_prefix, double avg_degree = static_cast(FLAGS_edge_num) / static_cast(FLAGS_vertex_num); std::pair min_max_id = - get_min_max_id(*fragment->GetVertexMap()); + get_min_max_id(fragment->GetVertexMap()); if (is_int32(min_max_id.first) && is_int32(min_max_id.second)) { if (avg_degree > 256) { using AppType = CDLPOptUDDense; @@ -287,32 +265,22 @@ void CreateAndQueryOpt(const CommSpec& comm_spec, const std::string& out_prefix, graph_spec.set_rebalance(FLAGS_rebalance, FLAGS_rebalance_vertex_factor); if (FLAGS_deserialize) { graph_spec.set_deserialize(true, FLAGS_serialization_prefix); - } else if (FLAGS_serialize) { - graph_spec.set_serialize(true, FLAGS_serialization_prefix); } - if (FLAGS_segmented_partition) { - using VertexMapType = - GlobalVertexMap>; - using FRAG_T = - ImmutableEdgecutFragment; - std::shared_ptr fragment = - LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); - using AppType = APP_T; - auto app = std::make_shared(); - DoQuery(fragment, app, comm_spec, spec, - out_prefix, args...); - } else { - graph_spec.set_rebalance(false, 0); - using FRAG_T = ImmutableEdgecutFragment; - std::shared_ptr fragment = - LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); - using AppType = APP_T; - auto app = std::make_shared(); - DoQuery(fragment, app, comm_spec, spec, - out_prefix, args...); + if (FLAGS_serialize) { + graph_spec.set_serialize(true, FLAGS_serialization_prefix); } + graph_spec.partitioner_type = + parse_partitioner_type_name(FLAGS_partitioner_type); + graph_spec.idxer_type = parse_idxer_type_name(FLAGS_idxer_type); + + using FRAG_T = ImmutableEdgecutFragment; + std::shared_ptr fragment = + LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); + using AppType = APP_T; + auto app = std::make_shared(); + DoQuery(fragment, app, comm_spec, spec, out_prefix, + args...); } template >; - using FRAG_T = - ImmutableEdgecutFragment; - std::shared_ptr fragment = - LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); - using App1Type = APP1_T; - auto app1 = std::make_shared(); - using App2Type = APP2_T; - auto app2 = std::make_shared(); - DoDualQuery( - fragment, app1, app2, comm_spec, spec, out_prefix, args...); - } else { - graph_spec.set_rebalance(false, 0); - using FRAG_T = ImmutableEdgecutFragment; - std::shared_ptr fragment = - LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); - using App1Type = APP1_T; - auto app1 = std::make_shared(); - using App2Type = APP2_T; - auto app2 = std::make_shared(); - DoDualQuery( - fragment, app1, app2, comm_spec, spec, out_prefix, args...); - } + graph_spec.partitioner_type = + parse_partitioner_type_name(FLAGS_partitioner_type); + graph_spec.idxer_type = parse_idxer_type_name(FLAGS_idxer_type); + + using FRAG_T = ImmutableEdgecutFragment; + std::shared_ptr fragment = + LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); + using App1Type = APP1_T; + auto app1 = std::make_shared(); + using App2Type = APP2_T; + auto app2 = std::make_shared(); + DoDualQuery( + fragment, app1, app2, comm_spec, spec, out_prefix, args...); } void RunOpt() { @@ -384,55 +339,93 @@ void RunOpt() { } std::string name = FLAGS_application; if (name == "sssp") { - FLAGS_segmented_partition = true; FLAGS_rebalance = false; + if (FLAGS_partitioner_type == "default") { + FLAGS_partitioner_type = "segment"; + } + if (FLAGS_idxer_type == "default") { + FLAGS_idxer_type = "sorted_array"; + } CreateAndQueryOpt( comm_spec, out_prefix, spec, FLAGS_sssp_source); } else if (name == "bfs") { + FLAGS_rebalance = false; + if (FLAGS_partitioner_type == "default") { + FLAGS_partitioner_type = "segment"; + } + if (FLAGS_idxer_type == "default") { + FLAGS_idxer_type = "sorted_array"; + } if (FLAGS_directed) { - FLAGS_segmented_partition = true; - FLAGS_rebalance = false; CreateAndQueryOpt( comm_spec, out_prefix, spec, FLAGS_bfs_source); } else { - FLAGS_segmented_partition = true; - FLAGS_rebalance = false; CreateAndQueryOpt( comm_spec, out_prefix, spec, FLAGS_bfs_source); } } else if (name == "pagerank") { if (FLAGS_directed) { - FLAGS_segmented_partition = false; + if (FLAGS_partitioner_type == "default") { + FLAGS_partitioner_type = "hash"; + } + if (FLAGS_idxer_type == "default") { + FLAGS_idxer_type = "pthash"; + } CreateAndQueryOpt(comm_spec, out_prefix, spec, FLAGS_pr_d, FLAGS_pr_mr); } else { - FLAGS_segmented_partition = true; FLAGS_rebalance = true; FLAGS_rebalance_vertex_factor = 0; + if (FLAGS_partitioner_type == "default") { + FLAGS_partitioner_type = "segment"; + } + if (FLAGS_idxer_type == "default") { + FLAGS_idxer_type = "sorted_array"; + } RunUndirectedPageRankOpt( comm_spec, out_prefix, spec, FLAGS_pr_d, FLAGS_pr_mr); } } else if (name == "cdlp") { if (FLAGS_directed) { FLAGS_directed = false; - FLAGS_segmented_partition = false; + if (FLAGS_partitioner_type == "default") { + FLAGS_partitioner_type = "hash"; + } + if (FLAGS_idxer_type == "default") { + FLAGS_idxer_type = "pthash"; + } RunDirectedCDLP(comm_spec, out_prefix, spec); } else { - FLAGS_segmented_partition = true; FLAGS_rebalance = true; FLAGS_rebalance_vertex_factor = 0; + if (FLAGS_partitioner_type == "default") { + FLAGS_partitioner_type = "segment"; + } + if (FLAGS_idxer_type == "default") { + FLAGS_idxer_type = "sorted_array"; + } RunUndirectedCDLP(comm_spec, out_prefix, spec); } } else if (name == "wcc") { FLAGS_directed = false; - FLAGS_segmented_partition = true; FLAGS_rebalance = false; + if (FLAGS_partitioner_type == "default") { + FLAGS_partitioner_type = "segment"; + } + if (FLAGS_idxer_type == "default") { + FLAGS_idxer_type = "sorted_array"; + } CreateAndQueryOpt( comm_spec, out_prefix, spec); } else if (name == "lcc") { if (FLAGS_directed) { - FLAGS_segmented_partition = false; + if (FLAGS_partitioner_type == "default") { + FLAGS_partitioner_type = "hash"; + } + if (FLAGS_idxer_type == "default") { + FLAGS_idxer_type = "pthash"; + } if (FLAGS_edge_num > static_cast(std::numeric_limits::max())) { CreateAndQueryOpt( @@ -442,9 +435,14 @@ void RunOpt() { comm_spec, out_prefix, spec); } } else { - FLAGS_segmented_partition = true; FLAGS_rebalance = true; FLAGS_rebalance_vertex_factor = 0; + if (FLAGS_partitioner_type == "default") { + FLAGS_partitioner_type = "segment"; + } + if (FLAGS_idxer_type == "default") { + FLAGS_idxer_type = "sorted_array"; + } if (FLAGS_edge_num > static_cast(std::numeric_limits::max()) * 2) { CreateAndQueryOpt( diff --git a/examples/analytical_apps/run_cuda_app.h b/examples/analytical_apps/run_cuda_app.h index 284da2f8..4705aa2a 100644 --- a/examples/analytical_apps/run_cuda_app.h +++ b/examples/analytical_apps/run_cuda_app.h @@ -155,51 +155,28 @@ void CreateAndQueryWithPreprocess(const grape::CommSpec& comm_spec, } else if (FLAGS_serialize) { graph_spec.set_serialize(true, FLAGS_serialization_prefix); } - if (FLAGS_segmented_partition) { - using VERTEX_MAP_T = - GlobalVertexMap>; - using FRAG_T = grape::cuda::HostFragment; - std::shared_ptr fragment; - int dev_id = comm_spec.local_id(); - int dev_count; - - CHECK_CUDA(cudaGetDeviceCount(&dev_count)); - CHECK_LE(comm_spec.local_num(), dev_count) - << "Only found " << dev_count << " GPUs, but " << comm_spec.local_num() - << " processes are launched"; - CHECK_CUDA(cudaSetDevice(dev_id)); - fragment = LoadGraph(efile, vfile, comm_spec, graph_spec); - - auto app = std::make_shared>(); - auto pre = std::make_shared>(); - DoPreprocess, Args...>(fragment, pre, comm_spec, - dev_id, out_prefix, args...); - DoQuery, Args...>(fragment, app, comm_spec, dev_id, - out_prefix, args...); - } else { - graph_spec.set_rebalance(false, 0); - using VERTEX_MAP_T = GlobalVertexMap>; - using FRAG_T = grape::cuda::HostFragment; - std::shared_ptr fragment; - int dev_id = comm_spec.local_id(); - int dev_count; - - CHECK_CUDA(cudaGetDeviceCount(&dev_count)); - CHECK_LE(comm_spec.local_num(), dev_count) - << "Only found " << dev_count << " GPUs, but " << comm_spec.local_num() - << " processes are launched"; - CHECK_CUDA(cudaSetDevice(dev_id)); - fragment = LoadGraph(efile, vfile, comm_spec, graph_spec); - - auto app = std::make_shared>(); - auto pre = std::make_shared>(); - DoPreprocess, Args...>(fragment, pre, comm_spec, - dev_id, out_prefix, args...); - DoQuery, Args...>(fragment, app, comm_spec, dev_id, - out_prefix, args...); - } + graph_spec.partitioner_type = + grape::parse_partitioner_type_name(FLAGS_partitioner_type); + graph_spec.idxer_type = grape::parse_idxer_type_name(FLAGS_idxer_type); + using FRAG_T = grape::cuda::HostFragment; + std::shared_ptr fragment; + int dev_id = comm_spec.local_id(); + int dev_count; + + CHECK_CUDA(cudaGetDeviceCount(&dev_count)); + CHECK_LE(comm_spec.local_num(), dev_count) + << "Only found " << dev_count << " GPUs, but " << comm_spec.local_num() + << " processes are launched"; + CHECK_CUDA(cudaSetDevice(dev_id)); + fragment = LoadGraph(efile, vfile, comm_spec, graph_spec); + + auto app = std::make_shared>(); + auto pre = std::make_shared>(); + DoPreprocess, Args...>(fragment, pre, comm_spec, + dev_id, out_prefix, args...); + DoQuery, Args...>(fragment, app, comm_spec, dev_id, + out_prefix, args...); } template >; - using FRAG_T = grape::cuda::HostFragment; - std::shared_ptr fragment; - int dev_id = comm_spec.local_id(); - int dev_count; - - CHECK_CUDA(cudaGetDeviceCount(&dev_count)); - CHECK_LE(comm_spec.local_num(), dev_count) - << "Only found " << dev_count << " GPUs, but " << comm_spec.local_num() - << " processes are launched"; - CHECK_CUDA(cudaSetDevice(dev_id)); - fragment = LoadGraph(efile, vfile, comm_spec, graph_spec); - - auto app = std::make_shared>(); - DoQuery, Args...>(fragment, app, comm_spec, dev_id, - out_prefix, args...); - } else { - graph_spec.set_rebalance(false, 0); - using VERTEX_MAP_T = GlobalVertexMap>; - using FRAG_T = grape::cuda::HostFragment; - std::shared_ptr fragment; - int dev_id = comm_spec.local_id(); - int dev_count; - - CHECK_CUDA(cudaGetDeviceCount(&dev_count)); - CHECK_LE(comm_spec.local_num(), dev_count) - << "Only found " << dev_count << " GPUs, but " << comm_spec.local_num() - << " processes are launched"; - CHECK_CUDA(cudaSetDevice(dev_id)); - fragment = LoadGraph(efile, vfile, comm_spec, graph_spec); - - auto app = std::make_shared>(); - DoQuery, Args...>(fragment, app, comm_spec, dev_id, - out_prefix, args...); - } + graph_spec.partitioner_type = + grape::parse_partitioner_type_name(FLAGS_partitioner_type); + graph_spec.idxer_type = grape::parse_idxer_type_name(FLAGS_idxer_type); + + using FRAG_T = grape::cuda::HostFragment; + std::shared_ptr fragment; + int dev_id = comm_spec.local_id(); + int dev_count; + + CHECK_CUDA(cudaGetDeviceCount(&dev_count)); + CHECK_LE(comm_spec.local_num(), dev_count) + << "Only found " << dev_count << " GPUs, but " << comm_spec.local_num() + << " processes are launched"; + CHECK_CUDA(cudaSetDevice(dev_id)); + fragment = LoadGraph(efile, vfile, comm_spec, graph_spec); + + auto app = std::make_shared>(); + DoQuery, Args...>(fragment, app, comm_spec, dev_id, + out_prefix, args...); } template diff --git a/examples/gnn_sampler/append_only_edgecut_fragment.h b/examples/gnn_sampler/append_only_edgecut_fragment.h index 3a602db2..e3543114 100644 --- a/examples/gnn_sampler/append_only_edgecut_fragment.h +++ b/examples/gnn_sampler/append_only_edgecut_fragment.h @@ -26,10 +26,8 @@ limitations under the License. #include #include -#include #include #include -#include #include #include #include @@ -41,7 +39,7 @@ limitations under the License. #include #include #include -#include +#include #include #include "flat_hash_map/flat_hash_map.hpp" @@ -276,7 +274,6 @@ struct AppendOnlyEdgecutFragmentTraits { using sub_vertices_t = VertexVector; using fragment_adj_list_t = AdjList; using fragment_const_adj_list_t = ConstAdjList; - using vertex_map_t = GlobalVertexMap; using mirror_vertices_t = std::vector>; }; @@ -301,7 +298,7 @@ class AppendOnlyEdgecutFragment using oid_t = OID_T; using vdata_t = VDATA_T; using edata_t = EDATA_T; - using vertex_map_t = typename traits_t::vertex_map_t; + using vertex_map_t = VertexMap; using nbr_space_iter_impl = NbrSpaceIterImpl; using nbr_mapspace_iter_impl = NbrMapSpaceIterImpl; @@ -326,8 +323,8 @@ class AppendOnlyEdgecutFragment /** Constructor. * @param vm_ptr the vertex map. */ - explicit AppendOnlyEdgecutFragment(std::shared_ptr vm_ptr) - : FragmentBase(vm_ptr) {} + AppendOnlyEdgecutFragment() + : FragmentBase() {} virtual ~AppendOnlyEdgecutFragment() {} @@ -336,9 +333,11 @@ class AppendOnlyEdgecutFragment using base_t::InnerVertexGid2Lid; using base_t::IsInnerVertexGid; static std::string type_info() { return ""; } - void Init(fid_t fid, bool directed, std::vector& vertices, + void Init(const CommSpec& comm_spec, bool directed, + std::unique_ptr>&& vm_ptr, + std::vector& vertices, std::vector& edges) override { - init(fid, directed); + init(comm_spec.fid(), directed, std::move(vm_ptr)); ovnum_ = 0; oenum_ = 0; @@ -376,12 +375,12 @@ class AppendOnlyEdgecutFragment } tvnum_ = ivnum_ + ovnum_; max_old_ilid_ = ivnum_; - min_old_olid_ = id_parser_.max_local_id() - ovnum_; + min_old_olid_ = id_parser_.max_local_id() - ovnum_ + 1; this->inner_vertices_.SetRange(0, ivnum_); - this->outer_vertices_.SetRange(id_parser_.max_local_id() - ovnum_, - id_parser_.max_local_id()); - this->vertices_.SetRange(0, ivnum_, id_parser_.max_local_id() - ovnum_, - id_parser_.max_local_id()); + this->outer_vertices_.SetRange(id_parser_.max_local_id() - ovnum_ + 1, + id_parser_.max_local_id() + 1); + this->vertices_.SetRange(0, ivnum_, id_parser_.max_local_id() - ovnum_ + 1, + id_parser_.max_local_id() + 1); { std::vector odegree(ivnum_, 0); @@ -500,11 +499,13 @@ class AppendOnlyEdgecutFragment std::vector edges; edges.reserve(edge_messages.size()); std::vector empty_id_list; - auto& partitioner = vm_ptr_->GetPartitioner(); { edata_t e_data; oid_t src, dst, src_gid, dst_gid; fid_t src_fid, dst_fid; + std::vector> edge_list; + edge_list.reserve(edge_messages.size()); + std::vector local_vertices_to_add; auto line_parser_ptr = std::make_shared>(); for (auto& msg : edge_messages) { @@ -517,12 +518,26 @@ class AppendOnlyEdgecutFragment LOG(ERROR) << e.what(); continue; } - src_fid = partitioner.GetPartitionId(src); - dst_fid = partitioner.GetPartitionId(dst); - vm_ptr_->AddVertex(src, src_gid); - vm_ptr_->AddVertex(dst, dst_gid); - if (src_fid == fid_ || dst_fid == fid_) { - edges.emplace_back(src_gid, dst_gid, e_data); + src_fid = vm_ptr_->GetFragmentId(src); + dst_fid = vm_ptr_->GetFragmentId(dst); + if (src_fid == fid_) { + if (!vm_ptr_->GetGid(src, src_gid)) { + local_vertices_to_add.push_back(src); + } + edge_list.emplace_back(src, dst, e_data); + } else if (dst_fid == fid_) { + if (!vm_ptr_->GetGid(dst, dst_gid)) { + local_vertices_to_add.push_back(dst); + } + edge_list.emplace_back(src, dst, e_data); + } + } + + vm_ptr_->ExtendVertices(comm_spec, std::move(local_vertices_to_add)); + for (auto& e : edge_list) { + if (vm_ptr_->GetGid(e.src, src_gid) && + vm_ptr_->GetGid(e.dst, dst_gid)) { + edges.emplace_back(src_gid, dst_gid, e.edata); } } } @@ -573,10 +588,11 @@ class AppendOnlyEdgecutFragment } } this->inner_vertices_.SetRange(0, ivnum_); - this->outer_vertices_.SetRange(id_parser_.max_local_id() - ovnum_, - id_parser_.max_local_id()); - this->vertices_.SetRange(0, ivnum_, id_parser_.max_local_id() - ovnum_, - id_parser_.max_local_id()); + this->outer_vertices_.SetRange(id_parser_.max_local_id() - ovnum_ + 1, + id_parser_.max_local_id() + 1); + this->vertices_.SetRange(0, ivnum_, + id_parser_.max_local_id() - ovnum_ + 1, + id_parser_.max_local_id() + 1); tvnum_ = ivnum_ + ovnum_; ovgid_.resize(ovnum_); memcpy(&ovgid_[old_ovnum], &ov_to_extend[0], @@ -605,7 +621,7 @@ class AppendOnlyEdgecutFragment InArchive ia; vid_t xivnum = max_old_ilid_; - vid_t xovnum = id_parser_.max_local_id() - min_old_olid_; + vid_t xovnum = id_parser_.max_local_id() - min_old_olid_ + 1; ia << xivnum << xovnum << oenum_; io_adaptor->WriteArchive(ia); @@ -646,10 +662,13 @@ class AppendOnlyEdgecutFragment } template - void Deserialize(const std::string prefix, const fid_t fid) { + void Deserialize(const CommSpec& comm_spec, + std::unique_ptr>&& vm_ptr, + const std::string prefix) { + vm_ptr_ = std::move(vm_ptr); char fbuf[1024]; snprintf(fbuf, sizeof(fbuf), kSerializationFilenameFormat, prefix.c_str(), - fid); + comm_spec.fid()); VLOG(1) << "Deserialize from " << fbuf; auto io_adaptor = std::unique_ptr(new IOADAPTOR_T(std::string(fbuf))); @@ -700,15 +719,15 @@ class AppendOnlyEdgecutFragment io_adaptor->Close(); max_old_ilid_ = ivnum_; - min_old_olid_ = id_parser_.max_local_id() - ovnum_; + min_old_olid_ = id_parser_.max_local_id() - ovnum_ + 1; extra_oenum_ = 0; extra_oe_.clear(); extra_oe_.resize(ivnum_, -1); this->inner_vertices_.SetRange(0, ivnum_); - this->outer_vertices_.SetRange(id_parser_.max_local_id() - ovnum_, - id_parser_.max_local_id()); - this->vertices_.SetRange(0, ivnum_, id_parser_.max_local_id() - ovnum_, - id_parser_.max_local_id()); + this->outer_vertices_.SetRange(id_parser_.max_local_id() - ovnum_ + 1, + id_parser_.max_local_id() + 1); + this->vertices_.SetRange(0, ivnum_, id_parser_.max_local_id() - ovnum_ + 1, + id_parser_.max_local_id() + 1); initOuterVerticesOfFragment(); } diff --git a/examples/gnn_sampler/run_sampler.cc b/examples/gnn_sampler/run_sampler.cc index 6e5a44ea..fa3bb02c 100644 --- a/examples/gnn_sampler/run_sampler.cc +++ b/examples/gnn_sampler/run_sampler.cc @@ -24,7 +24,6 @@ limitations under the License. #include #include -#include #include #include "append_only_edgecut_fragment.h" diff --git a/grape/app/mutation_context.h b/grape/app/mutation_context.h index 783b58f6..0692633f 100644 --- a/grape/app/mutation_context.h +++ b/grape/app/mutation_context.h @@ -66,20 +66,29 @@ class MutationContext : public ContextBase { void add_vertex(const oid_t& id, const vdata_t& data) { fid_t fid = partitioner_.GetPartitionId(id); - id_to_add_[fid].push_back(id); - vdata_to_add_[fid].push_back(data); + if (fid == fragment_.fnum()) { + LOG(ERROR) << "add vertex - " << id << " failed, unknwon partition id"; + } else { + id_to_add_[fid].push_back(id); + vdata_to_add_[fid].push_back(data); + } } void add_edge(const oid_t& src, const oid_t& dst, const edata_t& data) { fid_t src_fid = partitioner_.GetPartitionId(src); fid_t dst_fid = partitioner_.GetPartitionId(dst); - esrc_to_add_[src_fid].push_back(src); - edst_to_add_[src_fid].push_back(dst); - edata_to_add_[src_fid].push_back(data); - if (src_fid != dst_fid) { - esrc_to_add_[dst_fid].push_back(src); - edst_to_add_[dst_fid].push_back(dst); - edata_to_add_[dst_fid].push_back(data); + if (src_fid == fragment_.fnum() || dst_fid == fragment_.fnum()) { + LOG(ERROR) << "add edge - " << src << " -> " << dst + << " failed, unknwon partition id"; + } else { + esrc_to_add_[src_fid].push_back(src); + edst_to_add_[src_fid].push_back(dst); + edata_to_add_[src_fid].push_back(data); + if (src_fid != dst_fid) { + esrc_to_add_[dst_fid].push_back(src); + edst_to_add_[dst_fid].push_back(dst); + edata_to_add_[dst_fid].push_back(data); + } } } @@ -95,8 +104,13 @@ class MutationContext : public ContextBase { parsed_vertices_to_update_.emplace_back(gid, data); } else { fid_t fid = partitioner_.GetPartitionId(id); - id_to_update_[fid].push_back(id); - vdata_to_update_[fid].push_back(data); + if (fid == fragment_.fnum()) { + LOG(ERROR) << "update vertex - " << id + << " failed, unknwon partition id"; + } else { + id_to_update_[fid].push_back(id); + vdata_to_update_[fid].push_back(data); + } } } @@ -107,13 +121,18 @@ class MutationContext : public ContextBase { void update_edge(const oid_t& src, const oid_t& dst, const edata_t& data) { fid_t src_fid = partitioner_.GetPartitionId(src); fid_t dst_fid = partitioner_.GetPartitionId(dst); - esrc_to_update_[src_fid].push_back(src); - edst_to_update_[src_fid].push_back(dst); - edata_to_update_[src_fid].push_back(data); - if (src_fid != dst_fid) { - esrc_to_update_[dst_fid].push_back(src); - edst_to_update_[dst_fid].push_back(dst); - edata_to_update_[dst_fid].push_back(data); + if (src_fid == fragment_.fnum() || dst_fid == fragment_.fnum()) { + LOG(ERROR) << "update edge - " << src << " -> " << dst + << " failed, unknwon partition id"; + } else { + esrc_to_update_[src_fid].push_back(src); + edst_to_update_[src_fid].push_back(dst); + edata_to_update_[src_fid].push_back(data); + if (src_fid != dst_fid) { + esrc_to_update_[dst_fid].push_back(src); + edst_to_update_[dst_fid].push_back(dst); + edata_to_update_[dst_fid].push_back(data); + } } } @@ -130,7 +149,12 @@ class MutationContext : public ContextBase { parsed_vid_to_remove_.push_back(gid); } else { fid_t fid = partitioner_.GetPartitionId(id); - id_to_remove_[fid].push_back(id); + if (fid == fragment_.fnum()) { + LOG(ERROR) << "remove vertex - " << id + << " failed, unknwon partition id"; + } else { + id_to_remove_[fid].push_back(id); + } } } @@ -141,11 +165,16 @@ class MutationContext : public ContextBase { void remove_edge(const oid_t& src, const oid_t& dst) { fid_t src_fid = partitioner_.GetPartitionId(src); fid_t dst_fid = partitioner_.GetPartitionId(dst); - esrc_to_remove_[src_fid].push_back(src); - edst_to_remove_[src_fid].push_back(dst); - if (src_fid != dst_fid) { - esrc_to_remove_[dst_fid].push_back(src); - edst_to_remove_[dst_fid].push_back(dst); + if (src_fid == fragment_.fnum() || dst_fid == fragment_.fnum()) { + LOG(ERROR) << "remove edge - " << src << " -> " << dst + << " failed, unknwon partition id"; + } else { + esrc_to_remove_[src_fid].push_back(src); + edst_to_remove_[src_fid].push_back(dst); + if (src_fid != dst_fid) { + esrc_to_remove_[dst_fid].push_back(src); + edst_to_remove_[dst_fid].push_back(dst); + } } } diff --git a/grape/communication/sync_comm.h b/grape/communication/sync_comm.h index 2cf356dc..2417fb29 100644 --- a/grape/communication/sync_comm.h +++ b/grape/communication/sync_comm.h @@ -391,6 +391,68 @@ struct CommImpl, } }; +template +struct CommImpl, + typename std::enable_if::value>::type> { + static void send(const Array& vec, int dst_worker_id, int tag, + MPI_Comm comm) { + int64_t len = vec.size(); + CommImpl::send(len, dst_worker_id, tag, comm); + if (len > 0) { + send_buffer(vec.data(), vec.size(), dst_worker_id, tag, comm); + } + } + + static void send_partial(const Array& vec, size_t from, size_t to, + int dst_worker_id, int tag, MPI_Comm comm) { + int64_t len = to - from; + CommImpl::send(len, dst_worker_id, tag, comm); + if (len > 0) { + send_buffer(vec.data() + from, len, dst_worker_id, tag, comm); + } + } + + static void recv(Array& vec, int src_worker_id, int tag, + MPI_Comm comm) { + int64_t len; + CommImpl::recv(len, src_worker_id, tag, comm); + vec.resize(len); + if (len > 0) { + recv_buffer(vec.data(), vec.size(), src_worker_id, tag, comm); + } + } + + static void recv_at(Array& vec, size_t offset, int src_worker_id, + int tag, MPI_Comm comm) { + int64_t len; + CommImpl::recv(len, src_worker_id, tag, comm); + if (offset + len > vec.size()) { + vec.resize(offset + len); + } + if (len > 0) { + recv_buffer(vec.data() + offset, len, src_worker_id, tag, comm); + } + } + + template + static void multiple_send(const Array& vec, + const ITER_T& worker_id_begin, + const ITER_T& worker_id_end, int tag, + MPI_Comm comm) { + for (ITER_T iter = worker_id_begin; iter != worker_id_end; ++iter) { + int dst_worker_id = *iter; + send(vec, dst_worker_id, tag, comm); + } + } + + static void bcast(Array& vec, int root, MPI_Comm comm) { + int64_t len = vec.size(); + bcast_small_buffer(&len, 1, root, comm); + vec.resize(len); + bcast_buffer(vec.data(), len, root, comm); + } +}; + template <> struct CommImpl { static void send(const InArchive& arc, int dst_worker_id, int tag, @@ -751,7 +813,7 @@ typename std::enable_if::value>::type FlatAllGather( global.data(), counts.data(), displs.data(), MPI_CHAR, comm); } else { std::vector reqs; - std::vector offsets; + std::vector offsets(worker_num); int64_t sum = 0; for (int i = 0; i < worker_num; ++i) { offsets[i] = sum; diff --git a/grape/cuda/fragment/device_fragment.h b/grape/cuda/fragment/device_fragment.h index 48765d8c..4c7d42bb 100644 --- a/grape/cuda/fragment/device_fragment.h +++ b/grape/cuda/fragment/device_fragment.h @@ -27,7 +27,7 @@ limitations under the License. namespace grape { namespace cuda { template + grape::LoadStrategy _load_strategy> class HostFragment; namespace dev { @@ -445,8 +445,7 @@ class DeviceFragment { ArrayView> mirrors_of_frag_; template + typename _EDATA_T, grape::LoadStrategy __load_strategy> friend class grape::cuda::HostFragment; }; diff --git a/grape/cuda/fragment/host_fragment.h b/grape/cuda/fragment/host_fragment.h index 0127d838..dbb4ec8b 100644 --- a/grape/cuda/fragment/host_fragment.h +++ b/grape/cuda/fragment/host_fragment.h @@ -43,7 +43,6 @@ limitations under the License. #include "grape/types.h" #include "grape/util.h" #include "grape/utils/vertex_array.h" -#include "grape/vertex_map/global_vertex_map.h" namespace grape { namespace cuda { @@ -65,14 +64,12 @@ inline void CalculateOffsetWithPrefixSum(const Stream& stream, } template > -class HostFragment - : public ImmutableEdgecutFragment { + grape::LoadStrategy _load_strategy = grape::LoadStrategy::kOnlyOut> +class HostFragment : public ImmutableEdgecutFragment { public: - using base_t = ImmutableEdgecutFragment; + using base_t = + ImmutableEdgecutFragment; using internal_vertex_t = typename base_t::internal_vertex_t; using edge_t = typename base_t::edge_t; using nbr_t = typename base_t::nbr_t; @@ -86,8 +83,7 @@ class HostFragment using edata_t = EDATA_T; using vertex_range_t = typename base_t::vertex_range_t; - using vertex_map_t = typename base_t::vertex_map_t; - using dev_vertex_map_t = cuda::DeviceVertexMap; + using dev_vertex_map_t = cuda::DeviceVertexMap>; using inner_vertices_t = typename base_t::inner_vertices_t; using outer_vertices_t = typename base_t::outer_vertices_t; using device_t = @@ -99,15 +95,14 @@ class HostFragment static constexpr grape::LoadStrategy load_strategy = _load_strategy; - HostFragment() = default; + HostFragment() : FragmentBase() {} - explicit HostFragment(std::shared_ptr vm_ptr) - : FragmentBase(vm_ptr) {} - - void Init(fid_t fid, bool directed, std::vector& vertices, + void Init(const CommSpec& comm_spec, bool directed, + std::unique_ptr>&& vm_ptr, + std::vector& vertices, std::vector& edges) { - base_t::Init(fid, directed, vertices, edges); - __allocate_device_fragment__(); + base_t::Init(comm_spec, directed, std::move(vm_ptr), vertices, edges); + __allocate_device_fragment__(comm_spec.local_id()); } template @@ -116,9 +111,12 @@ class HostFragment } template - void Deserialize(const std::string& prefix, const fid_t fid) { - base_t::template Deserialize(prefix, fid); - __allocate_device_fragment__(); + void Deserialize(const CommSpec& comm_spec, + std::unique_ptr>&& vm_ptr, + const std::string& prefix) { + base_t::template Deserialize(comm_spec, std::move(vm_ptr), + prefix); + __allocate_device_fragment__(comm_spec.local_id()); } void PrepareToRunApp(const CommSpec& comm_spec, PrepareConf conf) { @@ -135,7 +133,6 @@ class HostFragment } if (conf.need_split_edges || conf.need_split_edges_by_fragment) { - auto& comm_spec = vm_ptr_->GetCommSpec(); auto& ie = ie_.get_edges(); auto& ieoffset = ie_.get_offsets(); auto& oe = oe_.get_edges(); @@ -161,7 +158,7 @@ class HostFragment stream.cuda_stream())); auto prefix_sum = compute_prefix_sum(ieoffset); - ArrayView d_prefix_sum(prefix_sum); + ArrayView d_prefix_sum(prefix_sum.data(), prefix_sum.size()); CalculateOffsetWithPrefixSum( stream, d_prefix_sum, thrust::raw_pointer_cast(d_ie_.data()), @@ -176,7 +173,7 @@ class HostFragment stream.cuda_stream())); auto prefix_sum = compute_prefix_sum(oeoffset); - ArrayView d_prefix_sum(prefix_sum); + ArrayView d_prefix_sum(prefix_sum.data(), prefix_sum.size()); CalculateOffsetWithPrefixSum( stream, d_prefix_sum, thrust::raw_pointer_cast(d_oe_.data()), @@ -211,7 +208,7 @@ class HostFragment } if (conf.need_build_device_vm) { - d_vm_ptr_->Init(stream); + d_vm_ptr_->Init(stream, comm_spec, vm_ptr_); } stream.Sync(); } @@ -321,18 +318,17 @@ class HostFragment return dev_frag; } - void __allocate_device_fragment__() { - auto& comm_spec = vm_ptr_->GetCommSpec(); + void __allocate_device_fragment__(int local_id) { auto& ie = ie_.get_edges(); auto& ieoffset = ie_.get_offsets(); auto& oe = oe_.get_edges(); auto& oeoffset = oe_.get_offsets(); - int dev_id = comm_spec.local_id(); + int dev_id = local_id; CHECK_CUDA(cudaSetDevice(dev_id)); Stream stream; - d_vm_ptr_ = std::make_shared(vm_ptr_); + d_vm_ptr_ = std::make_shared(); auto offset_size = ivnum_ + ovnum_ + 1; auto compute_prefix_sum = [offset_size]( @@ -354,7 +350,7 @@ class HostFragment cudaMemcpyHostToDevice, stream.cuda_stream())); auto prefix_sum = compute_prefix_sum(ieoffset); - ArrayView d_prefix_sum(prefix_sum); + ArrayView d_prefix_sum(prefix_sum.data(), prefix_sum.size()); CalculateOffsetWithPrefixSum( stream, d_prefix_sum, thrust::raw_pointer_cast(d_ie_.data()), @@ -370,7 +366,7 @@ class HostFragment cudaMemcpyHostToDevice, stream.cuda_stream())); auto prefix_sum = compute_prefix_sum(oeoffset); - ArrayView d_prefix_sum(prefix_sum); + ArrayView d_prefix_sum(prefix_sum.data(), prefix_sum.size()); CalculateOffsetWithPrefixSum( stream, d_prefix_sum, thrust::raw_pointer_cast(d_oe_.data()), @@ -414,7 +410,6 @@ class HostFragment [] __device__(VID_T * gids, VID_T * lids, VID_T size, CUDASTL::HashMap * ovg2l) { auto tid = TID_1D; - gids = thrust::raw_pointer_cast(gids); auto nthreads = TOTAL_THREADS_1D; for (VID_T idx = 0 + tid; idx < size; idx += nthreads) { @@ -424,8 +419,7 @@ class HostFragment (*ovg2l)[gid] = lid; } }, - thrust::raw_pointer_cast(gids.data()), - thrust::raw_pointer_cast(lids.data()), size, d_ovg2l_.get()); + gids.data(), lids.data(), size, d_ovg2l_.get()); } d_mirrors_of_frag_holder_.resize(fnum_); @@ -635,7 +629,7 @@ class HostFragment thrust::device_vector& d_fid_list, thrust::device_vector& d_fid_list_offset) { pinned_vector prefix_sum(ivnum_ + 1, 0); - ArrayView d_prefix_sum(prefix_sum); + ArrayView d_prefix_sum(prefix_sum.data(), prefix_sum.size()); for (VID_T i = 0; i < ivnum_; ++i) { prefix_sum[i + 1] = diff --git a/grape/cuda/vertex_map/device_vertex_map.h b/grape/cuda/vertex_map/device_vertex_map.h index dbb7e03d..c71d4767 100644 --- a/grape/cuda/vertex_map/device_vertex_map.h +++ b/grape/cuda/vertex_map/device_vertex_map.h @@ -23,7 +23,6 @@ limitations under the License. #include "grape/cuda/utils/launcher.h" #include "grape/cuda/utils/stream.h" #include "grape/fragment/id_parser.h" -#include "grape/vertex_map/global_vertex_map.h" namespace grape { namespace cuda { @@ -98,23 +97,22 @@ class DeviceVertexMap { using VID_T = typename HOST_VM_T::vid_t; public: - explicit DeviceVertexMap(std::shared_ptr vm_ptr) - : vm_ptr_(vm_ptr) {} + DeviceVertexMap() {} - void Init(const Stream& stream) { - auto& comm_spec = vm_ptr_->GetCommSpec(); - fid_t fnum = comm_spec.fnum(); + void Init(const Stream& stream, const CommSpec& comm_spec, + std::unique_ptr& vm_ptr) { + fnum_ = comm_spec.fnum(); int dev_id = comm_spec.local_id(); CHECK_CUDA(cudaSetDevice(dev_id)); - id_parser_.init(fnum); - d_o2l_.resize(fnum); - d_l2o_.resize(fnum); - d_l2o_ptr_.resize(fnum); + id_parser_.init(fnum_); + d_o2l_.resize(fnum_); + d_l2o_.resize(fnum_); + d_l2o_ptr_.resize(fnum_); - for (fid_t fid = 0; fid < fnum; fid++) { - auto ivnum = vm_ptr_->GetInnerVertexSize(fid); + for (fid_t fid = 0; fid < fnum_; fid++) { + auto ivnum = vm_ptr->GetInnerVertexSize(fid); // TODO(liang): replace this d_o2l_[fid] = CUDASTL::CreateHashMap>( @@ -124,7 +122,7 @@ class DeviceVertexMap { for (size_t lid = 0; lid < ivnum; lid++) { OID_T oid; - CHECK(vm_ptr_->GetOid(fid, lid, oid)); + CHECK(vm_ptr->GetOid(fid, lid, oid)); oids[lid] = oid; } @@ -141,17 +139,16 @@ class DeviceVertexMap { (*o2l)[oid] = lid; } }, - thrust::raw_pointer_cast(oids.data()), ivnum, d_o2l_[fid]); + oids.data(), ivnum, d_o2l_[fid]); d_l2o_[fid].assign(oids.begin(), oids.end()); d_l2o_ptr_[fid] = ArrayView(d_l2o_[fid]); } } dev::DeviceVertexMap DeviceObject() { - auto& comm_spec = vm_ptr_->GetCommSpec(); dev::DeviceVertexMap dev_vm; - dev_vm.fnum_ = comm_spec.fnum(); + dev_vm.fnum_ = fnum_; dev_vm.id_parser_ = id_parser_; // if device vm is built @@ -163,7 +160,6 @@ class DeviceVertexMap { } private: - std::shared_ptr vm_ptr_; IdParser id_parser_; // l2o for per device thrust::device_vector< @@ -171,6 +167,8 @@ class DeviceVertexMap { d_o2l_; std::vector> d_l2o_; thrust::device_vector> d_l2o_ptr_; + + fid_t fnum_; }; } // namespace cuda diff --git a/grape/fragment/basic_efile_fragment_loader.h b/grape/fragment/basic_efile_fragment_loader.h new file mode 100644 index 00000000..8f26088f --- /dev/null +++ b/grape/fragment/basic_efile_fragment_loader.h @@ -0,0 +1,185 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef GRAPE_FRAGMENT_BASIC_EFILE_FRAGMENT_LOADER_H_ +#define GRAPE_FRAGMENT_BASIC_EFILE_FRAGMENT_LOADER_H_ + +#include "grape/communication/shuffle.h" +#include "grape/fragment/basic_fragment_loader_base.h" +#include "grape/fragment/rebalancer.h" +#include "grape/graph/edge.h" +#include "grape/graph/vertex.h" +#include "grape/vertex_map/vertex_map.h" + +namespace grape { + +template +class BasicEFileFragmentLoader : public BasicFragmentLoaderBase { + using fragment_t = FRAG_T; + using oid_t = typename fragment_t::oid_t; + using internal_oid_t = typename InternalOID::type; + using vid_t = typename fragment_t::vid_t; + using vdata_t = typename fragment_t::vdata_t; + using edata_t = typename fragment_t::edata_t; + + public: + explicit BasicEFileFragmentLoader(const CommSpec& comm_spec, + const LoadGraphSpec& spec) + : BasicFragmentLoaderBase(comm_spec, spec) { + if (spec_.partitioner_type != PartitionerType::kHashPartitioner) { + LOG(ERROR) << "Only hash partitioner is supported in " + "BasicEFileFragmentLoader"; + spec_.partitioner_type = PartitionerType::kHashPartitioner; + } + if (spec_.rebalance) { + LOG(ERROR) << "Rebalance is not supported in BasicEFileFragmentLoader"; + spec_.rebalance = false; + } + partitioner_ = std::unique_ptr>( + new HashPartitioner(comm_spec_.fnum())); + edges_to_frag_.resize(comm_spec_.fnum()); + for (fid_t fid = 0; fid < comm_spec_.fnum(); ++fid) { + int worker_id = comm_spec_.FragToWorker(fid); + edges_to_frag_[fid].Init(comm_spec_.comm(), edge_tag, 4096000); + edges_to_frag_[fid].SetDestination(worker_id, fid); + if (worker_id == comm_spec_.worker_id()) { + edges_to_frag_[fid].DisableComm(); + } + } + + edge_recv_thread_ = + std::thread(&BasicEFileFragmentLoader::edgeRecvRoutine, this); + recv_thread_running_ = true; + } + + ~BasicEFileFragmentLoader() { + if (recv_thread_running_) { + for (auto& ea : edges_to_frag_) { + ea.Flush(); + } + edge_recv_thread_.join(); + } + } + + void AddVertex(const oid_t& id, const vdata_t& data) override {} + + void ConstructVertices() override {} + + void AddEdge(const oid_t& src, const oid_t& dst, + const edata_t& data) override { + internal_oid_t internal_src(src); + internal_oid_t internal_dst(dst); + fid_t src_fid = partitioner_->GetPartitionId(internal_src); + fid_t dst_fid = partitioner_->GetPartitionId(internal_dst); + if (src_fid == comm_spec_.fnum() || dst_fid == comm_spec_.fnum()) { + LOG(ERROR) << "Unknown partition id for edge " << src << " -> " << dst; + } else { + edges_to_frag_[src_fid].Emplace(internal_src, internal_dst, data); + if (src_fid != dst_fid) { + edges_to_frag_[dst_fid].Emplace(internal_src, internal_dst, data); + } + } + } + + void ConstructFragment(std::shared_ptr& fragment) override { + for (auto& ea : edges_to_frag_) { + ea.Flush(); + } + + edge_recv_thread_.join(); + recv_thread_running_ = false; + + MPI_Barrier(comm_spec_.comm()); + got_edges_.emplace_back( + std::move(edges_to_frag_[comm_spec_.fid()].buffers())); + edges_to_frag_[comm_spec_.fid()].Clear(); + + std::unique_ptr> vm_ptr( + new VertexMap()); + { + VertexMapBuilder builder( + comm_spec_.fid(), comm_spec_.fnum(), std::move(partitioner_), + spec_.idxer_type); + for (auto& buffers : got_edges_) { + foreach_helper( + buffers, + [&builder](const internal_oid_t& src, const internal_oid_t& dst) { + builder.add_vertex(src); + builder.add_vertex(dst); + }, + make_index_sequence<2>{}); + } + builder.finish(comm_spec_, *vm_ptr); + } + + std::vector> processed_edges; + for (auto& buffers : got_edges_) { + foreach_rval(buffers, [&processed_edges, &vm_ptr](internal_oid_t&& src, + internal_oid_t&& dst, + edata_t&& data) { + vid_t src_gid, dst_gid; + if (vm_ptr->GetGid(oid_t(src), src_gid) && + vm_ptr->GetGid(oid_t(dst), dst_gid)) { + processed_edges.emplace_back(src_gid, dst_gid, std::move(data)); + } + }); + } + + fragment = std::make_shared(); + std::vector> fake_vertices; + fragment->Init(comm_spec_, spec_.directed, std::move(vm_ptr), fake_vertices, + processed_edges); + + this->InitOuterVertexData(fragment); + } + + private: + void edgeRecvRoutine() { + ShuffleIn data_in; + data_in.Init(comm_spec_.fnum(), comm_spec_.comm(), edge_tag); + fid_t dst_fid; + int src_worker_id; + while (!data_in.Finished()) { + src_worker_id = data_in.Recv(dst_fid); + if (src_worker_id == -1) { + break; + } + CHECK_EQ(dst_fid, comm_spec_.fid()); + got_edges_.emplace_back(std::move(data_in.buffers())); + data_in.Clear(); + } + } + + std::unique_ptr> partitioner_; + + std::vector> + edges_to_frag_; + + std::thread edge_recv_thread_; + bool recv_thread_running_; + + std::vector> + got_edges_; + + using BasicFragmentLoaderBase::comm_spec_; + using BasicFragmentLoaderBase::spec_; + using BasicFragmentLoaderBase::id_parser_; + + using BasicFragmentLoaderBase::edge_tag; +}; + +}; // namespace grape + +#endif // GRAPE_FRAGMENT_BASIC_EFILE_FRAGMENT_LOADER_H_ diff --git a/grape/fragment/basic_fragment_loader.h b/grape/fragment/basic_fragment_loader.h index 28c6a73b..f10ca31a 100644 --- a/grape/fragment/basic_fragment_loader.h +++ b/grape/fragment/basic_fragment_loader.h @@ -16,71 +16,17 @@ limitations under the License. #ifndef GRAPE_FRAGMENT_BASIC_FRAGMENT_LOADER_H_ #define GRAPE_FRAGMENT_BASIC_FRAGMENT_LOADER_H_ -#include - -#include -#include -#include -#include -#include -#include -#include - #include "grape/communication/shuffle.h" -#include "grape/config.h" +#include "grape/fragment/basic_fragment_loader_base.h" +#include "grape/fragment/rebalancer.h" #include "grape/graph/edge.h" #include "grape/graph/vertex.h" -#include "grape/util.h" -#include "grape/utils/concurrent_queue.h" -#include "grape/utils/vertex_array.h" -#include "grape/worker/comm_spec.h" +#include "grape/vertex_map/vertex_map.h" namespace grape { -/** - * @brief LoadGraphSpec determines the specification to load a graph. - * - */ -struct LoadGraphSpec { - bool directed; - bool rebalance; - int rebalance_vertex_factor; - - bool serialize; - std::string serialization_prefix; - - bool deserialize; - std::string deserialization_prefix; - - void set_directed(bool val = true) { directed = val; } - void set_rebalance(bool flag, int weight) { - rebalance = flag; - rebalance_vertex_factor = weight; - } - - void set_serialize(bool flag, const std::string& prefix) { - serialize = flag; - serialization_prefix = prefix; - } - - void set_deserialize(bool flag, const std::string& prefix) { - deserialize = flag; - deserialization_prefix = prefix; - } -}; - -inline LoadGraphSpec DefaultLoadGraphSpec() { - LoadGraphSpec spec; - spec.directed = true; - spec.rebalance = true; - spec.rebalance_vertex_factor = 0; - spec.serialize = false; - spec.deserialize = false; - return spec; -} - -template -class BasicFragmentLoader { +template +class BasicFragmentLoader : public BasicFragmentLoaderBase { using fragment_t = FRAG_T; using oid_t = typename fragment_t::oid_t; using internal_oid_t = typename InternalOID::type; @@ -88,216 +34,151 @@ class BasicFragmentLoader { using vdata_t = typename fragment_t::vdata_t; using edata_t = typename fragment_t::edata_t; - using vertex_map_t = typename fragment_t::vertex_map_t; - using partitioner_t = typename vertex_map_t::partitioner_t; - - static constexpr LoadStrategy load_strategy = fragment_t::load_strategy; - public: - explicit BasicFragmentLoader(const CommSpec& comm_spec) - : comm_spec_(comm_spec) { - comm_spec_.Dup(); - vm_ptr_ = std::make_shared(comm_spec_); - vertices_to_frag_.resize(comm_spec_.fnum()); - edges_to_frag_.resize(comm_spec_.fnum()); - for (fid_t fid = 0; fid < comm_spec_.fnum(); ++fid) { - int worker_id = comm_spec_.FragToWorker(fid); - vertices_to_frag_[fid].Init(comm_spec_.comm(), vertex_tag, 4096000); - vertices_to_frag_[fid].SetDestination(worker_id, fid); - edges_to_frag_[fid].Init(comm_spec_.comm(), edge_tag, 4096000); - edges_to_frag_[fid].SetDestination(worker_id, fid); - if (worker_id == comm_spec_.worker_id()) { - vertices_to_frag_[fid].DisableComm(); - edges_to_frag_[fid].DisableComm(); - } + explicit BasicFragmentLoader(const CommSpec& comm_spec, + const LoadGraphSpec& spec) + : BasicFragmentLoaderBase(comm_spec, spec) { + if (spec_.idxer_type == IdxerType::kLocalIdxer) { + LOG(ERROR) << "Global vertex map is required in BasicFragmentLoader"; + spec_.idxer_type = IdxerType::kHashMapIdxer; + } + if (spec_.rebalance) { + LOG(ERROR) << "Rebalance is not supported in BasicFragmentLoader"; + spec_.rebalance = false; } - recv_thread_running_ = false; } - ~BasicFragmentLoader() { Stop(); } - - void SetPartitioner(const partitioner_t& partitioner) { - vm_ptr_->SetPartitioner(partitioner); - } - - void SetPartitioner(partitioner_t&& partitioner) { - vm_ptr_->SetPartitioner(std::move(partitioner)); - } - - void Start() { - vertex_recv_thread_ = - std::thread(&BasicFragmentLoader::vertexRecvRoutine, this); - edge_recv_thread_ = - std::thread(&BasicFragmentLoader::edgeRecvRoutine, this); - recv_thread_running_ = true; - } - - void Stop() { + ~BasicFragmentLoader() { if (recv_thread_running_) { - for (auto& va : vertices_to_frag_) { - va.Flush(); - } for (auto& ea : edges_to_frag_) { ea.Flush(); } - vertex_recv_thread_.join(); edge_recv_thread_.join(); - recv_thread_running_ = false; } } - void AddVertex(const oid_t& id, const vdata_t& data) { - internal_oid_t internal_id(id); - auto& partitioner = vm_ptr_->GetPartitioner(); - fid_t fid = partitioner.GetPartitionId(internal_id); - vertices_to_frag_[fid].Emplace(internal_id, data); - } - - void AddEdge(const oid_t& src, const oid_t& dst, const edata_t& data) { - internal_oid_t internal_src(src); - internal_oid_t internal_dst(dst); - auto& partitioner = vm_ptr_->GetPartitioner(); - fid_t src_fid = partitioner.GetPartitionId(internal_src); - fid_t dst_fid = partitioner.GetPartitionId(internal_dst); - edges_to_frag_[src_fid].Emplace(internal_src, internal_dst, data); - if (src_fid != dst_fid) { - edges_to_frag_[dst_fid].Emplace(internal_src, internal_dst, data); - } + void AddVertex(const oid_t& id, const vdata_t& data) override { + vertices_.emplace_back(id); + vdata_.emplace_back(data); } - bool SerializeFragment(std::shared_ptr& fragment, - const std::string& serialization_prefix) { - std::string type_prefix = fragment_t::type_info(); - std::string typed_prefix = serialization_prefix + "/" + type_prefix; - char serial_file[1024]; - snprintf(serial_file, sizeof(serial_file), "%s/%s", typed_prefix.c_str(), - kSerializationVertexMapFilename); - vm_ptr_->template Serialize(typed_prefix); - fragment->template Serialize(typed_prefix); - - return true; - } + void ConstructVertices() override { + fid_t fid = comm_spec_.fid(); + fid_t fnum = comm_spec_.fnum(); + std::unique_ptr> partitioner(nullptr); + if (spec_.partitioner_type == PartitionerType::kHashPartitioner) { + partitioner = std::unique_ptr>( + new HashPartitioner(fnum)); + } else if (spec_.partitioner_type == PartitionerType::kMapPartitioner) { + std::vector all_vertices; + sync_comm::FlatAllGather(vertices_, all_vertices, comm_spec_.comm()); + DistinctSort(all_vertices); - bool existSerializationFile(const std::string& prefix) { - char vm_fbuf[1024], frag_fbuf[1024]; - snprintf(vm_fbuf, sizeof(vm_fbuf), "%s/%s", prefix.c_str(), - kSerializationVertexMapFilename); - snprintf(frag_fbuf, sizeof(frag_fbuf), kSerializationFilenameFormat, - prefix.c_str(), comm_spec_.fid()); - std::string vm_path = vm_fbuf; - std::string frag_path = frag_fbuf; - return exists_file(vm_path) && exists_file(frag_path); - } + partitioner = std::unique_ptr>( + new MapPartitioner(fnum, all_vertices)); + } else if (spec_.partitioner_type == + PartitionerType::kSegmentedPartitioner) { + std::vector all_vertices; + sync_comm::FlatAllGather(vertices_, all_vertices, comm_spec_.comm()); + DistinctSort(all_vertices); - bool DeserializeFragment(std::shared_ptr& fragment, - const std::string& deserialization_prefix) { - std::string type_prefix = fragment_t::type_info(); - std::string typed_prefix = deserialization_prefix + "/" + type_prefix; - if (!existSerializationFile(typed_prefix)) { - return false; - } - auto io_adaptor = - std::unique_ptr(new IOADAPTOR_T(typed_prefix)); - if (io_adaptor->IsExist()) { - vm_ptr_->template Deserialize(typed_prefix, - comm_spec_.fid()); - fragment = std::shared_ptr(new fragment_t(vm_ptr_)); - fragment->template Deserialize(typed_prefix, - comm_spec_.fid()); - return true; + partitioner = std::unique_ptr>( + new SegmentedPartitioner(fnum, all_vertices)); } else { - return false; + LOG(FATAL) << "Unsupported partitioner type"; + } + std::vector> local_vertices_id; + std::vector> local_vertices_data; + this->ShuffleVertexData(vertices_, vdata_, local_vertices_id, + local_vertices_data, *partitioner); + std::vector sorted_vertices; + for (auto& buf : local_vertices_id) { + sorted_vertices.insert(sorted_vertices.end(), buf.begin(), buf.end()); + } + std::sort(sorted_vertices.begin(), sorted_vertices.end()); + + VertexMapBuilder builder(fid, fnum, std::move(partitioner), + spec_.idxer_type); + for (auto& v : sorted_vertices) { + builder.add_vertex(v); + } + vertex_map_ = + std::unique_ptr>(new VertexMap()); + builder.finish(comm_spec_, *vertex_map_); + + for (size_t buf_i = 0; buf_i < local_vertices_id.size(); ++buf_i) { + std::vector& local_vertices = local_vertices_id[buf_i]; + std::vector& local_vdata = local_vertices_data[buf_i]; + size_t local_vertices_num = local_vertices.size(); + for (size_t i = 0; i < local_vertices_num; ++i) { + vid_t gid; + if (vertex_map_->GetGid(local_vertices[i], gid)) { + processed_vertices_.emplace_back(gid, std::move(local_vdata[i])); + } + } + } + + edges_to_frag_.resize(fnum); + for (fid_t fid = 0; fid < fnum; ++fid) { + int worker_id = comm_spec_.FragToWorker(fid); + edges_to_frag_[fid].Init(comm_spec_.comm(), edge_tag, 4096000); + edges_to_frag_[fid].SetDestination(worker_id, fid); + if (worker_id == comm_spec_.worker_id()) { + edges_to_frag_[fid].DisableComm(); + } } + edge_recv_thread_ = + std::thread(&BasicFragmentLoader::edgeRecvRoutine, this); + recv_thread_running_ = true; } - void ConstructFragment(std::shared_ptr& fragment, bool directed) { - for (auto& va : vertices_to_frag_) { - va.Flush(); + void AddEdge(const oid_t& src, const oid_t& dst, + const edata_t& data) override { + vid_t src_gid, dst_gid; + if (vertex_map_->GetGid(src, src_gid) && + vertex_map_->GetGid(dst, dst_gid)) { + fid_t src_fid = id_parser_.get_fragment_id(src_gid); + fid_t dst_fid = id_parser_.get_fragment_id(dst_gid); + edges_to_frag_[src_fid].Emplace(src_gid, dst_gid, data); + if (src_fid != dst_fid) { + edges_to_frag_[dst_fid].Emplace(src_gid, dst_gid, data); + } } + } + + void ConstructFragment(std::shared_ptr& fragment) override { for (auto& ea : edges_to_frag_) { ea.Flush(); } - vertex_recv_thread_.join(); + edge_recv_thread_.join(); recv_thread_running_ = false; MPI_Barrier(comm_spec_.comm()); - got_vertices_.emplace_back( - std::move(vertices_to_frag_[comm_spec_.fid()].buffers())); - vertices_to_frag_[comm_spec_.fid()].Clear(); got_edges_.emplace_back( std::move(edges_to_frag_[comm_spec_.fid()].buffers())); edges_to_frag_[comm_spec_.fid()].Clear(); - vm_ptr_->Init(); - auto builder = vm_ptr_->GetLocalBuilder(); - for (auto& buffers : got_vertices_) { - foreach_helper( - buffers, - [&builder](const internal_oid_t& id) { builder.add_vertex(id); }, - make_index_sequence<1>{}); - } - for (auto& buffers : got_edges_) { - foreach_helper( - buffers, - [&builder](const internal_oid_t& src, const internal_oid_t& dst) { - builder.add_vertex(src); - builder.add_vertex(dst); - }, - make_index_sequence<2>{}); - } - builder.finish(*vm_ptr_); - - processed_vertices_.clear(); - if (!std::is_same::value) { - for (auto& buffers : got_vertices_) { - foreach_rval(buffers, [this](internal_oid_t&& id, vdata_t&& data) { - vid_t gid; - CHECK(vm_ptr_->_GetGid(id, gid)); - processed_vertices_.emplace_back(gid, std::move(data)); - }); - } - } - got_vertices_.clear(); - + std::vector> processed_edges; for (auto& buffers : got_edges_) { - foreach_rval(buffers, [this](internal_oid_t&& src, internal_oid_t&& dst, - edata_t&& data) { - vid_t src_gid, dst_gid; - CHECK(vm_ptr_->_GetGid(src, src_gid)); - CHECK(vm_ptr_->_GetGid(dst, dst_gid)); - processed_edges_.emplace_back(src_gid, dst_gid, std::move(data)); + foreach_rval(buffers, [&processed_edges](vid_t&& src, vid_t&& dst, + edata_t&& data) { + processed_edges.emplace_back(src, dst, std::move(data)); }); } - fragment = std::shared_ptr(new fragment_t(vm_ptr_)); - fragment->Init(comm_spec_.fid(), directed, processed_vertices_, - processed_edges_); + fragment = std::make_shared(); + fragment->Init(comm_spec_, spec_.directed, std::move(vertex_map_), + processed_vertices_, processed_edges); - if (!std::is_same::value) { - initOuterVertexData(fragment); - } - } - - void vertexRecvRoutine() { - ShuffleIn data_in; - data_in.Init(comm_spec_.fnum(), comm_spec_.comm(), vertex_tag); - fid_t dst_fid; - int src_worker_id; - while (!data_in.Finished()) { - src_worker_id = data_in.Recv(dst_fid); - if (src_worker_id == -1) { - break; - } - got_vertices_.emplace_back(std::move(data_in.buffers())); - data_in.Clear(); - } + this->InitOuterVertexData(fragment); } + private: void edgeRecvRoutine() { - ShuffleIn data_in; + ShuffleIn data_in; data_in.Init(comm_spec_.fnum(), comm_spec_.comm(), edge_tag); fid_t dst_fid; int src_worker_id; @@ -306,75 +187,37 @@ class BasicFragmentLoader { if (src_worker_id == -1) { break; } - CHECK_EQ(dst_fid, comm_spec_.fid()); - got_edges_.emplace_back(std::move(data_in.buffers())); - data_in.Clear(); - } - } - - void initOuterVertexData(std::shared_ptr fragment) { - int worker_num = comm_spec_.worker_num(); - - std::vector> request_gid_lists(worker_num); - auto& outer_vertices = fragment->OuterVertices(); - for (auto& v : outer_vertices) { - fid_t fid = fragment->GetFragId(v); - request_gid_lists[comm_spec_.FragToWorker(fid)].emplace_back( - fragment->GetOuterVertexGid(v)); - } - std::vector> requested_gid_lists(worker_num); - sync_comm::AllToAll(request_gid_lists, requested_gid_lists, - comm_spec_.comm()); - std::vector> response_vdata_lists(worker_num); - for (int i = 0; i < worker_num; ++i) { - auto& id_vec = requested_gid_lists[i]; - auto& data_vec = response_vdata_lists[i]; - data_vec.reserve(id_vec.size()); - for (auto id : id_vec) { - typename fragment_t::vertex_t v; - CHECK(fragment->InnerVertexGid2Vertex(id, v)); - data_vec.emplace_back(fragment->GetData(v)); - } - } - std::vector> responsed_vdata_lists(worker_num); - sync_comm::AllToAll(response_vdata_lists, responsed_vdata_lists, - comm_spec_.comm()); - for (int i = 0; i < worker_num; ++i) { - auto& id_vec = request_gid_lists[i]; - auto& data_vec = responsed_vdata_lists[i]; - CHECK_EQ(id_vec.size(), data_vec.size()); - size_t num = id_vec.size(); - for (size_t k = 0; k < num; ++k) { - typename fragment_t::vertex_t v; - CHECK(fragment->OuterVertexGid2Vertex(id_vec[k], v)); - fragment->SetData(v, data_vec[k]); + if (dst_fid == comm_spec_.fid()) { + got_edges_.emplace_back(std::move(data_in.buffers())); + data_in.Clear(); } } } - private: - CommSpec comm_spec_; - std::shared_ptr vm_ptr_; + std::vector vertices_; + std::vector vdata_; + + std::vector> processed_vertices_; - std::vector> vertices_to_frag_; - std::vector> - edges_to_frag_; + std::unique_ptr> vertex_map_; - std::thread vertex_recv_thread_; + std::vector> edges_to_frag_; std::thread edge_recv_thread_; bool recv_thread_running_; - std::vector> got_vertices_; - std::vector> - got_edges_; + std::vector> got_edges_; - std::vector> processed_vertices_; - std::vector> processed_edges_; + std::vector src_gid_list_; + std::vector dst_gid_list_; + std::vector edata_; + + using BasicFragmentLoaderBase::comm_spec_; + using BasicFragmentLoaderBase::spec_; + using BasicFragmentLoaderBase::id_parser_; - static constexpr int vertex_tag = 5; - static constexpr int edge_tag = 6; + using BasicFragmentLoaderBase::edge_tag; }; -} // namespace grape +}; // namespace grape #endif // GRAPE_FRAGMENT_BASIC_FRAGMENT_LOADER_H_ diff --git a/grape/fragment/basic_fragment_loader_base.h b/grape/fragment/basic_fragment_loader_base.h new file mode 100644 index 00000000..44c1bdbb --- /dev/null +++ b/grape/fragment/basic_fragment_loader_base.h @@ -0,0 +1,440 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef GRAPE_FRAGMENT_BASIC_FRAGMENT_LOADER_BASE_H_ +#define GRAPE_FRAGMENT_BASIC_FRAGMENT_LOADER_BASE_H_ + +#include "grape/util.h" +#include "grape/vertex_map/idxers/idxers.h" +#include "grape/vertex_map/partitioner.h" +#include "grape/vertex_map/vertex_map.h" + +namespace grape { + +/** + * @brief LoadGraphSpec determines the specification to load a graph. + * + */ +struct LoadGraphSpec { + bool directed; + bool rebalance; + int rebalance_vertex_factor; + + bool serialize; + std::string serialization_prefix; + + bool deserialize; + std::string deserialization_prefix; + + PartitionerType partitioner_type; + IdxerType idxer_type; + + void set_directed(bool val = true) { directed = val; } + void set_rebalance(bool flag, int weight) { + rebalance = flag; + rebalance_vertex_factor = weight; + } + + void set_serialize(bool flag, const std::string& prefix) { + serialize = flag; + serialization_prefix = prefix; + } + + void set_deserialize(bool flag, const std::string& prefix) { + deserialize = flag; + deserialization_prefix = prefix; + } + + std::string to_string() const { + std::string ret; + ret += (directed ? "directed-" : "undirected-"); + if (rebalance) { + ret += "rebalance-" + std::to_string(rebalance_vertex_factor) + "-"; + } else { + ret += "no-rebalance-"; + } + if (partitioner_type == PartitionerType::kHashPartitioner) { + ret += "hash-partitioner-"; + } else if (partitioner_type == PartitionerType::kMapPartitioner) { + ret += "map-partitioner-"; + } else if (partitioner_type == PartitionerType::kSegmentedPartitioner) { + ret += "segmented-partitioner-"; + } else { + LOG(FATAL) << "Unknown partitioner type"; + } + if (idxer_type == IdxerType::kHashMapIdxer) { + ret += "hashmap-idxer"; + } else if (idxer_type == IdxerType::kSortedArrayIdxer) { + ret += "sorted-array-idxer"; + } else if (idxer_type == IdxerType::kLocalIdxer) { + ret += "local-idxer"; + } else if (idxer_type == IdxerType::kPTHashIdxer) { + ret += "pthash-idxer"; + } else if (idxer_type == IdxerType::kHashMapIdxerView) { + ret += "hashmap-idxer-view"; + } else { + LOG(FATAL) << "Unknown idxer type"; + } + return ret; + } +}; + +inline LoadGraphSpec DefaultLoadGraphSpec() { + LoadGraphSpec spec; + spec.directed = true; + spec.rebalance = true; + spec.rebalance_vertex_factor = 0; + spec.serialize = false; + spec.deserialize = false; + spec.partitioner_type = PartitionerType::kHashPartitioner; + spec.idxer_type = IdxerType::kHashMapIdxer; + return spec; +} + +inline size_t hash_strings(const std::vector& strs) { + std::hash hash_fn; + size_t combinedHash = 0; + for (auto& str : strs) { + combinedHash ^= + hash_fn(str) + 0x9e3779b9 + (combinedHash << 6) + (combinedHash >> 2); + } + return combinedHash; +} + +inline std::string to_hex_string(size_t hash) { + std::stringstream ss; + ss << std::hex << std::uppercase << hash; + return ss.str(); +} + +template +std::string sigfile_content(const std::string& efile, const std::string& vfile, + const LoadGraphSpec& spec) { + std::string spec_info = spec.to_string(); + std::string frag_type_name = FRAG_T::type_info(); + std::string desc = "efile: " + efile + "\n"; + desc += "vfile: " + vfile + "\n"; + desc += "spec: " + spec_info + "\n"; + desc += "frag_type: " + frag_type_name + "\n"; + return desc; +} + +template +bool find_serialization(const std::string& efile, const std::string& vfile, + const std::string& serialization_prefix, + const LoadGraphSpec& spec, fid_t fnum, + std::string& prefix_out) { + std::string spec_info = spec.to_string(); + std::string frag_type_name = FRAG_T::type_info(); + size_t hash_value = hash_strings({efile, vfile, spec_info, frag_type_name}); + std::string desc = sigfile_content(efile, vfile, spec); + + while (true) { + std::string typed_prefix = serialization_prefix + "/" + + to_hex_string(hash_value) + "/" + "part_" + + std::to_string(fnum); + std::string sigfile_name = typed_prefix + "/sig"; + if (exists_file(sigfile_name)) { + std::string sigfile_content; + std::ifstream sigfile(sigfile_name); + if (!sigfile.is_open()) { + LOG(ERROR) << "Failed to open signature file: " << sigfile_name; + return false; + } + std::string line; + while (std::getline(sigfile, line)) { + sigfile_content += (line + "\n"); + } + if (sigfile_content == desc) { + prefix_out = typed_prefix; + return true; + } + } else { + prefix_out = typed_prefix; + return false; + } + ++hash_value; + } +} + +template +bool SerializeFragment(std::shared_ptr& fragment, + const CommSpec& comm_spec, const std::string& efile, + const std::string& vfile, const LoadGraphSpec& spec) { + std::string typed_prefix; + bool exist = + find_serialization(efile, vfile, spec.serialization_prefix, spec, + comm_spec.fnum(), typed_prefix); + if (exist) { + LOG(ERROR) << "Serialization exists: " << typed_prefix; + return false; + } + + if (!create_directories(typed_prefix)) { + LOG(ERROR) << "Failed to create directory: " << typed_prefix << ", " + << std::strerror(errno); + return false; + } + + char serial_file[1024]; + snprintf(serial_file, sizeof(serial_file), "%s/%s", typed_prefix.c_str(), + kSerializationVertexMapFilename); + fragment->GetVertexMap().template Serialize(typed_prefix, + comm_spec); + fragment->template Serialize(typed_prefix); + + MPI_Barrier(comm_spec.comm()); + if (comm_spec.worker_id() == 0) { + std::string sigfile_name = typed_prefix + "/sig"; + std::ofstream sigfile(sigfile_name); + if (!sigfile.is_open()) { + LOG(ERROR) << "Failed to open signature file: " << sigfile_name; + return false; + } + sigfile << sigfile_content(efile, vfile, spec); + } + + return true; +} + +template +bool DeserializeFragment(std::shared_ptr& fragment, + const CommSpec& comm_spec, const std::string& efile, + const std::string& vfile, const LoadGraphSpec& spec) { + std::string typed_prefix; + bool exist = + find_serialization(efile, vfile, spec.deserialization_prefix, + spec, comm_spec.fnum(), typed_prefix); + if (!exist) { + LOG(ERROR) << "Serialization not exists: " << typed_prefix; + return false; + } + + auto io_adaptor = std::unique_ptr(new IOADAPTOR_T(typed_prefix)); + if (io_adaptor->IsExist()) { + std::unique_ptr> + vm_ptr(new VertexMap()); + vm_ptr->template Deserialize(typed_prefix, comm_spec); + fragment = std::shared_ptr(new FRAG_T()); + fragment->template Deserialize(comm_spec, std::move(vm_ptr), + typed_prefix); + return true; + } else { + return false; + } +} + +template +class BasicFragmentLoaderBase { + using fragment_t = FRAG_T; + using oid_t = typename fragment_t::oid_t; + using vid_t = typename fragment_t::vid_t; + using vdata_t = typename fragment_t::vdata_t; + using edata_t = typename fragment_t::edata_t; + using internal_oid_t = typename InternalOID::type; + + public: + BasicFragmentLoaderBase(const CommSpec& comm_spec, const LoadGraphSpec& spec) + : comm_spec_(comm_spec), spec_(spec) { + comm_spec_.Dup(); + id_parser_.init(comm_spec_.fnum()); + } + virtual ~BasicFragmentLoaderBase() {} + + virtual void AddVertex(const oid_t& id, const vdata_t& data) = 0; + virtual void ConstructVertices() = 0; + virtual void AddEdge(const oid_t& src, const oid_t& dst, + const edata_t& data) = 0; + virtual void ConstructFragment(std::shared_ptr& fragment) = 0; + + protected: + void InitOuterVertexData(std::shared_ptr fragment) { + int worker_num = comm_spec_.worker_num(); + + std::vector> request_gid_lists(worker_num); + auto& outer_vertices = fragment->OuterVertices(); + for (auto& v : outer_vertices) { + fid_t fid = fragment->GetFragId(v); + request_gid_lists[comm_spec_.FragToWorker(fid)].emplace_back( + fragment->GetOuterVertexGid(v)); + } + std::vector> requested_gid_lists(worker_num); + sync_comm::AllToAll(request_gid_lists, requested_gid_lists, + comm_spec_.comm()); + std::vector> response_vdata_lists(worker_num); + for (int i = 0; i < worker_num; ++i) { + auto& id_vec = requested_gid_lists[i]; + auto& data_vec = response_vdata_lists[i]; + data_vec.reserve(id_vec.size()); + for (auto id : id_vec) { + typename fragment_t::vertex_t v; + CHECK(fragment->InnerVertexGid2Vertex(id, v)); + data_vec.emplace_back(fragment->GetData(v)); + } + } + std::vector> responsed_vdata_lists(worker_num); + sync_comm::AllToAll(response_vdata_lists, responsed_vdata_lists, + comm_spec_.comm()); + for (int i = 0; i < worker_num; ++i) { + auto& id_vec = request_gid_lists[i]; + auto& data_vec = responsed_vdata_lists[i]; + CHECK_EQ(id_vec.size(), data_vec.size()); + size_t num = id_vec.size(); + for (size_t k = 0; k < num; ++k) { + typename fragment_t::vertex_t v; + CHECK(fragment->OuterVertexGid2Vertex(id_vec[k], v)); + fragment->SetData(v, data_vec[k]); + } + } + } + + void ShuffleVertex(const std::vector& added_vertices_id, + std::vector>& local_vertices_id, + const IPartitioner& partitioner) { + fid_t fnum = comm_spec_.fnum(); + fid_t fid = comm_spec_.fid(); + std::vector> partitioned_vertices_out(fnum); + size_t added_vertices = added_vertices_id.size(); + for (size_t i = 0; i < added_vertices; ++i) { + fid_t dst_fid = partitioner.GetPartitionId(added_vertices_id[i]); + if (dst_fid == fnum) { + LOG(ERROR) << "Unknown partition id for vertex " + << added_vertices_id[i]; + } else { + partitioned_vertices_out[dst_fid].emplace_back( + std::move(added_vertices_id[i])); + } + } + + local_vertices_id.emplace_back(std::move(partitioned_vertices_out[fid])); + + std::thread send_thread([&]() { + int dst_worker_id = + (comm_spec_.worker_id() + 1) % comm_spec_.worker_num(); + while (dst_worker_id != comm_spec_.worker_id()) { + for (fid_t fid = 0; fid < fnum; ++fid) { + if (comm_spec_.FragToWorker(fid) != dst_worker_id) { + continue; + } + sync_comm::Send(partitioned_vertices_out[fid], dst_worker_id, + vertex_tag, comm_spec_.comm()); + } + dst_worker_id = (dst_worker_id + 1) % comm_spec_.worker_num(); + } + }); + std::thread recv_thread([&]() { + int src_worker_id = + (comm_spec_.worker_id() + comm_spec_.worker_num() - 1) % + comm_spec_.worker_num(); + while (src_worker_id != comm_spec_.worker_id()) { + for (fid_t fid = 0; fid < fnum; ++fid) { + if (comm_spec_.FragToWorker(fid) != comm_spec_.worker_id()) { + continue; + } + std::vector recv_vertices; + sync_comm::Recv(recv_vertices, src_worker_id, vertex_tag, + comm_spec_.comm()); + local_vertices_id.emplace_back(std::move(recv_vertices)); + } + src_worker_id = (src_worker_id + comm_spec_.worker_num() - 1) % + comm_spec_.worker_num(); + } + }); + + recv_thread.join(); + send_thread.join(); + } + + void ShuffleVertexData(const std::vector& added_vertices_id, + const std::vector& added_vertices_data, + std::vector>& local_vertices_id, + std::vector>& local_vertices_data, + const IPartitioner& partitioner) { + fid_t fnum = comm_spec_.fnum(); + fid_t fid = comm_spec_.fid(); + std::vector> partitioned_vertices_out(fnum); + std::vector> partitioned_vdata_out(fnum); + size_t added_vertices = added_vertices_id.size(); + for (size_t i = 0; i < added_vertices; ++i) { + fid_t dst_fid = partitioner.GetPartitionId(added_vertices_id[i]); + if (dst_fid == fnum) { + LOG(ERROR) << "Unknown partition id for vertex " + << added_vertices_id[i]; + } else { + partitioned_vertices_out[dst_fid].emplace_back( + std::move(added_vertices_id[i])); + partitioned_vdata_out[dst_fid].emplace_back( + std::move(added_vertices_data[i])); + } + } + + local_vertices_id.emplace_back(std::move(partitioned_vertices_out[fid])); + local_vertices_data.emplace_back(std::move(partitioned_vdata_out[fid])); + + std::thread send_thread([&]() { + int dst_worker_id = + (comm_spec_.worker_id() + 1) % comm_spec_.worker_num(); + while (dst_worker_id != comm_spec_.worker_id()) { + for (fid_t fid = 0; fid < fnum; ++fid) { + if (comm_spec_.FragToWorker(fid) != dst_worker_id) { + continue; + } + sync_comm::Send(partitioned_vertices_out[fid], dst_worker_id, + vertex_tag, comm_spec_.comm()); + sync_comm::Send(partitioned_vdata_out[fid], dst_worker_id, vertex_tag, + comm_spec_.comm()); + } + dst_worker_id = (dst_worker_id + 1) % comm_spec_.worker_num(); + } + }); + std::thread recv_thread([&]() { + int src_worker_id = + (comm_spec_.worker_id() + comm_spec_.worker_num() - 1) % + comm_spec_.worker_num(); + while (src_worker_id != comm_spec_.worker_id()) { + for (fid_t fid = 0; fid < fnum; ++fid) { + if (comm_spec_.FragToWorker(fid) != comm_spec_.worker_id()) { + continue; + } + std::vector recv_vertices; + std::vector recv_vdata; + sync_comm::Recv(recv_vertices, src_worker_id, vertex_tag, + comm_spec_.comm()); + sync_comm::Recv(recv_vdata, src_worker_id, vertex_tag, + comm_spec_.comm()); + local_vertices_id.emplace_back(std::move(recv_vertices)); + local_vertices_data.emplace_back(std::move(recv_vdata)); + } + + src_worker_id = (src_worker_id + comm_spec_.worker_num() - 1) % + comm_spec_.worker_num(); + } + }); + + recv_thread.join(); + send_thread.join(); + } + + CommSpec comm_spec_; + LoadGraphSpec spec_; + IdParser id_parser_; + + static constexpr int vertex_tag = 5; + static constexpr int edge_tag = 6; +}; + +} // namespace grape + +#endif // GRAPE_FRAGMENT_BASIC_FRAGMENT_LOADER_BASE_H_ diff --git a/grape/fragment/basic_fragment_mutator.h b/grape/fragment/basic_fragment_mutator.h index 8307a460..37fbf24e 100644 --- a/grape/fragment/basic_fragment_mutator.h +++ b/grape/fragment/basic_fragment_mutator.h @@ -22,6 +22,7 @@ limitations under the License. #include #include #include +#include #include namespace grape { @@ -41,7 +42,6 @@ struct Mutation { template class BasicFragmentMutator { using fragment_t = FRAG_T; - using vertex_map_t = typename FRAG_T::vertex_map_t; using oid_t = typename FRAG_T::oid_t; using internal_oid_t = typename InternalOID::type; using vid_t = typename FRAG_T::vid_t; @@ -49,27 +49,18 @@ class BasicFragmentMutator { using edata_t = typename FRAG_T::edata_t; using mutation_t = Mutation; static constexpr LoadStrategy load_strategy = FRAG_T::load_strategy; - using partitioner_t = typename vertex_map_t::partitioner_t; public: explicit BasicFragmentMutator(const CommSpec& comm_spec, std::shared_ptr fragment) : comm_spec_(comm_spec), fragment_(fragment), - vm_ptr_(fragment->GetVertexMap()) { + vm_(fragment->GetVertexMap()) { comm_spec_.Dup(); } ~BasicFragmentMutator() = default; - void SetPartitioner(const partitioner_t& partitioner) { - vm_ptr_->SetPartitioner(partitioner); - } - - void SetPartitioner(partitioner_t&& partitioner) { - vm_ptr_->SetPartitioner(std::move(partitioner)); - } - void AddVerticesToRemove(const std::vector& id_vec) { if (parsed_vertices_to_remove_.empty()) { parsed_vertices_to_remove_ = id_vec; @@ -132,6 +123,7 @@ class BasicFragmentMutator { shuf.Flush(); } recv_thread_.join(); + MPI_Barrier(comm_spec_.comm()); got_vertices_to_add_.emplace_back( std::move(vertices_to_add_[comm_spec_.fid()].buffers())); got_vertices_to_remove_.emplace_back( @@ -150,8 +142,8 @@ class BasicFragmentMutator { foreach_rval(buffers, [this](internal_oid_t&& src, internal_oid_t&& dst, edata_t&& data) { vid_t src_gid, dst_gid; - if (vm_ptr_->_GetGid(src, src_gid) && - vm_ptr_->_GetGid(dst, dst_gid)) { + if (vm_.GetGid(oid_t(src), src_gid) && + vm_.GetGid(oid_t(dst), dst_gid)) { mutation_.edges_to_update.emplace_back(src_gid, dst_gid, std::move(data)); } @@ -161,20 +153,21 @@ class BasicFragmentMutator { got_edges_to_update_.clear(); for (auto& buffers : got_edges_to_remove_) { - foreach(buffers, [this](const internal_oid_t& src, - const internal_oid_t& dst) { - vid_t src_gid, dst_gid; - if (vm_ptr_->_GetGid(src, src_gid) && vm_ptr_->_GetGid(dst, dst_gid)) { - mutation_.edges_to_remove.emplace_back(src_gid, dst_gid); - } - }); + foreach(buffers, + [this](const internal_oid_t& src, const internal_oid_t& dst) { + vid_t src_gid, dst_gid; + if (vm_.GetGid(oid_t(src), src_gid) && + vm_.GetGid(oid_t(dst), dst_gid)) { + mutation_.edges_to_remove.emplace_back(src_gid, dst_gid); + } + }); } got_edges_to_remove_.clear(); for (auto& buffers : got_vertices_to_remove_) { foreach(buffers, [this](const internal_oid_t& id) { vid_t gid; - if (vm_ptr_->_GetGid(id, gid)) { + if (vm_.GetGid(oid_t(id), gid)) { parsed_vertices_to_remove_.emplace_back(gid); } }); @@ -185,7 +178,7 @@ class BasicFragmentMutator { for (auto& buffers : got_vertices_to_update_) { foreach_rval(buffers, [this](internal_oid_t&& id, vdata_t&& data) { vid_t gid; - if (vm_ptr_->_GetGid(id, gid)) { + if (vm_.GetGid(oid_t(id), gid)) { parsed_vertices_to_update_.emplace_back(gid, std::move(data)); } }); @@ -193,33 +186,33 @@ class BasicFragmentMutator { } got_vertices_to_update_.clear(); - auto builder = vm_ptr_->GetLocalBuilder(); + std::vector local_vertices_to_add; + for (auto& buffers : got_vertices_to_add_) { - foreach_rval(buffers, - [this, &builder](internal_oid_t&& id, vdata_t&& data) { - vid_t gid; - builder.add_local_vertex(id, gid); - parsed_vertices_to_add_.emplace_back(gid, std::move(data)); - }); + foreach(buffers, [&local_vertices_to_add](const internal_oid_t& id, + const vdata_t& data) { + local_vertices_to_add.emplace_back(oid_t(id)); + }); } - got_vertices_to_add_.clear(); - for (auto& buffers : got_edges_to_add_) { - foreach_helper( - buffers, - [&builder](const internal_oid_t& src, const internal_oid_t& dst) { - builder.add_vertex(src); - builder.add_vertex(dst); - }, - make_index_sequence<2>{}); + vm_.ExtendVertices(comm_spec_, std::move(local_vertices_to_add)); + + for (auto& buffers : got_vertices_to_add_) { + foreach_rval(buffers, [this](internal_oid_t&& id, vdata_t&& data) { + vid_t gid; + if (vm_.GetGid(oid_t(id), gid)) { + parsed_vertices_to_add_.emplace_back(gid, std::move(data)); + } + }); } - builder.finish(*vm_ptr_); + got_vertices_to_add_.clear(); for (auto& buffers : got_edges_to_add_) { foreach_rval(buffers, [this](internal_oid_t&& src, internal_oid_t&& dst, edata_t&& data) { vid_t src_gid, dst_gid; - if (vm_ptr_->_GetGid(src, src_gid) && vm_ptr_->_GetGid(dst, dst_gid)) { + if (vm_.GetGid(oid_t(src), src_gid) && + vm_.GetGid(oid_t(dst), dst_gid)) { mutation_.edges_to_add.emplace_back(src_gid, dst_gid, std::move(data)); } @@ -276,8 +269,7 @@ class BasicFragmentMutator { } void AddVertex(const internal_oid_t& id, const vdata_t& data) { - auto& partitioner = vm_ptr_->GetPartitioner(); - fid_t fid = partitioner.GetPartitionId(id); + fid_t fid = vm_.GetFragmentId(oid_t(id)); vertices_to_add_[fid].Emplace(id, data); } @@ -294,9 +286,8 @@ class BasicFragmentMutator { void AddEdge(const internal_oid_t& src, const internal_oid_t& dst, const edata_t& data) { - auto& partitioner = vm_ptr_->GetPartitioner(); - fid_t src_fid = partitioner.GetPartitionId(src); - fid_t dst_fid = partitioner.GetPartitionId(dst); + fid_t src_fid = vm_.GetFragmentId(oid_t(src)); + fid_t dst_fid = vm_.GetFragmentId(oid_t(dst)); edges_to_add_[src_fid].Emplace(src, dst, data); if (src_fid != dst_fid) { edges_to_add_[dst_fid].Emplace(src, dst, data); @@ -318,8 +309,7 @@ class BasicFragmentMutator { } void RemoveVertex(const oid_t& id) { - auto& partitioner = vm_ptr_->GetPartitioner(); - fid_t fid = partitioner.GetPartitionId(id); + fid_t fid = vm_.GetFragmentId(id); vertices_to_remove_[fid].Emplace(id); } @@ -332,9 +322,8 @@ class BasicFragmentMutator { } void RemoveEdge(const oid_t& src, const oid_t& dst) { - auto& partitioner = vm_ptr_->GetPartitioner(); - fid_t src_fid = partitioner.GetPartitionId(src); - fid_t dst_fid = partitioner.GetPartitionId(dst); + fid_t src_fid = vm_.GetFragmentId(src); + fid_t dst_fid = vm_.GetFragmentId(dst); edges_to_remove_[src_fid].Emplace(src, dst); if (src_fid != dst_fid) { edges_to_remove_[dst_fid].Emplace(src, dst); @@ -359,8 +348,7 @@ class BasicFragmentMutator { template typename std::enable_if::value>::type UpdateVertex(const oid_t& id, const vdata_t& data) { - auto& partitioner = vm_ptr_->GetPartitioner(); - fid_t fid = partitioner.GetPartitionId(id); + fid_t fid = vm_.GetFragmentId(id); vertices_to_update_[fid].Emplace(id, data); } @@ -384,9 +372,8 @@ class BasicFragmentMutator { } void UpdateEdge(const oid_t& src, const oid_t& dst, const edata_t& data) { - auto& partitioner = vm_ptr_->GetPartitioner(); - fid_t src_fid = partitioner.GetPartitionId(src); - fid_t dst_fid = partitioner.GetPartitionId(dst); + fid_t src_fid = vm_.GetFragmentId(src); + fid_t dst_fid = vm_.GetFragmentId(dst); edges_to_update_[src_fid].Emplace(src, dst, data); if (src_fid != dst_fid) { edges_to_update_[dst_fid].Emplace(src, dst, data); @@ -489,8 +476,8 @@ class BasicFragmentMutator { } CommSpec comm_spec_; + std::shared_ptr fragment_; - std::shared_ptr vm_ptr_; std::thread recv_thread_; @@ -525,6 +512,7 @@ class BasicFragmentMutator { static constexpr int eu_tag = 6; mutation_t mutation_; + VertexMap& vm_; }; } // namespace grape diff --git a/grape/fragment/basic_local_fragment_loader.h b/grape/fragment/basic_local_fragment_loader.h new file mode 100644 index 00000000..d50c8e37 --- /dev/null +++ b/grape/fragment/basic_local_fragment_loader.h @@ -0,0 +1,250 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef GRAPE_FRAGMENT_BASIC_LOCAL_FRAGMENT_LOADER_H_ +#define GRAPE_FRAGMENT_BASIC_LOCAL_FRAGMENT_LOADER_H_ + +namespace grape { + +template +class BasicLocalFragmentLoader : public BasicFragmentLoaderBase { + using fragment_t = FRAG_T; + using oid_t = typename fragment_t::oid_t; + using internal_oid_t = typename InternalOID::type; + using vid_t = typename fragment_t::vid_t; + using vdata_t = typename fragment_t::vdata_t; + using edata_t = typename fragment_t::edata_t; + + public: + explicit BasicLocalFragmentLoader(const CommSpec& comm_spec, + const LoadGraphSpec& spec) + : BasicFragmentLoaderBase(comm_spec, spec) { + if (spec_.idxer_type != IdxerType::kLocalIdxer) { + LOG(ERROR) << "Local vertex map is required in BasicLocalFragmentLoader"; + spec_.idxer_type = IdxerType::kLocalIdxer; + } + if (spec_.rebalance) { + LOG(ERROR) << "Rebalance is not supported in BasicLocalFragmentLoader"; + spec_.rebalance = false; + } + if (spec_.partitioner_type != PartitionerType::kHashPartitioner) { + LOG(ERROR) << "Only hash partitioner is supported in " + "BasicLocalFragmentLoader"; + spec_.partitioner_type = PartitionerType::kHashPartitioner; + } + partitioner_ = std::unique_ptr>( + new HashPartitioner(comm_spec_.fnum())); + + vertices_to_frag_.resize(comm_spec_.fnum()); + edges_to_frag_.resize(comm_spec_.fnum()); + for (fid_t fid = 0; fid < comm_spec_.fnum(); ++fid) { + int worker_id = comm_spec_.FragToWorker(fid); + vertices_to_frag_[fid].Init(comm_spec_.comm(), vertex_tag, 4096000); + vertices_to_frag_[fid].SetDestination(worker_id, fid); + edges_to_frag_[fid].Init(comm_spec_.comm(), edge_tag, 4096000); + edges_to_frag_[fid].SetDestination(worker_id, fid); + if (worker_id == comm_spec_.worker_id()) { + vertices_to_frag_[fid].DisableComm(); + edges_to_frag_[fid].DisableComm(); + } + } + + vertex_recv_thread_ = + std::thread(&BasicLocalFragmentLoader::vertexRecvRoutine, this); + vertex_recv_thread_running_ = true; + } + + ~BasicLocalFragmentLoader() { + if (vertex_recv_thread_running_) { + for (auto& va : vertices_to_frag_) { + va.Flush(); + } + vertex_recv_thread_.join(); + } + if (edge_recv_thread_running_) { + for (auto& ea : edges_to_frag_) { + ea.Flush(); + } + edge_recv_thread_.join(); + } + } + + void AddVertex(const oid_t& id, const vdata_t& data) override { + internal_oid_t internal_id(id); + fid_t fid = partitioner_->GetPartitionId(internal_id); + if (fid == comm_spec_.fnum()) { + LOG(ERROR) << "Unknown partition id for vertex " << id; + } else { + vertices_to_frag_[fid].Emplace(internal_id, data); + } + } + + void ConstructVertices() override { + for (auto& va : vertices_to_frag_) { + va.Flush(); + } + vertex_recv_thread_.join(); + vertex_recv_thread_running_ = false; + + got_vertices_.emplace_back( + std::move(vertices_to_frag_[comm_spec_.fid()].buffers())); + vertices_to_frag_[comm_spec_.fid()].Clear(); + + edge_recv_thread_ = + std::thread(&BasicLocalFragmentLoader::edgeRecvRoutine, this); + edge_recv_thread_running_ = true; + } + + void AddEdge(const oid_t& src, const oid_t& dst, + const edata_t& data) override { + internal_oid_t internal_src(src); + internal_oid_t internal_dst(dst); + fid_t src_fid = partitioner_->GetPartitionId(internal_src); + fid_t dst_fid = partitioner_->GetPartitionId(internal_dst); + if (src_fid == comm_spec_.fnum() || dst_fid == comm_spec_.fnum()) { + LOG(ERROR) << "Unknown partition id for edge " << src << " -> " << dst; + } else { + edges_to_frag_[src_fid].Emplace(internal_src, internal_dst, data); + if (src_fid != dst_fid) { + edges_to_frag_[dst_fid].Emplace(internal_src, internal_dst, data); + } + } + } + + void ConstructFragment(std::shared_ptr& fragment) override { + for (auto& ea : edges_to_frag_) { + ea.Flush(); + } + edge_recv_thread_.join(); + edge_recv_thread_running_ = false; + + MPI_Barrier(comm_spec_.comm()); + got_edges_.emplace_back( + std::move(edges_to_frag_[comm_spec_.fid()].buffers())); + edges_to_frag_[comm_spec_.fid()].Clear(); + + std::unique_ptr> vm_ptr( + new VertexMap()); + { + VertexMapBuilder builder( + comm_spec_.fid(), comm_spec_.fnum(), std::move(partitioner_), + spec_.idxer_type); + for (auto& buffers : got_vertices_) { + foreach_helper( + buffers, + [&builder](const internal_oid_t& id) { builder.add_vertex(id); }, + make_index_sequence<1>{}); + } + for (auto& buffers : got_edges_) { + foreach_helper( + buffers, + [&builder](const internal_oid_t& src, const internal_oid_t& dst) { + builder.add_vertex(src); + builder.add_vertex(dst); + }, + make_index_sequence<2>{}); + } + builder.finish(comm_spec_, *vm_ptr); + } + + std::vector> processed_vertices; + for (auto& buffers : got_vertices_) { + foreach_rval(buffers, [&vm_ptr, &processed_vertices](internal_oid_t&& id, + vdata_t&& data) { + vid_t gid; + CHECK(vm_ptr->GetGid(oid_t(id), gid)); + processed_vertices.emplace_back(gid, std::move(data)); + }); + } + + std::vector> processed_edges; + for (auto& buffers : got_edges_) { + foreach_rval(buffers, [&vm_ptr, &processed_edges](internal_oid_t&& src, + internal_oid_t&& dst, + edata_t&& data) { + vid_t src_gid, dst_gid; + CHECK(vm_ptr->GetGid(oid_t(src), src_gid)); + CHECK(vm_ptr->GetGid(oid_t(dst), dst_gid)); + processed_edges.emplace_back(src_gid, dst_gid, std::move(data)); + }); + } + + fragment = std::shared_ptr(new fragment_t()); + fragment->Init(comm_spec_, spec_.directed, std::move(vm_ptr), + processed_vertices, processed_edges); + + if (!std::is_same::value) { + this->InitOuterVertexData(fragment); + } + } + + private: + void vertexRecvRoutine() { + ShuffleIn data_in; + data_in.Init(comm_spec_.fnum(), comm_spec_.comm(), vertex_tag); + fid_t dst_fid; + int src_worker_id; + while (!data_in.Finished()) { + src_worker_id = data_in.Recv(dst_fid); + if (src_worker_id == -1) { + break; + } + got_vertices_.emplace_back(std::move(data_in.buffers())); + data_in.Clear(); + } + } + + void edgeRecvRoutine() { + ShuffleIn data_in; + data_in.Init(comm_spec_.fnum(), comm_spec_.comm(), edge_tag); + fid_t dst_fid; + int src_worker_id; + while (!data_in.Finished()) { + src_worker_id = data_in.Recv(dst_fid); + if (src_worker_id == -1) { + break; + } + CHECK_EQ(dst_fid, comm_spec_.fid()); + got_edges_.emplace_back(std::move(data_in.buffers())); + data_in.Clear(); + } + } + + std::unique_ptr> partitioner_; + + std::vector> vertices_to_frag_; + std::vector> + edges_to_frag_; + + std::thread vertex_recv_thread_; + bool vertex_recv_thread_running_; + std::thread edge_recv_thread_; + bool edge_recv_thread_running_; + + std::vector> got_vertices_; + std::vector> + got_edges_; + + using BasicFragmentLoaderBase::comm_spec_; + using BasicFragmentLoaderBase::spec_; + using BasicFragmentLoaderBase::id_parser_; + + using BasicFragmentLoaderBase::vertex_tag; + using BasicFragmentLoaderBase::edge_tag; +}; + +} // namespace grape + +#endif // GRAPE_FRAGMENT_BASIC_LOCAL_FRAGMENT_LOADER_H_ diff --git a/grape/fragment/basic_rb_fragment_loader.h b/grape/fragment/basic_rb_fragment_loader.h new file mode 100644 index 00000000..92f1b952 --- /dev/null +++ b/grape/fragment/basic_rb_fragment_loader.h @@ -0,0 +1,228 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef GRAPE_FRAGMENT_BASIC_RB_FRAGMENT_LOADER_H_ +#define GRAPE_FRAGMENT_BASIC_RB_FRAGMENT_LOADER_H_ + +#include "grape/fragment/basic_fragment_loader_base.h" + +namespace grape { + +template +class BasicRbFragmentLoader : public BasicFragmentLoaderBase { + using fragment_t = FRAG_T; + using oid_t = typename fragment_t::oid_t; + using internal_oid_t = typename InternalOID::type; + using vid_t = typename fragment_t::vid_t; + using vdata_t = typename fragment_t::vdata_t; + using edata_t = typename fragment_t::edata_t; + + public: + explicit BasicRbFragmentLoader(const CommSpec& comm_spec, + const LoadGraphSpec& spec) + : BasicFragmentLoaderBase(comm_spec, spec) { + if (spec_.idxer_type == IdxerType::kLocalIdxer) { + LOG(ERROR) << "Global vertex map is required in BasicRbFragmentLoader"; + spec_.idxer_type = IdxerType::kHashMapIdxer; + } + if (spec_.partitioner_type == PartitionerType::kHashPartitioner) { + LOG(ERROR) + << "Hash partitioner is not supported in BasicRbFragmentLoader"; + spec_.partitioner_type = PartitionerType::kMapPartitioner; + } + } + + ~BasicRbFragmentLoader() {} + + void AddVertex(const oid_t& id, const vdata_t& data) override { + vertices_.emplace_back(id); + vdata_.emplace_back(data); + } + + void ConstructVertices() override { + fid_t fid = comm_spec_.fid(); + fid_t fnum = comm_spec_.fnum(); + std::unique_ptr> partitioner(nullptr); + if (spec_.partitioner_type == PartitionerType::kHashPartitioner) { + partitioner = std::unique_ptr>( + new HashPartitioner(fnum)); + } else if (spec_.partitioner_type == PartitionerType::kMapPartitioner) { + std::vector all_vertices; + sync_comm::FlatAllGather(vertices_, all_vertices, comm_spec_.comm()); + DistinctSort(all_vertices); + + partitioner = std::unique_ptr>( + new MapPartitioner(fnum, all_vertices)); + } else if (spec_.partitioner_type == + PartitionerType::kSegmentedPartitioner) { + std::vector all_vertices; + sync_comm::FlatAllGather(vertices_, all_vertices, comm_spec_.comm()); + DistinctSort(all_vertices); + + partitioner = std::unique_ptr>( + new SegmentedPartitioner(fnum, all_vertices)); + } else { + LOG(FATAL) << "Unsupported partitioner type"; + } + + std::vector> local_vertices_id; + this->ShuffleVertex(vertices_, local_vertices_id, *partitioner); + + std::vector sorted_vertices; + for (auto& buf : local_vertices_id) { + sorted_vertices.insert(sorted_vertices.end(), buf.begin(), buf.end()); + } + std::sort(sorted_vertices.begin(), sorted_vertices.end()); + + VertexMapBuilder builder(fid, fnum, std::move(partitioner), + spec_.idxer_type); + for (auto& v : sorted_vertices) { + builder.add_vertex(v); + } + vertex_map_ = + std::unique_ptr>(new VertexMap()); + builder.finish(comm_spec_, *vertex_map_); + } + + void AddEdge(const oid_t& src, const oid_t& dst, + const edata_t& data) override { + edges_src_.emplace_back(src); + edges_dst_.emplace_back(dst); + edges_data_.emplace_back(data); + } + + void ConstructFragment(std::shared_ptr& fragment) override { + if (spec_.rebalance) { + Rebalancer rebalancer(spec_.rebalance_vertex_factor, + std::move(vertex_map_)); + for (auto& v : edges_src_) { + rebalancer.inc_degree(v); + } + if (!spec_.directed) { + for (auto& v : edges_dst_) { + rebalancer.inc_degree(v); + } + } + + vertex_map_ = std::unique_ptr>( + new VertexMap()); + rebalancer.finish(comm_spec_, *vertex_map_); + } + + fid_t fnum = comm_spec_.fnum(); + std::vector> edges_to_frag(fnum); + for (fid_t i = 0; i < fnum; ++i) { + int worker_id = comm_spec_.FragToWorker(i); + edges_to_frag[i].Init(comm_spec_.comm(), edge_tag, 4096000); + edges_to_frag[i].SetDestination(worker_id, i); + if (worker_id == comm_spec_.worker_id()) { + edges_to_frag[i].DisableComm(); + } + } + std::vector> got_edges; + std::thread edge_recv_thread([&, this]() { + ShuffleIn data_in; + data_in.Init(comm_spec_.fnum(), comm_spec_.comm(), edge_tag); + fid_t dst_fid; + int src_worker_id; + while (!data_in.Finished()) { + src_worker_id = data_in.Recv(dst_fid); + if (src_worker_id == -1) { + break; + } + if (dst_fid == comm_spec_.fid()) { + got_edges.emplace_back(std::move(data_in.buffers())); + data_in.Clear(); + } + } + }); + + size_t added_edges = edges_src_.size(); + for (size_t i = 0; i < added_edges; ++i) { + vid_t src_gid, dst_gid; + if (vertex_map_->GetGid(edges_src_[i], src_gid) && + vertex_map_->GetGid(edges_dst_[i], dst_gid)) { + fid_t src_fid = id_parser_.get_fragment_id(src_gid); + fid_t dst_fid = id_parser_.get_fragment_id(dst_gid); + edges_to_frag[src_fid].Emplace(src_gid, dst_gid, edges_data_[i]); + if (src_fid != dst_fid) { + edges_to_frag[dst_fid].Emplace(src_gid, dst_gid, edges_data_[i]); + } + } + } + + for (auto& ea : edges_to_frag) { + ea.Flush(); + } + edge_recv_thread.join(); + + MPI_Barrier(comm_spec_.comm()); + got_edges.emplace_back( + std::move(edges_to_frag[comm_spec_.fid()].buffers())); + edges_to_frag[comm_spec_.fid()].Clear(); + + std::vector> processed_edges; + for (auto& buffers : got_edges) { + foreach_rval(buffers, [&processed_edges](vid_t&& src, vid_t&& dst, + edata_t&& data) { + processed_edges.emplace_back(src, dst, std::move(data)); + }); + } + + std::vector> local_vertices_id; + std::vector> local_vertices_data; + this->ShuffleVertexData(vertices_, vdata_, local_vertices_id, + local_vertices_data, vertex_map_->GetPartitioner()); + size_t buf_num = local_vertices_id.size(); + std::vector> processed_vertices; + for (size_t buf_i = 0; buf_i < buf_num; ++buf_i) { + std::vector& local_vertices = local_vertices_id[buf_i]; + std::vector& local_vdata = local_vertices_data[buf_i]; + size_t local_vertices_num = local_vertices.size(); + for (size_t i = 0; i < local_vertices_num; ++i) { + vid_t gid; + if (vertex_map_->GetGid(local_vertices[i], gid)) { + processed_vertices.emplace_back(gid, std::move(local_vdata[i])); + } + } + } + + fragment = std::make_shared(); + fragment->Init(comm_spec_, spec_.directed, std::move(vertex_map_), + processed_vertices, processed_edges); + + this->InitOuterVertexData(fragment); + } + + private: + std::vector vertices_; + std::vector vdata_; + + std::vector edges_src_; + std::vector edges_dst_; + std::vector edges_data_; + + std::unique_ptr> vertex_map_; + + using BasicFragmentLoaderBase::comm_spec_; + using BasicFragmentLoaderBase::spec_; + using BasicFragmentLoaderBase::id_parser_; + + using BasicFragmentLoaderBase::edge_tag; +}; + +} // namespace grape + +#endif // GRAPE_FRAGMENT_BASIC_RB_FRAGMENT_LOADER_H_ diff --git a/grape/fragment/csr_edgecut_fragment_base.h b/grape/fragment/csr_edgecut_fragment_base.h index 42091eee..a84ff96d 100644 --- a/grape/fragment/csr_edgecut_fragment_base.h +++ b/grape/fragment/csr_edgecut_fragment_base.h @@ -24,7 +24,6 @@ limitations under the License. #include "grape/graph/adj_list.h" #include "grape/graph/immutable_csr.h" #include "grape/util.h" -#include "grape/vertex_map/global_vertex_map.h" namespace grape { diff --git a/grape/fragment/edgecut_fragment_base.h b/grape/fragment/edgecut_fragment_base.h index 7ad03acd..55521f17 100644 --- a/grape/fragment/edgecut_fragment_base.h +++ b/grape/fragment/edgecut_fragment_base.h @@ -137,7 +137,7 @@ class EdgecutFragmentBase * @return The original ID. */ OID_T GetInnerVertexId(vertex_t v) const { - OID_T oid; + OID_T oid{}; vm_ptr_->GetOid(GetInnerVertexGid(v), oid); return oid; } @@ -150,7 +150,7 @@ class EdgecutFragmentBase * @return The original ID. */ OID_T GetOuterVertexId(vertex_t v) const { - OID_T oid; + OID_T oid{}; vm_ptr_->GetOid(GetOuterVertexGid(v), oid); return oid; } diff --git a/grape/fragment/ev_fragment_loader.h b/grape/fragment/ev_fragment_loader.h index 1b796d64..89bc1d53 100644 --- a/grape/fragment/ev_fragment_loader.h +++ b/grape/fragment/ev_fragment_loader.h @@ -23,8 +23,10 @@ limitations under the License. #include #include +#include "grape/fragment/basic_efile_fragment_loader.h" #include "grape/fragment/basic_fragment_loader.h" -#include "grape/fragment/partitioner.h" +#include "grape/fragment/basic_local_fragment_loader.h" +#include "grape/fragment/basic_rb_fragment_loader.h" #include "grape/io/line_parser_base.h" #include "grape/io/local_io_adaptor.h" #include "grape/io/tsv_line_parser.h" @@ -51,8 +53,6 @@ class EVFragmentLoader { using vdata_t = typename fragment_t::vdata_t; using edata_t = typename fragment_t::edata_t; - using vertex_map_t = typename fragment_t::vertex_map_t; - using partitioner_t = typename vertex_map_t::partitioner_t; using io_adaptor_t = IOADAPTOR_T; using line_parser_t = LINE_PARSER_T; @@ -64,7 +64,7 @@ class EVFragmentLoader { public: explicit EVFragmentLoader(const CommSpec& comm_spec) - : comm_spec_(comm_spec), basic_fragment_loader_(comm_spec) {} + : comm_spec_(comm_spec), basic_fragment_loader_(nullptr) {} ~EVFragmentLoader() = default; @@ -72,10 +72,9 @@ class EVFragmentLoader { const std::string& vfile, const LoadGraphSpec& spec) { std::shared_ptr fragment(nullptr); - CHECK(!spec.rebalance); - if (spec.deserialize && (!spec.serialize)) { - bool deserialized = basic_fragment_loader_.DeserializeFragment( - fragment, spec.deserialization_prefix); + if (spec.deserialize) { + bool deserialized = DeserializeFragment( + fragment, comm_spec_, efile, vfile, spec); int flag = 0; int sum = 0; if (!deserialized) { @@ -93,10 +92,32 @@ class EVFragmentLoader { } } - std::vector id_list; - std::vector vdata_list; + if (vfile.empty()) { + basic_fragment_loader_ = + std::unique_ptr>( + new BasicEFileFragmentLoader(comm_spec_, spec)); + } else { + if (spec.idxer_type != IdxerType::kLocalIdxer) { + if (spec.rebalance) { + basic_fragment_loader_ = + std::unique_ptr>( + new BasicRbFragmentLoader(comm_spec_, spec)); + } else { + basic_fragment_loader_ = + std::unique_ptr>( + new BasicFragmentLoader(comm_spec_, spec)); + } + } else { + basic_fragment_loader_ = + std::unique_ptr>( + new BasicLocalFragmentLoader(comm_spec_, spec)); + } + } + if (!vfile.empty()) { auto io_adaptor = std::unique_ptr(new IOADAPTOR_T(vfile)); + io_adaptor->SetPartialRead(comm_spec_.worker_id(), + comm_spec_.worker_num()); io_adaptor->Open(); std::string line; vdata_t v_data; @@ -116,24 +137,12 @@ class EVFragmentLoader { VLOG(1) << e.what(); continue; } - id_list.push_back(vertex_id); - vdata_list.push_back(v_data); + basic_fragment_loader_->AddVertex(vertex_id, v_data); } io_adaptor->Close(); } - partitioner_t partitioner(comm_spec_.fnum(), id_list); - - basic_fragment_loader_.SetPartitioner(std::move(partitioner)); - - basic_fragment_loader_.Start(); - - { - size_t vnum = id_list.size(); - for (size_t i = 0; i < vnum; ++i) { - basic_fragment_loader_.AddVertex(id_list[i], vdata_list[i]); - } - } + basic_fragment_loader_->ConstructVertices(); { auto io_adaptor = @@ -162,7 +171,7 @@ class EVFragmentLoader { continue; } - basic_fragment_loader_.AddEdge(src, dst, e_data); + basic_fragment_loader_->AddEdge(src, dst, e_data); } io_adaptor->Close(); } @@ -170,11 +179,11 @@ class EVFragmentLoader { VLOG(1) << "[worker-" << comm_spec_.worker_id() << "] finished add vertices and edges"; - basic_fragment_loader_.ConstructFragment(fragment, spec.directed); + basic_fragment_loader_->ConstructFragment(fragment); if (spec.serialize) { - bool serialized = basic_fragment_loader_.SerializeFragment( - fragment, spec.serialization_prefix); + bool serialized = SerializeFragment( + fragment, comm_spec_, efile, vfile, spec); if (!serialized) { VLOG(2) << "[worker-" << comm_spec_.worker_id() << "] Serialization failed."; @@ -187,7 +196,7 @@ class EVFragmentLoader { private: CommSpec comm_spec_; - BasicFragmentLoader basic_fragment_loader_; + std::unique_ptr> basic_fragment_loader_; line_parser_t line_parser_; }; diff --git a/grape/fragment/ev_fragment_mutator.h b/grape/fragment/ev_fragment_mutator.h index 4b9bce35..ce658782 100644 --- a/grape/fragment/ev_fragment_mutator.h +++ b/grape/fragment/ev_fragment_mutator.h @@ -17,7 +17,6 @@ limitations under the License. #define GRAPE_FRAGMENT_EV_FRAGMENT_MUTATOR_H_ #include -#include #include namespace grape { diff --git a/grape/fragment/ev_fragment_rebalance_loader.h b/grape/fragment/ev_fragment_rebalance_loader.h deleted file mode 100644 index 814ba4f6..00000000 --- a/grape/fragment/ev_fragment_rebalance_loader.h +++ /dev/null @@ -1,432 +0,0 @@ -/** Copyright 2020 Alibaba Group Holding Limited. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#ifndef GRAPE_FRAGMENT_EV_FRAGMENT_REBALANCE_LOADER_H_ -#define GRAPE_FRAGMENT_EV_FRAGMENT_REBALANCE_LOADER_H_ - -#include - -#include -#include -#include -#include - -#include "grape/fragment/basic_fragment_loader.h" -#include "grape/fragment/partitioner.h" -#include "grape/io/line_parser_base.h" -#include "grape/io/local_io_adaptor.h" -#include "grape/io/tsv_line_parser.h" -#include "grape/worker/comm_spec.h" - -namespace grape { - -/** - * @brief EVFragmentLoader is a loader to load fragments from separated - * efile and vfile. - * - * @tparam FRAG_T Fragment type. - * @tparam IOADAPTOR_T IOAdaptor type. - * @tparam LINE_PARSER_T LineParser type. - */ -template > -class EVFragmentRebalanceLoader { - using fragment_t = FRAG_T; - using oid_t = typename fragment_t::oid_t; - using vid_t = typename fragment_t::vid_t; - using vdata_t = typename fragment_t::vdata_t; - using edata_t = typename fragment_t::edata_t; - - using vertex_map_t = typename fragment_t::vertex_map_t; - using partitioner_t = typename vertex_map_t::partitioner_t; - using line_parser_t = LINE_PARSER_T; - - static constexpr LoadStrategy load_strategy = fragment_t::load_strategy; - - static_assert(std::is_base_of, - LINE_PARSER_T>::value, - "LineParser type is invalid"); - - public: - explicit EVFragmentRebalanceLoader(const CommSpec& comm_spec) - : comm_spec_(comm_spec) {} - - ~EVFragmentRebalanceLoader() = default; - - std::shared_ptr LoadFragment(const std::string& efile, - const std::string& vfile, - const LoadGraphSpec& spec) { - std::shared_ptr fragment(nullptr); - if (spec.deserialize && (!spec.serialize)) { - bool deserialized = deserializeFragment(fragment, spec); - int flag = 0; - int sum = 0; - if (!deserialized) { - flag = 1; - } - MPI_Allreduce(&flag, &sum, 1, MPI_INT, MPI_SUM, comm_spec_.comm()); - if (sum != 0) { - fragment.reset(); - if (comm_spec_.worker_id() == 0) { - VLOG(2) << "Deserialization failed, start loading graph from " - "efile and vfile."; - } - } else { - return fragment; - } - } - - std::vector id_list; - std::vector vdata_list; - - CHECK(!vfile.empty()); - { - auto io_adaptor = std::unique_ptr(new IOADAPTOR_T(vfile)); - io_adaptor->Open(); - std::string line; - vdata_t v_data; - oid_t vertex_id; - size_t line_no = 0; - while (io_adaptor->ReadLine(line)) { - ++line_no; - if (line_no % 1000000 == 0) { - VLOG(10) << "[worker-" << comm_spec_.worker_id() << "][vfile] " - << line_no; - } - if (line.empty() || line[0] == '#') - continue; - try { - line_parser_.LineParserForVFile(line, vertex_id, v_data); - } catch (std::exception& e) { - VLOG(1) << e.what(); - continue; - } - id_list.push_back(vertex_id); - vdata_list.push_back(v_data); - } - io_adaptor->Close(); - } - - fid_t fnum = comm_spec_.fnum(); - partitioner_t partitioner(fnum, id_list); - - std::shared_ptr vm_ptr = - std::make_shared(comm_spec_); - vm_ptr->SetPartitioner(partitioner); - vm_ptr->Init(); - auto builder = vm_ptr->GetLocalBuilder(); - - for (auto id : id_list) { - builder.add_vertex(id); - } - builder.finish(*vm_ptr); - - std::vector src_list, dst_list; - std::vector edata_list; - { - auto io_adaptor = - std::unique_ptr(new IOADAPTOR_T(std::string(efile))); - io_adaptor->SetPartialRead(comm_spec_.worker_id(), - comm_spec_.worker_num()); - io_adaptor->Open(); - std::string line; - edata_t e_data; - oid_t src, dst; - vid_t src_gid, dst_gid; - - size_t lineNo = 0; - while (io_adaptor->ReadLine(line)) { - ++lineNo; - if (lineNo % 1000000 == 0) { - VLOG(10) << "[worker-" << comm_spec_.worker_id() << "][efile] " - << lineNo; - } - if (line.empty() || line[0] == '#') - continue; - - try { - line_parser_.LineParserForEFile(line, src, dst, e_data); - } catch (std::exception& e) { - VLOG(1) << e.what(); - continue; - } - - CHECK(vm_ptr->GetGid(src, src_gid)); - CHECK(vm_ptr->GetGid(dst, dst_gid)); - - src_list.push_back(src_gid); - dst_list.push_back(dst_gid); - edata_list.push_back(e_data); - } - io_adaptor->Close(); - } - - std::vector> degree_lists(fnum); - std::vector> gid_map(fnum); - for (fid_t i = 0; i < fnum; ++i) { - degree_lists[i].resize(vm_ptr->GetInnerVertexSize(i), 0); - gid_map[i].resize(vm_ptr->GetInnerVertexSize(i)); - } - - for (auto v : src_list) { - fid_t fid = vm_ptr->GetFidFromGid(v); - vid_t lid = vm_ptr->GetLidFromGid(v); - ++degree_lists[fid][lid]; - } - if (!spec.directed) { - for (auto v : dst_list) { - fid_t fid = vm_ptr->GetFidFromGid(v); - vid_t lid = vm_ptr->GetLidFromGid(v); - ++degree_lists[fid][lid]; - } - } - - for (fid_t i = 0; i < fnum; ++i) { - CHECK_LT(degree_lists[i].size(), - static_cast(std::numeric_limits::max())); - MPI_Allreduce(MPI_IN_PLACE, degree_lists[i].data(), - degree_lists[i].size(), MPI_INT, MPI_SUM, - comm_spec_.comm()); - } - - size_t total_edge_num = 0; - size_t total_vertex_num = 0; - for (auto& vec : degree_lists) { - total_vertex_num += vec.size(); - for (auto d : vec) { - total_edge_num += d; - } - } - - size_t total_score = - total_edge_num + total_vertex_num * spec.rebalance_vertex_factor; - std::vector scores_before(fnum, 0), scores_after(fnum, 0); - std::vector enum_before(fnum, 0), enum_after(fnum, 0); - - fid_t mapped_fid = 0; - vid_t mapped_lid = 0; - size_t cur_score = 0; - size_t expected_score = (total_score + fnum - 1) / fnum; - vid_t cur_num = 0; - std::vector vnum_list; - for (fid_t i = 0; i < fnum; ++i) { - vid_t vn = degree_lists[i].size(); - for (vid_t j = 0; j < vn; ++j) { - size_t v_score = spec.rebalance_vertex_factor + degree_lists[i][j]; - cur_score += v_score; - scores_before[i] += v_score; - enum_before[i] += degree_lists[i][j]; - scores_after[mapped_fid] += v_score; - enum_after[mapped_fid] += degree_lists[i][j]; - gid_map[i][j] = vm_ptr->Lid2Gid(mapped_fid, mapped_lid); - ++cur_num; - if (cur_score >= expected_score) { - ++mapped_fid; - mapped_lid = 0; - cur_score = 0; - vnum_list.push_back(cur_num); - cur_num = 0; - } else { - ++mapped_lid; - } - } - } - if (mapped_fid == fnum) { - CHECK_EQ(mapped_lid, 0); - } else { - CHECK_EQ(mapped_fid, fnum - 1); - vnum_list.push_back(cur_num); - } - - if (comm_spec_.worker_id() == 0) { - LOG(INFO) << "Total score = " << total_score; - for (fid_t i = 0; i < fnum; ++i) { - LOG(INFO) << "[frag-" << i - << "]: vertex_num: " << degree_lists[i].size() << " -> " - << vnum_list[i] << ", edge_num: " << enum_before[i] << " -> " - << enum_after[i] << ", score: " << scores_before[i] << " ->" - << scores_after[i]; - } - } - - for (auto& v : src_list) { - fid_t fid = vm_ptr->GetFidFromGid(v); - vid_t lid = vm_ptr->GetLidFromGid(v); - v = gid_map[fid][lid]; - } - for (auto& v : dst_list) { - fid_t fid = vm_ptr->GetFidFromGid(v); - vid_t lid = vm_ptr->GetLidFromGid(v); - v = gid_map[fid][lid]; - } - - vm_ptr->UpdateToBalance(vnum_list, gid_map); - - std::vector> edges_to_frag(fnum); - for (fid_t i = 0; i < fnum; ++i) { - int worker_id = comm_spec_.FragToWorker(i); - edges_to_frag[i].Init(comm_spec_.comm(), edge_tag, 4096000); - edges_to_frag[i].SetDestination(worker_id, i); - if (comm_spec_.worker_id() == worker_id) { - edges_to_frag[i].DisableComm(); - } - } - - std::vector> processed_vertices; - std::vector> processed_edges; - - std::thread edge_recv_thread([&]() { - ShuffleIn data_in; - data_in.Init(comm_spec_.fnum(), comm_spec_.comm(), edge_tag); - fid_t dst_fid; - int src_worker_id; - while (!data_in.Finished()) { - src_worker_id = data_in.Recv(dst_fid); - if (src_worker_id == -1) { - break; - } - CHECK_EQ(dst_fid, comm_spec_.fid()); - auto& buffers = data_in.buffers(); - foreach_rval(buffers, [&](vid_t&& src, vid_t&& dst, edata_t&& data) { - processed_edges.emplace_back(src, dst, std::move(data)); - }); - data_in.Clear(); - } - }); - - size_t local_enum = src_list.size(); - for (size_t i = 0; i < local_enum; ++i) { - fid_t src_fid = vm_ptr->GetFidFromGid(src_list[i]); - fid_t dst_fid = vm_ptr->GetFidFromGid(dst_list[i]); - edges_to_frag[src_fid].Emplace(src_list[i], dst_list[i], edata_list[i]); - if (src_fid != dst_fid) { - edges_to_frag[dst_fid].Emplace(src_list[i], dst_list[i], edata_list[i]); - } - } - - for (auto& ea : edges_to_frag) { - ea.Flush(); - } - - edge_recv_thread.join(); - { - auto& buffers = edges_to_frag[comm_spec_.fid()].buffers(); - foreach_rval(buffers, [&](vid_t&& src, vid_t&& dst, edata_t&& data) { - processed_edges.emplace_back(src, dst, std::move(data)); - }); - } - - size_t vertex_num = id_list.size(); - if (!std::is_same::value) { - for (size_t i = 0; i < vertex_num; ++i) { - vid_t gid; - CHECK(vm_ptr->GetGid(id_list[i], gid)); - fid_t fid = vm_ptr->GetFidFromGid(gid); - if (fid == comm_spec_.fid()) { - processed_vertices.emplace_back(gid, vdata_list[i]); - } - } - } - - fragment = std::shared_ptr(new fragment_t(vm_ptr)); - fragment->Init(comm_spec_.fid(), spec.directed, processed_vertices, - processed_edges); - - if (!std::is_same::value) { - for (size_t i = 0; i < vertex_num; ++i) { - typename fragment_t::vertex_t v; - if (fragment->GetVertex(id_list[i], v)) { - if (fragment->IsOuterVertex(v)) { - fragment->SetData(v, vdata_list[i]); - } - } - } - } - - if (spec.serialize) { - bool serialized = serializeFragment(fragment, vm_ptr, spec); - if (!serialized) { - VLOG(2) << "[worker-" << comm_spec_.worker_id() - << "] Serialization failed."; - } - } - - return fragment; - } - - private: - bool existSerializationFile(const std::string& prefix) { - char vm_fbuf[1024], frag_fbuf[1024]; - snprintf(vm_fbuf, sizeof(vm_fbuf), "%s/%s", prefix.c_str(), - kSerializationVertexMapFilename); - snprintf(frag_fbuf, sizeof(frag_fbuf), kSerializationFilenameFormat, - prefix.c_str(), comm_spec_.fid()); - std::string vm_path = vm_fbuf; - std::string frag_path = frag_fbuf; - return exists_file(vm_path) && exists_file(frag_path); - } - - bool deserializeFragment(std::shared_ptr& fragment, - const LoadGraphSpec& spec) { - std::string type_prefix = fragment_t::type_info(); - CHECK(spec.rebalance); - type_prefix += ("_rb_" + std::to_string(spec.rebalance_vertex_factor)); - std::string typed_prefix = spec.deserialization_prefix + "/" + type_prefix; - LOG(INFO) << "typed_prefix = " << typed_prefix; - if (!existSerializationFile(typed_prefix)) { - return false; - } - auto io_adaptor = - std::unique_ptr(new IOADAPTOR_T(typed_prefix)); - if (io_adaptor->IsExist()) { - std::shared_ptr vm_ptr = - std::make_shared(comm_spec_); - vm_ptr->template Deserialize(typed_prefix, comm_spec_.fid()); - fragment = std::shared_ptr(new fragment_t(vm_ptr)); - fragment->template Deserialize(typed_prefix, - comm_spec_.fid()); - return true; - } else { - return false; - } - } - - bool serializeFragment(std::shared_ptr fragment, - std::shared_ptr vm_ptr, - const LoadGraphSpec& spec) { - std::string type_prefix = fragment_t::type_info(); - CHECK(spec.rebalance); - type_prefix += ("_rb_" + std::to_string(spec.rebalance_vertex_factor)); - std::string typed_prefix = spec.serialization_prefix + "/" + type_prefix; - char serial_file[1024]; - snprintf(serial_file, sizeof(serial_file), "%s/%s", typed_prefix.c_str(), - kSerializationVertexMapFilename); - vm_ptr->template Serialize(typed_prefix); - fragment->template Serialize(typed_prefix); - - return true; - } - - static constexpr int edge_tag = 6; - - CommSpec comm_spec_; - line_parser_t line_parser_; -}; - -} // namespace grape - -#endif // GRAPE_FRAGMENT_EV_FRAGMENT_REBALANCE_LOADER_H_ diff --git a/grape/fragment/fragment_base.h b/grape/fragment/fragment_base.h index 8fca3a08..f2fbd3dc 100644 --- a/grape/fragment/fragment_base.h +++ b/grape/fragment/fragment_base.h @@ -22,6 +22,7 @@ limitations under the License. #include "grape/graph/adj_list.h" #include "grape/graph/edge.h" #include "grape/graph/vertex.h" +#include "grape/vertex_map/vertex_map.h" #include "grape/worker/comm_spec.h" namespace grape { @@ -51,25 +52,23 @@ template class FragmentBase { public: - using vertex_map_t = typename TRAITS_T::vertex_map_t; - using fragment_adj_list_t = typename TRAITS_T::fragment_adj_list_t; using fragment_const_adj_list_t = typename TRAITS_T::fragment_const_adj_list_t; FragmentBase() : vm_ptr_(nullptr) {} + virtual ~FragmentBase() {} - explicit FragmentBase(std::shared_ptr vm_ptr) - : vm_ptr_(vm_ptr) {} - - std::shared_ptr GetVertexMap() { return vm_ptr_; } - const std::shared_ptr GetVertexMap() const { return vm_ptr_; } + VertexMap& GetVertexMap() { return *vm_ptr_; } + const VertexMap& GetVertexMap() const { return *vm_ptr_; } protected: - void init(fid_t fid, bool directed) { + void init(fid_t fid, bool directed, + std::unique_ptr>&& vm_ptr) { fid_ = fid; directed_ = directed; - fnum_ = vm_ptr_->GetFragmentNum(); + fnum_ = vm_ptr->GetFragmentNum(); + vm_ptr_ = std::move(vm_ptr); id_parser_.init(fnum_); ivnum_ = vm_ptr_->GetInnerVertexSize(fid); } @@ -82,7 +81,8 @@ class FragmentBase { * @param vertices A set of vertices. * @param edges A set of edges. */ - virtual void Init(fid_t fid, bool directed, + virtual void Init(const CommSpec& comm_spec, bool directed, + std::unique_ptr>&& vm_ptr, std::vector>& vertices, std::vector>& edges) = 0; @@ -170,7 +170,7 @@ class FragmentBase { * @return Its original ID. */ OID_T GetId(const Vertex& v) const { - OID_T oid; + OID_T oid{}; vm_ptr_->GetOid(Vertex2Gid(v), oid); return oid; } @@ -326,7 +326,7 @@ class FragmentBase { VID_T ivnum_; vertices_t vertices_; - std::shared_ptr vm_ptr_; + std::unique_ptr> vm_ptr_; IdParser id_parser_; }; diff --git a/grape/fragment/immutable_edgecut_fragment.h b/grape/fragment/immutable_edgecut_fragment.h index f955abb3..aecbc9d5 100644 --- a/grape/fragment/immutable_edgecut_fragment.h +++ b/grape/fragment/immutable_edgecut_fragment.h @@ -42,15 +42,13 @@ limitations under the License. #include "grape/types.h" #include "grape/util.h" #include "grape/utils/vertex_array.h" -#include "grape/vertex_map/global_vertex_map.h" #include "grape/worker/comm_spec.h" namespace grape { class CommSpec; class OutArchive; -template +template struct ImmutableEdgecutFragmentTraits { using inner_vertices_t = VertexRange; using outer_vertices_t = VertexRange; @@ -63,7 +61,6 @@ struct ImmutableEdgecutFragmentTraits { using csr_t = ImmutableCSR>; using csr_builder_t = ImmutableCSRBuild>; using mirror_vertices_t = std::vector>; - using vertex_map_t = VERTEX_MAP_T; }; /** @@ -114,16 +111,14 @@ struct ImmutableEdgecutFragmentTraits { * */ template > + LoadStrategy _load_strategy = LoadStrategy::kOnlyOut> class ImmutableEdgecutFragment : public CSREdgecutFragmentBase< OID_T, VID_T, VDATA_T, EDATA_T, - ImmutableEdgecutFragmentTraits> { + ImmutableEdgecutFragmentTraits> { public: - using traits_t = ImmutableEdgecutFragmentTraits; + using traits_t = + ImmutableEdgecutFragmentTraits; using base_t = CSREdgecutFragmentBase; using internal_vertex_t = internal::Vertex; @@ -134,7 +129,6 @@ class ImmutableEdgecutFragment using oid_t = OID_T; using vdata_t = VDATA_T; using edata_t = EDATA_T; - using vertex_map_t = typename traits_t::vertex_map_t; using IsEdgeCut = std::true_type; using IsVertexCut = std::false_type; @@ -155,10 +149,8 @@ class ImmutableEdgecutFragment template using vertex_array_t = VertexArray; - ImmutableEdgecutFragment() {} - - explicit ImmutableEdgecutFragment(std::shared_ptr vm_ptr) - : FragmentBase(vm_ptr) {} + ImmutableEdgecutFragment() + : FragmentBase() {} virtual ~ImmutableEdgecutFragment() = default; @@ -167,41 +159,65 @@ class ImmutableEdgecutFragment using base_t::IsInnerVertexGid; static std::string type_info() { - std::string ret = ""; + std::string ret = "ImmutableEdgecutFragment<"; + if (std::is_same::value) { + ret += "int64_t, "; + } else if (std::is_same::value) { + ret += "int32_t, "; + } else if (std::is_same::value) { + ret += "std::string, "; + } else { + LOG(FATAL) << "OID_T type not supported..."; + } + + if (std::is_same::value) { + ret += "uint64_t, "; + } else if (std::is_same::value) { + ret += "uint32_t, "; + } else { + LOG(FATAL) << "VID_T type not supported..."; + } + + if (std::is_same::value) { + ret += "empty, "; + } else if (std::is_same::value) { + ret += "double, "; + } else if (std::is_same::value) { + ret += "float, "; + } else { + LOG(FATAL) << "Vertex data type not supported..."; + } + if (std::is_same::value) { - ret += "empty"; + ret += "empty, "; } else if (std::is_same::value) { - ret += "double"; + ret += "double, "; } else if (std::is_same::value) { - ret += "float"; + ret += "float, "; } else { LOG(FATAL) << "Edge data type not supported..."; } if (_load_strategy == LoadStrategy::kOnlyOut) { - ret += "_out"; + ret += "out"; } else if (_load_strategy == LoadStrategy::kOnlyIn) { - ret += "_in"; + ret += "in"; } else if (_load_strategy == LoadStrategy::kBothOutIn) { - ret += "_both"; + ret += "both"; } else { LOG(FATAL) << "Invalid load strategy..."; } - using partitioner_t = typename VERTEX_MAP_T::partitioner_t; - if (std::is_same>::value) { - ret += "_hash"; - } else if (std::is_same>::value) { - ret += "_seg"; - } + ret += ">"; return ret; } - void Init(fid_t fid, bool directed, std::vector& vertices, + void Init(const CommSpec& comm_spec, bool directed, + std::unique_ptr>&& vm_ptr, + std::vector& vertices, std::vector& edges) override { - init(fid, directed); + init(comm_spec.fid(), directed, std::move(vm_ptr)); static constexpr VID_T invalid_vid = std::numeric_limits::max(); { @@ -365,10 +381,13 @@ class ImmutableEdgecutFragment } template - void Deserialize(const std::string& prefix, const fid_t fid) { + void Deserialize(const CommSpec& comm_spec, + std::unique_ptr>&& vm_ptr, + const std::string& prefix) { + vm_ptr_ = std::move(vm_ptr); char fbuf[1024]; snprintf(fbuf, sizeof(fbuf), kSerializationFilenameFormat, prefix.c_str(), - fid); + comm_spec.fid()); auto io_adaptor = std::unique_ptr(new IOADAPTOR_T(std::string(fbuf))); io_adaptor->Open(); @@ -722,6 +741,7 @@ class ImmutableEdgecutFragment using base_t::fid_; using base_t::fnum_; using base_t::id_parser_; + using base_t::vm_ptr_; ska::flat_hash_map, std::equal_to, Allocator>> diff --git a/grape/fragment/loader.h b/grape/fragment/loader.h index 167e9a0a..d2abe7af 100644 --- a/grape/fragment/loader.h +++ b/grape/fragment/loader.h @@ -21,8 +21,6 @@ limitations under the License. #include "grape/fragment/ev_fragment_loader.h" #include "grape/fragment/ev_fragment_mutator.h" -#include "grape/fragment/ev_fragment_rebalance_loader.h" -#include "grape/fragment/partitioner.h" #include "grape/io/local_io_adaptor.h" namespace grape { @@ -48,19 +46,9 @@ static std::shared_ptr LoadGraph( const std::string& efile, const std::string& vfile, const CommSpec& comm_spec, const LoadGraphSpec& spec = DefaultLoadGraphSpec()) { - if (spec.rebalance) { - std::unique_ptr< - EVFragmentRebalanceLoader> - loader( - new EVFragmentRebalanceLoader( - comm_spec)); - return loader->LoadFragment(efile, vfile, spec); - } else { - std::unique_ptr> - loader(new EVFragmentLoader( - comm_spec)); - return loader->LoadFragment(efile, vfile, spec); - } + std::unique_ptr> loader( + new EVFragmentLoader(comm_spec)); + return loader->LoadFragment(efile, vfile, spec); } template +template struct MutableEdgecutFragmentTraits { using inner_vertices_t = VertexRange; using outer_vertices_t = VertexRange; @@ -46,22 +44,17 @@ struct MutableEdgecutFragmentTraits { using csr_t = DeMutableCSR>; using csr_builder_t = DeMutableCSRBuilder>; - using vertex_map_t = VERTEX_MAP_T; using mirror_vertices_t = std::vector>; }; template >> + LoadStrategy _load_strategy = LoadStrategy::kOnlyOut> class MutableEdgecutFragment : public CSREdgecutFragmentBase< OID_T, VID_T, VDATA_T, EDATA_T, - MutableEdgecutFragmentTraits> { + MutableEdgecutFragmentTraits> { public: - using traits_t = MutableEdgecutFragmentTraits; + using traits_t = MutableEdgecutFragmentTraits; using base_t = CSREdgecutFragmentBase; using internal_vertex_t = internal::Vertex; @@ -73,8 +66,6 @@ class MutableEdgecutFragment using vdata_t = VDATA_T; using edata_t = EDATA_T; - using vertex_map_t = typename traits_t::vertex_map_t; - using IsEdgeCut = std::true_type; using IsVertexCut = std::false_type; @@ -96,19 +87,73 @@ class MutableEdgecutFragment template using vertex_array_t = VertexArray; - explicit MutableEdgecutFragment(std::shared_ptr vm_ptr) - : FragmentBase(vm_ptr) {} + MutableEdgecutFragment() + : FragmentBase() {} virtual ~MutableEdgecutFragment() = default; using base_t::buildCSR; using base_t::init; using base_t::IsInnerVertexGid; - static std::string type_info() { return ""; } + static std::string type_info() { + std::string ret = "MutableEdgecutFragment<"; + if (std::is_same::value) { + ret += "int64_t, "; + } else if (std::is_same::value) { + ret += "int32_t, "; + } else if (std::is_same::value) { + ret += "std::string, "; + } else { + LOG(FATAL) << "OID_T type not supported..."; + } + + if (std::is_same::value) { + ret += "uint64_t, "; + } else if (std::is_same::value) { + ret += "uint32_t, "; + } else { + LOG(FATAL) << "VID_T type not supported..."; + } + + if (std::is_same::value) { + ret += "empty, "; + } else if (std::is_same::value) { + ret += "double, "; + } else if (std::is_same::value) { + ret += "float, "; + } else { + LOG(FATAL) << "Vertex data type not supported..."; + } + + if (std::is_same::value) { + ret += "empty, "; + } else if (std::is_same::value) { + ret += "double, "; + } else if (std::is_same::value) { + ret += "float, "; + } else { + LOG(FATAL) << "Edge data type not supported..."; + } + + if (_load_strategy == LoadStrategy::kOnlyOut) { + ret += "out"; + } else if (_load_strategy == LoadStrategy::kOnlyIn) { + ret += "in"; + } else if (_load_strategy == LoadStrategy::kBothOutIn) { + ret += "both"; + } else { + LOG(FATAL) << "Invalid load strategy..."; + } + + ret += ">"; + return ret; + } - void Init(fid_t fid, bool directed, std::vector& vertices, + void Init(const CommSpec& comm_spec, bool directed, + std::unique_ptr>&& vm_ptr, + std::vector& vertices, std::vector& edges) override { - init(fid, directed); + init(comm_spec.fid(), directed, std::move(vm_ptr)); ovnum_ = 0; static constexpr VID_T invalid_vid = std::numeric_limits::max(); @@ -387,10 +432,13 @@ class MutableEdgecutFragment } template - void Deserialize(const std::string& prefix, const fid_t fid) { + void Deserialize(const CommSpec& comm_spec, + std::unique_ptr>&& vm_ptr, + const std::string& prefix) { + vm_ptr_ = std::move(vm_ptr); char fbuf[1024]; snprintf(fbuf, sizeof(fbuf), kSerializationFilenameFormat, prefix.c_str(), - fid); + comm_spec.fid()); auto io_adaptor = std::unique_ptr(new IOADAPTOR_T(std::string(fbuf))); io_adaptor->Open(); diff --git a/grape/fragment/rebalancer.h b/grape/fragment/rebalancer.h new file mode 100644 index 00000000..103cfc65 --- /dev/null +++ b/grape/fragment/rebalancer.h @@ -0,0 +1,170 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef GRAPE_FRAGMENT_REBALANCER_H_ +#define GRAPE_FRAGMENT_REBALANCER_H_ + +#include + +#include "grape/types.h" +#include "grape/vertex_map/vertex_map.h" + +namespace grape { + +template +class Rebalancer { + using internal_oid_t = typename InternalOID::type; + using vid_t = VID_T; + + public: + Rebalancer(int vertex_factor, + std::unique_ptr>&& vertex_map) + : vertex_factor_(vertex_factor), vertex_map_(std::move(vertex_map)) { + fid_t fnum = vertex_map_->GetFragmentNum(); + id_parser_.init(fnum); + degree_.resize(fnum); + for (fid_t i = 0; i < fnum; ++i) { + degree_[i].resize(vertex_map_->GetInnerVertexSize(i), 0); + } + } + + void inc_degree(const OID_T& oid) { + VID_T gid; + if (vertex_map_->GetGid(oid, gid)) { + fid_t fid = id_parser_.get_fragment_id(gid); + vid_t lid = id_parser_.get_local_id(gid); + ++degree_[fid][lid]; + } + } + + void finish(const CommSpec& comm_spec, + VertexMap& new_vertex_map) { + fid_t fnum = vertex_map_->GetFragmentNum(); + fid_t self_fid = comm_spec.fid(); + for (auto& deg : degree_) { + MPI_Allreduce(MPI_IN_PLACE, deg.data(), deg.size(), MPI_INT, MPI_SUM, + comm_spec.comm()); + } + size_t total_score = 0; + std::vector frag_scores_before, frag_scores_after; + for (auto& vec : degree_) { + size_t cur_score = vec.size() * vertex_factor_; + for (auto deg : vec) { + cur_score += deg; + } + + frag_scores_before.push_back(cur_score); + total_score += cur_score; + } + size_t expected_score = (total_score + fnum - 1) / fnum; + std::vector native_oids; + std::unique_ptr> new_partitioner(nullptr); + if (vertex_map_->GetPartitioner().type() == + PartitionerType::kMapPartitioner) { + fid_t cur_fid = 0; + size_t cur_score = 0; + + new_partitioner = std::unique_ptr>( + new MapPartitioner(fnum)); + frag_scores_after.resize(fnum, 0); + for (fid_t i = 0; i < fnum; ++i) { + vid_t vnum = vertex_map_->GetInnerVertexSize(i); + for (vid_t j = 0; j < vnum; ++j) { + OID_T cur_oid; + CHECK(vertex_map_->GetOid(i, j, cur_oid)); + new_partitioner->SetPartitionId(internal_oid_t(cur_oid), cur_fid); + if (cur_fid == self_fid) { + native_oids.push_back(cur_oid); + } + + size_t v_score = degree_[i][j] + vertex_factor_; + frag_scores_after[cur_fid] += v_score; + + cur_score += v_score; + if (cur_score > expected_score && cur_fid < (fnum - 1)) { + ++cur_fid; + cur_score = 0; + } + } + } + CHECK_LE(cur_fid, fnum); + } else if (vertex_map_->GetPartitioner().type() == + PartitionerType::kSegmentedPartitioner) { + size_t cur_score = 0; + fid_t cur_fid = 0; + bool is_boundary = false; + std::vector boundaries; + frag_scores_after.resize(fnum, 0); + for (fid_t i = 0; i < fnum; ++i) { + std::vector> frag_vertices; + vid_t vnum = vertex_map_->GetInnerVertexSize(i); + frag_vertices.reserve(vnum); + for (vid_t j = 0; j < vnum; ++j) { + OID_T cur_oid; + CHECK(vertex_map_->GetOid(i, j, cur_oid)); + frag_vertices.emplace_back(cur_oid, j); + } + std::sort( + frag_vertices.begin(), frag_vertices.end(), + [](const std::pair& a, + const std::pair& b) { return a.first < b.first; }); + + for (auto& pair : frag_vertices) { + if (is_boundary) { + boundaries.push_back(pair.first); + is_boundary = false; + } + if (cur_fid == self_fid) { + native_oids.push_back(pair.first); + } + frag_scores_after[cur_fid] += + (degree_[i][pair.second] + vertex_factor_); + cur_score += (degree_[i][pair.second] + vertex_factor_); + if (cur_score >= expected_score && cur_fid < (fnum - 1)) { + is_boundary = true; + ++cur_fid; + cur_score = 0; + } + } + } + CHECK_EQ(boundaries.size(), fnum - 1); + new_partitioner = std::unique_ptr>( + new SegmentedPartitioner(boundaries)); + } else { + LOG(FATAL) << "Unsupported partitioner type - " + << static_cast(vertex_map_->GetPartitioner().type()); + } + IdxerType idxer_type = vertex_map_->idxer_type(); + CHECK(idxer_type != IdxerType::kLocalIdxer) + << "Rebalancer only supports global vertex map"; + VertexMapBuilder builder( + self_fid, fnum, std::move(new_partitioner), idxer_type); + for (auto& oid : native_oids) { + builder.add_vertex(oid); + } + builder.finish(comm_spec, new_vertex_map); + } + + private: + int vertex_factor_; + std::unique_ptr> vertex_map_; + IdParser id_parser_; + + std::vector> degree_; +}; + +} // namespace grape + +#endif // GRAPE_FRAGMENT_REBALANCER_H_ diff --git a/grape/graph/id_indexer.h b/grape/graph/id_indexer.h index 0be9e7f8..923e8fdf 100644 --- a/grape/graph/id_indexer.h +++ b/grape/graph/id_indexer.h @@ -21,8 +21,11 @@ limitations under the License. #include #include "flat_hash_map/flat_hash_map.hpp" +#include "grape/communication/sync_comm.h" #include "grape/config.h" +#include "grape/io/io_adaptor_base.h" #include "grape/types.h" +#include "grape/utils/ref_vector.h" #include "grape/utils/string_view_vector.h" namespace grape { @@ -32,118 +35,249 @@ namespace id_indexer_impl { static constexpr int8_t min_lookups = 4; static constexpr double max_load_factor = 0.5f; -inline int8_t log2(size_t value) { - static constexpr int8_t table[64] = { - 63, 0, 58, 1, 59, 47, 53, 2, 60, 39, 48, 27, 54, 33, 42, 3, - 61, 51, 37, 40, 49, 18, 28, 20, 55, 30, 34, 11, 43, 14, 22, 4, - 62, 57, 46, 52, 38, 26, 32, 41, 50, 36, 17, 19, 29, 10, 13, 21, - 56, 45, 25, 31, 35, 16, 9, 12, 44, 24, 15, 8, 23, 7, 6, 5}; - value |= value >> 1; - value |= value >> 2; - value |= value >> 4; - value |= value >> 8; - value |= value >> 16; - value |= value >> 32; - return table[((value - (value >> 1)) * 0x07EDD5E59A4E28C2) >> 58]; +template +size_t vec_dump_bytes(T const& vec) { + return vec.size() * sizeof(vec.front()) + sizeof(typename T::size_type); } template struct KeyBuffer { - using type = std::vector>; + public: + KeyBuffer() = default; + ~KeyBuffer() = default; + + const T& get(size_t idx) const { return inner_[idx]; } + void set(size_t idx, const T& val) { inner_[idx] = val; } + + void push_back(const T& val) { inner_.push_back(val); } + + size_t size() const { return inner_.size(); } + + std::vector>& buffer() { return inner_; } + const std::vector>& buffer() const { return inner_; } template - static void serialize(std::unique_ptr& writer, type& buffer) { - size_t size = buffer.size(); + void serialize(std::unique_ptr& writer) const { + size_t size = inner_.size(); CHECK(writer->Write(&size, sizeof(size_t))); if (size > 0) { - CHECK(writer->Write(buffer.data(), size * sizeof(T))); + CHECK(writer->Write(const_cast(inner_.data()), size * sizeof(T))); } } + void serialize_to_mem(std::vector& buf) const { + encode_vec(inner_, buf); + } + template - static void deserialize(std::unique_ptr& reader, type& buffer) { + void deserialize(std::unique_ptr& reader) { size_t size; CHECK(reader->Read(&size, sizeof(size_t))); if (size > 0) { - buffer.resize(size); - CHECK(reader->Read(buffer.data(), size * sizeof(T))); + inner_.resize(size); + CHECK(reader->Read(inner_.data(), size * sizeof(T))); } } - static void SendTo(const type& buffer, int dst_worker_id, int tag, - MPI_Comm comm) { - sync_comm::Send(buffer, dst_worker_id, tag, comm); + void swap(KeyBuffer& rhs) { inner_.swap(rhs.inner_); } + + void clear() { inner_.clear(); } + + template + void load(Loader& loader) { + loader.load_vec(inner_); } - static void RecvFrom(type& buffer, int src_worker_id, int tag, - MPI_Comm comm) { - sync_comm::Recv(buffer, src_worker_id, tag, comm); + template + void dump(Dumper& dumper) const { + dumper.dump_vec(inner_); } + + size_t dump_size() const { return vec_dump_bytes(inner_); } + + private: + std::vector> inner_; }; template <> struct KeyBuffer { - using type = StringViewVector; + KeyBuffer() = default; + ~KeyBuffer() = default; + + nonstd::string_view get(size_t idx) const { return inner_[idx]; } + + void push_back(const nonstd::string_view& val) { inner_.push_back(val); } + + size_t size() const { return inner_.size(); } + + StringViewVector& buffer() { return inner_; } + const StringViewVector& buffer() const { return inner_; } template - static void serialize(std::unique_ptr& writer, type& buffer) { - size_t content_buffer_size = buffer.content_buffer().size(); - CHECK(writer->Write(&content_buffer_size, sizeof(size_t))); - if (content_buffer_size > 0) { - CHECK(writer->Write(buffer.content_buffer().data(), - content_buffer_size * sizeof(char))); - } - size_t offset_buffer_size = buffer.offset_buffer().size(); - CHECK(writer->Write(&offset_buffer_size, sizeof(size_t))); - if (offset_buffer_size > 0) { - CHECK(writer->Write(buffer.offset_buffer().data(), - offset_buffer_size * sizeof(size_t))); - } + void serialize(std::unique_ptr& writer) const { + inner_.serialize(writer); + } + + void serialize_to_mem(std::vector& buf) const { + inner_.serialize_to_mem(buf); } template - static void deserialize(std::unique_ptr& reader, type& buffer) { - size_t content_buffer_size; - CHECK(reader->Read(&content_buffer_size, sizeof(size_t))); - if (content_buffer_size > 0) { - buffer.content_buffer().resize(content_buffer_size); - CHECK(reader->Read(buffer.content_buffer().data(), - content_buffer_size * sizeof(char))); - } - size_t offset_buffer_size; - CHECK(reader->Read(&offset_buffer_size, sizeof(size_t))); - if (offset_buffer_size > 0) { - buffer.offset_buffer().resize(offset_buffer_size); - CHECK(reader->Read(buffer.offset_buffer().data(), - offset_buffer_size * sizeof(size_t))); - } + void deserialize(std::unique_ptr& reader) { + inner_.deserialize(reader); + } + + void swap(KeyBuffer& rhs) { inner_.swap(rhs.inner_); } + + void clear() { inner_.clear(); } + + template + void load(Loader& loader) { + loader.load_vec(inner_.content_buffer()); + loader.load_vec(inner_.offset_buffer()); } - static void SendTo(const type& buffer, int dst_worker_id, int tag, - MPI_Comm comm) { - sync_comm::Send(buffer, dst_worker_id, tag, comm); + template + void dump(Dumper& dumper) const { + dumper.dump_vec(inner_.content_buffer()); + dumper.dump_vec(inner_.offset_buffer()); } - static void RecvFrom(type& buffer, int src_worker_id, int tag, - MPI_Comm comm) { - sync_comm::Recv(buffer, src_worker_id, tag, comm); + size_t dump_size() const { + return vec_dump_bytes(inner_.content_buffer()) + + vec_dump_bytes(inner_.offset_buffer()); } + + private: + StringViewVector inner_; +}; + +#if __cplusplus >= 201703L +template <> +struct KeyBuffer { + KeyBuffer() = default; + ~KeyBuffer() = default; + + std::string_view get(size_t idx) const { + std::string_view view(inner_[idx].data(), inner_[idx].size()); + return view; + } + + void push_back(const std::string_view& val) { + nonstd::string_view view(val.data(), val.size()); + inner_.push_back(view); + } + + size_t size() const { return inner_.size(); } + + StringViewVector& buffer() { return inner_; } + const StringViewVector& buffer() const { return inner_; } + + void swap(KeyBuffer& rhs) { inner_.swap(rhs.inner_); } + + void clear() { inner_.clear(); } + + template + void load(Loader& loader) { + loader.load_vec(inner_.content_buffer()); + loader.load_vec(inner_.offset_buffer()); + } + + template + void dump(Dumper& dumper) const { + dumper.dump_vec(inner_.content_buffer()); + dumper.dump_vec(inner_.offset_buffer()); + } + + size_t dump_size() { + return vec_dump_bytes(inner_.content_buffer()) + + vec_dump_bytes(inner_.offset_buffer()); + } + + private: + StringViewVector inner_; +}; +#endif + +template +struct KeyBufferView { + public: + KeyBufferView() {} + + size_t init(const void* buffer, size_t size) { + return inner_.init(buffer, size); + } + + T get(size_t idx) const { return inner_.get(idx); } + + size_t size() const { return inner_.size(); } + + template + void load(Loader& loader) { + inner_.load(loader); + } + + private: + ref_vector inner_; }; } // namespace id_indexer_impl +namespace sync_comm { + +template +struct CommImpl> { + static void send(const id_indexer_impl::KeyBuffer& buf, int dst_worker_id, + int tag, MPI_Comm comm) { + Send(buf.buffer(), dst_worker_id, tag, comm); + } + + static void recv(id_indexer_impl::KeyBuffer& buf, int src_worker_id, + int tag, MPI_Comm comm) { + Recv(buf.buffer(), src_worker_id, tag, comm); + } +}; + +template <> +struct CommImpl> { + static void send(const id_indexer_impl::KeyBuffer& buf, + int dst_worker_id, int tag, MPI_Comm comm) { + Send(buf.buffer(), dst_worker_id, tag, comm); + } + + static void recv(id_indexer_impl::KeyBuffer& buf, + int src_worker_id, int tag, MPI_Comm comm) { + Recv(buf.buffer(), src_worker_id, tag, comm); + } +}; + +} // namespace sync_comm + template class IdIndexer { public: - using key_buffer_t = typename id_indexer_impl::KeyBuffer::type; + using key_buffer_t = typename id_indexer_impl::KeyBuffer; using ind_buffer_t = std::vector>; using dist_buffer_t = std::vector>; IdIndexer() : hasher_() { reset_to_empty_state(); } - ~IdIndexer() {} + IdIndexer(IdIndexer&& rhs) { swap(rhs); } + ~IdIndexer() = default; + + IdIndexer& operator=(IdIndexer&& rhs) { + swap(rhs); + return *this; + } size_t entry_num() const { return distances_.size(); } + size_t memory_usage() const { + size_t ret = keys_.dump_size(); + ret += indices_.size() * sizeof(INDEX_T); + ret += distances_.size() * sizeof(int8_t); + return ret; + } + bool add(const KEY_T& oid, INDEX_T& lid) { size_t index = hash_policy_.index_for_hash(hasher_(oid), num_slots_minus_one_); @@ -152,7 +286,7 @@ class IdIndexer { for (; distances_[index] >= distance_from_desired; ++index, ++distance_from_desired) { INDEX_T cur_lid = indices_[index]; - if (keys_[cur_lid] == oid) { + if (keys_.get(cur_lid) == oid) { lid = cur_lid; return false; } @@ -174,7 +308,7 @@ class IdIndexer { for (; distances_[index] >= distance_from_desired; ++index, ++distance_from_desired) { INDEX_T cur_lid = indices_[index]; - if (keys_[cur_lid] == oid) { + if (keys_.get(cur_lid) == oid) { lid = cur_lid; return false; } @@ -196,7 +330,7 @@ class IdIndexer { for (; distances_[index] >= distance_from_desired; ++index, ++distance_from_desired) { INDEX_T cur_lid = indices_[index]; - if (keys_[cur_lid] == oid) { + if (keys_.get(cur_lid) == oid) { lid = cur_lid; return false; } @@ -218,7 +352,7 @@ class IdIndexer { for (; distances_[index] >= distance_from_desired; ++index, ++distance_from_desired) { INDEX_T cur_lid = indices_[index]; - if (keys_[cur_lid] == oid) { + if (keys_.get(cur_lid) == oid) { lid = cur_lid; return false; } @@ -239,7 +373,7 @@ class IdIndexer { int8_t distance_from_desired = 0; for (; distances_[index] >= distance_from_desired; ++index, ++distance_from_desired) { - if (keys_[indices_[index]] == oid) { + if (keys_.get(indices_[index]) == oid) { return; } } @@ -258,7 +392,7 @@ class IdIndexer { int8_t distance_from_desired = 0; for (; distances_[index] >= distance_from_desired; ++index, ++distance_from_desired) { - if (keys_[indices_[index]] == oid) { + if (keys_.get(indices_[index]) == oid) { return; } } @@ -282,7 +416,7 @@ class IdIndexer { if (lid >= num_elements_) { return false; } - oid = keys_[lid]; + oid = keys_.get(lid); return true; } @@ -292,7 +426,7 @@ class IdIndexer { for (int8_t distance = 0; distances_[index] >= distance; ++distance, ++index) { INDEX_T ret = indices_[index]; - if (keys_[ret] == oid) { + if (keys_.get(ret) == oid) { lid = ret; return true; } @@ -305,7 +439,7 @@ class IdIndexer { for (int8_t distance = 0; distances_[index] >= distance; ++distance, ++index) { INDEX_T ret = indices_[index]; - if (keys_[ret] == oid) { + if (keys_.get(ret) == oid) { lid = ret; return true; } @@ -332,49 +466,73 @@ class IdIndexer { template void Serialize(std::unique_ptr& writer) { - id_indexer_impl::KeyBuffer::serialize(writer, keys_); - InArchive arc; - arc << hash_policy_.get_mod_function_index() << max_lookups_ - << num_elements_ << num_slots_minus_one_ << indices_.size() - << distances_.size(); - CHECK(writer->WriteArchive(arc)); - arc.Clear(); - + keys_.serialize(writer); + + size_t mod_function_index = hash_policy_.get_mod_function_index(); + int8_t max_lookups_val = max_lookups_; + size_t num_elements_val = num_elements_; + size_t num_slots_minus_one_val = num_slots_minus_one_; + CHECK(writer->Write(&mod_function_index, sizeof(size_t))); + CHECK(writer->Write(&max_lookups_val, sizeof(int8_t))); + CHECK(writer->Write(&num_elements_val, sizeof(size_t))); + CHECK(writer->Write(&num_slots_minus_one_val, sizeof(size_t))); + + size_t indices_size = indices_.size(); + CHECK(writer->Write(&indices_size, sizeof(size_t))); if (indices_.size() > 0) { - CHECK(writer->Write(indices_.data(), indices_.size() * sizeof(INDEX_T))); + CHECK(writer->Write(const_cast(indices_.data()), + indices_size * sizeof(INDEX_T))); } + size_t distances_size = distances_.size(); + CHECK(writer->Write(&distances_size, sizeof(size_t))); if (distances_.size() > 0) { - CHECK( - writer->Write(distances_.data(), distances_.size() * sizeof(int8_t))); + CHECK(writer->Write(const_cast(distances_.data()), + distances_size * sizeof(int8_t))); } } template void Deserialize(std::unique_ptr& reader) { - id_indexer_impl::KeyBuffer::deserialize(reader, keys_); - OutArchive arc; - CHECK(reader->ReadArchive(arc)); - size_t mod_function_index; - size_t indices_size, distances_size; - arc >> mod_function_index >> max_lookups_ >> num_elements_ >> - num_slots_minus_one_ >> indices_size >> distances_size; - arc.Clear(); + keys_.deserialize(reader); + size_t mod_function_index; + CHECK(reader->Read(&mod_function_index, sizeof(size_t))); hash_policy_.set_mod_function_by_index(mod_function_index); + CHECK(reader->Read(&max_lookups_, sizeof(int8_t))); + CHECK(reader->Read(&num_elements_, sizeof(size_t))); + CHECK(reader->Read(&num_slots_minus_one_, sizeof(size_t))); + + size_t indices_size; + CHECK(reader->Read(&indices_size, sizeof(size_t))); indices_.resize(indices_size); - distances_.resize(distances_size); if (indices_size > 0) { CHECK(reader->Read(indices_.data(), indices_.size() * sizeof(INDEX_T))); } + + size_t distances_size; + CHECK(reader->Read(&distances_size, sizeof(size_t))); + distances_.resize(distances_size); if (distances_size > 0) { CHECK( reader->Read(distances_.data(), distances_.size() * sizeof(int8_t))); } } + void serialize_to_mem(std::vector& buf) const { + keys_.serialize_to_mem(buf); + size_t mod_function_index = hash_policy_.get_mod_function_index(); + encode_val(mod_function_index, buf); + encode_val(max_lookups_, buf); + encode_val(num_elements_, buf); + encode_val(num_slots_minus_one_, buf); + + encode_vec(indices_, buf); + encode_vec(distances_, buf); + } + private: void emplace(INDEX_T lid) { - KEY_T key = keys_[lid]; + KEY_T key = keys_.get(lid); size_t index = hash_policy_.index_for_hash(hasher_(key), num_slots_minus_one_); int8_t distance_from_desired = 0; @@ -484,7 +642,7 @@ class IdIndexer { } static int8_t compute_max_lookups(size_t num_buckets) { - int8_t desired = id_indexer_impl::log2(num_buckets); + int8_t desired = ska::detailv3::log2(num_buckets); return std::max(id_indexer_impl::min_lookups, desired); } @@ -503,6 +661,91 @@ class IdIndexer { std::hash hasher_; }; +template +class IdIndexerView { + public: + IdIndexerView() : hasher_() {} + ~IdIndexerView() = default; + + void Init(const void* data, size_t size) { + const char* ptr = reinterpret_cast(data); + size_t cur = keys_.init(ptr, size); + ptr += cur; + + size_t mod_function_index; + ptr = decode_val(mod_function_index, ptr); + hash_policy_.set_mod_function_by_index(mod_function_index); + + ptr = decode_val(max_lookups_, ptr); + ptr = decode_val(num_elements_, ptr); + ptr = decode_val(num_slots_minus_one_, ptr); + + size_t used_size = ptr - reinterpret_cast(data); + size -= used_size; + + cur = indices_.init(ptr, size); + ptr += cur; + size -= cur; + + distances_.init(ptr, size); + } + + size_t entry_num() const { return distances_.size(); } + + size_t bucket_count() const { + return num_slots_minus_one_ ? num_slots_minus_one_ + 1 : 0; + } + + size_t size() const { return num_elements_; } + + bool get_key(INDEX_T lid, KEY_T& oid) const { + if (lid >= num_elements_) { + return false; + } + oid = keys_.get(lid); + return true; + } + + bool get_index(const KEY_T& oid, INDEX_T& lid) const { + size_t index = + hash_policy_.index_for_hash(hasher_(oid), num_slots_minus_one_); + for (int8_t distance = 0; distances_.get(index) >= distance; + ++distance, ++index) { + INDEX_T ret = indices_.get(index); + if (keys_.get(ret) == oid) { + lid = ret; + return true; + } + } + return false; + } + + bool _get_index(const KEY_T& oid, size_t hash, INDEX_T& lid) const { + size_t index = hash_policy_.index_for_hash(hash, num_slots_minus_one_); + for (int8_t distance = 0; distances_.get(index) >= distance; + ++distance, ++index) { + INDEX_T ret = indices_.get(index); + if (keys_.get(ret) == oid) { + lid = ret; + return true; + } + } + return false; + } + + private: + typename id_indexer_impl::KeyBufferView keys_; + ref_vector indices_; + ref_vector distances_; + + ska::ska::prime_number_hash_policy hash_policy_; + int8_t max_lookups_ = id_indexer_impl::min_lookups - 1; + size_t num_elements_ = 0; + size_t num_slots_minus_one_ = 0; + + std::hash hasher_; +}; + namespace sync_comm { template @@ -513,8 +756,7 @@ struct CommImpl> { arc << indexer.hash_policy_.get_mod_function_index() << indexer.max_lookups_ << indexer.num_elements_ << indexer.num_slots_minus_one_; Send(arc, dst_worker_id, tag, comm); - id_indexer_impl::KeyBuffer::SendTo(indexer.keys_, dst_worker_id, tag, - comm); + Send(indexer.keys_, dst_worker_id, tag, comm); Send(indexer.indices_, dst_worker_id, tag, comm); Send(indexer.distances_, dst_worker_id, tag, comm); } @@ -527,8 +769,7 @@ struct CommImpl> { arc >> mod_function_index >> indexer.max_lookups_ >> indexer.num_elements_ >> indexer.num_slots_minus_one_; indexer.hash_policy_.set_mod_function_by_index(mod_function_index); - id_indexer_impl::KeyBuffer::RecvFrom(indexer.keys_, src_worker_id, - tag, comm); + Recv(indexer.keys_, src_worker_id, tag, comm); Recv(indexer.indices_, src_worker_id, tag, comm); Recv(indexer.distances_, src_worker_id, tag, comm); } diff --git a/grape/graph/immutable_csr.h b/grape/graph/immutable_csr.h index b0e73932..3a18a202 100644 --- a/grape/graph/immutable_csr.h +++ b/grape/graph/immutable_csr.h @@ -184,6 +184,13 @@ class ImmutableCSR { Array>& get_edges_mut() { return edges_; } Array>& get_offsets_mut() { return offsets_; } + size_t memory_usage() const { + size_t mem = sizeof(*this); + mem += offsets_.size() * sizeof(nbr_t*); + mem += edges_.size() * sizeof(nbr_t); + return mem; + } + template void Serialize(std::unique_ptr& writer) { vid_t vnum = vertex_num(); diff --git a/grape/graph/mutable_csr.h b/grape/graph/mutable_csr.h index 77c6c06c..ba952c8c 100644 --- a/grape/graph/mutable_csr.h +++ b/grape/graph/mutable_csr.h @@ -552,20 +552,24 @@ class MutableCSR> { if (std::is_pod::value) { for (vid_t i = 0; i < vnum; ++i) { - CHECK(writer->Write(adj_lists_[i].begin, - adj_lists_[i].degree() * sizeof(nbr_t))); + if (degree[i] > 0) { + CHECK(writer->Write(adj_lists_[i].begin, + adj_lists_[i].degree() * sizeof(nbr_t))); + } } } else { - for (vid_t i = 0; i < vnum; ++i) { - auto ptr = adj_lists_[i].begin; - auto end = adj_lists_[i].end; - while (ptr != end) { - ia << *ptr; - ++ptr; + if (edge_num > 0) { + for (vid_t i = 0; i < vnum; ++i) { + auto ptr = adj_lists_[i].begin; + auto end = adj_lists_[i].end; + while (ptr != end) { + ia << *ptr; + ++ptr; + } } + CHECK(writer->WriteArchive(ia)); + ia.Clear(); } - CHECK(writer->WriteArchive(ia)); - ia.Clear(); } } @@ -600,23 +604,30 @@ class MutableCSR> { adj_lists_[i].end = ptr + degree_list[i]; ptr += capacity_[i]; } - prev_[0] = sentinel; - next_[vnum - 1] = sentinel; + if (vnum > 0) { + prev_[0] = sentinel; + next_[vnum - 1] = sentinel; + } + if (std::is_pod::value) { for (vid_t i = 0; i < vnum; ++i) { - CHECK(reader->Read(ptr, sizeof(nbr_t) * degree_list[i])); + if (degree_list[i] > 0) { + CHECK(reader->Read(ptr, sizeof(nbr_t) * degree_list[i])); + } } } else { - CHECK(reader->ReadArchive(oa)); - for (vid_t i = 0; i < vnum; ++i) { - nbr_t* begin = adj_lists_[i].begin; - nbr_t* end = adj_lists_[i].end; - while (begin != end) { - oa >> *begin; - ++begin; + if (edge_num > 0) { + CHECK(reader->ReadArchive(oa)); + for (vid_t i = 0; i < vnum; ++i) { + nbr_t* begin = adj_lists_[i].begin; + nbr_t* end = adj_lists_[i].end; + while (begin != end) { + oa >> *begin; + ++begin; + } } + oa.Clear(); } - oa.Clear(); } buffers_.emplace_back(std::move(buffer)); } diff --git a/grape/types.h b/grape/types.h index 4035a1b7..34138683 100644 --- a/grape/types.h +++ b/grape/types.h @@ -20,6 +20,8 @@ limitations under the License. #include #include +#include "pthash/utils/hasher.hpp" + // Use the same setting with apache-arrow to avoid possible conflicts #define nssv_CONFIG_SELECT_STRING_VIEW nssv_STRING_VIEW_NONSTD #include "string_view/string_view.hpp" @@ -138,6 +140,31 @@ struct InternalOID { static std::string FromInternal(const type& val) { return std::string(val); } }; +struct murmurhasher { + typedef pthash::hash64 hash_type; + + // specialization for std::string + static inline hash_type hash(std::string const& val, uint64_t seed) { + return pthash::MurmurHash2_64(val.data(), val.size(), seed); + } + + // specialization for uint64_t + static inline hash_type hash(uint64_t val, uint64_t seed) { + return pthash::MurmurHash2_64(reinterpret_cast(&val), + sizeof(val), seed); + } + + static inline hash_type hash(const nonstd::string_view& val, uint64_t seed) { + return pthash::MurmurHash2_64(val.data(), val.size(), seed); + } + +#if __cplusplus >= 201703L + static inline hash_type hash(std::string_view const& val, uint64_t seed) { + return pthash::MurmurHash2_64(val.data(), val.size(), seed); + } +#endif +}; + #ifdef __cpp_lib_is_invocable template using result_of_t = std::invoke_result_t; diff --git a/grape/util.h b/grape/util.h index 55798af5..5145f1ea 100644 --- a/grape/util.h +++ b/grape/util.h @@ -22,6 +22,7 @@ limitations under the License. #endif #endif +#include #include #include #include @@ -29,6 +30,7 @@ limitations under the License. #include #include #include +#include #include #include #include @@ -140,11 +142,41 @@ struct IdenticalHasher { static uint64_t hash(uint64_t x) { return x; } }; -static inline bool exists_file(const std::string& name) { +inline bool exists_file(const std::string& name) { struct stat buffer; return (stat(name.c_str(), &buffer) == 0); } +inline std::string get_absolute_path(const std::string& path) { + char abs_path[PATH_MAX]; + if (realpath(path.c_str(), abs_path) == nullptr) { + LOG(ERROR) << "Failed to get absolute path for " << path; + return ""; + } + return std::string(abs_path); +} + +inline bool create_directories(const std::string& path) { + char temp_path[256]; + snprintf(temp_path, sizeof(temp_path), "%s", path.c_str()); + + for (char* p = temp_path + 1; *p; ++p) { + if (*p == '/') { + *p = '\0'; + if (mkdir(temp_path, 0755) != 0 && errno != EEXIST) { + std::cerr << "Error creating directory: " << temp_path << std::endl; + return false; + } + *p = '/'; + } + } + if (mkdir(temp_path, 0755) != 0 && errno != EEXIST) { + std::cerr << "Error creating directory: " << temp_path << std::endl; + return false; + } + return true; +} + inline std::vector split_string(const std::string& str, char delimiter) { std::vector tokens; diff --git a/grape/utils/concurrent_queue.h b/grape/utils/concurrent_queue.h index 938e0758..a0312daf 100644 --- a/grape/utils/concurrent_queue.h +++ b/grape/utils/concurrent_queue.h @@ -136,6 +136,17 @@ class BlockingQueue { } } + bool TryGetAll(std::deque& items) { + { + std::unique_lock lk(lock_); + if (!queue_.empty()) { + std::swap(items, queue_); + full_.notify_all(); + } + return (producer_num_ != 0); + } + } + size_t Size() const { return queue_.size(); } private: @@ -156,8 +167,7 @@ class SpinLock { public: void lock() { while (locked.test_and_set(std::memory_order_acquire)) { - { - } + {} } } diff --git a/grape/utils/pthash_utils/ef_sequence_view.h b/grape/utils/pthash_utils/ef_sequence_view.h new file mode 100644 index 00000000..0a56d700 --- /dev/null +++ b/grape/utils/pthash_utils/ef_sequence_view.h @@ -0,0 +1,149 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GRAPE_UTILS_PTHASH_UTILS_EF_SEQUENCE_VIEW_H_ +#define GRAPE_UTILS_PTHASH_UTILS_EF_SEQUENCE_VIEW_H_ + +#include +#include +#include + +#include + +#include "grape/utils/ref_vector.h" +#include "pthash/encoders/util.hpp" + +namespace grape { + +// This code is an adaptation from +// https://github.com/jermp/pthash/blob/master/include/encoders/bit_vector.hpp +struct bit_vector_view { + const uint64_t* data() const { return m_bits.data(); } + + template + void load(Loader& loader) { + loader.load(m_size); + loader.load_ref_vec(m_bits); + } + + size_t m_size; + ref_vector m_bits; +}; + +// This code is an adaptation from +// https://github.com/jermp/pthash/blob/master/include/encoders/darray.hpp +struct darray1_view { + inline uint64_t select(const bit_vector_view& bv, uint64_t idx) const { + assert(idx < m_positions); + uint64_t block = idx / block_size; + int64_t block_pos = m_block_inventory[block]; + if (block_pos < 0) { // sparse super-block + uint64_t overflow_pos = uint64_t(-block_pos - 1); + return m_overflow_positions[overflow_pos + (idx & (block_size - 1))]; + } + + size_t subblock = idx / subblock_size; + size_t start_pos = uint64_t(block_pos) + m_subblock_inventory[subblock]; + size_t reminder = idx & (subblock_size - 1); + if (!reminder) { + return start_pos; + } + + const uint64_t* data = bv.data(); + size_t word_idx = start_pos >> 6; + size_t word_shift = start_pos & 63; + uint64_t word = data[word_idx] & (uint64_t(-1) << word_shift); + + while (true) { + size_t popcnt = pthash::util::popcount(word); + if (reminder < popcnt) { + break; + } + reminder -= popcnt; + word = data[++word_idx]; + } + + return (word_idx << 6) + pthash::util::select_in_word(word, reminder); + } + + template + void load(Loader& loader) { + loader.load(m_positions); + loader.load_ref_vec(m_block_inventory); + loader.load_ref_vec(m_subblock_inventory); + loader.load_ref_vec(m_overflow_positions); + } + + static const size_t block_size = 1024; // 2048 + static const size_t subblock_size = 32; + static const size_t max_in_block_distance = 1 << 16; + + size_t m_positions; + ref_vector m_block_inventory; + ref_vector m_subblock_inventory; + ref_vector m_overflow_positions; +}; + +// This code is an adaptation from +// https://github.com/jermp/pthash/blob/master/include/encoders/compact_vector.hpp +struct compact_vector_view { + inline uint64_t size() const { return m_size; } + inline uint64_t width() const { return m_width; } + inline uint64_t access(uint64_t pos) const { + assert(pos < size()); + uint64_t i = pos * m_width; + const char* ptr = reinterpret_cast(m_bits.data()); + return (*(reinterpret_cast(ptr + (i >> 3))) >> (i & 7)) & + m_mask; + } + + template + void load(Loader& loader) { + loader.load(m_size); + loader.load(m_width); + loader.load(m_mask); + loader.load_ref_vec(m_bits); + } + + uint64_t m_size; + uint64_t m_width; + uint64_t m_mask; + ref_vector m_bits; +}; + +// This code is an adaptation from +// https://github.com/jermp/pthash/blob/master/include/encoders/ef_sequence.hpp +struct ef_sequence_view { + uint64_t access(uint64_t i) const { + assert(i < m_low_bits.size()); + return ((m_high_bits_d1.select(m_high_bits, i) - i) << m_low_bits.width()) | + m_low_bits.access(i); + } + + template + void load(Loader& loader) { + m_high_bits.load(loader); + m_high_bits_d1.load(loader); + m_low_bits.load(loader); + } + + bit_vector_view m_high_bits; + darray1_view m_high_bits_d1; + compact_vector_view m_low_bits; +}; + +} // namespace grape + +#endif // GRAPE_UTILS_PTHASH_UTILS_EF_SEQUENCE_VIEW_H_ diff --git a/grape/utils/pthash_utils/encoders_view.h b/grape/utils/pthash_utils/encoders_view.h new file mode 100644 index 00000000..85615c4f --- /dev/null +++ b/grape/utils/pthash_utils/encoders_view.h @@ -0,0 +1,62 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GRAPE_UTILS_PTHASH_UTILS_ENCODERS_VIEW_H_ +#define GRAPE_UTILS_PTHASH_UTILS_ENCODERS_VIEW_H_ + +#include "grape/utils/pthash_utils/ef_sequence_view.h" + +namespace grape { + +// This code is an adaptation from +// https://github.com/jermp/pthash/blob/master/include/encoders/encoders.hpp +struct dictionary_view { + size_t size() const { return m_ranks.size(); } + uint64_t access(uint64_t i) const { + uint64_t rank = m_ranks.access(i); + return m_dict.access(rank); + } + + template + void load(Loader& loader) { + m_ranks.load(loader); + m_dict.load(loader); + } + + compact_vector_view m_ranks; + compact_vector_view m_dict; +}; + +struct dual_dictionary_view { + uint64_t access(uint64_t i) const { + if (i < m_front.size()) { + return m_front.access(i); + } + return m_back.access(i - m_front.size()); + } + + template + void load(Loader& loader) { + m_front.load(loader); + m_back.load(loader); + } + + dictionary_view m_front; + dictionary_view m_back; +}; + +} // namespace grape + +#endif // GRAPE_UTILS_PTHASH_UTILS_ENCODERS_VIEW_H_ diff --git a/grape/utils/pthash_utils/ph_indexer_view.h b/grape/utils/pthash_utils/ph_indexer_view.h new file mode 100644 index 00000000..101ede47 --- /dev/null +++ b/grape/utils/pthash_utils/ph_indexer_view.h @@ -0,0 +1,81 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef GRAPE_UTILS_PTHASH_UTILS_PH_INDEXER_VIEW_H_ +#define GRAPE_UTILS_PTHASH_UTILS_PH_INDEXER_VIEW_H_ + +#include "grape/graph/id_indexer.h" +#include "grape/utils/pthash_utils/single_phf_view.h" +#include "grape/utils/ref_vector.h" + +namespace grape { + +template +class PHIndexerView { + public: + PHIndexerView() {} + ~PHIndexerView() {} + + void init(const void* buffer, size_t size) { + buffer_ = buffer; + buffer_size_ = size; + + mem_loader loader(reinterpret_cast(buffer), size); + phf_view_.load(loader); + keys_view_.load(loader); + } + + size_t entry_num() const { return keys_view_.size(); } + + bool empty() const { return keys_view_.empty(); } + + bool get_key(INDEX_T lid, KEY_T& oid) const { + if (lid >= keys_view_.size()) { + return false; + } + oid = keys_view_.get(lid); + return true; + } + + bool get_index(const KEY_T& oid, INDEX_T& lid) const { + auto idx = phf_view_(oid); + if (idx < keys_view_.size() && keys_view_.get(idx) == oid) { + lid = idx; + return true; + } + return false; + } + + size_t size() const { return keys_view_.size(); } + + template + void Serialize(std::unique_ptr& writer) { + writer->Write(&buffer_size_, sizeof(size_t)); + if (buffer_size_ > 0) { + writer->Write(const_cast(buffer_), buffer_size_); + } + } + + private: + SinglePHFView phf_view_; + id_indexer_impl::KeyBufferView keys_view_; + + const void* buffer_; + size_t buffer_size_; +}; + +} // namespace grape + +#endif // GRAPE_UTILS_PTHASH_UTILS_PH_INDEXER_VIEW_H_ diff --git a/grape/utils/pthash_utils/single_phf_view.h b/grape/utils/pthash_utils/single_phf_view.h new file mode 100644 index 00000000..e25aad68 --- /dev/null +++ b/grape/utils/pthash_utils/single_phf_view.h @@ -0,0 +1,218 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GRAPE_UTILS_PTHASH_UTILS_SINGLE_PHF_VIEW_H_ +#define GRAPE_UTILS_PTHASH_UTILS_SINGLE_PHF_VIEW_H_ + +#include "grape/utils/pthash_utils/encoders_view.h" +#include "pthash/builders/util.hpp" +#include "pthash/encoders/encoders.hpp" +#include "pthash/single_phf.hpp" +#include "pthash/utils/bucketers.hpp" +#include "pthash/utils/hasher.hpp" + +namespace grape { + +struct mem_dumper { + public: + mem_dumper() = default; + ~mem_dumper() = default; + + template + void dump(const T& val) { + static_assert(std::is_pod::value, "T must be POD type"); + const char* ptr = reinterpret_cast(&val); + buf_.insert(buf_.end(), ptr, ptr + sizeof(T)); + } + + template + void dump_vec(const std::vector& vec) { + static_assert(std::is_pod::value, "T must be POD type"); + size_t n = vec.size(); + dump(n); + const char* ptr = reinterpret_cast(vec.data()); + buf_.insert(buf_.end(), ptr, ptr + sizeof(T) * n); + } + + const std::vector& buffer() const { return buf_; } + std::vector& buffer() { return buf_; } + + size_t size() const { return buf_.size(); } + + private: + std::vector buf_; +}; + +struct external_mem_dumper { + public: + external_mem_dumper(void* buf, size_t size) : buf_(buf), size_(size) {} + + ~external_mem_dumper() = default; + + template + void dump(const T& val) { + static_assert(std::is_pod::value, "T must be POD type"); + const char* ptr = reinterpret_cast(&val); + if (pos_ + sizeof(T) > size_) { + return; + } + memcpy(reinterpret_cast(buf_) + pos_, ptr, sizeof(T)); + pos_ += sizeof(T); + } + + template + void dump_vec(const std::vector& vec) { + static_assert(std::is_pod::value, "T must be POD type"); + size_t n = vec.size(); + if (pos_ + sizeof(T) * n + sizeof(size_t) > size_) { + return; + } + dump(n); + const char* ptr = reinterpret_cast(vec.data()); + memcpy(reinterpret_cast(buf_) + pos_, ptr, sizeof(T) * n); + pos_ += sizeof(T) * n; + } + + const void* buffer() const { return buf_; } + + size_t size() const { return size_; } + + private: + void* buf_ = nullptr; + size_t pos_ = 0; + size_t size_ = 0; +}; + +struct mem_loader { + public: + mem_loader(const char* buf, size_t size) + : begin_(buf), ptr_(buf), end_(buf + size) {} + ~mem_loader() = default; + + template + void load(T& val) { + memcpy(&val, ptr_, sizeof(T)); + ptr_ += sizeof(T); + } + + template + void load_vec(std::vector& vec) { + static_assert(std::is_pod::value, "T must be POD type"); + size_t n; + load(n); + vec.resize(n); + memcpy(vec.data(), ptr_, n * sizeof(T)); + ptr_ += (n * sizeof(T)); + } + + template + void load_ref_vec(ref_vector& vec) { + ptr_ += vec.init(ptr_, end_ - ptr_); + } + + const char* data() const { return ptr_; } + size_t remaining() const { return end_ - ptr_; } + size_t used() const { return ptr_ - begin_; } + + private: + const char* begin_; + const char* ptr_; + const char* end_; +}; + +// This code is an adaptation from +// https://github.com/jermp/pthash/blob/master/include/single_phf.hpp +template +struct SinglePHFView { + public: + SinglePHFView() = default; + ~SinglePHFView() = default; + + template + uint64_t operator()(T const& key) const { + auto hash = Hasher::hash(key, m_seed); + return position(hash); + } + + uint64_t position(typename Hasher::hash_type hash) const { + uint64_t bucket = m_bucketer.bucket(hash.first()); + uint64_t pilot = m_pilots.access(bucket); + uint64_t hashed_pilot = pthash::default_hash64(pilot, m_seed); + uint64_t p = + fastmod::fastmod_u64(hash.second() ^ hashed_pilot, m_M, m_table_size); + if (PTHASH_LIKELY(p < m_num_keys)) + return p; + return m_free_slots.access(p - m_num_keys); + } + + template + void load(Loader& loader) { + loader.load(m_seed); + loader.load(m_num_keys); + loader.load(m_table_size); + loader.load(m_M); + m_bucketer.load(loader); + m_pilots.load(loader); + m_free_slots.load(loader); + } + + template + static void build(Iterator keys, uint64_t n, Dumper& dumper, int thread_num) { + pthash::build_configuration config; + config.c = 7.0; + config.alpha = 0.94; + config.num_threads = thread_num; + config.minimal_output = true; + config.verbose_output = false; + + pthash::single_phf phf; + phf.build_in_internal_memory(keys, n, config); + std::set idx; + for (uint64_t k = 0; k < n; ++k) { + idx.insert(phf(*keys)); + ++keys; + } + phf.dump(dumper); + } + + template + static void build( + Iterator keys, uint64_t n, + pthash::single_phf& + phf, + int thread_num) { + pthash::build_configuration config; + config.c = 7.0; + config.alpha = 0.94; + config.num_threads = thread_num; + config.minimal_output = true; + config.verbose_output = false; + + phf.build_in_internal_memory(keys, n, config); + } + + private: + uint64_t m_seed; + uint64_t m_num_keys; + uint64_t m_table_size; + __uint128_t m_M; + pthash::skew_bucketer m_bucketer; + dual_dictionary_view m_pilots; + ef_sequence_view m_free_slots; +}; + +} // namespace grape + +#endif // GRAPE_UTILS_PTHASH_UTILS_SINGLE_PHF_VIEW_H_ diff --git a/grape/utils/ref_vector.h b/grape/utils/ref_vector.h new file mode 100644 index 00000000..b0233553 --- /dev/null +++ b/grape/utils/ref_vector.h @@ -0,0 +1,85 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef GRAPE_UTILS_REF_VECTOR_H_ +#define GRAPE_UTILS_REF_VECTOR_H_ + +#include +#include + +#include "grape/types.h" + +namespace grape { + +template +struct ref_vector { + static_assert(std::is_pod::value, "T must be POD type"); + ref_vector() : buffer_(nullptr), size_(0) {} + ~ref_vector() {} + + size_t init(const void* buffer, size_t size) { + const void* ptr = buffer; + size_ = *reinterpret_cast(ptr); + ptr = reinterpret_cast(ptr) + sizeof(size_t); + buffer_ = reinterpret_cast(ptr); + return size_ * sizeof(T) + sizeof(size_t); + } + + size_t size() const { return size_; } + + T get(size_t idx) const { return buffer_[idx]; } + + const T* data() const { return buffer_; } + + const T& operator[](size_t idx) const { return buffer_[idx]; } + + template + void load(Loader& loader) { + loader.load_ref_vec(*this); + } + + private: + const T* buffer_; + size_t size_; +}; + +template +void encode_vec(const std::vector& vec, std::vector& buf) { + size_t old_size = buf.size(); + size_t vec_size = vec.size(); + buf.resize(old_size + sizeof(size_t) + vec_size * sizeof(T)); + char* ptr = buf.data() + old_size; + memcpy(ptr, &vec_size, sizeof(size_t)); + ptr += sizeof(size_t); + memcpy(ptr, vec.data(), sizeof(T) * vec_size); +} + +template +void encode_val(const T& val, std::vector& buf) { + size_t old_size = buf.size(); + buf.resize(old_size + sizeof(T)); + char* ptr = buf.data() + old_size; + memcpy(ptr, &val, sizeof(T)); +} + +template +const char* decode_val(T& val, const char* buf) { + memcpy(&val, buf, sizeof(T)); + return buf + sizeof(T); +} + +} // namespace grape + +#endif // GRAPE_UTILS_REF_VECTOR_H_ diff --git a/grape/utils/string_view_vector.h b/grape/utils/string_view_vector.h index 7adef9ce..5c13804d 100644 --- a/grape/utils/string_view_vector.h +++ b/grape/utils/string_view_vector.h @@ -19,9 +19,11 @@ limitations under the License. #include #include +#include #include #include "grape/types.h" +#include "grape/utils/ref_vector.h" namespace grape { @@ -74,11 +76,136 @@ class StringViewVector { offsets_.swap(rhs.offsets_); } + template + void serialize(std::unique_ptr& writer) const { + size_t content_buffer_size = content_buffer().size(); + CHECK(writer->Write(&content_buffer_size, sizeof(size_t))); + if (content_buffer_size > 0) { + CHECK(writer->Write(const_cast(content_buffer().data()), + content_buffer_size * sizeof(char))); + } + size_t offset_buffer_size = offset_buffer().size(); + CHECK(writer->Write(&offset_buffer_size, sizeof(size_t))); + if (offset_buffer_size > 0) { + CHECK(writer->Write(const_cast(offset_buffer().data()), + offset_buffer_size * sizeof(size_t))); + } + } + + template + void deserialize(std::unique_ptr& reader) { + size_t content_buffer_size; + CHECK(reader->Read(&content_buffer_size, sizeof(size_t))); + if (content_buffer_size > 0) { + content_buffer().resize(content_buffer_size); + CHECK(reader->Read(content_buffer().data(), + content_buffer_size * sizeof(char))); + } + size_t offset_buffer_size; + CHECK(reader->Read(&offset_buffer_size, sizeof(size_t))); + if (offset_buffer_size > 0) { + offset_buffer().resize(offset_buffer_size); + CHECK(reader->Read(offset_buffer().data(), + offset_buffer_size * sizeof(size_t))); + } + } + + void serialize_to_mem(std::vector& buf) const { + encode_vec(buffer_, buf); + encode_vec(offsets_, buf); + } + private: std::vector buffer_; std::vector offsets_; }; +template <> +struct ref_vector { + ref_vector() {} + ~ref_vector() {} + + size_t init(const void* buffer, size_t size) { + size_t buffer_size = buffer_.init(buffer, size); + const void* ptr = reinterpret_cast(buffer) + buffer_size; + size_t offset_size = offsets_.init(ptr, size - buffer_size); + return buffer_size + offset_size; + } + + ref_vector& buffer() { return buffer_; } + ref_vector& offsets() { return offsets_; } + + const ref_vector& buffer() const { return buffer_; } + const ref_vector& offsets() const { return offsets_; } + + size_t size() const { + if (offsets_.size() == 0) { + return 0; + } + return offsets_.size() - 1; + } + + nonstd::string_view get(size_t idx) const { + size_t from = offsets_.get(idx); + size_t to = offsets_.get(idx + 1); + return nonstd::string_view(buffer_.data() + from, to - from); + } + + template + void load(Loader& loader) { + loader.load_ref_vec(buffer_); + loader.load_ref_vec(offsets_); + } + + private: + ref_vector buffer_; + ref_vector offsets_; +}; + +#if __cplusplus >= 201703L +template <> +struct ref_vector { + ref_vector() {} + ~ref_vector() {} + + size_t init(const void* buffer, size_t size) { + size_t buffer_size = buffer_.init(buffer, size); + const void* ptr = reinterpret_cast(buffer) + buffer_size; + size_t offset_size = offsets_.init(ptr, size - buffer_size); + return buffer_size + offset_size; + } + + ref_vector& buffer() { return buffer_; } + ref_vector& offsets() { return offsets_; } + + const ref_vector& buffer() const { return buffer_; } + const ref_vector& offsets() const { return offsets_; } + + size_t size() const { + if (offsets_.size() == 0) { + return 0; + } + return offsets_.size() - 1; + } + + std::string_view get(size_t idx) const { + size_t from = offsets_.get(idx); + size_t to = offsets_.get(idx + 1); + return std::string_view(buffer_.data() + from, to - from); + } + + template + void load(Loader& loader) { + loader.load_ref_vec(buffer_); + loader.load_ref_vec(offsets_); + } + + private: + ref_vector buffer_; + ref_vector offsets_; +}; +#endif + } // namespace grape #endif // GRAPE_UTILS_STRING_VIEW_VECTOR_H_ diff --git a/grape/vertex_map/global_vertex_map.h b/grape/vertex_map/global_vertex_map.h deleted file mode 100644 index 1a5a468b..00000000 --- a/grape/vertex_map/global_vertex_map.h +++ /dev/null @@ -1,318 +0,0 @@ -/** Copyright 2020 Alibaba Group Holding Limited. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#ifndef GRAPE_VERTEX_MAP_GLOBAL_VERTEX_MAP_H_ -#define GRAPE_VERTEX_MAP_GLOBAL_VERTEX_MAP_H_ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "grape/config.h" -#include "grape/fragment/partitioner.h" -#include "grape/graph/id_indexer.h" -#include "grape/serialization/in_archive.h" -#include "grape/serialization/out_archive.h" -#include "grape/vertex_map/vertex_map_base.h" -#include "grape/worker/comm_spec.h" - -namespace grape { - -template -class GlobalVertexMap; - -template -class GlobalVertexMapBuilder { - using internal_oid_t = typename InternalOID::type; - - private: - GlobalVertexMapBuilder(fid_t fid, IdIndexer& indexer, - const PARTITIONER_T& partitioner, - const IdParser& id_parser) - : fid_(fid), - indexer_(indexer), - partitioner_(partitioner), - id_parser_(id_parser) {} - - public: - ~GlobalVertexMapBuilder() {} - - void add_local_vertex(const internal_oid_t& id, VID_T& gid) { - assert(partitioner_.GetPartitionId(id) == fid_); - indexer_.add(id, gid); - id_parser_.generate_global_id(fid_, gid); - } - - void add_vertex(const internal_oid_t& id) { - if (partitioner_.GetPartitionId(id) == fid_) { - indexer_._add(id); - } - } - - void finish(GlobalVertexMap& vertex_map) { - const CommSpec& comm_spec = vertex_map.GetCommSpec(); - int worker_id = comm_spec.worker_id(); - int worker_num = comm_spec.worker_num(); - fid_t fnum = comm_spec.fnum(); - { - std::thread recv_thread([&]() { - int src_worker_id = (worker_id + 1) % worker_num; - while (src_worker_id != worker_id) { - for (fid_t fid = 0; fid < fnum; ++fid) { - if (comm_spec.FragToWorker(fid) != src_worker_id) { - continue; - } - sync_comm::Recv(vertex_map.indexers_[fid], src_worker_id, 0, - comm_spec.comm()); - } - src_worker_id = (src_worker_id + 1) % worker_num; - } - }); - std::thread send_thread([&]() { - int dst_worker_id = (worker_id + worker_num - 1) % worker_num; - while (dst_worker_id != worker_id) { - for (fid_t fid = 0; fid < fnum; ++fid) { - if (comm_spec.FragToWorker(fid) != worker_id) { - continue; - } - sync_comm::Send(indexer_, dst_worker_id, 0, comm_spec.comm()); - } - dst_worker_id = (dst_worker_id + worker_num - 1) % worker_num; - } - }); - send_thread.join(); - recv_thread.join(); - } - } - - private: - template - friend class GlobalVertexMap; - - fid_t fid_; - IdIndexer& indexer_; - const PARTITIONER_T& partitioner_; - const IdParser& id_parser_; -}; - -/** - * @brief a kind of VertexMapBase which holds global mapping information in - * each worker. - * - * @tparam OID_T - * @tparam VID_T - */ -template > -class GlobalVertexMap : public VertexMapBase { - // TODO(lxj): to support shared-memory for workers on same host (auto apps) - - using base_t = VertexMapBase; - using internal_oid_t = typename InternalOID::type; - - public: - explicit GlobalVertexMap(const CommSpec& comm_spec) : base_t(comm_spec) {} - ~GlobalVertexMap() = default; - void Init() { indexers_.resize(comm_spec_.fnum()); } - - size_t GetTotalVertexSize() const { - size_t size = 0; - for (const auto& v : indexers_) { - size += v.size(); - } - return size; - } - - size_t GetInnerVertexSize(fid_t fid) const { return indexers_[fid].size(); } - void AddVertex(const OID_T& oid) { - fid_t fid = partitioner_.GetPartitionId(oid); - indexers_[fid]._add(oid); - } - - using base_t::Lid2Gid; - bool AddVertex(const OID_T& oid, VID_T& gid) { - fid_t fid = partitioner_.GetPartitionId(oid); - internal_oid_t internal_oid(oid); - if (indexers_[fid].add(std::move(internal_oid), gid)) { - gid = Lid2Gid(fid, gid); - return true; - } - gid = Lid2Gid(fid, gid); - return false; - } - - bool AddVertex(OID_T&& oid, VID_T& gid) { - fid_t fid = partitioner_.GetPartitionId(oid); - internal_oid_t internal_oid(std::move(oid)); - if (indexers_[fid].add(std::move(internal_oid), gid)) { - gid = Lid2Gid(fid, gid); - return true; - } - gid = Lid2Gid(fid, gid); - return false; - } - - using base_t::GetFidFromGid; - using base_t::GetLidFromGid; - bool GetOid(const VID_T& gid, OID_T& oid) const { - fid_t fid = GetFidFromGid(gid); - VID_T lid = GetLidFromGid(gid); - return GetOid(fid, lid, oid); - } - - bool GetOid(fid_t fid, const VID_T& lid, OID_T& oid) const { - internal_oid_t internal_oid; - if (indexers_[fid].get_key(lid, internal_oid)) { - oid = InternalOID::FromInternal(internal_oid); - return true; - } - return false; - } - - bool _GetGid(fid_t fid, const internal_oid_t& oid, VID_T& gid) const { - if (indexers_[fid].get_index(oid, gid)) { - gid = Lid2Gid(fid, gid); - return true; - } - return false; - } - - bool GetGid(fid_t fid, const OID_T& oid, VID_T& gid) const { - internal_oid_t internal_oid(oid); - return _GetGid(fid, internal_oid, gid); - } - - bool _GetGid(const internal_oid_t& oid, VID_T& gid) const { - fid_t fid = partitioner_.GetPartitionId(oid); - return _GetGid(fid, oid, gid); - } - - bool GetGid(const OID_T& oid, VID_T& gid) const { - fid_t fid = partitioner_.GetPartitionId(oid); - return GetGid(fid, oid, gid); - } - - GlobalVertexMapBuilder GetLocalBuilder() { - fid_t fid = comm_spec_.fid(); - return GlobalVertexMapBuilder( - fid, indexers_[fid], partitioner_, id_parser_); - } - - private: - template - void serialize(const std::string& path) { - auto io_adaptor = std::unique_ptr(new IOADAPTOR_T(path)); - io_adaptor->Open("wb"); - base_t::serialize(io_adaptor); - for (fid_t i = 0; i < comm_spec_.fnum(); ++i) { - indexers_[i].Serialize(io_adaptor); - } - io_adaptor->Close(); - } - - public: - template - void Serialize(const std::string& prefix) { - char fbuf[1024]; - snprintf(fbuf, sizeof(fbuf), "%s/%s", prefix.c_str(), - kSerializationVertexMapFilename); - std::string path = std::string(fbuf); - if (comm_spec_.worker_id() == 0) { - serialize(path); - } - MPI_Barrier(comm_spec_.comm()); - auto exists_file = [](const std::string& name) { - std::ifstream f(name.c_str()); - return f.good(); - }; - if (!exists_file(path) && comm_spec_.local_id() == 0) { - serialize(path); - } - MPI_Barrier(comm_spec_.comm()); - if (!exists_file(path)) { - serialize(path); - } - } - - template - void Deserialize(const std::string& prefix, fid_t fid) { - char fbuf[1024]; - snprintf(fbuf, sizeof(fbuf), "%s/%s", prefix.c_str(), - kSerializationVertexMapFilename); - - auto io_adaptor = - std::unique_ptr(new IOADAPTOR_T(std::string(fbuf))); - io_adaptor->Open(); - - base_t::deserialize(io_adaptor); - - indexers_.resize(comm_spec_.fnum()); - for (fid_t i = 0; i < comm_spec_.fnum(); ++i) { - indexers_[i].Deserialize(io_adaptor); - } - io_adaptor->Close(); - } - - void UpdateToBalance(std::vector& vnum_list, - std::vector>& gid_maps) { - fid_t fnum = comm_spec_.fnum(); - std::vector> oid_lists(fnum); - for (fid_t i = 0; i < fnum; ++i) { - oid_lists[i].resize(vnum_list[i]); - } - for (fid_t fid = 0; fid < fnum; ++fid) { - auto& old_indexer = indexers_[fid]; - VID_T vnum = old_indexer.size(); - for (VID_T i = 0; i < vnum; ++i) { - VID_T new_gid = gid_maps[fid][i]; - internal_oid_t oid; - fid_t new_fid = GetFidFromGid(new_gid); - CHECK(old_indexer.get_key(i, oid)); - if (new_fid != fid) { - OID_T id = InternalOID::FromInternal(oid); - partitioner_.SetPartitionId(id, new_fid); - } - VID_T new_lid = GetLidFromGid(new_gid); - oid_lists[new_fid][new_lid] = oid; - } - } - std::vector> new_indexers(fnum); - for (fid_t i = 0; i < fnum; ++i) { - auto& indexer = new_indexers[i]; - for (auto& oid : oid_lists[i]) { - indexer._add(oid); - } - } - std::swap(indexers_, new_indexers); - } - - private: - template - friend class GlobalVertexMapBuilder; - - std::vector> indexers_; - using base_t::comm_spec_; - using base_t::id_parser_; - using base_t::partitioner_; -}; - -} // namespace grape - -#endif // GRAPE_VERTEX_MAP_GLOBAL_VERTEX_MAP_H_ diff --git a/grape/vertex_map/idxers/hashmap_idxer.h b/grape/vertex_map/idxers/hashmap_idxer.h new file mode 100644 index 00000000..ae079f89 --- /dev/null +++ b/grape/vertex_map/idxers/hashmap_idxer.h @@ -0,0 +1,130 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef GRAPE_VERTEX_MAP_IDXERS_HASHMAP_IDXER_H_ +#define GRAPE_VERTEX_MAP_IDXERS_HASHMAP_IDXER_H_ + +#include "grape/graph/id_indexer.h" +#include "grape/vertex_map/idxers/idxer_base.h" + +namespace grape { + +template +class HashMapIdxer : public IdxerBase { + using internal_oid_t = typename InternalOID::type; + + public: + HashMapIdxer() {} + explicit HashMapIdxer(IdIndexer&& indexer) + : indexer_(std::move(indexer)) {} + + bool get_key(VID_T vid, internal_oid_t& oid) const override { + return indexer_.get_key(vid, oid); + } + + bool get_index(const internal_oid_t& oid, VID_T& vid) const override { + return indexer_.get_index(oid, vid); + } + + IdxerType type() const override { return IdxerType::kHashMapIdxer; } + + void serialize(std::unique_ptr& writer) override { + indexer_.Serialize(writer); + } + void deserialize(std::unique_ptr& reader) override { + indexer_.Deserialize(reader); + } + + size_t size() const override { return indexer_.size(); } + + void add(const internal_oid_t& oid) { indexer_._add(oid); } + + size_t memory_usage() const override { return indexer_.memory_usage(); } + + private: + IdIndexer indexer_; +}; + +template +class HashMapIdxerDummyBuilder : public IdxerBuilderBase { + public: + using internal_oid_t = typename InternalOID::type; + void add(const internal_oid_t& oid) override {} + + std::unique_ptr> finish() override { + return std::unique_ptr>( + new HashMapIdxer(std::move(indexer_))); + } + + void sync_request(const CommSpec& comm_spec, int target, int tag) override { + int req_type = 0; + sync_comm::Send(req_type, target, tag, comm_spec.comm()); + sync_comm::Recv(indexer_, target, tag + 1, comm_spec.comm()); + } + void sync_response(const CommSpec& comm_spec, int source, int tag) override { + LOG(ERROR) + << "HashMapIdxerDummyBuilder should not be used to sync response"; + } + + private: + IdIndexer indexer_; +}; + +template +class HashMapIdxerBuilder : public IdxerBuilderBase { + public: + using internal_oid_t = typename InternalOID::type; + void add(const internal_oid_t& oid) override { indexer_._add(oid); } + + std::unique_ptr> finish() override { + return std::unique_ptr>( + new HashMapIdxer(std::move(indexer_))); + } + + void sync_request(const CommSpec& comm_spec, int target, int tag) override { + LOG(ERROR) << "HashMapIdxerBuilder should not be used to sync request"; + } + + void sync_response(const CommSpec& comm_spec, int source, int tag) override { + int req_type; + sync_comm::Recv(req_type, source, tag, comm_spec.comm()); + if (req_type == 0) { + // request all + sync_comm::Send(indexer_, source, tag + 1, comm_spec.comm()); + } else if (req_type == 1) { + // request partial + typename IdIndexer::key_buffer_t keys; + sync_comm::Recv(keys, source, tag, comm_spec.comm()); + std::vector response; + size_t keys_num = keys.size(); + for (size_t i = 0; i < keys_num; ++i) { + VID_T vid; + if (indexer_.get_index(keys.get(i), vid)) { + response.push_back(vid); + } else { + response.push_back(std::numeric_limits::max()); + } + } + sync_comm::Send(response, source, tag + 1, comm_spec.comm()); + } + } + + private: + IdIndexer indexer_; +}; + +} // namespace grape + +#endif // GRAPE_VERTEX_MAP_IDXERS_HASHMAP_IDXER_H_ diff --git a/grape/vertex_map/idxers/hashmap_idxer_view.h b/grape/vertex_map/idxers/hashmap_idxer_view.h new file mode 100644 index 00000000..8ffe2e55 --- /dev/null +++ b/grape/vertex_map/idxers/hashmap_idxer_view.h @@ -0,0 +1,157 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef GRAPE_VERTEX_MAP_IDXERS_HASHMAP_IDXER_VIEW_H_ +#define GRAPE_VERTEX_MAP_IDXERS_HASHMAP_IDXER_VIEW_H_ + +#include "grape/graph/id_indexer.h" +#include "grape/vertex_map/idxers/idxer_base.h" + +namespace grape { + +template +class HashMapIdxerView : public IdxerBase { + using internal_oid_t = typename InternalOID::type; + + public: + HashMapIdxerView() {} + explicit HashMapIdxerView(Array>&& buf) + : buffer_(std::move(buf)) { + indexer_.Init(buffer_.data(), buffer_.size()); + } + ~HashMapIdxerView() {} + + bool get_key(VID_T vid, internal_oid_t& oid) const override { + return indexer_.get_key(vid, oid); + } + + bool get_index(const internal_oid_t& oid, VID_T& vid) const override { + return indexer_.get_index(oid, vid); + } + + IdxerType type() const override { return IdxerType::kHashMapIdxerView; } + + void serialize(std::unique_ptr& writer) override { + size_t size = buffer_.size(); + writer->Write(&size, sizeof(size_t)); + if (size > 0) { + writer->Write(buffer_.data(), size); + } + } + + void deserialize(std::unique_ptr& reader) override { + size_t size; + CHECK(reader->Read(&size, sizeof(size_t))); + if (size > 0) { + buffer_.resize(size); + CHECK(reader->Read(buffer_.data(), size)); + indexer_.Init(buffer_.data(), size); + } + } + + size_t size() const override { return indexer_.size(); } + + size_t memory_usage() const override { return buffer_.size(); } + + private: + IdIndexerView indexer_; + Array> buffer_; +}; + +template +class HashMapIdxerViewDummyBuilder : public IdxerBuilderBase { + public: + using internal_oid_t = typename InternalOID::type; + void add(const internal_oid_t& oid) override {} + + std::unique_ptr> finish() override { + return std::unique_ptr>( + new HashMapIdxerView(std::move(buffer_))); + } + + void sync_request(const CommSpec& comm_spec, int target, int tag) override { + int req_type = 0; + sync_comm::Send(req_type, target, tag, comm_spec.comm()); + sync_comm::Recv(buffer_, target, tag + 1, comm_spec.comm()); + } + void sync_response(const CommSpec& comm_spec, int source, int tag) override { + LOG(ERROR) + << "HashMapIdxerViewDummyBuilder should not be used to sync response"; + } + + private: + Array> buffer_; +}; + +template +class HashMapIdxerViewBuilder : public IdxerBuilderBase { + using internal_oid_t = typename InternalOID::type; + + public: + HashMapIdxerViewBuilder() {} + ~HashMapIdxerViewBuilder() {} + + void add(const internal_oid_t& oid) override { indexer_._add(oid); } + + std::unique_ptr> finish() override { + if (buffer_.empty() && indexer_.size() > 0) { + indexer_.serialize_to_mem(buffer_); + } + Array> buffer; + buffer.resize(buffer_.size()); + memcpy(buffer.data(), buffer_.data(), buffer_.size()); + return std::unique_ptr>( + new HashMapIdxerView(std::move(buffer))); + } + + void sync_request(const CommSpec& comm_spec, int target, int tag) override { + LOG(ERROR) << "HashMapIdxerBuilder should not be used to sync request"; + } + + void sync_response(const CommSpec& comm_spec, int source, int tag) override { + int req_type; + sync_comm::Recv(req_type, source, tag, comm_spec.comm()); + if (req_type == 0) { + // request all + if (buffer_.empty() && indexer_.size() > 0) { + indexer_.serialize_to_mem(buffer_); + } + sync_comm::Send(buffer_, source, tag + 1, comm_spec.comm()); + } else if (req_type == 1) { + // request partial + typename IdIndexer::key_buffer_t keys; + sync_comm::Recv(keys, source, tag, comm_spec.comm()); + std::vector response; + size_t keys_num = keys.size(); + for (size_t i = 0; i < keys_num; ++i) { + VID_T vid; + if (indexer_.get_index(keys.get(i), vid)) { + response.push_back(vid); + } else { + response.push_back(std::numeric_limits::max()); + } + } + sync_comm::Send(response, source, tag + 1, comm_spec.comm()); + } + } + + private: + IdIndexer indexer_; + std::vector buffer_; +}; + +} // namespace grape + +#endif // GRAPE_VERTEX_MAP_IDXERS_HASHMAP_IDXER_VIEW_H_ diff --git a/grape/vertex_map/idxers/idxer_base.h b/grape/vertex_map/idxers/idxer_base.h new file mode 100644 index 00000000..4d3224b2 --- /dev/null +++ b/grape/vertex_map/idxers/idxer_base.h @@ -0,0 +1,105 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef GRAPE_VERTEX_MAP_IDXERS_IDXER_BASE_H_ +#define GRAPE_VERTEX_MAP_IDXERS_IDXER_BASE_H_ + +#include "grape/worker/comm_spec.h" + +namespace grape { + +enum class IdxerType { + kHashMapIdxer, + kLocalIdxer, + kPTHashIdxer, + kHashMapIdxerView, + kSortedArrayIdxer, +}; + +template +class IdxerBase { + using internal_oid_t = typename InternalOID::type; + + public: + virtual ~IdxerBase() = default; + + virtual bool get_key(VID_T vid, internal_oid_t& oid) const = 0; + + virtual bool get_index(const internal_oid_t& oid, VID_T& vid) const = 0; + + virtual IdxerType type() const = 0; + + virtual size_t size() const = 0; + + virtual size_t memory_usage() const = 0; + + virtual void serialize(std::unique_ptr& writer) = 0; + virtual void deserialize(std::unique_ptr& reader) = 0; +}; + +template +class IdxerBuilderBase { + using internal_oid_t = typename InternalOID::type; + + public: + virtual ~IdxerBuilderBase() = default; + + virtual void add(const internal_oid_t& oid) = 0; + + virtual std::unique_ptr> finish() = 0; + + virtual void sync_request(const CommSpec& comm_spec, int target, int tag) = 0; + virtual void sync_response(const CommSpec& comm_spec, int source, + int tag) = 0; +}; + +template +void serialize_idxer(std::unique_ptr& writer, + std::unique_ptr>& idxer) { + int type = static_cast(idxer->type()); + writer->Write(&type, sizeof(type)); + idxer->serialize(writer); +} + +} // namespace grape + +namespace std { +inline ostream& operator<<(ostream& os, const grape::IdxerType& type) { + switch (type) { + case grape::IdxerType::kHashMapIdxer: + os << "HashMapIdxer"; + break; + case grape::IdxerType::kLocalIdxer: + os << "LocalIdxer"; + break; + case grape::IdxerType::kPTHashIdxer: + os << "PTHashIdxer"; + break; + case grape::IdxerType::kHashMapIdxerView: + os << "HashMapIdxerView"; + break; + case grape::IdxerType::kSortedArrayIdxer: + os << "SortedArrayIdxer"; + break; + default: + os << "Unknown"; + break; + } + return os; +} + +} // namespace std + +#endif // GRAPE_VERTEX_MAP_IDXERS_IDXER_BASE_H_ diff --git a/grape/vertex_map/idxers/idxers.h b/grape/vertex_map/idxers/idxers.h new file mode 100644 index 00000000..94233d02 --- /dev/null +++ b/grape/vertex_map/idxers/idxers.h @@ -0,0 +1,114 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef GRAPE_VERTEX_MAP_IDXERS_IDXERS_H_ +#define GRAPE_VERTEX_MAP_IDXERS_IDXERS_H_ + +#include "grape/vertex_map/idxers/hashmap_idxer.h" +#include "grape/vertex_map/idxers/hashmap_idxer_view.h" +#include "grape/vertex_map/idxers/local_idxer.h" +#include "grape/vertex_map/idxers/pthash_idxer.h" +#include "grape/vertex_map/idxers/sorted_array_idxer.h" + +namespace grape { + +template +std::unique_ptr> deserialize_idxer( + std::unique_ptr& reader) { + int type; + reader->Read(&type, sizeof(type)); + IdxerType idxer_type = static_cast(type); + switch (idxer_type) { + case IdxerType::kHashMapIdxer: { + auto idxer = std::unique_ptr>( + new HashMapIdxer()); + idxer->deserialize(reader); + return idxer; + } + case IdxerType::kLocalIdxer: { + auto idxer = std::unique_ptr>( + new LocalIdxer()); + idxer->deserialize(reader); + return idxer; + } + case IdxerType::kHashMapIdxerView: { + auto idxer = std::unique_ptr>( + new HashMapIdxerView()); + idxer->deserialize(reader); + return idxer; + } + case IdxerType::kPTHashIdxer: { + auto idxer = std::unique_ptr>( + new PTHashIdxer()); + idxer->deserialize(reader); + return idxer; + } + case IdxerType::kSortedArrayIdxer: { + auto idxer = std::unique_ptr>( + new SortedArrayIdxer()); + idxer->deserialize(reader); + return idxer; + } + default: + return nullptr; + } +} + +template +std::unique_ptr> extend_indexer( + std::unique_ptr>&& input, + const std::vector& id_list, VID_T base) { + if (input->type() == IdxerType::kHashMapIdxer) { + auto casted = std::unique_ptr>( + dynamic_cast*>(input.release())); + for (auto& id : id_list) { + casted->add(id); + } + return casted; + } else if (input->type() == IdxerType::kLocalIdxer) { + auto casted = std::unique_ptr>( + dynamic_cast*>(input.release())); + for (auto& id : id_list) { + casted->add(id, base++); + } + return casted; + } else { + LOG(ERROR) << "Only HashMapIdxer or LocalIdxer can be extended"; + return std::move(input); + } +} + +inline IdxerType parse_idxer_type_name(const std::string& name) { + if (name == "hashmap") { + return IdxerType::kHashMapIdxer; + } else if (name == "local") { + return IdxerType::kLocalIdxer; + } else if (name == "pthash") { + return IdxerType::kPTHashIdxer; + } else if (name == "sorted_array") { + return IdxerType::kSortedArrayIdxer; + } else if (name == "hashmap_view") { + return IdxerType::kHashMapIdxerView; + } else { + LOG(INFO) << "unrecognized idxer type: " << name + << ", use hashmap idxer " + "as default"; + return IdxerType::kHashMapIdxer; + } +} + +} // namespace grape + +#endif // GRAPE_VERTEX_MAP_IDXERS_IDXERS_H_ diff --git a/grape/vertex_map/idxers/local_idxer.h b/grape/vertex_map/idxers/local_idxer.h new file mode 100644 index 00000000..2499c43e --- /dev/null +++ b/grape/vertex_map/idxers/local_idxer.h @@ -0,0 +1,121 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef GRAPE_VERTEX_MAP_IDXERS_LOCAL_IDXER_H_ +#define GRAPE_VERTEX_MAP_IDXERS_LOCAL_IDXER_H_ + +#include "grape/vertex_map/idxers/idxer_base.h" + +namespace grape { + +template +class LocalIdxer : public IdxerBase { + using internal_oid_t = typename InternalOID::type; + + public: + LocalIdxer() {} + LocalIdxer(IdIndexer&& oid_indexer, + IdIndexer&& lid_indexer) + : oid_indexer_(std::move(oid_indexer)), + lid_indexer_(std::move(lid_indexer)) {} + + bool get_key(VID_T vid, internal_oid_t& oid) const override { + VID_T idx; + if (lid_indexer_.get_index(vid, idx)) { + return oid_indexer_.get_key(idx, oid); + } else { + return false; + } + } + + bool get_index(const internal_oid_t& oid, VID_T& vid) const override { + VID_T idx; + if (oid_indexer_.get_index(oid, idx)) { + return lid_indexer_.get_key(idx, vid); + } else { + return false; + } + } + + IdxerType type() const override { return IdxerType::kLocalIdxer; } + + void serialize(std::unique_ptr& writer) override { + oid_indexer_.Serialize(writer); + lid_indexer_.Serialize(writer); + } + void deserialize(std::unique_ptr& reader) override { + oid_indexer_.Deserialize(reader); + lid_indexer_.Deserialize(reader); + } + + size_t size() const override { return oid_indexer_.size(); } + + void add(const internal_oid_t& oid, VID_T vid) { + size_t before = oid_indexer_.size(); + oid_indexer_._add(oid); + if (oid_indexer_.size() > before) { + lid_indexer_._add(vid); + } + } + + size_t memory_usage() const override { + return oid_indexer_.memory_usage() + lid_indexer_.memory_usage(); + } + + private: + IdIndexer oid_indexer_; // oid -> idx + IdIndexer lid_indexer_; // lid -> idx +}; + +template +class LocalIdxerBuilder : public IdxerBuilderBase { + public: + using internal_oid_t = typename InternalOID::type; + void add(const internal_oid_t& oid) override { oid_indexer_._add(oid); } + + std::unique_ptr> finish() override { + return std::unique_ptr>( + new LocalIdxer(std::move(oid_indexer_), + std::move(lid_indexer_))); + } + + void sync_request(const CommSpec& comm_spec, int target, int tag) override { + int req_type = 1; + sync_comm::Send(req_type, target, tag, comm_spec.comm()); + sync_comm::Send(oid_indexer_.keys(), target, tag, comm_spec.comm()); + std::vector response; + sync_comm::Recv(response, target, tag + 1, comm_spec.comm()); + VID_T sentinel = std::numeric_limits::max(); + for (size_t i = 0; i < oid_indexer_.size(); ++i) { + if (response[i] != std::numeric_limits::max()) { + lid_indexer_._add(response[i]); + } else { + lid_indexer_._add(sentinel); + --sentinel; + } + } + } + void sync_response(const CommSpec& comm_spec, int source, int tag) override { + LOG(ERROR) << "LocalIdxerBuilder should not be used to sync response"; + } + + private: + IdIndexer oid_indexer_; + IdIndexer lid_indexer_; +}; + +} // namespace grape + +#endif // GRAPE_VERTEX_MAP_IDXERS_LOCAL_IDXER_H_ diff --git a/grape/vertex_map/idxers/pthash_idxer.h b/grape/vertex_map/idxers/pthash_idxer.h new file mode 100644 index 00000000..3cd3d4ad --- /dev/null +++ b/grape/vertex_map/idxers/pthash_idxer.h @@ -0,0 +1,186 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef GRAPE_VERTEX_MAP_IDXERS_PTHASH_IDXER_H_ +#define GRAPE_VERTEX_MAP_IDXERS_PTHASH_IDXER_H_ + +#include "grape/util.h" +#include "grape/utils/gcontainer.h" +#include "grape/utils/pthash_utils/ph_indexer_view.h" +#include "grape/vertex_map/idxers/idxer_base.h" + +namespace grape { + +template +class PTHashIdxer : public IdxerBase { + using internal_oid_t = typename InternalOID::type; + + public: + PTHashIdxer() {} + explicit PTHashIdxer(Array>&& buf) + : buffer_(std::move(buf)) { + idxer_.init(buffer_.data(), buffer_.size()); + } + ~PTHashIdxer() {} + + void Init(void* buffer, size_t size) { idxer_.init(buffer, size); } + + bool get_key(VID_T vid, internal_oid_t& oid) const override { + return idxer_.get_key(vid, oid); + } + + bool get_index(const internal_oid_t& oid, VID_T& vid) const override { + return idxer_.get_index(oid, vid); + } + + IdxerType type() const override { return IdxerType::kPTHashIdxer; } + + void serialize(std::unique_ptr& writer) override { + idxer_.Serialize(writer); + } + + void deserialize(std::unique_ptr& reader) override { + size_t size; + CHECK(reader->Read(&size, sizeof(size_t))); + if (size > 0) { + buffer_.resize(size); + CHECK(reader->Read(buffer_.data(), size)); + idxer_.init(buffer_.data(), size); + } + } + + size_t size() const override { return idxer_.size(); } + + size_t memory_usage() const override { return buffer_.size(); } + + private: + Array> buffer_; + PHIndexerView idxer_; +}; + +template +class PTHashIdxerDummyBuilder : public IdxerBuilderBase { + public: + using internal_oid_t = typename InternalOID::type; + void add(const internal_oid_t& oid) override {} + + void sync_request(const CommSpec& comm_spec, int target, int tag) override { + sync_comm::Recv(buffer_, target, tag, comm_spec.comm()); + } + + void sync_response(const CommSpec& comm_spec, int source, int tag) override { + LOG(ERROR) << "PTHashIdxerDummyBuilder should not be used to sync response"; + } + + std::unique_ptr> finish() override { + return std::unique_ptr>( + new PTHashIdxer(std::move(buffer_))); + } + + private: + Array> buffer_; +}; + +template +class PTHashIdxerBuilder : public IdxerBuilderBase { + using internal_oid_t = typename InternalOID::type; + + public: + PTHashIdxerBuilder() {} + ~PTHashIdxerBuilder() {} + + void add(const internal_oid_t& oid) override { keys_.push_back(OID_T(oid)); } + + void buildPhf() { + if (build_phf_) { + return; + } + DistinctSort(keys_); + SinglePHFView::build(keys_.begin(), keys_.size(), phf_, 1); + std::vector ordered_keys(keys_.size()); + for (auto& key : keys_) { + size_t idx = phf_(key); + ordered_keys[idx] = key; + } + key_buffer_.clear(); + for (auto& key : ordered_keys) { + key_buffer_.push_back(key); + } + build_phf_ = true; + } + + size_t getSerializeSize() { + return phf_.num_bits() / 8 + key_buffer_.dump_size(); + } + + /* + * Finish building the perfect hash index in a allocated buffer. + * After add all keys, call buildPhf to build the perfect hash function. + * And then allocate a buffer with getSerializeSize() bytes. + * Call finishInplace to finish building the index in the buffer. + */ + void finishInplace(void* buffer, size_t size, + PTHashIdxer& idxer) { + external_mem_dumper dumper(reinterpret_cast(buffer), size); + phf_.dump(dumper); + key_buffer_.dump(dumper); + idxer.Init(buffer, size); + } + + /* + * Finish building the perfect hash index in an internal + * buffer(std::vector). After add all keys, call finish to build the + * perfect hash index and serialize it. + */ + std::unique_ptr> finish() override { + buildPhf(); + if (getSerializeSize() != buffer_.size()) { + buffer_.resize(getSerializeSize()); + external_mem_dumper dumper(buffer_.data(), buffer_.size()); + phf_.dump(dumper); + key_buffer_.dump(dumper); + } + return std::unique_ptr>( + new PTHashIdxer(std::move(buffer_))); + } + + void sync_request(const CommSpec& comm_spec, int target, int tag) override { + LOG(ERROR) << "PTHashIdxerBuilder should not be used to sync request"; + } + + void sync_response(const CommSpec& comm_spec, int source, int tag) override { + buildPhf(); + if (getSerializeSize() != buffer_.size()) { + buffer_.resize(getSerializeSize()); + external_mem_dumper dumper(buffer_.data(), buffer_.size()); + phf_.dump(dumper); + key_buffer_.dump(dumper); + } + + sync_comm::Send(buffer_, source, tag, comm_spec.comm()); + } + + private: + std::vector keys_; + id_indexer_impl::KeyBuffer key_buffer_; + pthash::single_phf phf_; + + Array> buffer_; + bool build_phf_ = false; +}; + +} // namespace grape + +#endif // GRAPE_VERTEX_MAP_IDXERS_PTHASH_IDXER_H_ diff --git a/grape/vertex_map/idxers/sorted_array_idxer.h b/grape/vertex_map/idxers/sorted_array_idxer.h new file mode 100644 index 00000000..0eaf48da --- /dev/null +++ b/grape/vertex_map/idxers/sorted_array_idxer.h @@ -0,0 +1,198 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef GRAPE_VERTEX_MAP_IDXERS_SORTED_ARRAY_IDXER_H_ +#define GRAPE_VERTEX_MAP_IDXERS_SORTED_ARRAY_IDXER_H_ + +#include "grape/utils/gcontainer.h" +#include "grape/vertex_map/idxers/idxer_base.h" + +namespace grape { + +template +class SortedArrayIdxer : public IdxerBase { + using internal_oid_t = typename InternalOID::type; + + public: + SortedArrayIdxer() {} + explicit SortedArrayIdxer(Array>&& id_list) + : id_list_(std::move(id_list)) {} + ~SortedArrayIdxer() {} + + bool get_key(VID_T vid, internal_oid_t& oid) const override { + if (vid >= id_list_.size()) { + return false; + } + oid = id_list_[vid]; + return true; + } + + bool get_index(const internal_oid_t& oid, VID_T& vid) const override { + auto it = std::lower_bound(id_list_.begin(), id_list_.end(), oid); + if (it == id_list_.end() || *it != oid) { + return false; + } + vid = it - id_list_.begin(); + return true; + } + + IdxerType type() const override { return IdxerType::kSortedArrayIdxer; } + + void serialize(std::unique_ptr& writer) override { + size_t size = id_list_.size(); + writer->Write(&size, sizeof(size_t)); + writer->Write(id_list_.data(), size * sizeof(OID_T)); + } + + void deserialize(std::unique_ptr& reader) override { + size_t size; + reader->Read(&size, sizeof(size_t)); + id_list_.resize(size); + reader->Read(id_list_.data(), size * sizeof(OID_T)); + } + + size_t size() const override { return id_list_.size(); } + + size_t memory_usage() const override { + return id_list_.size() * sizeof(OID_T); + } + + private: + Array> id_list_; +}; + +template +class SortedArrayIdxer + : public IdxerBase { + using internal_oid_t = typename InternalOID::type; + + public: + SortedArrayIdxer() {} + explicit SortedArrayIdxer( + Array>&& id_list) { + for (auto& id : id_list) { + id_list_.emplace_back(id); + } + } + ~SortedArrayIdxer() {} + + bool get_key(VID_T vid, internal_oid_t& oid) const override { + if (vid >= id_list_.size()) { + return false; + } + oid = internal_oid_t(id_list_[vid]); + return true; + } + + bool get_index(const internal_oid_t& oid, VID_T& vid) const override { + size_t num = id_list_.size(); + size_t low = 0, high = num - 1; + nonstd::string_view oid_view(oid); + while (low <= high) { + size_t mid = low + (high - low) / 2; + if (id_list_[mid] == oid_view) { + vid = mid; + return true; + } else if (id_list_[mid] < oid_view) { + low = mid + 1; + } else { + high = mid - 1; + } + } + return false; + } + + IdxerType type() const override { return IdxerType::kSortedArrayIdxer; } + + void serialize(std::unique_ptr& writer) override { + id_list_.serialize(writer); + } + + void deserialize(std::unique_ptr& reader) override { + id_list_.deserialize(reader); + } + + size_t size() const override { return id_list_.size(); } + + size_t memory_usage() const override { + return id_list_.content_buffer().size() + + id_list_.offset_buffer().size() * sizeof(size_t); + } + + private: + StringViewVector id_list_; +}; + +template +class SortedArrayIdxerDummyBuilder : public IdxerBuilderBase { + public: + using internal_oid_t = typename InternalOID::type; + void add(const internal_oid_t& oid) override {} + + std::unique_ptr> finish() override { + return std::unique_ptr>( + new SortedArrayIdxer(std::move(id_list_))); + } + + void sync_request(const CommSpec& comm_spec, int target, int tag) override { + sync_comm::Recv(id_list_, target, tag, comm_spec.comm()); + } + + void sync_response(const CommSpec& comm_spec, int source, int tag) override { + LOG(ERROR) << "SortedArrayIdxerDummyBuilder should not be used to sync " + "response"; + } + + private: + Array> id_list_; +}; + +template +class SortedArrayIdxerBuilder : public IdxerBuilderBase { + public: + using internal_oid_t = typename InternalOID::type; + void add(const internal_oid_t& oid) override { keys_.push_back(OID_T(oid)); } + + std::unique_ptr> finish() override { + if (!sorted_) { + DistinctSort(keys_); + sorted_ = true; + } + Array> id_list(keys_.size()); + std::copy(keys_.begin(), keys_.end(), id_list.begin()); + return std::unique_ptr>( + new SortedArrayIdxer(std::move(id_list))); + } + + void sync_request(const CommSpec& comm_spec, int target, int tag) override { + LOG(ERROR) << "HashMapIdxerBuilder should not be used to sync request"; + } + + void sync_response(const CommSpec& comm_spec, int source, int tag) override { + if (!sorted_) { + DistinctSort(keys_); + sorted_ = true; + } + sync_comm::Send(keys_, source, tag, comm_spec.comm()); + } + + private: + std::vector keys_; + bool sorted_ = false; +}; + +} // namespace grape + +#endif // GRAPE_VERTEX_MAP_IDXERS_SORTED_ARRAY_IDXER_H_ diff --git a/grape/vertex_map/local_vertex_map.h b/grape/vertex_map/local_vertex_map.h deleted file mode 100644 index fe6878ea..00000000 --- a/grape/vertex_map/local_vertex_map.h +++ /dev/null @@ -1,280 +0,0 @@ -/** Copyright 2020 Alibaba Group Holding Limited. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#ifndef GRAPE_VERTEX_MAP_LOCAL_VERTEX_MAP_H_ -#define GRAPE_VERTEX_MAP_LOCAL_VERTEX_MAP_H_ - -#include -#include -#include -#include -#include -#include -#include - -#include "grape/config.h" -#include "grape/fragment/partitioner.h" -#include "grape/graph/id_indexer.h" -#include "grape/serialization/in_archive.h" -#include "grape/serialization/out_archive.h" -#include "grape/vertex_map/vertex_map_base.h" -#include "grape/worker/comm_spec.h" - -namespace grape { - -template -class LocalVertexMap; - -template -class LocalVertexMapBuilder { - using internal_oid_t = typename InternalOID::type; - - private: - LocalVertexMapBuilder( - fid_t fid, std::vector>& oid_to_index, - std::vector>& gid_to_index, - const PARTITIONER_T& partitioner, const IdParser& id_parser) - : fid_(fid), - oid_to_index_(oid_to_index), - gid_to_index_(gid_to_index), - partitioner_(partitioner), - id_parser_(id_parser) {} - - public: - ~LocalVertexMapBuilder() {} - - void add_local_vertex(const internal_oid_t& id, VID_T& gid) { - assert(partitioner_.GetPartitionId(id) == fid_); - oid_to_index_[fid_].add(id, gid); - gid = id_parser_.generate_global_id(fid_, gid); - } - - void add_vertex(const internal_oid_t& id) { - fid_t fid = partitioner_.GetPartitionId(id); - oid_to_index_[fid]._add(id); - } - - void finish(LocalVertexMap& vertex_map) { - const CommSpec& comm_spec = vertex_map.GetCommSpec(); - int worker_id = comm_spec.worker_id(); - int worker_num = comm_spec.worker_num(); - std::thread request_thread([&]() { - for (int i = 1; i < worker_num; ++i) { - int dst_worker_id = (worker_id + i) % worker_num; - auto& indexer = oid_to_index_[comm_spec.WorkerToFrag(dst_worker_id)]; - sync_comm::Send(indexer.keys(), dst_worker_id, 0, comm_spec.comm()); - std::vector gid_list(indexer.size()); - sync_comm::Recv(gid_list, dst_worker_id, 1, comm_spec.comm()); - auto& gid_indexer = - gid_to_index_[comm_spec.WorkerToFrag(dst_worker_id)]; - for (auto gid : gid_list) { - gid_indexer._add(gid); - } - } - }); - std::thread response_thread([&]() { - for (int i = 1; i < worker_num; ++i) { - int src_worker_id = (worker_id + worker_num - i) % worker_num; - typename IdIndexer::key_buffer_t keys; - sync_comm::Recv(keys, src_worker_id, 0, comm_spec.comm()); - std::vector gid_list(keys.size()); - VID_T gid; - auto& native_indexer = oid_to_index_[fid_]; - for (size_t k = 0; k < keys.size(); ++k) { - CHECK(native_indexer.get_index(keys[k], gid)); - gid = id_parser_.generate_global_id(fid_, gid); - gid_list[k] = gid; - } - sync_comm::Send(gid_list, src_worker_id, 1, comm_spec.comm()); - } - }); - - request_thread.join(); - response_thread.join(); - MPI_Barrier(comm_spec.comm()); - - vertex_map.vertices_num_.resize(comm_spec.fnum()); - vertex_map.vertices_num_[fid_] = oid_to_index_[fid_].size(); - sync_comm::AllGather(vertex_map.vertices_num_, comm_spec.comm()); - } - - private: - template - friend class LocalVertexMap; - - fid_t fid_; - std::vector>& oid_to_index_; - std::vector>& gid_to_index_; - const PARTITIONER_T& partitioner_; - const IdParser id_parser_; -}; - -template > -class LocalVertexMap : public VertexMapBase { - using base_t = VertexMapBase; - using internal_oid_t = typename InternalOID::type; - - public: - explicit LocalVertexMap(const CommSpec& comm_spec) : base_t(comm_spec) {} - ~LocalVertexMap() = default; - void Init() { - oid_to_index_.resize(comm_spec_.fnum()); - gid_to_index_.resize(comm_spec_.fnum()); - } - - size_t GetTotalVertexSize() const { - size_t size = 0; - for (auto v : vertices_num_) { - size += v; - } - return size; - } - - size_t GetInnerVertexSize(fid_t fid) const { return vertices_num_[fid]; } - void AddVertex(const OID_T& oid) { LOG(FATAL) << "not implemented"; } - - using base_t::Lid2Gid; - bool AddVertex(const OID_T& oid, VID_T& gid) { - LOG(FATAL) << "not implemented"; - return false; - } - - using base_t::GetFidFromGid; - using base_t::GetLidFromGid; - bool GetOid(const VID_T& gid, OID_T& oid) const { - fid_t fid = GetFidFromGid(gid); - return GetOid(fid, id_parser_.get_local_id(gid), oid); - } - - bool GetOid(fid_t fid, const VID_T& lid, OID_T& oid) const { - internal_oid_t internal_oid; - if (fid == comm_spec_.fid()) { - if (oid_to_index_[fid].get_key(lid, internal_oid)) { - oid = InternalOID::FromInternal(internal_oid); - return true; - } - } else { - VID_T index; - if (gid_to_index_[fid].get_index(id_parser_.generate_global_id(fid, lid), - index)) { - if (oid_to_index_[fid].get_key(index, internal_oid)) { - oid = InternalOID::FromInternal(internal_oid); - return true; - } - } - } - return false; - } - - bool GetGid(fid_t fid, const OID_T& oid, VID_T& gid) const { - internal_oid_t internal_oid(oid); - return _GetGid(fid, internal_oid, gid); - } - - bool _GetGid(fid_t fid, const internal_oid_t& oid, VID_T& gid) const { - VID_T index; - if (fid == comm_spec_.fid()) { - if (oid_to_index_[fid].get_index(oid, index)) { - gid = id_parser_.generate_global_id(fid, index); - return true; - } - } else { - if (oid_to_index_[fid].get_index(oid, index)) { - return gid_to_index_[fid].get_key(index, gid); - } - } - return false; - } - - bool GetGid(const OID_T& oid, VID_T& gid) const { - fid_t fid = partitioner_.GetPartitionId(oid); - return GetGid(fid, oid, gid); - } - - bool _GetGid(const internal_oid_t& oid, VID_T& gid) const { - fid_t fid = partitioner_.GetPartitionId(oid); - return _GetGid(fid, oid, gid); - } - - LocalVertexMapBuilder GetLocalBuilder() { - fid_t fid = comm_spec_.fid(); - return LocalVertexMapBuilder( - fid, oid_to_index_, gid_to_index_, partitioner_, id_parser_); - } - - template - void Serialize(const std::string& prefix) { - char fbuf[1024]; - snprintf(fbuf, sizeof(fbuf), "%s/%s_%d", prefix.c_str(), - kSerializationVertexMapFilename, comm_spec_.fid()); - - auto io_adaptor = - std::unique_ptr(new IOADAPTOR_T(std::string(fbuf))); - io_adaptor->Open("wb"); - - base_t::serialize(io_adaptor); - for (auto& indexer : oid_to_index_) { - indexer.Serialize(io_adaptor); - } - for (auto& indexer : gid_to_index_) { - indexer.Serialize(io_adaptor); - } - io_adaptor->Close(); - } - - template - void Deserialize(const std::string& prefix, fid_t fid) { - char fbuf[1024]; - snprintf(fbuf, sizeof(fbuf), "%s/%s_%d", prefix.c_str(), - kSerializationVertexMapFilename, fid); - - auto io_adaptor = - std::unique_ptr(new IOADAPTOR_T(std::string(fbuf))); - io_adaptor->Open(); - - base_t::deserialize(io_adaptor); - oid_to_index_.resize(comm_spec_.fnum()); - for (auto& indexer : oid_to_index_) { - indexer.Deserialize(io_adaptor); - } - gid_to_index_.resize(comm_spec_.fnum()); - for (auto& indexer : gid_to_index_) { - indexer.Deserialize(io_adaptor); - } - io_adaptor->Close(); - } - - void UpdateToBalance(std::vector& vnum_list, - std::vector>& gid_maps) { - LOG(FATAL) << "not implemented"; - } - - private: - template - friend class LocalVertexMapBuilder; - - std::vector> oid_to_index_; - std::vector> gid_to_index_; - using base_t::comm_spec_; - using base_t::id_parser_; - using base_t::partitioner_; - - std::vector vertices_num_; -}; - -} // namespace grape - -#endif // GRAPE_VERTEX_MAP_LOCAL_VERTEX_MAP_H_ diff --git a/grape/vertex_map/partitioner.h b/grape/vertex_map/partitioner.h new file mode 100644 index 00000000..82a3cb50 --- /dev/null +++ b/grape/vertex_map/partitioner.h @@ -0,0 +1,292 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef GRAPE_VERTEX_MAP_PARTITIONER_H_ +#define GRAPE_VERTEX_MAP_PARTITIONER_H_ + +#include +#include "grape/io/io_adaptor_base.h" + +namespace grape { + +enum class PartitionerType { + kHashPartitioner, + kMapPartitioner, + kSegmentedPartitioner, +}; + +inline PartitionerType parse_partitioner_type_name(const std::string& name) { + if (name == "hash") { + return PartitionerType::kHashPartitioner; + } else if (name == "map") { + return PartitionerType::kMapPartitioner; + } else if (name == "segment") { + return PartitionerType::kSegmentedPartitioner; + } else { + LOG(ERROR) << "unrecognized partitioner: " << name + << ", use map partitioner as default"; + return PartitionerType::kMapPartitioner; + } +} + +template +class IPartitioner { + public: + using internal_oid_t = typename InternalOID::type; + + virtual ~IPartitioner() = default; + + virtual fid_t GetPartitionId(const internal_oid_t& oid) const = 0; + + virtual void SetPartitionId(const internal_oid_t& oid, fid_t fid) = 0; + + virtual void serialize(std::unique_ptr& writer) = 0; + + virtual void deserialize(std::unique_ptr& reader) = 0; + + virtual size_t memory_usage() const = 0; + + virtual PartitionerType type() const = 0; +}; + +template > +class HashPartitioner : public IPartitioner { + public: + using internal_oid_t = typename InternalOID::type; + + HashPartitioner() : hash_(), fnum_(1) {} + explicit HashPartitioner(size_t frag_num) : hash_(), fnum_(frag_num) {} + + fid_t GetPartitionId(const internal_oid_t& oid) const override { + return static_cast(hash_(OID_T(oid)) % fnum_); + } + + void SetPartitionId(const internal_oid_t& oid, fid_t fid) override { + if (GetPartitionId(oid) != fid) { + LOG(ERROR) << "HashPartitioner cannot set partition id"; + } + } + + void serialize(std::unique_ptr& writer) override { + CHECK(writer->Write(&fnum_, sizeof(fid_t))); + } + + void deserialize(std::unique_ptr& reader) override { + CHECK(reader->Read(&fnum_, sizeof(fid_t))); + } + + PartitionerType type() const override { + return PartitionerType::kHashPartitioner; + } + + size_t memory_usage() const override { return 0; } + + private: + HASH_T hash_; + fid_t fnum_; +}; + +template +class MapPartitioner : public IPartitioner { + public: + using internal_oid_t = typename InternalOID::type; + + MapPartitioner() : fnum_(0) {} + explicit MapPartitioner(fid_t fnum) : fnum_(fnum) {} + MapPartitioner(fid_t fnum, const std::vector& oid_list) { + fnum_ = fnum; + Init(fnum, oid_list); + } + ~MapPartitioner() = default; + + void Init(fid_t fnum, const std::vector& oid_list) { + fnum_ = fnum; + size_t frag_num = fnum; + size_t vnum = oid_list.size(); + size_t frag_vnum = (vnum + frag_num - 1) / frag_num; + o2f_.clear(); + o2f_.reserve(vnum); + for (size_t i = 0; i < vnum; ++i) { + fid_t fid = static_cast(i / frag_vnum); + o2f_.emplace(oid_list[i], fid); + } + } + + void Init(const std::vector>& oid_lists) { + size_t frag_num = oid_lists.size(); + fnum_ = frag_num; + o2f_.clear(); + for (size_t i = 0; i < frag_num; ++i) { + for (const auto& oid : oid_lists[i]) { + o2f_.emplace(oid, i); + } + } + } + + fid_t GetPartitionId(const internal_oid_t& oid) const override { + auto iter = o2f_.find(OID_T(oid)); + if (iter == o2f_.end()) { + return fnum_; + } + return iter->second; + } + + void SetPartitionId(const internal_oid_t& oid, fid_t fid) override { + o2f_[OID_T(oid)] = fid; + } + + void serialize(std::unique_ptr& writer) override { + InArchive arc; + arc << fnum_ << o2f_; + CHECK(writer->WriteArchive(arc)); + } + + void deserialize(std::unique_ptr& reader) override { + OutArchive arc; + CHECK(reader->ReadArchive(arc)); + arc >> fnum_ >> o2f_; + } + + PartitionerType type() const override { + return PartitionerType::kMapPartitioner; + } + + size_t memory_usage() const override { return o2f_.memory_usage(); } + + private: + fid_t fnum_; + ska::flat_hash_map o2f_; +}; + +template +class SegmentedPartitioner : public IPartitioner { + using internal_oid_t = typename InternalOID::type; + + public: + SegmentedPartitioner() : fnum_(0) {} + SegmentedPartitioner(fid_t fnum, const std::vector& sorted_oid_list) { + fnum_ = fnum; + size_t part_size = (sorted_oid_list.size() + fnum - 1) / fnum; + for (size_t i = 1; i < fnum; ++i) { + boundaries_.emplace_back(sorted_oid_list[i * part_size]); + } + } + explicit SegmentedPartitioner(const std::vector& boundaries) + : fnum_(boundaries.size() + 1), boundaries_(boundaries) {} + ~SegmentedPartitioner() = default; + + void Init(fid_t fnum, const std::vector& boundaries) { + fnum_ = fnum; + boundaries_ = boundaries; + CHECK_EQ(fnum_, boundaries_.size() + 1); + } + + fid_t GetPartitionId(const internal_oid_t& oid) const override { + auto iter = + std::upper_bound(boundaries_.begin(), boundaries_.end(), OID_T(oid)); + return static_cast(iter - boundaries_.begin()); + } + + void SetPartitionId(const internal_oid_t& oid, fid_t fid) override { + LOG(FATAL) << "SegmentedPartitioner cannot set partition id"; + } + + PartitionerType type() const override { + return PartitionerType::kSegmentedPartitioner; + } + + void serialize(std::unique_ptr& writer) override { + InArchive arc; + arc << fnum_ << boundaries_; + CHECK(writer->WriteArchive(arc)); + } + + void deserialize(std::unique_ptr& reader) override { + OutArchive arc; + CHECK(reader->ReadArchive(arc)); + arc >> fnum_ >> boundaries_; + } + + size_t memory_usage() const override { + return boundaries_.size() * sizeof(OID_T); + } + + private: + fid_t fnum_; + std::vector boundaries_; +}; + +template +void serialize_partitioner(std::unique_ptr& writer, + std::unique_ptr>& partitioner) { + int type = static_cast(partitioner->type()); + writer->Write(&type, sizeof(type)); + partitioner->serialize(writer); +} + +template +std::unique_ptr> deserialize_partitioner( + std::unique_ptr& reader) { + int type; + reader->Read(&type, sizeof(type)); + std::unique_ptr> partitioner(nullptr); + switch (static_cast(type)) { + case PartitionerType::kHashPartitioner: + partitioner = + std::unique_ptr>(new HashPartitioner()); + break; + case PartitionerType::kMapPartitioner: + partitioner = + std::unique_ptr>(new MapPartitioner()); + break; + case PartitionerType::kSegmentedPartitioner: + partitioner = + std::unique_ptr>(new SegmentedPartitioner()); + break; + default: + LOG(FATAL) << "Unknown partitioner type"; + } + if (partitioner) { + partitioner->deserialize(reader); + } + return partitioner; +} + +} // namespace grape + +#include + +namespace std { + +inline ostream& operator<<(ostream& os, const grape::PartitionerType& type) { + switch (type) { + case grape::PartitionerType::kHashPartitioner: + os << "hash"; + break; + case grape::PartitionerType::kMapPartitioner: + os << "map"; + break; + case grape::PartitionerType::kSegmentedPartitioner: + os << "segment"; + break; + default: + os << "unknown"; + } + return os; +} + +} // namespace std + +#endif // GRAPE_VERTEX_MAP_PARTITIONER_H_ diff --git a/grape/vertex_map/vertex_map.h b/grape/vertex_map/vertex_map.h new file mode 100644 index 00000000..a1c385bd --- /dev/null +++ b/grape/vertex_map/vertex_map.h @@ -0,0 +1,525 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#ifndef GRAPE_VERTEX_MAP_VERTEX_MAP_H_ +#define GRAPE_VERTEX_MAP_VERTEX_MAP_H_ + +#include + +#include "grape/fragment/id_parser.h" +#include "grape/util.h" +#include "grape/vertex_map/idxers/idxers.h" +#include "grape/vertex_map/partitioner.h" + +namespace grape { + +template +class VertexMapBuilder; + +template +class VertexMap { + public: + using oid_t = OID_T; + using vid_t = VID_T; + using internal_oid_t = typename InternalOID::type; + + VertexMap(const VertexMap&) = delete; + VertexMap() : partitioner_(nullptr) {} + ~VertexMap() {} + + fid_t GetFragmentNum() const { return fnum_; } + + fid_t GetFragmentId(const OID_T& oid) const { + internal_oid_t internal_oid(oid); + return partitioner_->GetPartitionId(internal_oid); + } + + const IdParser& GetIdParser() const { return id_parser_; } + + const IPartitioner& GetPartitioner() const { return *partitioner_; } + + VID_T Lid2Gid(fid_t fid, const VID_T& lid) const { + return id_parser_.generate_global_id(fid, lid); + } + + fid_t GetFidFromGid(const VID_T& gid) const { + return id_parser_.get_fragment_id(gid); + } + + VID_T GetLidFromGid(const VID_T& gid) const { + return id_parser_.get_local_id(gid); + } + + VID_T MaxVertexNum() const { return id_parser_.max_local_id(); } + + size_t GetTotalVertexSize() const { return total_vertex_size_; } + + size_t GetInnerVertexSize(fid_t fid) const { return inner_vertex_size_[fid]; } + + void UpdateToBalance(const CommSpec& comm_spec, + const std::vector& vnum_list, + const std::vector>& gid_maps); + + bool GetOid(const VID_T& gid, OID_T& oid) const { + fid_t fid = GetFidFromGid(gid); + return GetOid(fid, GetLidFromGid(gid), oid); + } + + bool GetOid(fid_t fid, const VID_T& lid, OID_T& oid) const { + internal_oid_t internal_oid; + if (fid >= fnum_) { + return false; + } + if (idxers_[fid]->get_key(lid, internal_oid)) { + oid = InternalOID::FromInternal(internal_oid); + return true; + } + return false; + } + + bool GetGid(fid_t fid, const OID_T& oid, VID_T& gid) const { + internal_oid_t internal_oid(oid); + if (fid >= fnum_) { + return false; + } + if (idxers_[fid]->get_index(internal_oid, gid)) { + gid = Lid2Gid(fid, gid); + return true; + } + return false; + } + + bool GetGid(const OID_T& oid, VID_T& gid) const { + fid_t fid = partitioner_->GetPartitionId(oid); + if (fid == fnum_) { + return false; + } + return GetGid(fid, oid, gid); + } + + void reset() { idxers_.clear(); } + + void ExtendVertices(const CommSpec& comm_spec, + std::vector&& local_vertices_to_add) { + int worker_id = comm_spec.worker_id(); + DistinctSort(local_vertices_to_add); + bool unpartitioned_id = false; + for (size_t i = 0; i < local_vertices_to_add.size();) { + fid_t fid = partitioner_->GetPartitionId(local_vertices_to_add[i]); + if (fid == fnum_) { + unpartitioned_id = true; + } else if (comm_spec.FragToWorker(fid) != worker_id) { + LOG(ERROR) << "Partition id is not consistent for vertex - " + << local_vertices_to_add[i] << ", discarded..."; + std::swap(local_vertices_to_add[i], local_vertices_to_add.back()); + local_vertices_to_add.pop_back(); + continue; + } else { + vid_t index; + if (idxers_[fid]->get_index(internal_oid_t(local_vertices_to_add[i]), + index)) { + LOG(ERROR) << "Vertex already exists - " << local_vertices_to_add[i]; + std::swap(local_vertices_to_add[i], local_vertices_to_add.back()); + local_vertices_to_add.pop_back(); + continue; + } + } + ++i; + } + int state = 0; + if (unpartitioned_id) { + state = 1; + } + std::vector states(comm_spec.fnum(), 0); + states[worker_id] = state; + sync_comm::AllGather(states, comm_spec.comm()); + // need to update partitioner with new vertices + std::vector> global_vertices_to_add(comm_spec.fnum()); + global_vertices_to_add[comm_spec.fid()] = std::move(local_vertices_to_add); + sync_comm::AllGather(global_vertices_to_add, comm_spec.comm()); + for (fid_t fid = 0; fid < fnum_; ++fid) { + if (states[fid] == 1) { + CHECK(partitioner_->type() == PartitionerType::kMapPartitioner); + for (auto& v : global_vertices_to_add[fid]) { + partitioner_->SetPartitionId(v, fid); + } + } + idxers_[fid] = + extend_indexer(std::move(idxers_[fid]), global_vertices_to_add[fid], + static_cast(inner_vertex_size_[fid])); + inner_vertex_size_[fid] += global_vertices_to_add[fid].size(); + total_vertex_size_ += global_vertices_to_add[fid].size(); + } + } + + template + void Serialize(const std::string& prefix, const CommSpec& comm_spec) { + if (idxer_type_ != IdxerType::kLocalIdxer) { + char fbuf[1024]; + snprintf(fbuf, sizeof(fbuf), "%s/%s", prefix.c_str(), + kSerializationVertexMapFilename); + std::string path = std::string(fbuf); + if (comm_spec.worker_id() == 0) { + serialize_impl(path); + } + MPI_Barrier(comm_spec.comm()); + if (!exists_file(path) && comm_spec.local_id() == 0) { + serialize_impl(path); + } + MPI_Barrier(comm_spec.comm()); + if (!exists_file(path)) { + serialize_impl(path); + } + } else { + char fbuf[1024]; + snprintf(fbuf, sizeof(fbuf), "%s/%s_%d", prefix.c_str(), + kSerializationVertexMapFilename, comm_spec.fid()); + serialize_impl(std::string(fbuf)); + } + } + + template + void Deserialize(const std::string& prefix, const CommSpec& comm_spec) { + char local_fbuf[1024]; + snprintf(local_fbuf, sizeof(local_fbuf), "%s/%s_%d", prefix.c_str(), + kSerializationVertexMapFilename, comm_spec.fid()); + if (exists_file(local_fbuf)) { + deserialize_impl(std::string(local_fbuf)); + } else { + char global_fbuf[1024]; + snprintf(global_fbuf, sizeof(global_fbuf), "%s/%s", prefix.c_str(), + kSerializationVertexMapFilename); + if (exists_file(global_fbuf)) { + deserialize_impl(std::string(global_fbuf)); + } else { + LOG(FATAL) << "Cannot find vertex map file."; + } + } + + id_parser_.init(fnum_); + } + + VertexMap& operator=(VertexMap&& other) { + if (this == &other) { + return *this; + } + + this->fid_ = other.fid_; + this->fnum_ = other.fnum_; + this->idxer_type_ = other.idxer_type_; + this->total_vertex_size_ = other.total_vertex_size_; + this->inner_vertex_size_ = std::move(other.inner_vertex_size_); + + this->idxers_ = std::move(other.idxers_); + this->partitioner_ = std::move(other.partitioner_); + this->id_parser_.init(fnum_); + + other.idxers_.clear(); + other.total_vertex_size_ = 0; + other.inner_vertex_size_.clear(); + + return *this; + } + + PartitionerType partitioner_type() const { return partitioner_->type(); } + IdxerType idxer_type() const { return idxer_type_; } + + private: + template + void serialize_impl(const std::string& path) { + auto io_adaptor = std::unique_ptr(new IOADAPTOR(path)); + io_adaptor->Open("wb"); + InArchive arc; + arc << fid_ << fnum_ << idxer_type_ << total_vertex_size_ + << inner_vertex_size_; + io_adaptor->WriteArchive(arc); + for (fid_t fid = 0; fid < fnum_; ++fid) { + serialize_idxer(io_adaptor, idxers_[fid]); + } + serialize_partitioner(io_adaptor, partitioner_); + } + + template + void deserialize_impl(const std::string& path) { + auto io_adaptor = std::unique_ptr(new IOADAPTOR(path)); + io_adaptor->Open(); + OutArchive arc; + io_adaptor->ReadArchive(arc); + arc >> fid_ >> fnum_ >> idxer_type_ >> total_vertex_size_ >> + inner_vertex_size_; + for (fid_t fid = 0; fid < fnum_; ++fid) { + idxers_.emplace_back(deserialize_idxer(io_adaptor)); + } + partitioner_ = deserialize_partitioner(io_adaptor); + } + + template + friend class VertexMapBuilder; + + fid_t fid_; + fid_t fnum_; + + IdxerType idxer_type_; + + size_t total_vertex_size_; + std::vector inner_vertex_size_; + + std::unique_ptr> partitioner_; + std::vector>> idxers_; + IdParser id_parser_; +}; + +template +class VertexMapBuilder { + using internal_oid_t = typename InternalOID::type; + + public: + VertexMapBuilder(fid_t fid, fid_t fnum, + std::unique_ptr>&& partitioner, + IdxerType idxer_type) + : fid_(fid), + fnum_(fnum), + idxer_type_(idxer_type), + partitioner_(std::move(partitioner)) { + if (idxer_type_ == IdxerType::kSortedArrayIdxer) { + for (fid_t i = 0; i < fnum; ++i) { + if (i != fid) { + idxer_builders_.emplace_back( + new SortedArrayIdxerDummyBuilder()); + } else { + idxer_builders_.emplace_back( + new SortedArrayIdxerBuilder()); + } + } + } else if (idxer_type_ == IdxerType::kHashMapIdxer) { + for (fid_t i = 0; i < fnum; ++i) { + if (i != fid) { + idxer_builders_.emplace_back( + new HashMapIdxerDummyBuilder()); + } else { + idxer_builders_.emplace_back(new HashMapIdxerBuilder()); + } + } + } else if (idxer_type_ == IdxerType::kPTHashIdxer) { + for (fid_t i = 0; i < fnum; ++i) { + if (i != fid) { + idxer_builders_.emplace_back( + new PTHashIdxerDummyBuilder()); + } else { + idxer_builders_.emplace_back(new PTHashIdxerBuilder()); + } + } + } else if (idxer_type_ == IdxerType::kLocalIdxer) { + for (fid_t i = 0; i < fnum; ++i) { + if (i != fid) { + idxer_builders_.emplace_back(new LocalIdxerBuilder()); + } else { + idxer_builders_.emplace_back(new HashMapIdxerBuilder()); + } + } + } else if (idxer_type == IdxerType::kHashMapIdxerView) { + for (fid_t i = 0; i < fnum; ++i) { + if (i != fid) { + idxer_builders_.emplace_back( + new HashMapIdxerViewDummyBuilder()); + } else { + idxer_builders_.emplace_back( + new HashMapIdxerViewBuilder()); + } + } + } else { + LOG(FATAL) << "Unknown idxer type"; + } + } + + ~VertexMapBuilder() {} + + fid_t get_fragment_id(const internal_oid_t& oid) const { + return partitioner_->GetPartitionId(oid); + } + + void add_vertex(const internal_oid_t& id) { + fid_t fid = partitioner_->GetPartitionId(id); + if (fid < fnum_) { + idxer_builders_[fid]->add(id); + } else { + LOG(ERROR) << "add vertex - " << id << " failed, unknwon partition id"; + } + } + + void finish(const CommSpec& comm_spec, VertexMap& vertex_map) { + int worker_id = comm_spec.worker_id(); + int worker_num = comm_spec.worker_num(); + fid_t fnum = comm_spec.fnum(); + { + std::thread response_thread = std::thread([&]() { + int dst_worker_id = (worker_id + worker_num - 1) % worker_num; + while (dst_worker_id != worker_id) { + for (fid_t fid = 0; fid < fnum; ++fid) { + if (comm_spec.FragToWorker(fid) != worker_id) { + continue; + } + idxer_builders_[fid]->sync_response(comm_spec, dst_worker_id, 0); + } + dst_worker_id = (dst_worker_id + worker_num - 1) % worker_num; + } + }); + std::thread request_thread = std::thread([&]() { + int src_worker_id = (worker_id + 1) % worker_num; + while (src_worker_id != worker_id) { + for (fid_t fid = 0; fid < fnum; ++fid) { + if (comm_spec.FragToWorker(fid) != src_worker_id) { + continue; + } + idxer_builders_[fid]->sync_request(comm_spec, src_worker_id, 0); + } + src_worker_id = (src_worker_id + 1) % worker_num; + } + }); + + request_thread.join(); + response_thread.join(); + MPI_Barrier(comm_spec.comm()); + } + + vertex_map.reset(); + vertex_map.fid_ = fid_; + vertex_map.fnum_ = fnum; + vertex_map.idxer_type_ = idxer_type_; + vertex_map.partitioner_ = std::move(partitioner_); + for (fid_t fid = 0; fid < fnum; ++fid) { + vertex_map.idxers_.emplace_back(idxer_builders_[fid]->finish()); + } + idxer_builders_.clear(); + vertex_map.id_parser_.init(fnum); + + vertex_map.inner_vertex_size_.resize(fnum, 0); + vertex_map.inner_vertex_size_[fid_] = vertex_map.idxers_[fid_]->size(); + + sync_comm::AllGather(vertex_map.inner_vertex_size_, comm_spec.comm()); + + size_t total = 0; + for (fid_t i = 0; i < fnum; ++i) { + total += vertex_map.inner_vertex_size_[i]; + } + vertex_map.total_vertex_size_ = total; + } + + private: + fid_t fid_; + fid_t fnum_; + IdxerType idxer_type_; + std::unique_ptr> partitioner_; + std::vector>> idxer_builders_; +}; + +template +void VertexMap::UpdateToBalance( + const CommSpec& comm_spec, const std::vector& vnum_list, + const std::vector>& gid_maps) { + fid_t fnum = comm_spec.fnum(); + std::vector> oid_lists(fnum); + std::vector> unresolved_lids(fnum); + std::vector>> unresolved_vertices(fnum); + std::vector> unresolved_oids(fnum); + for (fid_t fid = 0; fid < fnum; ++fid) { + VID_T num = inner_vertex_size_[fid]; + CHECK_EQ(num, gid_maps[fid].size()); + for (VID_T lid = 0; lid < num; ++lid) { + VID_T new_gid = gid_maps[fid][lid]; + internal_oid_t oid; + fid_t new_fid = GetFidFromGid(new_gid); + VID_T new_lid = GetLidFromGid(new_gid); + if (!idxers_[fid]->get_key(lid, oid)) { + unresolved_lids[fid].push_back(lid); + unresolved_vertices[fid].push_back(std::make_pair(new_fid, new_lid)); + } else { + if (oid_lists[new_fid].size() <= new_lid) { + oid_lists[new_fid].resize(new_lid + 1); + } + oid_lists[new_fid][new_lid] = oid_t(oid); + } + } + } + { + std::thread request_thread = std::thread([&]() { + int src_worker_id = (comm_spec.worker_id() + 1) % comm_spec.worker_num(); + while (src_worker_id != comm_spec.worker_id()) { + for (fid_t fid = 0; fid < fnum; ++fid) { + if (comm_spec.FragToWorker(fid) != src_worker_id) { + continue; + } + sync_comm::Send(unresolved_lids[fid], src_worker_id, 0, + comm_spec.comm()); + sync_comm::Recv(unresolved_oids[fid], src_worker_id, 1, + comm_spec.comm()); + } + src_worker_id = (src_worker_id + 1) % comm_spec.worker_num(); + } + }); + std::thread response_thread = std::thread([&]() { + int dst_worker_id = (comm_spec.worker_id() + comm_spec.worker_num() - 1) % + comm_spec.worker_num(); + while (dst_worker_id != comm_spec.worker_id()) { + for (fid_t fid = 0; fid < fnum; ++fid) { + if (comm_spec.FragToWorker(fid) != comm_spec.worker_id()) { + continue; + } + std::vector lid_list; + sync_comm::Recv(lid_list, dst_worker_id, 0, comm_spec.comm()); + std::vector oid_list; + for (auto lid : lid_list) { + OID_T oid{}; + if (!GetOid(fid, lid, oid)) { + LOG(ERROR) << "Cannot find oid for lid " << lid; + } + oid_list.push_back(oid); + } + sync_comm::Send(oid_list, dst_worker_id, 1, comm_spec.comm()); + } + dst_worker_id = (dst_worker_id + comm_spec.worker_num() - 1) % + comm_spec.worker_num(); + } + }); + response_thread.join(); + request_thread.join(); + MPI_Barrier(comm_spec.comm()); + } + for (fid_t fid = 0; fid < fnum; ++fid) { + for (size_t i = 0; i < unresolved_lids[fid].size(); ++i) { + OID_T oid = unresolved_oids[fid][i]; + const auto& pair = unresolved_vertices[fid][i]; + oid_lists[pair.first][pair.second] = oid; + } + } + + std::unique_ptr> new_partitioner( + new MapPartitioner(fnum_)); + new_partitioner->Init(oid_lists); + + VertexMapBuilder builder( + comm_spec.fid(), comm_spec.fnum(), std::move(new_partitioner), true, + idxers_[0]->type() == IdxerType::kPTHashIdxer); + for (auto& oid : oid_lists[comm_spec.fid()]) { + internal_oid_t internal_oid(oid); + builder.add_vertex(internal_oid); + } + + builder.finish(comm_spec, *this); +} + +} // namespace grape + +#endif // GRAPE_VERTEX_MAP_VERTEX_MAP_H_ diff --git a/grape/vertex_map/vertex_map_base.h b/grape/vertex_map/vertex_map_base.h deleted file mode 100644 index 3ba30bcb..00000000 --- a/grape/vertex_map/vertex_map_base.h +++ /dev/null @@ -1,147 +0,0 @@ -/** Copyright 2020 Alibaba Group Holding Limited. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -#ifndef GRAPE_VERTEX_MAP_VERTEX_MAP_BASE_H_ -#define GRAPE_VERTEX_MAP_VERTEX_MAP_BASE_H_ - -#include -#include - -#include "grape/config.h" -#include "grape/fragment/id_parser.h" -#include "grape/serialization/in_archive.h" -#include "grape/serialization/out_archive.h" -#include "grape/worker/comm_spec.h" - -namespace grape { - -/** - * @brief VertexMapBase manages some mapping about vertices. - * - * a manages: - * - * 1) which fragment a vertex resides in as a inner_vertex, for edge-cut - * distributed graphs; - * - * 2) which fragment a vertex resides in as a master_vertex, - * for vertex-cut distributed graphs; - * - * 3) the mapping from ids. There are 3 kinds of vertex ids in grape. - * - * - original_id (a.k.a., OID), is provided by the origin dataset, it may be - * not continoues, or even strings. - * - * - local_id (a.k.a., LID), is allocated WITHIN a fragment, it is continoues - * and increased from 1. - * - * - global_id (a.k.a., GID), is unique in the distributed graph and works as - * the identifier of a vertex in libgrape-lite. It consists of two parts and - * formatted as fid|local_id. - * - * @note: The pure virtual functions in the class work as interfaces, - * instructing sub-classes to implement. The override functions in the derived - * classes would be invoked directly, not via virtual functions. - * - * @tparam OID_T - * @tparam VID_T - */ -template -class VertexMapBase { - public: - using partitioner_t = PARTITIONER_T; - using oid_t = OID_T; - using vid_t = VID_T; - explicit VertexMapBase(const CommSpec& comm_spec) - : comm_spec_(comm_spec), partitioner_() { - comm_spec_.Dup(); - id_parser_.init(comm_spec_.fnum()); - } - virtual ~VertexMapBase() = default; - - void SetPartitioner(const PARTITIONER_T& partitioner) { - partitioner_ = partitioner; - } - - void SetPartitioner(PARTITIONER_T&& partitioner) { - partitioner_ = std::move(partitioner); - } - - fid_t GetFragmentNum() const { return comm_spec_.fnum(); } - - VID_T Lid2Gid(fid_t fid, const VID_T& lid) const { - return id_parser_.generate_global_id(fid, lid); - } - - fid_t GetFidFromGid(const VID_T& gid) const { - return id_parser_.get_fragment_id(gid); - } - - VID_T GetLidFromGid(const VID_T& gid) const { - return id_parser_.get_local_id(gid); - } - - VID_T MaxVertexNum() const { return id_parser_.max_local_id(); } - - const CommSpec& GetCommSpec() const { return comm_spec_; } - - template - void serialize(std::unique_ptr& writer) { - partitioner_.template serialize(writer); - } - - template - void deserialize(std::unique_ptr& reader) { - id_parser_.init(comm_spec_.fnum()); - partitioner_.template deserialize(reader); - } - - fid_t GetFragmentId(const OID_T& oid) const { - return partitioner_.GetPartitionId(oid); - } - - const PARTITIONER_T& GetPartitioner() const { return partitioner_; } - - PARTITIONER_T& GetPartitioner() { return partitioner_; } - - protected: - CommSpec comm_spec_; - PARTITIONER_T partitioner_; - IdParser id_parser_; - - public: - // get metadata of the graph. - virtual size_t GetTotalVertexSize() const = 0; - virtual size_t GetInnerVertexSize(fid_t fid) const = 0; - - // for constructing the vertexmap. - virtual void AddVertex(const OID_T& oid) = 0; - virtual bool AddVertex(const OID_T& oid, VID_T& gid) = 0; - - virtual void UpdateToBalance(std::vector& vnum_list, - std::vector>& gid_maps) = 0; - - // convert the vertex ids with the help of mappings. - virtual bool GetOid(const VID_T& gid, OID_T& oid) const = 0; - - virtual bool GetOid(fid_t fid, const VID_T& lid, OID_T& oid) const = 0; - - virtual bool GetGid(fid_t fid, const OID_T& oid, VID_T& gid) const = 0; - - virtual bool GetGid(const OID_T& oid, VID_T& gid) const = 0; -}; - -} // namespace grape - -#endif // GRAPE_VERTEX_MAP_VERTEX_MAP_BASE_H_ diff --git a/misc/app_tests.sh b/misc/app_tests.sh index 4e27305f..f9d1bbc9 100755 --- a/misc/app_tests.sh +++ b/misc/app_tests.sh @@ -99,7 +99,7 @@ function BasicTests() { RunApp ${np} cdlp_auto --cdlp_mr=10 ExactVerify ${GRAPE_HOME}/dataset/${GRAPH}-CDLP - RunApp ${np} lcc --serialize=true --serialization_prefix=./serial/${GRAPH} + RunApp ${np} lcc --deserialize=true --serialization_prefix=./serial/${GRAPH} ExactVerify ${GRAPE_HOME}/dataset/${GRAPH}-LCC RunApp ${np} lcc_auto --deserialize=true --serialization_prefix=./serial/${GRAPH} @@ -163,10 +163,20 @@ function MutableFragmentTests() { WCCVerify ${GRAPE_HOME}/dataset/${GRAPH}-WCC } -function VertexMapTest() { +function LoadTest() { NP=$1; shift - cmd="mpirun -n ${NP} ./vertex_map_tests --vfile ${GRAPE_HOME}/dataset/${GRAPH}.v --efile ${GRAPE_HOME}/dataset/${GRAPH}.e --out_prefix ./extra_tests_output --sssp_source=6 $@" + cmd="mpirun -n ${NP} ./load_tests --vfile ${GRAPE_HOME}/dataset/${GRAPH}.v --efile ${GRAPE_HOME}/dataset/${GRAPH}.e --out_prefix ./extra_tests_output --sssp_source=6 $@" + + echo ${cmd} + eval ${cmd} +} + +function VertexMapTest() { + NP=$1; + + rm -rf ./vm_serial + cmd="mpirun -n ${NP} ./vertex_map_tests --vfile ${GRAPE_HOME}/dataset/${GRAPH}.v --efile ${GRAPE_HOME}/dataset/${GRAPH}.e --mutable_efile_base ${GRAPE_HOME}/dataset/${GRAPH}.e.mutable_base --mutable_efile_delta ${GRAPE_HOME}/dataset/${GRAPH}.e.mutable_delta --serialization_prefix=./vm_serial/${GRAPH}" echo ${cmd} eval ${cmd} @@ -181,38 +191,32 @@ function VertexMapTestOnMutableFragment() { eval ${cmd} } -function VertexMapTests() { +function LoadTests() { np=$1; shift - VertexMapTest ${np} --string_id + LoadTest ${np} --loader_type basic ExactVerify ${GRAPE_HOME}/dataset/${GRAPH}-SSSP - - VertexMapTest ${np} --nosegmented_partition + LoadTest ${np} --loader_type rb ExactVerify ${GRAPE_HOME}/dataset/${GRAPH}-SSSP - - VertexMapTest ${np} --string_id --nosegmented_partition + LoadTest ${np} --loader_type efile ExactVerify ${GRAPE_HOME}/dataset/${GRAPH}-SSSP - - VertexMapTest ${np} --noglobal_vertex_map + LoadTest ${np} --loader_type local ExactVerify ${GRAPE_HOME}/dataset/${GRAPH}-SSSP - VertexMapTest ${np} --string_id --noglobal_vertex_map + LoadTest ${np} --loader_type basic --string_id ExactVerify ${GRAPE_HOME}/dataset/${GRAPH}-SSSP - - VertexMapTest ${np} --nosegmented_partition --noglobal_vertex_map + LoadTest ${np} --loader_type rb --string_id ExactVerify ${GRAPE_HOME}/dataset/${GRAPH}-SSSP - - VertexMapTest ${np} --string_id --nosegmented_partition --noglobal_vertex_map + LoadTest ${np} --loader_type efile --string_id ExactVerify ${GRAPE_HOME}/dataset/${GRAPH}-SSSP - - VertexMapTestOnMutableFragment ${np} --string_id + LoadTest ${np} --loader_type local --string_id ExactVerify ${GRAPE_HOME}/dataset/${GRAPH}-SSSP +} - VertexMapTestOnMutableFragment ${np} --nosegmented_partition - ExactVerify ${GRAPE_HOME}/dataset/${GRAPH}-SSSP +function VertexMapTests() { + np=$1; shift - VertexMapTestOnMutableFragment ${np} --string_id --nosegmented_partition - ExactVerify ${GRAPE_HOME}/dataset/${GRAPH}-SSSP + VertexMapTest ${np} } pushd ${GRAPE_HOME}/build @@ -230,6 +234,7 @@ for np in ${proc_list}; do BasicTests ${np} MutableFragmentTests ${np} VertexMapTests ${np} + LoadTests ${np} done popd diff --git a/misc/cuda_app_tests.sh b/misc/cuda_app_tests.sh index 799a3545..1ef8a383 100755 --- a/misc/cuda_app_tests.sh +++ b/misc/cuda_app_tests.sh @@ -61,7 +61,7 @@ function RunAppWithELoader() { NP=$1; shift APP=$1; shift - cmd="mpirun -n ${NP} ./run_cuda_app --efile ${GRAPE_HOME}/dataset/${GRAPH}.e --application ${APP} --out_prefix ./extra_tests_output --nosegmented_partition $@" + cmd="mpirun -n ${NP} ./run_cuda_app --efile ${GRAPE_HOME}/dataset/${GRAPH}.e --application ${APP} --out_prefix ./extra_tests_output --partitioner_type=hash $@" echo ${cmd} eval ${cmd} } @@ -117,16 +117,16 @@ for np in ${proc_list}; do RunApp ${np} wcc -lb=${lb} ${SER} --serialization_prefix=./serial/${GRAPH}_wcc WCCVerify ${GRAPE_HOME}/dataset/${GRAPH}-WCC - RunApp ${np} wcc_opt -segmented_partition=true -rebalance=true -lb=${lb} ${SER} --serialization_prefix=./serial/${GRAPH}_wcc_opt_tt + RunApp ${np} wcc_opt -rebalance=true -lb=${lb} ${SER} --serialization_prefix=./serial/${GRAPH}_wcc_opt_tt WCCVerify ${GRAPE_HOME}/dataset/${GRAPH}-WCC - RunApp ${np} wcc_opt -segmented_partition=true -rebalance=false -lb=${lb} ${SER} --serialization_prefix=./serial/${GRAPH}_wcc_opt_tf + RunApp ${np} wcc_opt -rebalance=false -lb=${lb} ${SER} --serialization_prefix=./serial/${GRAPH}_wcc_opt_tf WCCVerify ${GRAPE_HOME}/dataset/${GRAPH}-WCC - RunApp ${np} wcc_opt -segmented_partition=false -rebalance=true -lb=${lb} ${SER} --serialization_prefix=./serial/${GRAPH}_wcc_opt_ft + RunApp ${np} wcc_opt --partitioner_type=hash -rebalance=true -lb=${lb} ${SER} --serialization_prefix=./serial/${GRAPH}_wcc_opt_ft WCCVerify ${GRAPE_HOME}/dataset/${GRAPH}-WCC - RunApp ${np} wcc_opt -segmented_partition=false -rebalance=false -lb=${lb} ${SER} --serialization_prefix=./serial/${GRAPH}_wcc_opt_ff + RunApp ${np} wcc_opt --partitioner_type=hash -rebalance=false -lb=${lb} ${SER} --serialization_prefix=./serial/${GRAPH}_wcc_opt_ff WCCVerify ${GRAPE_HOME}/dataset/${GRAPH}-WCC done done diff --git a/misc/load_tests.cc b/misc/load_tests.cc new file mode 100644 index 00000000..f69dc068 --- /dev/null +++ b/misc/load_tests.cc @@ -0,0 +1,178 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "sssp/sssp.h" + +#ifndef __AFFINITY__ +#define __AFFINITY__ false +#endif + +DEFINE_string(efile, "", "edge file"); +DEFINE_string(vfile, "", "vertex file"); +DEFINE_string(out_prefix, "", "output directory of results"); +DEFINE_int64(sssp_source, 0, "source vertex of sssp."); +DEFINE_string(loader_type, "basic", "loader type: basic, rb, efile or local"); +DEFINE_bool(string_id, false, "whether to use string as origin id"); + +void Init() { + if (FLAGS_out_prefix.empty()) { + LOG(FATAL) << "Please assign an output prefix."; + } + if (FLAGS_efile.empty()) { + LOG(FATAL) << "Please assign input edge files."; + } + if (access(FLAGS_out_prefix.c_str(), 0) != 0) { + mkdir(FLAGS_out_prefix.c_str(), 0777); + } + + grape::InitMPIComm(); + grape::CommSpec comm_spec; + comm_spec.Init(MPI_COMM_WORLD); + if (comm_spec.worker_id() == grape::kCoordinatorRank) { + VLOG(1) << "Workers of libgrape-lite initialized."; + } +} + +void Finalize() { + grape::FinalizeMPIComm(); + VLOG(1) << "Workers finalized."; +} + +template +void DoQuery(std::shared_ptr fragment, std::shared_ptr app, + const grape::CommSpec& comm_spec, + const grape::ParallelEngineSpec& spec, + const std::string& out_prefix, Args... args) { + auto worker = APP_T::CreateWorker(app, fragment); + worker->Init(comm_spec, spec); + worker->Query(std::forward(args)...); + + std::ofstream ostream; + std::string output_path = + grape::GetResultFilename(out_prefix, fragment->fid()); + ostream.open(output_path); + worker->Output(ostream); + ostream.close(); + worker->Finalize(); +} + +template +struct ParamConverter {}; + +template <> +struct ParamConverter { + static int64_t FromInt64(int64_t val) { return val; } +}; + +template <> +struct ParamConverter { + static std::string FromInt64(int64_t val) { return std::to_string(val); } +}; + +template class APP_T, + typename... Args> +void CreateAndQuery(const grape::CommSpec& comm_spec, + const grape::LoadGraphSpec& graph_spec, + const std::string& out_prefix, + const grape::ParallelEngineSpec& spec, Args... args) { + using FRAG_T = grape::ImmutableEdgecutFragment; + std::shared_ptr fragment = + grape::LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); + using AppType = APP_T; + auto app = std::make_shared(); + DoQuery(fragment, app, comm_spec, spec, out_prefix, + args...); +} + +int main(int argc, char* argv[]) { + FLAGS_stderrthreshold = 0; + + grape::gflags::SetUsageMessage( + "Usage: mpiexec [mpi_opts] ./run_app [grape_opts]"); + if (argc == 1) { + grape::gflags::ShowUsageWithFlagsRestrict(argv[0], "analytical_apps"); + exit(1); + } + grape::gflags::ParseCommandLineFlags(&argc, &argv, true); + grape::gflags::ShutDownCommandLineFlags(); + + google::InitGoogleLogging("analytical_apps"); + google::InstallFailureSignalHandler(); + + Init(); + + { + grape::CommSpec comm_spec; + comm_spec.Init(MPI_COMM_WORLD); + grape::LoadGraphSpec graph_spec = grape::DefaultLoadGraphSpec(); + graph_spec.set_directed(false); + if (FLAGS_loader_type == "rb") { + graph_spec.set_rebalance(true, 0); + graph_spec.partitioner_type = grape::PartitionerType::kMapPartitioner; + // idxer_type = kMapIdxer; + } else if (FLAGS_loader_type == "efile") { + FLAGS_vfile = ""; + graph_spec.set_rebalance(false, 0); + graph_spec.partitioner_type = grape::PartitionerType::kHashPartitioner; + // idxer_type = kMapIdxer; + } else if (FLAGS_loader_type == "local") { + graph_spec.set_rebalance(false, 0); + graph_spec.partitioner_type = grape::PartitionerType::kHashPartitioner; + graph_spec.idxer_type = grape::IdxerType::kLocalIdxer; + } else { + CHECK_EQ(FLAGS_loader_type, "basic"); + graph_spec.set_rebalance(false, 0); + + // partitioner_type = kMapPartitioner; + // idxer_type = kMapIdxer; + } + if (FLAGS_string_id) { + CreateAndQuery( + comm_spec, graph_spec, FLAGS_out_prefix, + grape::DefaultParallelEngineSpec(), + ParamConverter::FromInt64(FLAGS_sssp_source)); + } else { + CreateAndQuery( + comm_spec, graph_spec, FLAGS_out_prefix, + grape::DefaultParallelEngineSpec(), + ParamConverter::FromInt64(FLAGS_sssp_source)); + } + } + Finalize(); + + google::ShutdownGoogleLogging(); +} diff --git a/misc/mutable_fragment_tests.cc b/misc/mutable_fragment_tests.cc new file mode 100644 index 00000000..a018845f --- /dev/null +++ b/misc/mutable_fragment_tests.cc @@ -0,0 +1,271 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include "bfs/bfs.h" +#include "bfs/bfs_auto.h" +#include "cdlp/cdlp.h" +#include "cdlp/cdlp_auto.h" +#include "flags.h" +#include "lcc/lcc.h" +#include "lcc/lcc_auto.h" +#include "pagerank/pagerank.h" +#include "pagerank/pagerank_auto.h" +#include "pagerank/pagerank_local.h" +#include "pagerank/pagerank_local_parallel.h" +#include "pagerank/pagerank_parallel.h" +#include "sssp/sssp.h" +#include "sssp/sssp_auto.h" +#include "timer.h" +#include "wcc/wcc.h" +#include "wcc/wcc_auto.h" + +#ifndef __AFFINITY__ +#define __AFFINITY__ false +#endif + +DEFINE_string(efile, "", "edge file"); +DEFINE_string(vfile, "", "vertex file"); +DEFINE_string(delta_efile, "", "delta edge file"); +DEFINE_string(delta_vfile, "", "delta vertex file"); +DEFINE_string(out_prefix, "", "output directory of results"); +DEFINE_int64(bfs_source, 0, "source vertex of bfs."); +DEFINE_int32(cdlp_mr, 10, "max rounds of cdlp."); +DEFINE_int64(sssp_source, 0, "source vertex of sssp."); +DEFINE_double(pr_d, 0.85, "damping_factor of pagerank"); +DEFINE_int32(pr_mr, 10, "max rounds of pagerank"); +DEFINE_bool(directed, false, "input graph is directed or not."); +DEFINE_string(application, "", "application name"); + +void Init() { + if (FLAGS_out_prefix.empty()) { + LOG(FATAL) << "Please assign an output prefix."; + } + if (FLAGS_efile.empty()) { + LOG(FATAL) << "Please assign input edge files."; + } + if (access(FLAGS_out_prefix.c_str(), 0) != 0) { + mkdir(FLAGS_out_prefix.c_str(), 0777); + } + + grape::InitMPIComm(); + grape::CommSpec comm_spec; + comm_spec.Init(MPI_COMM_WORLD); + if (comm_spec.worker_id() == grape::kCoordinatorRank) { + VLOG(1) << "Workers of libgrape-lite initialized."; + } +} + +void Finalize() { + grape::FinalizeMPIComm(); + VLOG(1) << "Workers finalized."; +} + +template +void DoQuery(std::shared_ptr fragment, std::shared_ptr app, + const grape::CommSpec& comm_spec, + const grape::ParallelEngineSpec& spec, + const std::string& out_prefix, Args... args) { + timer_next("load application"); + auto worker = APP_T::CreateWorker(app, fragment); + worker->Init(comm_spec, spec); + timer_next("run algorithm"); + worker->Query(std::forward(args)...); + timer_next("print output"); + + std::ofstream ostream; + std::string output_path = + grape::GetResultFilename(out_prefix, fragment->fid()); + ostream.open(output_path); + worker->Output(ostream); + ostream.close(); + worker->Finalize(); + timer_end(); + VLOG(1) << "Worker-" << comm_spec.worker_id() << " finished: " << output_path; +} + +template +struct ParamConverter {}; + +template <> +struct ParamConverter { + static int64_t FromInt64(int64_t val) { return val; } +}; + +template <> +struct ParamConverter { + static std::string FromInt64(int64_t val) { return std::to_string(val); } +}; + +template class APP_T, + typename... Args> +void CreateAndQuery(const grape::CommSpec& comm_spec, + const std::string& out_prefix, int fnum, + const grape::ParallelEngineSpec& spec, Args... args) { + timer_next("load graph"); + grape::LoadGraphSpec graph_spec = grape::DefaultLoadGraphSpec(); + graph_spec.set_directed(FLAGS_directed); + graph_spec.set_rebalance(false, 0); + graph_spec.idxer_type = grape::IdxerType::kHashMapIdxer; + using FRAG_T = grape::MutableEdgecutFragment; + std::shared_ptr fragment = grape::LoadGraphAndMutate( + FLAGS_efile, FLAGS_vfile, FLAGS_delta_efile, FLAGS_delta_vfile, comm_spec, + graph_spec); + using AppType = APP_T; + auto app = std::make_shared(); + DoQuery(fragment, app, comm_spec, spec, out_prefix, + args...); +} + +template +void Run() { + grape::CommSpec comm_spec; + comm_spec.Init(MPI_COMM_WORLD); + + bool is_coordinator = comm_spec.worker_id() == grape::kCoordinatorRank; + timer_start(is_coordinator); + + // FIXME: no barrier apps. more manager? or use a dynamic-cast. + std::string efile = FLAGS_efile; + std::string vfile = FLAGS_vfile; + std::string delta_efile = FLAGS_delta_efile; + std::string delta_vfile = FLAGS_delta_vfile; + std::string out_prefix = FLAGS_out_prefix; + auto spec = grape::MultiProcessSpec(comm_spec, __AFFINITY__); + int fnum = comm_spec.fnum(); + std::string name = FLAGS_application; + if (name.find("sssp") != std::string::npos) { + if (name == "sssp") { + CreateAndQuery( + comm_spec, out_prefix, fnum, spec, + ParamConverter::FromInt64(FLAGS_sssp_source)); + } else if (name == "sssp_auto") { + CreateAndQuery( + comm_spec, out_prefix, fnum, spec, + ParamConverter::FromInt64(FLAGS_sssp_source)); + } else { + LOG(FATAL) << "No avaiable application named [" << name << "]."; + } + } else { + if (name == "bfs") { + CreateAndQuery( + comm_spec, out_prefix, fnum, spec, + ParamConverter::FromInt64(FLAGS_bfs_source)); + } else if (name == "bfs_auto") { + CreateAndQuery( + comm_spec, out_prefix, fnum, spec, + ParamConverter::FromInt64(FLAGS_bfs_source)); + } else if (name == "pagerank_local") { + CreateAndQuery(comm_spec, out_prefix, fnum, spec, FLAGS_pr_d, + FLAGS_pr_mr); + } else if (name == "pagerank_local_parallel") { + CreateAndQuery( + comm_spec, out_prefix, fnum, spec, FLAGS_pr_d, FLAGS_pr_mr); + } else if (name == "pagerank") { + CreateAndQuery(comm_spec, out_prefix, fnum, spec, FLAGS_pr_d, + FLAGS_pr_mr); + } else if (name == "pagerank_auto") { + CreateAndQuery(comm_spec, out_prefix, fnum, spec, FLAGS_pr_d, + FLAGS_pr_mr); + } else if (name == "pagerank_parallel") { + CreateAndQuery(comm_spec, out_prefix, fnum, spec, FLAGS_pr_d, + FLAGS_pr_mr); + } else if (name == "cdlp") { + CreateAndQuery( + comm_spec, out_prefix, fnum, spec, FLAGS_cdlp_mr); + } else if (name == "cdlp_auto") { + CreateAndQuery( + comm_spec, out_prefix, fnum, spec, FLAGS_cdlp_mr); + } else if (name == "wcc") { + CreateAndQuery( + comm_spec, out_prefix, fnum, spec); + } else if (name == "wcc_auto") { + CreateAndQuery( + comm_spec, out_prefix, fnum, spec); + } else if (name == "lcc") { + CreateAndQuery( + comm_spec, out_prefix, fnum, spec); + } else if (name == "lcc_auto") { + CreateAndQuery( + comm_spec, out_prefix, fnum, spec); + } else { + LOG(FATAL) << "No avaiable application named [" << name << "]."; + } + } +} + +int main(int argc, char* argv[]) { + FLAGS_stderrthreshold = 0; + + grape::gflags::SetUsageMessage( + "Usage: mpiexec [mpi_opts] ./run_app [grape_opts]"); + if (argc == 1) { + grape::gflags::ShowUsageWithFlagsRestrict(argv[0], "analytical_apps"); + exit(1); + } + grape::gflags::ParseCommandLineFlags(&argc, &argv, true); + grape::gflags::ShutDownCommandLineFlags(); + + google::InitGoogleLogging("analytical_apps"); + google::InstallFailureSignalHandler(); + + Init(); + + std::string name = FLAGS_application; + if (name.find("sssp") != std::string::npos) { + Run(); + } else { + Run(); + } + + Finalize(); + + google::ShutdownGoogleLogging(); +} diff --git a/misc/vertex_map_tests.cc b/misc/vertex_map_tests.cc new file mode 100644 index 00000000..2977c442 --- /dev/null +++ b/misc/vertex_map_tests.cc @@ -0,0 +1,330 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#ifndef __AFFINITY__ +#define __AFFINITY__ false +#endif + +DEFINE_string(efile, "", "edge file"); +DEFINE_string(vfile, "", "vertex file"); +DEFINE_string(mutable_efile_base, "", "base of mutable edge file"); +DEFINE_string(mutable_efile_delta, "", "delta of mutable edge file"); +DEFINE_string(serialization_prefix, "", + "directory to place serialization files"); + +void Init() { + if (FLAGS_efile.empty()) { + LOG(FATAL) << "Please assign input edge files."; + } + if (access(FLAGS_serialization_prefix.c_str(), 0) != 0) { + mkdir(FLAGS_serialization_prefix.c_str(), 0777); + } + + grape::InitMPIComm(); + grape::CommSpec comm_spec; + comm_spec.Init(MPI_COMM_WORLD); + if (comm_spec.worker_id() == grape::kCoordinatorRank) { + VLOG(1) << "Workers of libgrape-lite initialized."; + } +} + +void Finalize() { + grape::FinalizeMPIComm(); + VLOG(1) << "Workers finalized."; +} + +template +bool verify_vertex_map(const grape::CommSpec& comm_spec, + const grape::VertexMap& vertex_map) { + grape::fid_t fnum = comm_spec.fnum(); + std::vector>> all_maps_g2o(fnum); + std::vector>> all_maps_o2g(fnum); + bool ret = true; + for (grape::fid_t fid = 0; fid != fnum; ++fid) { + VID_T frag_vnum = vertex_map.GetInnerVertexSize(fid); + for (VID_T lid = 0; lid < frag_vnum; ++lid) { + OID_T oid_a, oid_b; + if (vertex_map.GetOid(fid, lid, oid_a)) { + VID_T gid_a{}, gid_b{}; + if (!vertex_map.GetGid(fid, oid_a, gid_a)) { + LOG(ERROR) << "Vertex " << oid_a << " not found by fid+oid in vertex " + << "map."; + ret = false; + continue; + } + if (!vertex_map.GetGid(oid_a, gid_b)) { + LOG(ERROR) << "Vertex " << oid_a << " not found by oid in vertex " + << "map."; + ret = false; + continue; + } + if (gid_a != gid_b) { + LOG(ERROR) << "Vertex " << oid_a << " gid not consistent."; + ret = false; + continue; + } + if (!vertex_map.GetOid(gid_a, oid_b)) { + LOG(ERROR) << "Vertex " << gid_a << " not found by gid in vertex " + << "map."; + ret = false; + continue; + } + if (oid_a != oid_b) { + LOG(ERROR) << "Vertex " << gid_a << " oid not consistent."; + ret = false; + continue; + } + all_maps_g2o[gid_a % fnum].emplace_back(gid_a, oid_a); + all_maps_o2g[std::hash{}(oid_a) % fnum].emplace_back(oid_a, + gid_a); + } + } + } + + { + std::vector>> all_maps_g2o_in(fnum); + grape::sync_comm::AllToAll(all_maps_g2o, all_maps_g2o_in, comm_spec.comm()); + + std::vector> all_maps_merged; + for (auto& maps : all_maps_g2o_in) { + all_maps_merged.insert(all_maps_merged.end(), maps.begin(), maps.end()); + } + + std::sort(all_maps_merged.begin(), all_maps_merged.end()); + for (size_t i = 1; i < all_maps_merged.size(); ++i) { + if (all_maps_merged[i].first == all_maps_merged[i - 1].first) { + if (all_maps_merged[i].second != all_maps_merged[i - 1].second) { + LOG(ERROR) << "Vertex " << all_maps_merged[i].first + << " has different oid in different fragments."; + ret = false; + } + } + } + } + + { + std::vector>> all_maps_o2g_in(fnum); + grape::sync_comm::AllToAll(all_maps_o2g, all_maps_o2g_in, comm_spec.comm()); + + std::vector> all_maps_merged; + for (auto& maps : all_maps_o2g_in) { + all_maps_merged.insert(all_maps_merged.end(), maps.begin(), maps.end()); + } + + std::sort(all_maps_merged.begin(), all_maps_merged.end()); + for (size_t i = 1; i < all_maps_merged.size(); ++i) { + if (all_maps_merged[i].first == all_maps_merged[i - 1].first) { + if (all_maps_merged[i].second != all_maps_merged[i - 1].second) { + LOG(ERROR) << "Vertex " << all_maps_merged[i].first + << " has different gid in different fragments."; + ret = false; + } + } + } + } + + return ret; +} + +template +bool verify_fragment_vertex_map(const FRAG_T& frag, + const VERTEX_MAP_T& vertex_map) { + auto inner_vertices = frag.InnerVertices(); + auto outer_vertices = frag.OuterVertices(); + using vid_t = typename FRAG_T::vid_t; + using oid_t = typename FRAG_T::oid_t; + for (auto v : inner_vertices) { + vid_t gid = frag.GetInnerVertexGid(v); + oid_t oid; + if (!vertex_map.GetOid(gid, oid)) { + LOG(ERROR) << "Vertex " << gid << " not found in vertex map."; + return false; + } + } + for (auto v : outer_vertices) { + vid_t gid = frag.GetOuterVertexGid(v); + oid_t oid; + if (!vertex_map.GetOid(gid, oid)) { + LOG(ERROR) << "Vertex " << gid << " not found in vertex map."; + return false; + } + } + return true; +} + +template +void test_build_vertex_map(const std::string& efile, const std::string& vfile, + const grape::LoadGraphSpec& graph_spec, + const grape::CommSpec& comm_spec) { + using FRAG_T = + grape::ImmutableEdgecutFragment; + std::shared_ptr fragment = + grape::LoadGraph(efile, vfile, comm_spec, graph_spec); + + verify_fragment_vertex_map(*fragment, fragment->GetVertexMap()); + verify_vertex_map(comm_spec, fragment->GetVertexMap()); +} + +template +void test_mutate_vertex_map(const std::string& efile_base, + const std::string& vfile, + const std::string& efile_delta, + const grape::LoadGraphSpec& graph_spec, + const grape::CommSpec& comm_spec) { + using FRAG_T = grape::MutableEdgecutFragment; + std::shared_ptr fragment = grape::LoadGraphAndMutate( + efile_base, vfile, efile_delta, "", comm_spec, graph_spec); + + verify_fragment_vertex_map(*fragment, fragment->GetVertexMap()); + verify_vertex_map(comm_spec, fragment->GetVertexMap()); +} + +int main(int argc, char* argv[]) { + FLAGS_stderrthreshold = 0; + grape::gflags::SetUsageMessage( + "Usage: mpiexec [mpi_opts] ./run_app [grape_opts]"); + if (argc == 1) { + grape::gflags::ShowUsageWithFlagsRestrict(argv[0], "vertex_map_tests"); + exit(1); + } + grape::gflags::ParseCommandLineFlags(&argc, &argv, true); + grape::gflags::ShutDownCommandLineFlags(); + + google::InitGoogleLogging("vertex_map_tests"); + google::InstallFailureSignalHandler(); + + Init(); + + std::vector string_id_options({false, true}); + std::vector rebalance_options({false, true}); + std::vector partitioner_options( + {grape::PartitionerType::kHashPartitioner, + grape::PartitionerType::kMapPartitioner, + grape::PartitionerType::kSegmentedPartitioner}); + std::vector idxer_options( + {grape::IdxerType::kHashMapIdxer, grape::IdxerType::kHashMapIdxerView, + grape::IdxerType::kPTHashIdxer, grape::IdxerType::kSortedArrayIdxer, + grape::IdxerType::kLocalIdxer}); + + { + grape::CommSpec comm_spec; + comm_spec.Init(MPI_COMM_WORLD); + int idx = 0; + for (auto string_id : string_id_options) { + for (auto rebalance : rebalance_options) { + for (auto partitioner_type : partitioner_options) { + for (auto idxer_type : idxer_options) { + if (rebalance) { + if (partitioner_type == + grape::PartitionerType::kHashPartitioner) { + continue; + } + } + if (idxer_type == grape::IdxerType::kLocalIdxer) { + if (partitioner_type != + grape::PartitionerType::kHashPartitioner) { + continue; + } + } + bool vm_extendable = + (idxer_type == grape::IdxerType::kHashMapIdxer || + idxer_type == grape::IdxerType::kLocalIdxer); + VLOG(2) << "Test " << idx++ << ": string_id=" << string_id + << ", rebalance=" << rebalance + << ", partitioner_type=" << partitioner_type + << ", idxer_type=" << idxer_type; + grape::LoadGraphSpec graph_spec = grape::DefaultLoadGraphSpec(); + graph_spec.set_directed(false); + if (rebalance) { + graph_spec.set_rebalance(true, 0); + } else { + graph_spec.set_rebalance(false, 0); + } + graph_spec.partitioner_type = partitioner_type; + graph_spec.idxer_type = idxer_type; + + graph_spec.set_serialize(true, FLAGS_serialization_prefix); + if (string_id) { + test_build_vertex_map(FLAGS_efile, FLAGS_vfile, + graph_spec, comm_spec); + if (vm_extendable) { + test_mutate_vertex_map( + FLAGS_mutable_efile_base, FLAGS_vfile, + FLAGS_mutable_efile_delta, graph_spec, comm_spec); + } + } else { + test_build_vertex_map(FLAGS_efile, FLAGS_vfile, + graph_spec, comm_spec); + if (vm_extendable) { + test_mutate_vertex_map( + FLAGS_mutable_efile_base, FLAGS_vfile, + FLAGS_mutable_efile_delta, graph_spec, comm_spec); + } + } + + graph_spec.set_deserialize(true, FLAGS_serialization_prefix); + if (string_id) { + test_build_vertex_map(FLAGS_efile, FLAGS_vfile, + graph_spec, comm_spec); + if (vm_extendable) { + test_mutate_vertex_map( + FLAGS_mutable_efile_base, FLAGS_vfile, + FLAGS_mutable_efile_delta, graph_spec, comm_spec); + } + } else { + test_build_vertex_map(FLAGS_efile, FLAGS_vfile, + graph_spec, comm_spec); + if (vm_extendable) { + test_mutate_vertex_map( + FLAGS_mutable_efile_base, FLAGS_vfile, + FLAGS_mutable_efile_delta, graph_spec, comm_spec); + } + } + } + } + } + } + } + + Finalize(); + + google::ShutdownGoogleLogging(); +} diff --git a/tests/load_tests.cc b/tests/load_tests.cc new file mode 100644 index 00000000..f69dc068 --- /dev/null +++ b/tests/load_tests.cc @@ -0,0 +1,178 @@ +/** Copyright 2020 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "sssp/sssp.h" + +#ifndef __AFFINITY__ +#define __AFFINITY__ false +#endif + +DEFINE_string(efile, "", "edge file"); +DEFINE_string(vfile, "", "vertex file"); +DEFINE_string(out_prefix, "", "output directory of results"); +DEFINE_int64(sssp_source, 0, "source vertex of sssp."); +DEFINE_string(loader_type, "basic", "loader type: basic, rb, efile or local"); +DEFINE_bool(string_id, false, "whether to use string as origin id"); + +void Init() { + if (FLAGS_out_prefix.empty()) { + LOG(FATAL) << "Please assign an output prefix."; + } + if (FLAGS_efile.empty()) { + LOG(FATAL) << "Please assign input edge files."; + } + if (access(FLAGS_out_prefix.c_str(), 0) != 0) { + mkdir(FLAGS_out_prefix.c_str(), 0777); + } + + grape::InitMPIComm(); + grape::CommSpec comm_spec; + comm_spec.Init(MPI_COMM_WORLD); + if (comm_spec.worker_id() == grape::kCoordinatorRank) { + VLOG(1) << "Workers of libgrape-lite initialized."; + } +} + +void Finalize() { + grape::FinalizeMPIComm(); + VLOG(1) << "Workers finalized."; +} + +template +void DoQuery(std::shared_ptr fragment, std::shared_ptr app, + const grape::CommSpec& comm_spec, + const grape::ParallelEngineSpec& spec, + const std::string& out_prefix, Args... args) { + auto worker = APP_T::CreateWorker(app, fragment); + worker->Init(comm_spec, spec); + worker->Query(std::forward(args)...); + + std::ofstream ostream; + std::string output_path = + grape::GetResultFilename(out_prefix, fragment->fid()); + ostream.open(output_path); + worker->Output(ostream); + ostream.close(); + worker->Finalize(); +} + +template +struct ParamConverter {}; + +template <> +struct ParamConverter { + static int64_t FromInt64(int64_t val) { return val; } +}; + +template <> +struct ParamConverter { + static std::string FromInt64(int64_t val) { return std::to_string(val); } +}; + +template class APP_T, + typename... Args> +void CreateAndQuery(const grape::CommSpec& comm_spec, + const grape::LoadGraphSpec& graph_spec, + const std::string& out_prefix, + const grape::ParallelEngineSpec& spec, Args... args) { + using FRAG_T = grape::ImmutableEdgecutFragment; + std::shared_ptr fragment = + grape::LoadGraph(FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); + using AppType = APP_T; + auto app = std::make_shared(); + DoQuery(fragment, app, comm_spec, spec, out_prefix, + args...); +} + +int main(int argc, char* argv[]) { + FLAGS_stderrthreshold = 0; + + grape::gflags::SetUsageMessage( + "Usage: mpiexec [mpi_opts] ./run_app [grape_opts]"); + if (argc == 1) { + grape::gflags::ShowUsageWithFlagsRestrict(argv[0], "analytical_apps"); + exit(1); + } + grape::gflags::ParseCommandLineFlags(&argc, &argv, true); + grape::gflags::ShutDownCommandLineFlags(); + + google::InitGoogleLogging("analytical_apps"); + google::InstallFailureSignalHandler(); + + Init(); + + { + grape::CommSpec comm_spec; + comm_spec.Init(MPI_COMM_WORLD); + grape::LoadGraphSpec graph_spec = grape::DefaultLoadGraphSpec(); + graph_spec.set_directed(false); + if (FLAGS_loader_type == "rb") { + graph_spec.set_rebalance(true, 0); + graph_spec.partitioner_type = grape::PartitionerType::kMapPartitioner; + // idxer_type = kMapIdxer; + } else if (FLAGS_loader_type == "efile") { + FLAGS_vfile = ""; + graph_spec.set_rebalance(false, 0); + graph_spec.partitioner_type = grape::PartitionerType::kHashPartitioner; + // idxer_type = kMapIdxer; + } else if (FLAGS_loader_type == "local") { + graph_spec.set_rebalance(false, 0); + graph_spec.partitioner_type = grape::PartitionerType::kHashPartitioner; + graph_spec.idxer_type = grape::IdxerType::kLocalIdxer; + } else { + CHECK_EQ(FLAGS_loader_type, "basic"); + graph_spec.set_rebalance(false, 0); + + // partitioner_type = kMapPartitioner; + // idxer_type = kMapIdxer; + } + if (FLAGS_string_id) { + CreateAndQuery( + comm_spec, graph_spec, FLAGS_out_prefix, + grape::DefaultParallelEngineSpec(), + ParamConverter::FromInt64(FLAGS_sssp_source)); + } else { + CreateAndQuery( + comm_spec, graph_spec, FLAGS_out_prefix, + grape::DefaultParallelEngineSpec(), + ParamConverter::FromInt64(FLAGS_sssp_source)); + } + } + Finalize(); + + google::ShutdownGoogleLogging(); +} diff --git a/tests/mutable_fragment_tests.cc b/tests/mutable_fragment_tests.cc index 7e619e39..a018845f 100644 --- a/tests/mutable_fragment_tests.cc +++ b/tests/mutable_fragment_tests.cc @@ -26,7 +26,6 @@ limitations under the License. #include #include #include -#include #include "bfs/bfs.h" #include "bfs/bfs_auto.h" @@ -133,6 +132,7 @@ void CreateAndQuery(const grape::CommSpec& comm_spec, grape::LoadGraphSpec graph_spec = grape::DefaultLoadGraphSpec(); graph_spec.set_directed(FLAGS_directed); graph_spec.set_rebalance(false, 0); + graph_spec.idxer_type = grape::IdxerType::kHashMapIdxer; using FRAG_T = grape::MutableEdgecutFragment; std::shared_ptr fragment = grape::LoadGraphAndMutate( diff --git a/tests/vertex_map_tests.cc b/tests/vertex_map_tests.cc index 03d98ee3..2977c442 100644 --- a/tests/vertex_map_tests.cc +++ b/tests/vertex_map_tests.cc @@ -28,14 +28,8 @@ limitations under the License. #include #include #include -#include #include #include -#include -#include - -#include "sssp/sssp.h" -#include "timer.h" #ifndef __AFFINITY__ #define __AFFINITY__ false @@ -43,26 +37,17 @@ limitations under the License. DEFINE_string(efile, "", "edge file"); DEFINE_string(vfile, "", "vertex file"); -DEFINE_string(delta_efile, "", "delta edge file"); -DEFINE_string(delta_vfile, "", "delta vertex file"); -DEFINE_string(out_prefix, "", "output directory of results"); -DEFINE_int64(sssp_source, 0, "source vertex of sssp."); -DEFINE_bool(string_id, false, "whether to use string as origin id"); -DEFINE_bool(segmented_partition, true, - "whether to use segmented partitioning."); -DEFINE_bool(rebalance, false, "whether to rebalance graph after loading."); -DEFINE_int32(rebalance_vertex_factor, 0, "vertex factor of rebalancing."); -DEFINE_bool(global_vertex_map, true, "whether to use global vertex map."); +DEFINE_string(mutable_efile_base, "", "base of mutable edge file"); +DEFINE_string(mutable_efile_delta, "", "delta of mutable edge file"); +DEFINE_string(serialization_prefix, "", + "directory to place serialization files"); void Init() { - if (FLAGS_out_prefix.empty()) { - LOG(FATAL) << "Please assign an output prefix."; - } if (FLAGS_efile.empty()) { LOG(FATAL) << "Please assign input edge files."; } - if (access(FLAGS_out_prefix.c_str(), 0) != 0) { - mkdir(FLAGS_out_prefix.c_str(), 0777); + if (access(FLAGS_serialization_prefix.c_str(), 0) != 0) { + mkdir(FLAGS_serialization_prefix.c_str(), 0777); } grape::InitMPIComm(); @@ -78,175 +63,265 @@ void Finalize() { VLOG(1) << "Workers finalized."; } -template -void DoQuery(std::shared_ptr fragment, std::shared_ptr app, - const grape::CommSpec& comm_spec, - const grape::ParallelEngineSpec& spec, - const std::string& out_prefix, Args... args) { - timer_next("load application"); - auto worker = APP_T::CreateWorker(app, fragment); - worker->Init(comm_spec, spec); - timer_next("run algorithm"); - worker->Query(std::forward(args)...); - timer_next("print output"); - - std::ofstream ostream; - std::string output_path = - grape::GetResultFilename(out_prefix, fragment->fid()); - ostream.open(output_path); - worker->Output(ostream); - ostream.close(); - worker->Finalize(); - timer_end(); - VLOG(1) << "Worker-" << comm_spec.worker_id() << " finished: " << output_path; -} +template +bool verify_vertex_map(const grape::CommSpec& comm_spec, + const grape::VertexMap& vertex_map) { + grape::fid_t fnum = comm_spec.fnum(); + std::vector>> all_maps_g2o(fnum); + std::vector>> all_maps_o2g(fnum); + bool ret = true; + for (grape::fid_t fid = 0; fid != fnum; ++fid) { + VID_T frag_vnum = vertex_map.GetInnerVertexSize(fid); + for (VID_T lid = 0; lid < frag_vnum; ++lid) { + OID_T oid_a, oid_b; + if (vertex_map.GetOid(fid, lid, oid_a)) { + VID_T gid_a{}, gid_b{}; + if (!vertex_map.GetGid(fid, oid_a, gid_a)) { + LOG(ERROR) << "Vertex " << oid_a << " not found by fid+oid in vertex " + << "map."; + ret = false; + continue; + } + if (!vertex_map.GetGid(oid_a, gid_b)) { + LOG(ERROR) << "Vertex " << oid_a << " not found by oid in vertex " + << "map."; + ret = false; + continue; + } + if (gid_a != gid_b) { + LOG(ERROR) << "Vertex " << oid_a << " gid not consistent."; + ret = false; + continue; + } + if (!vertex_map.GetOid(gid_a, oid_b)) { + LOG(ERROR) << "Vertex " << gid_a << " not found by gid in vertex " + << "map."; + ret = false; + continue; + } + if (oid_a != oid_b) { + LOG(ERROR) << "Vertex " << gid_a << " oid not consistent."; + ret = false; + continue; + } + all_maps_g2o[gid_a % fnum].emplace_back(gid_a, oid_a); + all_maps_o2g[std::hash{}(oid_a) % fnum].emplace_back(oid_a, + gid_a); + } + } + } -template -struct ParamConverter {}; - -template <> -struct ParamConverter { - static int64_t FromInt64(int64_t val) { return val; } -}; - -template <> -struct ParamConverter { - static std::string FromInt64(int64_t val) { return std::to_string(val); } -}; - -template class APP_T, - typename... Args> -void CreateAndQuery(const grape::CommSpec& comm_spec, - const std::string& out_prefix, int fnum, - const grape::ParallelEngineSpec& spec, Args... args) { - timer_next("load graph"); - grape::LoadGraphSpec graph_spec = grape::DefaultLoadGraphSpec(); - graph_spec.set_directed(false); - graph_spec.set_rebalance(FLAGS_rebalance, FLAGS_rebalance_vertex_factor); - if (!FLAGS_delta_efile.empty() || !FLAGS_delta_vfile.empty()) { - graph_spec.set_rebalance(false, 0); - if (FLAGS_global_vertex_map) { - using VertexMapType = grape::GlobalVertexMap; - using FRAG_T = - grape::MutableEdgecutFragment; - std::shared_ptr fragment = grape::LoadGraphAndMutate( - FLAGS_efile, FLAGS_vfile, FLAGS_delta_efile, FLAGS_delta_vfile, - comm_spec, graph_spec); - using AppType = APP_T; - auto app = std::make_shared(); - DoQuery(fragment, app, comm_spec, spec, - out_prefix, args...); - } else { - using VertexMapType = grape::LocalVertexMap; - using FRAG_T = - grape::MutableEdgecutFragment; - std::shared_ptr fragment = grape::LoadGraphAndMutate( - FLAGS_efile, FLAGS_vfile, FLAGS_delta_efile, FLAGS_delta_vfile, - comm_spec, graph_spec); - using AppType = APP_T; - auto app = std::make_shared(); - DoQuery(fragment, app, comm_spec, spec, - out_prefix, args...); + { + std::vector>> all_maps_g2o_in(fnum); + grape::sync_comm::AllToAll(all_maps_g2o, all_maps_g2o_in, comm_spec.comm()); + + std::vector> all_maps_merged; + for (auto& maps : all_maps_g2o_in) { + all_maps_merged.insert(all_maps_merged.end(), maps.begin(), maps.end()); } - } else { - if (FLAGS_segmented_partition) { - if (FLAGS_global_vertex_map) { - using VertexMapType = - grape::GlobalVertexMap>; - using FRAG_T = - grape::ImmutableEdgecutFragment; - std::shared_ptr fragment = grape::LoadGraph( - FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); - using AppType = APP_T; - auto app = std::make_shared(); - DoQuery(fragment, app, comm_spec, spec, - out_prefix, args...); - } else { - using VertexMapType = - grape::LocalVertexMap>; - using FRAG_T = - grape::ImmutableEdgecutFragment; - std::shared_ptr fragment = grape::LoadGraph( - FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); - using AppType = APP_T; - auto app = std::make_shared(); - DoQuery(fragment, app, comm_spec, spec, - out_prefix, args...); + + std::sort(all_maps_merged.begin(), all_maps_merged.end()); + for (size_t i = 1; i < all_maps_merged.size(); ++i) { + if (all_maps_merged[i].first == all_maps_merged[i - 1].first) { + if (all_maps_merged[i].second != all_maps_merged[i - 1].second) { + LOG(ERROR) << "Vertex " << all_maps_merged[i].first + << " has different oid in different fragments."; + ret = false; + } } - } else { - graph_spec.set_rebalance(false, 0); - if (FLAGS_global_vertex_map) { - using VertexMapType = grape::GlobalVertexMap; - using FRAG_T = - grape::ImmutableEdgecutFragment; - std::shared_ptr fragment = grape::LoadGraph( - FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); - using AppType = APP_T; - auto app = std::make_shared(); - DoQuery(fragment, app, comm_spec, spec, - out_prefix, args...); - } else { - using VertexMapType = grape::LocalVertexMap; - using FRAG_T = - grape::ImmutableEdgecutFragment; - std::shared_ptr fragment = grape::LoadGraph( - FLAGS_efile, FLAGS_vfile, comm_spec, graph_spec); - using AppType = APP_T; - auto app = std::make_shared(); - DoQuery(fragment, app, comm_spec, spec, - out_prefix, args...); + } + } + + { + std::vector>> all_maps_o2g_in(fnum); + grape::sync_comm::AllToAll(all_maps_o2g, all_maps_o2g_in, comm_spec.comm()); + + std::vector> all_maps_merged; + for (auto& maps : all_maps_o2g_in) { + all_maps_merged.insert(all_maps_merged.end(), maps.begin(), maps.end()); + } + + std::sort(all_maps_merged.begin(), all_maps_merged.end()); + for (size_t i = 1; i < all_maps_merged.size(); ++i) { + if (all_maps_merged[i].first == all_maps_merged[i - 1].first) { + if (all_maps_merged[i].second != all_maps_merged[i - 1].second) { + LOG(ERROR) << "Vertex " << all_maps_merged[i].first + << " has different gid in different fragments."; + ret = false; + } } } } + + return ret; +} + +template +bool verify_fragment_vertex_map(const FRAG_T& frag, + const VERTEX_MAP_T& vertex_map) { + auto inner_vertices = frag.InnerVertices(); + auto outer_vertices = frag.OuterVertices(); + using vid_t = typename FRAG_T::vid_t; + using oid_t = typename FRAG_T::oid_t; + for (auto v : inner_vertices) { + vid_t gid = frag.GetInnerVertexGid(v); + oid_t oid; + if (!vertex_map.GetOid(gid, oid)) { + LOG(ERROR) << "Vertex " << gid << " not found in vertex map."; + return false; + } + } + for (auto v : outer_vertices) { + vid_t gid = frag.GetOuterVertexGid(v); + oid_t oid; + if (!vertex_map.GetOid(gid, oid)) { + LOG(ERROR) << "Vertex " << gid << " not found in vertex map."; + return false; + } + } + return true; } template -void Run() { - grape::CommSpec comm_spec; - comm_spec.Init(MPI_COMM_WORLD); +void test_build_vertex_map(const std::string& efile, const std::string& vfile, + const grape::LoadGraphSpec& graph_spec, + const grape::CommSpec& comm_spec) { + using FRAG_T = + grape::ImmutableEdgecutFragment; + std::shared_ptr fragment = + grape::LoadGraph(efile, vfile, comm_spec, graph_spec); + + verify_fragment_vertex_map(*fragment, fragment->GetVertexMap()); + verify_vertex_map(comm_spec, fragment->GetVertexMap()); +} - bool is_coordinator = comm_spec.worker_id() == grape::kCoordinatorRank; - timer_start(is_coordinator); +template +void test_mutate_vertex_map(const std::string& efile_base, + const std::string& vfile, + const std::string& efile_delta, + const grape::LoadGraphSpec& graph_spec, + const grape::CommSpec& comm_spec) { + using FRAG_T = grape::MutableEdgecutFragment; + std::shared_ptr fragment = grape::LoadGraphAndMutate( + efile_base, vfile, efile_delta, "", comm_spec, graph_spec); - // FIXME: no barrier apps. more manager? or use a dynamic-cast. - auto spec = grape::MultiProcessSpec(comm_spec, __AFFINITY__); - int fnum = comm_spec.fnum(); - CreateAndQuery( - comm_spec, FLAGS_out_prefix, fnum, spec, - ParamConverter::FromInt64(FLAGS_sssp_source)); + verify_fragment_vertex_map(*fragment, fragment->GetVertexMap()); + verify_vertex_map(comm_spec, fragment->GetVertexMap()); } int main(int argc, char* argv[]) { FLAGS_stderrthreshold = 0; - grape::gflags::SetUsageMessage( "Usage: mpiexec [mpi_opts] ./run_app [grape_opts]"); if (argc == 1) { - grape::gflags::ShowUsageWithFlagsRestrict(argv[0], "analytical_apps"); + grape::gflags::ShowUsageWithFlagsRestrict(argv[0], "vertex_map_tests"); exit(1); } grape::gflags::ParseCommandLineFlags(&argc, &argv, true); grape::gflags::ShutDownCommandLineFlags(); - google::InitGoogleLogging("analytical_apps"); + google::InitGoogleLogging("vertex_map_tests"); google::InstallFailureSignalHandler(); Init(); - if (FLAGS_string_id) { - Run(); - } else { - Run(); + std::vector string_id_options({false, true}); + std::vector rebalance_options({false, true}); + std::vector partitioner_options( + {grape::PartitionerType::kHashPartitioner, + grape::PartitionerType::kMapPartitioner, + grape::PartitionerType::kSegmentedPartitioner}); + std::vector idxer_options( + {grape::IdxerType::kHashMapIdxer, grape::IdxerType::kHashMapIdxerView, + grape::IdxerType::kPTHashIdxer, grape::IdxerType::kSortedArrayIdxer, + grape::IdxerType::kLocalIdxer}); + + { + grape::CommSpec comm_spec; + comm_spec.Init(MPI_COMM_WORLD); + int idx = 0; + for (auto string_id : string_id_options) { + for (auto rebalance : rebalance_options) { + for (auto partitioner_type : partitioner_options) { + for (auto idxer_type : idxer_options) { + if (rebalance) { + if (partitioner_type == + grape::PartitionerType::kHashPartitioner) { + continue; + } + } + if (idxer_type == grape::IdxerType::kLocalIdxer) { + if (partitioner_type != + grape::PartitionerType::kHashPartitioner) { + continue; + } + } + bool vm_extendable = + (idxer_type == grape::IdxerType::kHashMapIdxer || + idxer_type == grape::IdxerType::kLocalIdxer); + VLOG(2) << "Test " << idx++ << ": string_id=" << string_id + << ", rebalance=" << rebalance + << ", partitioner_type=" << partitioner_type + << ", idxer_type=" << idxer_type; + grape::LoadGraphSpec graph_spec = grape::DefaultLoadGraphSpec(); + graph_spec.set_directed(false); + if (rebalance) { + graph_spec.set_rebalance(true, 0); + } else { + graph_spec.set_rebalance(false, 0); + } + graph_spec.partitioner_type = partitioner_type; + graph_spec.idxer_type = idxer_type; + + graph_spec.set_serialize(true, FLAGS_serialization_prefix); + if (string_id) { + test_build_vertex_map(FLAGS_efile, FLAGS_vfile, + graph_spec, comm_spec); + if (vm_extendable) { + test_mutate_vertex_map( + FLAGS_mutable_efile_base, FLAGS_vfile, + FLAGS_mutable_efile_delta, graph_spec, comm_spec); + } + } else { + test_build_vertex_map(FLAGS_efile, FLAGS_vfile, + graph_spec, comm_spec); + if (vm_extendable) { + test_mutate_vertex_map( + FLAGS_mutable_efile_base, FLAGS_vfile, + FLAGS_mutable_efile_delta, graph_spec, comm_spec); + } + } + + graph_spec.set_deserialize(true, FLAGS_serialization_prefix); + if (string_id) { + test_build_vertex_map(FLAGS_efile, FLAGS_vfile, + graph_spec, comm_spec); + if (vm_extendable) { + test_mutate_vertex_map( + FLAGS_mutable_efile_base, FLAGS_vfile, + FLAGS_mutable_efile_delta, graph_spec, comm_spec); + } + } else { + test_build_vertex_map(FLAGS_efile, FLAGS_vfile, + graph_spec, comm_spec); + if (vm_extendable) { + test_mutate_vertex_map( + FLAGS_mutable_efile_base, FLAGS_vfile, + FLAGS_mutable_efile_delta, graph_spec, comm_spec); + } + } + } + } + } + } } Finalize(); diff --git a/thirdparty/flat_hash_map/flat_hash_map.hpp b/thirdparty/flat_hash_map/flat_hash_map.hpp index 67760a45..b1f5bb16 100644 --- a/thirdparty/flat_hash_map/flat_hash_map.hpp +++ b/thirdparty/flat_hash_map/flat_hash_map.hpp @@ -913,6 +913,11 @@ class sherwood_v3_table : private EntryAlloc, public Hasher, public Equal deallocate_data(entries, num_slots_minus_one, max_lookups); } + size_t memory_usage() const + { + return (num_slots_minus_one + max_lookups + 1) * sizeof(Entry); + } + const allocator_type & get_allocator() const { return static_cast(*this); diff --git a/thirdparty/pthash/builders/external_memory_builder_single_phf.hpp b/thirdparty/pthash/builders/external_memory_builder_single_phf.hpp new file mode 100644 index 00000000..605f4809 --- /dev/null +++ b/thirdparty/pthash/builders/external_memory_builder_single_phf.hpp @@ -0,0 +1,753 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "pthash/builders/search.hpp" +#include "pthash/builders/util.hpp" +#include "pthash/mm_file/mm_file.hpp" + +#include "pthash/utils/bucketers.hpp" +#include "pthash/utils/hasher.hpp" +#include "pthash/utils/logger.hpp" + +namespace pthash { + +template +struct external_memory_builder_single_phf { + typedef Hasher hasher_type; + + external_memory_builder_single_phf() + : m_pilots_filename(""), m_free_slots_filename("") {} + // non construction-copyable + external_memory_builder_single_phf( + external_memory_builder_single_phf const&) = delete; + // non copyable + external_memory_builder_single_phf& operator=( + external_memory_builder_single_phf const&) = delete; + + ~external_memory_builder_single_phf() { + if (m_pilots_filename != "") + std::remove(m_pilots_filename.c_str()); + m_pilots_filename = ""; + if (m_free_slots_filename != "") + std::remove(m_free_slots_filename.c_str()); + m_free_slots_filename = ""; + } + + template + build_timings build_from_keys(Iterator keys, uint64_t num_keys, + build_configuration const& config) { + assert(num_keys > 1); + if (config.alpha == 0 or config.alpha > 1.0) { + throw std::invalid_argument("load factor must be > 0 and <= 1.0"); + } + + build_timings time; + uint64_t table_size = static_cast(num_keys) / config.alpha; + if ((table_size & (table_size - 1)) == 0) + table_size += 1; + uint64_t num_buckets = + std::ceil((config.c * num_keys) / std::log2(num_keys)); + + if (sizeof(bucket_id_type) != sizeof(uint64_t) and + num_buckets > (1ULL << (sizeof(bucket_id_type) * 8))) { + throw std::runtime_error( + "using too many buckets: change bucket_id_type to uint64_t or use a " + "smaller c"); + } + + m_num_keys = num_keys; + m_table_size = table_size; + m_num_buckets = num_buckets; + m_seed = + config.seed == constants::invalid_seed ? random_value() : config.seed; + m_bucketer.init(num_buckets); + + uint64_t ram = config.ram; + + uint64_t bitmap_taken_bytes = 8 * ((table_size + 63) / 64); + uint64_t hashed_pilots_cache_bytes = search_cache_size * sizeof(uint64_t); + if (bitmap_taken_bytes + hashed_pilots_cache_bytes >= ram) { + std::stringstream ss; + ss << "not enough RAM available, the bitmap alone takes " + << static_cast(bitmap_taken_bytes) / 1000000000 + << " GB of space."; + throw std::runtime_error(ss.str()); + } + + if (config.verbose_output) { + constexpr uint64_t GB = 1000000000; + uint64_t peak = + num_keys * (sizeof(bucket_payload_pair) + sizeof(uint64_t)) + + (num_keys + num_buckets) * sizeof(uint64_t); + std::cout << "c = " << config.c << std::endl; + std::cout << "alpha = " << config.alpha << std::endl; + std::cout << "num_keys = " << num_keys << std::endl; + std::cout << "table_size = " << table_size << std::endl; + std::cout << "num_buckets = " << num_buckets << std::endl; + std::cout << "using " << static_cast(ram) / GB << " GB of RAM" + << " (" << static_cast(bitmap_taken_bytes) / GB + << " GB occupied by the bitmap)" << std::endl; + std::cout << "using a peak of " << static_cast(peak) / GB + << " GB of disk space" << std::endl; + } + + uint64_t run_identifier = clock_type::now().time_since_epoch().count(); + temporary_files_manager tfm(config.tmp_dir, run_identifier); + + uint64_t num_non_empty_buckets = 0; + + try { + auto start = clock_type::now(); + { + auto start = clock_type::now(); + std::vector pairs_blocks; + map(keys, num_keys, pairs_blocks, tfm, config); + auto stop = clock_type::now(); + if (config.verbose_output) { + std::cout << " == map+sort " << tfm.get_num_pairs_files() + << " files(s) took: " << seconds(stop - start) << " seconds" + << std::endl; + } + start = clock_type::now(); + buckets_t buckets = tfm.buckets(config); + merge(pairs_blocks, buckets, config.verbose_output); + buckets.flush(); + for (auto& pairs_block : pairs_blocks) + pairs_block.close(); + num_non_empty_buckets = buckets.num_buckets(); + tfm.remove_all_pairs_files(); + stop = clock_type::now(); + if (config.verbose_output) { + std::cout << " == merge+check took: " << seconds(stop - start) + << " seconds" << std::endl; + std::cout << " == max bucket size = " << int(tfm.max_bucket_size()) + << std::endl; + } + } + auto stop = clock_type::now(); + time.mapping_ordering_seconds = seconds(stop - start); + if (config.verbose_output) { + std::cout << " == map+ordering took " << time.mapping_ordering_seconds + << " seconds" << std::endl; + } + } catch (...) { + tfm.remove_all_pairs_files(); + tfm.remove_all_merge_files(); + throw; + } + + try { + auto start = clock_type::now(); + bit_vector_builder taken(m_table_size); + + { // search + auto buckets_iterator = tfm.buckets_iterator(); + + // write all bucket-pilot pairs to files + uint64_t ram_for_pilots = + ram - bitmap_taken_bytes - hashed_pilots_cache_bytes; + auto pilots = tfm.get_multifile_pairs_writer(num_non_empty_buckets, + ram_for_pilots, 1, 0); + + search(m_num_keys, m_num_buckets, num_non_empty_buckets, m_seed, config, + buckets_iterator, taken, pilots); + + pilots.flush(); + buckets_iterator.close(); + // merge all sorted bucket-pilot pairs on a single file, saving only the + // pilot + pilots_merger_t pilots_merger(tfm.get_pilots_filename(), ram); + merge(tfm.pairs_blocks(), pilots_merger, false); + pilots_merger.finalize_and_close(m_num_buckets); + + if (m_pilots_filename != "") + std::remove(m_pilots_filename.c_str()); + m_pilots_filename = tfm.get_pilots_filename(); + + // remove unused temporary files + tfm.remove_all_pairs_files(); + tfm.remove_all_merge_files(); + } + + if (config.minimal_output) { // fill free slots + // write all free slots to file + buffered_file_t writer(tfm.get_free_slots_filename(), + ram - bitmap_taken_bytes); + fill_free_slots(taken, num_keys, writer); + writer.close(); + if (m_free_slots_filename != "") + std::remove(m_free_slots_filename.c_str()); + m_free_slots_filename = tfm.get_free_slots_filename(); + } + + auto stop = clock_type::now(); + time.searching_seconds = seconds(stop - start); + if (config.verbose_output) { + std::cout << " == search took " << time.searching_seconds << " seconds" + << std::endl; + } + } catch (...) { + tfm.remove_all_pairs_files(); + tfm.remove_all_merge_files(); + throw; + } + + return time; + } + + uint64_t seed() const { return m_seed; } + + uint64_t num_keys() const { return m_num_keys; } + + uint64_t table_size() const { return m_table_size; } + + skew_bucketer bucketer() const { return m_bucketer; } + + mm::file_source pilots() const { + return mm::file_source(m_pilots_filename); + } + + mm::file_source free_slots() const { + return mm::file_source(m_free_slots_filename); + } + + private: + uint64_t m_seed; + uint64_t m_num_keys; + uint64_t m_table_size; + uint64_t m_num_buckets; + skew_bucketer m_bucketer; + std::string m_pilots_filename; + std::string m_free_slots_filename; + + template + struct buffer_t { + buffer_t(uint64_t ram) : m_buffer_capacity(ram / sizeof(T)) { + m_buffer.reserve(m_buffer_capacity); + assert(m_buffer_capacity > 0); + } + + template + void emplace_back(_Args&&... __args) { + m_buffer.emplace_back(std::forward<_Args>(__args)...); + if (--m_buffer_capacity == 0) + flush(); + } + + void flush() { + if (!m_buffer.empty()) { + uint64_t buffer_size = m_buffer.size(); + flush_impl(m_buffer); + m_buffer_capacity += buffer_size; + m_buffer.clear(); + } + } + + protected: + virtual void flush_impl(std::vector& buffer) = 0; + + private: + uint64_t m_buffer_capacity; + std::vector m_buffer; + }; + + template + struct buffered_file_t : buffer_t { + buffered_file_t(std::string const& filename, uint64_t ram) + : buffer_t(ram) { + m_out.open(filename, std::ofstream::out | std::ofstream::binary); + if (!m_out.is_open()) + throw std::runtime_error("cannot open binary file in write mode"); + } + + void close() { + buffer_t::flush(); + m_out.close(); + } + + protected: + void flush_impl(std::vector& buffer) { + m_out.write(reinterpret_cast(buffer.data()), + buffer.size() * sizeof(T)); + } + + private: + std::ofstream m_out; + }; + + template + struct memory_view { + typedef T* iterator; + typedef const T* const_iterator; + + memory_view() : m_begin(nullptr), m_end(nullptr){}; + memory_view(T* begin, uint64_t size) + : m_begin(begin), m_end(begin + size) {} + + inline T* begin() const { return m_begin; } + inline T* end() const { return m_end; } + inline T& operator[](uint64_t pos) const { return *(m_begin + pos); } + inline uint64_t size() const { return std::distance(m_begin, m_end); } + + protected: + T *m_begin, *m_end; + }; + + template + struct reader_t : memory_view { + void open(std::string const& filename) { + if (m_is.is_open()) + m_is.close(); + m_is.open(filename, mm::advice::sequential); + if (!m_is.is_open()) + throw std::runtime_error("cannot open temporary file (read)"); + memory_view::m_begin = m_is.data(); + memory_view::m_end = m_is.data() + m_is.size(); + } + + void close() { m_is.close(); } + + private: + mm::file_source m_is; + }; + + typedef reader_t pairs_t; + + struct pairs_merger_t { + pairs_merger_t(std::string const& filename, uint64_t ram) + : m_buffer(filename, ram) {} + + template + void add(bucket_id_type bucket_id, bucket_size_type bucket_size, + HashIterator hashes) { + for (uint64_t k = 0; k != bucket_size; ++k, ++hashes) { + m_buffer.emplace_back(bucket_id, *hashes); + } + } + + void close() { m_buffer.close(); } + + private: + buffered_file_t m_buffer; + }; + + struct buckets_t { // merger + buckets_t(std::vector const& filenames, uint64_t ram, + std::vector& used_bucket_sizes) + : m_filenames(filenames), + m_buffers(filenames.size()), + m_buffer_capacity(ram / (sizeof(uint64_t) * 2)), + m_ram(ram / (sizeof(uint64_t) * 2)), + m_used_bucket_sizes(used_bucket_sizes), + m_outs(filenames.size()), + m_num_buckets(0) { + assert(m_filenames.size() == m_used_bucket_sizes.size()); + m_non_empty_buckets.reserve(filenames.size()); + for (uint64_t i = 0; i != filenames.size(); ++i) { + if (m_used_bucket_sizes[i]) { + throw std::runtime_error("One of the output files is already open"); + } + } + } + + template + void add(bucket_id_type bucket_id, bucket_size_type bucket_size, + HashIterator hashes) { + assert(bucket_size > 0 and bucket_size <= MAX_BUCKET_SIZE); + ensure_capacity(bucket_size); + uint64_t i = bucket_size - 1; + if (m_buffers[i].empty()) + m_non_empty_buckets.push_back(bucket_size - 1); + m_buffers[i].push_back(bucket_id); + for (uint64_t k = 0; k != bucket_size; ++k, ++hashes) + m_buffers[i].push_back(*hashes); + m_buffer_capacity -= bucket_size + 1; + ++m_num_buckets; + } + + uint64_t num_buckets() const { return m_num_buckets; }; + + void flush() { + for (uint64_t i = 0; i != m_buffers.size(); ++i) + flush_i(i); + m_non_empty_buckets.clear(); + } + + private: + void ensure_capacity(uint64_t bucket_size) { + if (bucket_size + 1 > m_buffer_capacity) { + std::sort(m_non_empty_buckets.begin(), m_non_empty_buckets.end(), + [&](uint64_t i, uint64_t j) { + return m_buffers[i].size() < m_buffers[j].size(); + }); + + uint64_t target = + std::max((uint64_t) std::ceil(0.999 * m_ram), bucket_size + 1); + while (m_buffer_capacity < target) { + flush_i(m_non_empty_buckets.back()); + m_non_empty_buckets.pop_back(); + } + } + } + + void flush_i(uint64_t i) { + if (m_buffers[i].size() == 0) + return; + if (!m_used_bucket_sizes[i]) { + m_outs[i].open(m_filenames[i].c_str(), + std::ofstream::out | std::ofstream::binary); + if (!m_outs[i].is_open()) { + throw std::runtime_error("cannot open temporary file (write)"); + } + m_used_bucket_sizes[i] = true; + } + m_outs[i].write(reinterpret_cast(m_buffers[i].data()), + m_buffers[i].size() * sizeof(uint64_t)); + m_buffer_capacity += m_buffers[i].size(); + std::vector().swap(m_buffers[i]); + } + + std::vector m_filenames; + std::vector> m_buffers; + uint64_t m_buffer_capacity; + uint64_t m_ram; + std::vector m_non_empty_buckets; + std::vector& m_used_bucket_sizes; + std::vector m_outs; + uint64_t m_num_buckets; + }; + + struct buckets_iterator_t { + buckets_iterator_t( + std::vector> const& + sizes_filenames) + : m_sizes(sizes_filenames.size()), m_sources(sizes_filenames.size()) { + m_pos = sizes_filenames.size(); + for (uint64_t i = 0, i_end = m_pos; i < i_end; ++i) { + m_sizes[i] = sizes_filenames[i].first; + m_sources[i].open(sizes_filenames[i].second, mm::advice::sequential); + assert(i == 0 or m_sizes[i - 1] < m_sizes[i]); + } + read_next_file(); + } + + void close() { + for (auto& is : m_sources) + is.close(); + } + + inline bucket_t operator*() { + bucket_t bucket; + bucket.init(m_it, m_bucket_size); + return bucket; + } + + void operator++() { + m_it += m_bucket_size + 1; + if (m_it >= m_end) + read_next_file(); + } + + private: + void read_next_file() { + if (m_pos == 0) { + m_it = m_end; + return; + } + --m_pos; + m_bucket_size = m_sizes[m_pos]; + m_it = m_sources[m_pos].data(); + m_end = m_it + m_sources[m_pos].size(); + } + + uint64_t m_pos; + std::vector m_sizes; + std::vector> m_sources; + bucket_size_type m_bucket_size; + uint64_t const* m_it; + uint64_t const* m_end; + }; + + struct pilots_merger_t { + pilots_merger_t(std::string const& filename, uint64_t ram) + : m_buffer(filename, ram), m_next_bucket_id(0) {} + + template + void add(bucket_id_type bucket_id, bucket_size_type bucket_size, + HashIterator hashes) { + assert(bucket_size == 1); + (void) bucket_size; // avoid unused warning in release mode + emplace_back_and_fill(bucket_id, *hashes); + } + + void finalize_and_close(uint64_t num_buckets) { + if (m_next_bucket_id < num_buckets) + emplace_back_and_fill(num_buckets - 1, 0); + m_buffer.close(); + } + + private: + inline void emplace_back_and_fill(bucket_id_type bucket_id, + uint64_t pilot) { + assert(m_next_bucket_id <= bucket_id); + + while (m_next_bucket_id++ < bucket_id) { + m_buffer.emplace_back(0); + } + m_buffer.emplace_back(pilot); + } + + buffered_file_t m_buffer; + uint64_t m_next_bucket_id; + }; + + struct multifile_pairs_writer : buffer_t { + multifile_pairs_writer(std::vector const& filenames, + uint64_t& num_pairs_files, uint64_t num_pairs, + uint64_t ram, uint64_t num_threads_sort = 1, + uint64_t ram_parallel_merge = 0) + : buffer_t(get_balanced_ram(num_pairs, ram)), + m_filenames(filenames), + m_num_pairs_files(num_pairs_files), + m_num_threads_sort(num_threads_sort), + m_ram_parallel_merge(ram_parallel_merge) { + assert(num_threads_sort > 1 or ram_parallel_merge == 0); + } + + protected: + void flush_impl(std::vector& buffer) { + const uint64_t size = buffer.size(); + + if (m_num_threads_sort > 1) { // parallel + std::vector> blocks; + uint64_t num_keys_per_thread = + (size + m_num_threads_sort - 1) / m_num_threads_sort; + auto exe = [&](uint64_t tid) { + std::sort(blocks[tid].begin(), blocks[tid].end()); + }; + + std::vector threads(m_num_threads_sort); + for (uint64_t i = 0; i != m_num_threads_sort; ++i) { + auto begin = buffer.data() + i * num_keys_per_thread; + auto end = + buffer.data() + std::min((i + 1) * num_keys_per_thread, size); + uint64_t block_size = std::distance(begin, end); + + blocks.emplace_back(begin, block_size); + threads[i] = std::thread(exe, i); + } + for (uint64_t i = 0; i != m_num_threads_sort; ++i) { + if (threads[i].joinable()) + threads[i].join(); + } + pairs_merger_t pairs_merger(m_filenames[m_num_pairs_files], + m_ram_parallel_merge); + ++m_num_pairs_files; + merge(blocks, pairs_merger, false); + pairs_merger.close(); + } else { // sequential + std::ofstream out(m_filenames[m_num_pairs_files], + std::ofstream::out | std::ofstream::binary); + if (!out.is_open()) + throw std::runtime_error("cannot open temporary file (write)"); + ++m_num_pairs_files; + std::sort(buffer.begin(), buffer.end()); + out.write(reinterpret_cast(buffer.data()), + size * sizeof(bucket_payload_pair)); + out.close(); + } + } + + private: + std::vector m_filenames; + uint64_t& m_num_pairs_files; + uint64_t m_num_threads_sort; + uint64_t m_ram_parallel_merge; + + static uint64_t get_balanced_ram(uint64_t num_pairs, uint64_t ram) { + uint64_t num_pairs_per_file = ram / sizeof(bucket_payload_pair); + uint64_t num_temporary_files = + (num_pairs + num_pairs_per_file - 1) / num_pairs_per_file; + uint64_t balanced_num_pairs_per_temporary_file = + (num_pairs + num_temporary_files - 1) / num_temporary_files; + uint64_t balanced_ram = + balanced_num_pairs_per_temporary_file * sizeof(bucket_payload_pair); + assert(balanced_ram <= ram); + + return balanced_ram; + } + }; + + struct temporary_files_manager { + temporary_files_manager(std::string const& dir_name, + uint64_t run_identifier) + : m_dir_name(dir_name), + m_run_identifier(run_identifier), + m_num_pairs_files(0), + m_used_bucket_sizes(MAX_BUCKET_SIZE) { + std::fill(m_used_bucket_sizes.begin(), m_used_bucket_sizes.end(), false); + } + + multifile_pairs_writer get_multifile_pairs_writer( + uint64_t num_pairs, uint64_t ram, uint64_t num_threads_sort = 1, + uint64_t ram_parallel_merge = 0) { + uint64_t num_pairs_per_file = ram / sizeof(bucket_payload_pair); + uint64_t num_temporary_files = + (num_pairs + num_pairs_per_file - 1) / num_pairs_per_file; + std::vector filenames; + filenames.reserve(num_temporary_files); + for (uint64_t i = 0; i < num_temporary_files; ++i) { + filenames.emplace_back(get_pairs_filename(m_num_pairs_files + i)); + } + return multifile_pairs_writer(filenames, m_num_pairs_files, num_pairs, + ram, num_threads_sort, ram_parallel_merge); + } + + uint64_t get_num_pairs_files() const { return m_num_pairs_files; } + + void remove_all_pairs_files() { + while (m_num_pairs_files > 0) { + std::remove(get_pairs_filename(--m_num_pairs_files).c_str()); + } + } + + void remove_all_merge_files() { + for (uint64_t i = 0; i != MAX_BUCKET_SIZE; ++i) { + if (m_used_bucket_sizes[i]) { + std::remove(get_buckets_filename(i + 1).c_str()); + m_used_bucket_sizes[i] = false; + } + } + } + + std::vector pairs_blocks() const { + std::vector result(m_num_pairs_files); + for (uint64_t i = 0; i != m_num_pairs_files; ++i) + result[i].open(get_pairs_filename(i)); + return result; + }; + + buckets_t buckets(build_configuration const& config) { + std::vector filenames; + filenames.reserve(MAX_BUCKET_SIZE); + for (uint64_t bucket_size = 1; bucket_size <= MAX_BUCKET_SIZE; + ++bucket_size) { + filenames.emplace_back(get_buckets_filename(bucket_size)); + } + return buckets_t(filenames, config.ram, m_used_bucket_sizes); + } + + buckets_iterator_t buckets_iterator() { + std::vector> sizes_filenames; + for (uint64_t i = 0; i != MAX_BUCKET_SIZE; ++i) { + if (m_used_bucket_sizes[i]) { + uint64_t bucket_size = i + 1; + sizes_filenames.emplace_back(bucket_size, + get_buckets_filename(bucket_size)); + } + } + assert(sizes_filenames.size() > 0); + return buckets_iterator_t(sizes_filenames); + } + + bucket_size_type max_bucket_size() { + bucket_size_type bucket_size = 0; + for (uint64_t i = 0, i_end = m_used_bucket_sizes.size(); i < i_end; ++i) { + if (m_used_bucket_sizes[i]) + bucket_size = i; + } + return bucket_size + 1; + } + + std::string get_pilots_filename() const { + std::stringstream filename; + filename << m_dir_name << "/pthash.tmp.run" << m_run_identifier + << ".pilots" + << ".bin"; + return filename.str(); + } + + std::string get_free_slots_filename() const { + std::stringstream filename; + filename << m_dir_name << "/pthash.tmp.run" << m_run_identifier + << ".free_slots" + << ".bin"; + return filename.str(); + } + + private: + std::string get_pairs_filename(uint32_t file_id) const { + std::stringstream filename; + filename << m_dir_name << "/pthash.tmp.run" << m_run_identifier + << ".pairs" << file_id << ".bin"; + return filename.str(); + } + + std::string get_buckets_filename(bucket_size_type bucket_size) const { + std::stringstream filename; + filename << m_dir_name << "/pthash.tmp.run" << m_run_identifier << ".size" + << static_cast(bucket_size) << ".bin"; + return filename.str(); + } + + std::string m_dir_name; + uint64_t m_run_identifier; + uint64_t m_num_pairs_files; + std::vector m_used_bucket_sizes; + }; + + template + void map(Iterator keys, uint64_t num_keys, std::vector& pairs_blocks, + temporary_files_manager& tfm, build_configuration const& config) { + progress_logger logger(num_keys, " == processed ", " keys from input", + config.verbose_output); + + uint64_t ram = config.ram; + uint64_t ram_parallel_merge = 0; + if (config.num_threads > 1) { + ram_parallel_merge = ram * 0.01; + assert(ram_parallel_merge >= + MAX_BUCKET_SIZE * sizeof(bucket_payload_pair)); + } + + auto writer = + tfm.get_multifile_pairs_writer(num_keys, ram - ram_parallel_merge, + config.num_threads, ram_parallel_merge); + try { + for (uint64_t i = 0; i != num_keys; ++i, ++keys) { + auto const& key = *keys; + auto hash = hasher_type::hash(key, m_seed); + bucket_id_type bucket_id = m_bucketer.bucket(hash.first()); + writer.emplace_back(bucket_id, hash.second()); + logger.log(); + } + writer.flush(); + logger.finalize(); + } catch (std::runtime_error const& e) { throw e; } + + auto tmp = tfm.pairs_blocks(); + pairs_blocks.swap(tmp); + } +}; + +} // namespace pthash diff --git a/thirdparty/pthash/builders/internal_memory_builder_single_phf.hpp b/thirdparty/pthash/builders/internal_memory_builder_single_phf.hpp new file mode 100644 index 00000000..df9be299 --- /dev/null +++ b/thirdparty/pthash/builders/internal_memory_builder_single_phf.hpp @@ -0,0 +1,365 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "pthash/builders/search.hpp" +#include "pthash/builders/util.hpp" +#include "pthash/utils/bucketers.hpp" +#include "pthash/utils/hasher.hpp" +#include "pthash/utils/logger.hpp" + +namespace pthash { + +template +struct internal_memory_builder_single_phf { + typedef Hasher hasher_type; + + template + build_timings build_from_keys(RandomAccessIterator keys, uint64_t num_keys, + build_configuration const& config) { + if (config.seed == constants::invalid_seed) { + for (auto attempt = 0; attempt < 10; ++attempt) { + m_seed = random_value(); + try { + return build_from_hashes( + hash_generator(keys, m_seed), num_keys, + config); + } catch (seed_runtime_error const& error) { + std::cout << "attempt " << attempt + 1 << " failed" << std::endl; + } + } + throw seed_runtime_error(); + } + m_seed = config.seed; + return build_from_hashes(hash_generator(keys, m_seed), + num_keys, config); + } + + template + build_timings build_from_hashes(RandomAccessIterator hashes, + uint64_t num_keys, + build_configuration const& config) { + assert(num_keys > 1); + if (config.alpha == 0 or config.alpha > 1.0) { + throw std::invalid_argument("load factor must be > 0 and <= 1.0"); + } + + clock_type::time_point start; + + start = clock_type::now(); + + build_timings time; + + uint64_t table_size = static_cast(num_keys) / config.alpha; + if ((table_size & (table_size - 1)) == 0) + table_size += 1; + uint64_t num_buckets = + (config.num_buckets == constants::invalid_num_buckets) + ? (std::ceil((config.c * num_keys) / std::log2(num_keys))) + : config.num_buckets; + + m_num_keys = num_keys; + m_table_size = table_size; + m_num_buckets = num_buckets; + m_bucketer.init(m_num_buckets); + + if (config.verbose_output) { + std::cout << "c = " << config.c << std::endl; + std::cout << "alpha = " << config.alpha << std::endl; + std::cout << "num_keys = " << num_keys << std::endl; + std::cout << "table_size = " << table_size << std::endl; + std::cout << "num_buckets = " << num_buckets << std::endl; + } + + buckets_t buckets; + { + auto start = clock_type::now(); + std::vector pairs_blocks; + map(hashes, num_keys, pairs_blocks, config); + auto elapsed = seconds(clock_type::now() - start); + if (config.verbose_output) { + std::cout << " == map+sort took: " << elapsed << " seconds" + << std::endl; + } + + start = clock_type::now(); + merge(pairs_blocks, buckets, config.verbose_output); + elapsed = seconds(clock_type::now() - start); + if (config.verbose_output) { + std::cout << " == merge+check took: " << elapsed << " seconds" + << std::endl; + } + } + auto buckets_iterator = buckets.begin(); + time.mapping_ordering_seconds = seconds(clock_type::now() - start); + if (config.verbose_output) { + std::cout << " == mapping+ordering took " << time.mapping_ordering_seconds + << " seconds " << std::endl; + std::cout << " == max bucket size = " << int((*buckets_iterator).size()) + << std::endl; + } + + start = clock_type::now(); + { + m_pilots.resize(num_buckets); + std::fill(m_pilots.begin(), m_pilots.end(), 0); + bit_vector_builder taken(m_table_size); + uint64_t num_non_empty_buckets = buckets.num_buckets(); + pilots_wrapper_t pilots_wrapper(m_pilots); + search(m_num_keys, m_num_buckets, num_non_empty_buckets, m_seed, config, + buckets_iterator, taken, pilots_wrapper); + if (config.minimal_output) { + m_free_slots.clear(); + m_free_slots.reserve(taken.size() - num_keys); + fill_free_slots(taken, num_keys, m_free_slots); + } + } + time.searching_seconds = seconds(clock_type::now() - start); + if (config.verbose_output) { + std::cout << " == search took " << time.searching_seconds << " seconds" + << std::endl; + } + + return time; + } + + uint64_t seed() const { return m_seed; } + + uint64_t num_keys() const { return m_num_keys; } + + uint64_t table_size() const { return m_table_size; } + + skew_bucketer bucketer() const { return m_bucketer; } + + std::vector const& pilots() const { return m_pilots; } + + std::vector const& free_slots() const { return m_free_slots; } + + void swap(internal_memory_builder_single_phf& other) { + std::swap(m_seed, other.m_seed); + std::swap(m_num_keys, other.m_num_keys); + std::swap(m_num_buckets, other.m_num_buckets); + std::swap(m_table_size, other.m_table_size); + std::swap(m_bucketer, other.m_bucketer); + m_pilots.swap(other.m_pilots); + m_free_slots.swap(other.m_free_slots); + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_seed); + visitor.visit(m_num_keys); + visitor.visit(m_num_buckets); + visitor.visit(m_table_size); + visitor.visit(m_bucketer); + visitor.visit(m_pilots); + visitor.visit(m_free_slots); + } + + static size_t estimate_num_bytes_for_construction( + uint64_t num_keys, build_configuration const& config) { + uint64_t table_size = static_cast(num_keys) / config.alpha; + if ((table_size & (table_size - 1)) == 0) + table_size += 1; + uint64_t num_buckets = + (config.num_buckets == constants::invalid_num_buckets) + ? (std::ceil((config.c * num_keys) / std::log2(num_keys))) + : config.num_buckets; + + size_t mapping_bytes = + num_keys * sizeof(bucket_payload_pair) // pairs + + (num_keys + num_buckets) * sizeof(uint64_t); // buckets + + size_t search_bytes = + num_buckets * sizeof(uint64_t) // pilots + + num_buckets * sizeof(uint64_t) // buckets + + (config.minimal_output ? (table_size - num_keys) * sizeof(uint64_t) + : 0) // free_slots + + num_keys * sizeof(uint64_t) // hashes + + table_size / 8; // bitmap taken + return std::max(mapping_bytes, search_bytes); + } + + private: + uint64_t m_seed; + uint64_t m_num_keys; + uint64_t m_num_buckets; + uint64_t m_table_size; + skew_bucketer m_bucketer; + std::vector m_pilots; + std::vector m_free_slots; + + template + struct hash_generator { + hash_generator(RandomAccessIterator keys, uint64_t seed) + : m_iterator(keys), m_seed(seed) {} + + inline typename hasher_type::hash_type operator*() { + return hasher_type::hash(*m_iterator, m_seed); + } + + inline void operator++() { ++m_iterator; } + + inline hash_generator operator+(uint64_t offset) const { + return hash_generator(m_iterator + offset, m_seed); + } + + private: + RandomAccessIterator m_iterator; + uint64_t m_seed; + }; + + typedef std::vector pairs_t; + + struct buckets_iterator_t { + buckets_iterator_t(std::vector> const& buffers) + : m_buffers_it(buffers.end() - 1), m_bucket_size(buffers.size()) { + m_bucket.init(m_buffers_it->data(), m_bucket_size); + skip_empty_buckets(); + } + + inline void operator++() { + uint64_t const* begin = m_bucket.begin() + m_bucket_size; + uint64_t const* end = m_buffers_it->data() + m_buffers_it->size(); + m_bucket.init(begin, m_bucket_size); + if ((m_bucket.begin() - 1) == end and m_bucket_size != 0) { + --m_bucket_size; + --m_buffers_it; + skip_empty_buckets(); + } + } + + inline bucket_t operator*() const { return m_bucket; } + + private: + std::vector>::const_iterator m_buffers_it; + bucket_size_type m_bucket_size; + bucket_t m_bucket; + + void skip_empty_buckets() { + while (m_bucket_size != 0 and m_buffers_it->empty()) { + --m_bucket_size; + --m_buffers_it; + } + if (m_bucket_size != 0) + m_bucket.init(m_buffers_it->data(), m_bucket_size); + } + }; + + struct buckets_t { + buckets_t() : m_buffers(MAX_BUCKET_SIZE), m_num_buckets(0) {} + + template + void add(bucket_id_type bucket_id, bucket_size_type bucket_size, + HashIterator hashes) { + assert(bucket_size > 0); + uint64_t i = bucket_size - 1; + m_buffers[i].push_back(bucket_id); + for (uint64_t k = 0; k != bucket_size; ++k, ++hashes) + m_buffers[i].push_back(*hashes); + ++m_num_buckets; + } + + uint64_t num_buckets() const { return m_num_buckets; }; + + buckets_iterator_t begin() const { return buckets_iterator_t(m_buffers); } + + private: + std::vector> m_buffers; + uint64_t m_num_buckets; + }; + + struct pilots_wrapper_t { + pilots_wrapper_t(std::vector& pilots) : m_pilots(pilots) {} + + inline void emplace_back(bucket_id_type bucket_id, uint64_t pilot) { + m_pilots[bucket_id] = pilot; + } + + private: + std::vector& m_pilots; + }; + + template + void map_sequential(RandomAccessIterator hashes, uint64_t num_keys, + std::vector& pairs_blocks, + build_configuration const&) const { + pairs_t pairs(num_keys); + RandomAccessIterator begin = hashes; + for (uint64_t i = 0; i != num_keys; ++i, ++begin) { + auto hash = *begin; + auto bucket_id = m_bucketer.bucket(hash.first()); + pairs[i] = {static_cast(bucket_id), hash.second()}; + } + std::sort(pairs.begin(), pairs.end()); + pairs_blocks.resize(1); + pairs_blocks.front().swap(pairs); + } + + template + void map_parallel(RandomAccessIterator hashes, uint64_t num_keys, + std::vector& pairs_blocks, + build_configuration const& config) const { + pairs_blocks.resize(config.num_threads); + uint64_t num_keys_per_thread = + (num_keys + config.num_threads - 1) / config.num_threads; + + auto exe = [&](uint64_t tid) { + auto& local_pairs = pairs_blocks[tid]; + RandomAccessIterator begin = hashes + tid * num_keys_per_thread; + uint64_t local_num_keys = (tid != config.num_threads - 1) + ? num_keys_per_thread + : (num_keys - tid * num_keys_per_thread); + local_pairs.resize(local_num_keys); + + for (uint64_t local_i = 0; local_i != local_num_keys; + ++local_i, ++begin) { + auto hash = *begin; + auto bucket_id = m_bucketer.bucket(hash.first()); + local_pairs[local_i] = {static_cast(bucket_id), + hash.second()}; + } + std::sort(local_pairs.begin(), local_pairs.end()); + }; + + std::vector threads(config.num_threads); + for (uint64_t i = 0; i != config.num_threads; ++i) + threads[i] = std::thread(exe, i); + for (auto& t : threads) { + if (t.joinable()) + t.join(); + } + } + + template + void map(RandomAccessIterator hashes, uint64_t num_keys, + std::vector& pairs_blocks, + build_configuration const& config) const { + if (config.num_threads > 1) { + map_parallel(hashes, num_keys, pairs_blocks, config); + } else { + map_sequential(hashes, num_keys, pairs_blocks, config); + } + } +}; + +} // namespace pthash diff --git a/thirdparty/pthash/builders/search.hpp b/thirdparty/pthash/builders/search.hpp new file mode 100644 index 00000000..39244430 --- /dev/null +++ b/thirdparty/pthash/builders/search.hpp @@ -0,0 +1,358 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include // for pow, round, log2 +#include // for stringbuf +#include +#include "pthash/essentials/essentials.hpp" + +#include "pthash/builders/util.hpp" +#include "pthash/encoders/bit_vector.hpp" +#include "pthash/utils/hasher.hpp" + +namespace pthash { + +constexpr uint64_t search_cache_size = 1000; + +struct search_logger { + search_logger(uint64_t num_keys, uint64_t table_size, uint64_t num_buckets) + : m_num_keys(num_keys), + m_table_size(table_size), + m_num_buckets(num_buckets), + m_step(m_num_buckets > 20 ? m_num_buckets / 20 : 1), + m_bucket(0), + m_placed_keys(0), + m_trials(0), + m_total_trials(0), + m_expected_trials(0.0), + m_total_expected_trials(0.0) {} + + void init() { + essentials::logger("search starts"); + m_timer.start(); + } + + /* If X_i is the random variable counting the number of trials + for bucket i, then Pr(X_i <= N - 1) = 1 - (1 - p_i)^N, + where p_i is the success probability for bucket i. + By solving 1 - (1 - p_i)^N >= T wrt N and for a given target + probability T < 1, we obtain N <= log_{1-p_i}(1-T), that is: + we get a pilot <= N with probability T. + Of course, the closer T is to 1, the higher N becomes. + In practice T = 0.65 suffices to have + N > # trials per bucket, for all buckets. + */ + double pilot_wp_T(double T, double p) { + assert(T > 0 and p > 0); + double x = std::log2(1.0 - T) / std::log2(1.0 - p); + return round(x); + } + + void update(uint64_t bucket, uint64_t bucket_size, uint64_t pilot) { + if (bucket > 0) { + double base = + static_cast(m_table_size - m_placed_keys) / m_table_size; + double p = pow(base, bucket_size); + double e = 1.0 / p; + m_expected_trials += e; + m_total_expected_trials += e; + } + + m_placed_keys += bucket_size; + m_trials += pilot + 1; + m_total_trials += pilot + 1; + + if (bucket > 0 and bucket % m_step == 0) + print(bucket); + } + + void finalize(uint64_t bucket) { + m_step = bucket - m_bucket; + print(bucket); + essentials::logger("search ends"); + std::cout << " == " << m_num_buckets - bucket << " empty buckets (" + << ((m_num_buckets - bucket) * 100.0) / m_num_buckets << "%)" + << std::endl; + std::cout << " == total trials = " << m_total_trials << std::endl; + std::cout << " == total expected trials = " + << uint64_t(m_total_expected_trials) << std::endl; + } + + private: + uint64_t m_num_keys; + uint64_t m_table_size; + uint64_t m_num_buckets; + uint64_t m_step; + uint64_t m_bucket; + uint64_t m_placed_keys; + + uint64_t m_trials; + uint64_t m_total_trials; + double m_expected_trials; + double m_total_expected_trials; + + essentials::timer + m_timer; + + void print(uint64_t bucket) { + m_timer.stop(); + std::stringbuf buffer; + std::ostream os(&buffer); + os << m_step << " buckets done in " << m_timer.elapsed() << " seconds (" + << (m_placed_keys * 100.0) / m_num_keys << "% of keys, " + << (bucket * 100.0) / m_num_buckets << "% of buckets, " + << static_cast(m_trials) / m_step << " trials per bucket, " + << m_expected_trials / m_step << " expected trials per bucket)"; + essentials::logger(buffer.str()); + m_bucket = bucket; + m_trials = 0; + m_expected_trials = 0.0; + m_timer.reset(); + m_timer.start(); + } +}; + +template +void search_sequential(uint64_t num_keys, uint64_t num_buckets, + uint64_t num_non_empty_buckets, uint64_t seed, + build_configuration const& config, + BucketsIterator& buckets, bit_vector_builder& taken, + PilotsBuffer& pilots) { + uint64_t max_bucket_size = (*buckets).size(); + uint64_t table_size = taken.size(); + std::vector positions; + positions.reserve(max_bucket_size); + __uint128_t M = fastmod::computeM_u64(table_size); + + std::vector hashed_pilots_cache(search_cache_size); + for (uint64_t pilot = 0; pilot != search_cache_size; ++pilot) { + hashed_pilots_cache[pilot] = default_hash64(pilot, seed); + } + + search_logger log(num_keys, table_size, num_buckets); + if (config.verbose_output) + log.init(); + + uint64_t processed_buckets = 0; + for (; processed_buckets < num_non_empty_buckets; + ++processed_buckets, ++buckets) { + auto const& bucket = *buckets; + assert(bucket.size() > 0); + + for (uint64_t pilot = 0; true; ++pilot) { + uint64_t hashed_pilot = PTHASH_LIKELY(pilot < search_cache_size) + ? hashed_pilots_cache[pilot] + : default_hash64(pilot, seed); + + positions.clear(); + + auto bucket_begin = bucket.begin(), bucket_end = bucket.end(); + for (; bucket_begin != bucket_end; ++bucket_begin) { + uint64_t hash = *bucket_begin; + uint64_t p = fastmod::fastmod_u64(hash ^ hashed_pilot, M, table_size); + if (taken.get(p)) + break; + positions.push_back(p); + } + + if (bucket_begin == + bucket_end) { // all keys do not have collisions with taken + + // check for in-bucket collisions + std::sort(positions.begin(), positions.end()); + auto it = std::adjacent_find(positions.begin(), positions.end()); + if (it != positions.end()) + continue; // in-bucket collision detected, try next pilot + + pilots.emplace_back(bucket.id(), pilot); + for (auto p : positions) { + assert(taken.get(p) == false); + taken.set(p, true); + } + if (config.verbose_output) + log.update(processed_buckets, bucket.size(), pilot); + break; + } + } + } + + if (config.verbose_output) + log.finalize(processed_buckets); +} + +template +void search_parallel(uint64_t num_keys, uint64_t num_buckets, + uint64_t num_non_empty_buckets, uint64_t seed, + build_configuration const& config, + BucketsIterator& buckets, bit_vector_builder& taken, + PilotsBuffer& pilots) { + uint64_t max_bucket_size = (*buckets).size(); + uint64_t table_size = taken.size(); + __uint128_t M = fastmod::computeM_u64(table_size); + + const uint64_t num_threads = config.num_threads; + std::vector hashed_pilots_cache(search_cache_size); + for (uint64_t pilot = 0; pilot != search_cache_size; ++pilot) { + hashed_pilots_cache[pilot] = default_hash64(pilot, seed); + } + + search_logger log(num_keys, table_size, num_buckets); + if (config.verbose_output) + log.init(); + + volatile uint64_t next_bucket_idx = 0; + + auto exe = [&](uint64_t local_bucket_idx, bucket_t bucket) { + std::vector positions; + positions.reserve(max_bucket_size); + + while (true) { + uint64_t pilot = 0; + bool pilot_checked = false; + + while (true) { + uint64_t local_next_bucket_idx = next_bucket_idx; + + for (; true; ++pilot) { + if (PTHASH_LIKELY(!pilot_checked)) { + uint64_t hashed_pilot = PTHASH_LIKELY(pilot < search_cache_size) + ? hashed_pilots_cache[pilot] + : default_hash64(pilot, seed); + + positions.clear(); + + auto bucket_begin = bucket.begin(), bucket_end = bucket.end(); + for (; bucket_begin != bucket_end; ++bucket_begin) { + uint64_t hash = *bucket_begin; + uint64_t p = + fastmod::fastmod_u64(hash ^ hashed_pilot, M, table_size); + if (taken.get(p)) + break; + positions.push_back(p); + } + + if (bucket_begin == bucket_end) { + std::sort(positions.begin(), positions.end()); + auto it = std::adjacent_find(positions.begin(), positions.end()); + if (it != positions.end()) + continue; + + // I can stop the pilot search as there are not collisions + pilot_checked = true; + break; + } + } else { + // I already computed the positions and checked the in-bucket + // collisions I must only check the bitmap again + for (auto p : positions) { + if (taken.get(p)) { + pilot_checked = false; + break; + } + } + // I can stop the pilot search as there are not collisions + if (pilot_checked) + break; + } + } + + // I am the first thread: this is the only condition that can stop the + // loop + if (local_next_bucket_idx == local_bucket_idx) + break; + + // active wait until another thread pushes a change in the bitmap + while (local_next_bucket_idx == next_bucket_idx) + ; + } + assert(local_bucket_idx == next_bucket_idx); + + /* thread-safe from now on */ + + pilots.emplace_back(bucket.id(), pilot); + for (auto p : positions) { + assert(taken.get(p) == false); + taken.set(p, true); + } + if (config.verbose_output) + log.update(local_bucket_idx, bucket.size(), pilot); + + // update (local) local_bucket_idx + local_bucket_idx = next_bucket_idx + num_threads; + + if (local_bucket_idx >= num_non_empty_buckets) { // stop the thread + // update (global) next_bucket_idx, which may unlock other threads + ++next_bucket_idx; + break; + } + + // read the next bucket and advance the iterator + bucket = (*buckets); + ++buckets; + + // update (global) next_bucket_idx, which may unlock other threads + ++next_bucket_idx; + } + }; + + std::vector threads; + threads.reserve(num_threads); + next_bucket_idx = static_cast( + -1); // avoid that some thread advances the iterator + for (uint64_t i = 0; i != num_threads and i < num_non_empty_buckets; + ++i, ++buckets) { + bucket_t bucket = *buckets; + threads.emplace_back(exe, i, bucket); + } + + next_bucket_idx = 0; // notify the first thread + for (auto& t : threads) { + if (t.joinable()) + t.join(); + } + assert(next_bucket_idx == num_non_empty_buckets); + + if (config.verbose_output) + log.finalize(next_bucket_idx); +} + +template +void search(uint64_t num_keys, uint64_t num_buckets, + uint64_t num_non_empty_buckets, uint64_t seed, + build_configuration const& config, BucketsIterator& buckets, + bit_vector_builder& taken, PilotsBuffer& pilots) { + if (config.num_threads > 1) { + if (config.num_threads > std::thread::hardware_concurrency()) { + throw std::invalid_argument( + "parallel search should use at most " + + std::to_string(std::thread::hardware_concurrency()) + " threads"); + } + search_parallel(num_keys, num_buckets, num_non_empty_buckets, seed, config, + buckets, taken, pilots); + } else { + search_sequential(num_keys, num_buckets, num_non_empty_buckets, seed, + config, buckets, taken, pilots); + } +} + +} // namespace pthash diff --git a/thirdparty/pthash/builders/util.hpp b/thirdparty/pthash/builders/util.hpp new file mode 100644 index 00000000..98ac7c3c --- /dev/null +++ b/thirdparty/pthash/builders/util.hpp @@ -0,0 +1,301 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "pthash/encoders/bit_vector.hpp" +#include "pthash/utils/logger.hpp" +#include "pthash/utils/util.hpp" + +namespace pthash { + +typedef uint32_t bucket_id_type; +typedef uint8_t bucket_size_type; +#define MAX_BUCKET_SIZE static_cast(100) + +static inline std::string get_tmp_builder_filename(std::string const& dir_name, + uint64_t id) { + return dir_name + "/pthash.temp." + std::to_string(id) + ".builder"; +} + +struct build_timings { + build_timings() + : partitioning_seconds(0.0), + mapping_ordering_seconds(0.0), + searching_seconds(0.0), + encoding_seconds(0.0) {} + + double partitioning_seconds; + double mapping_ordering_seconds; + double searching_seconds; + double encoding_seconds; +}; + +struct build_configuration { + build_configuration() + : c(4.5), + alpha(0.98), + num_partitions(1), + num_buckets(constants::invalid_num_buckets), + num_threads(1), + seed(constants::invalid_seed), + ram(static_cast(constants::available_ram) * 0.75), + tmp_dir(constants::default_tmp_dirname), + minimal_output(false), + verbose_output(true) {} + + double c; + double alpha; + uint64_t num_partitions; + uint64_t num_buckets; + uint64_t num_threads; + uint64_t seed; + uint64_t ram; + std::string tmp_dir; + bool minimal_output; + bool verbose_output; +}; + +struct seed_runtime_error : public std::runtime_error { + seed_runtime_error() : std::runtime_error("seed did not work") {} +}; + +#pragma pack(push, 4) +struct bucket_payload_pair { + bucket_id_type bucket_id; + uint64_t payload; + + bucket_payload_pair() {} + bucket_payload_pair(bucket_id_type bucket_id, uint64_t payload) + : bucket_id(bucket_id), payload(payload) {} + + bool operator<(bucket_payload_pair const& other) const { + return (bucket_id < other.bucket_id) or + (bucket_id == other.bucket_id and payload < other.payload); + } +}; +#pragma pack(pop) + +struct bucket_t { + bucket_t() : m_begin(nullptr), m_size(0) {} + + void init(uint64_t const* begin, bucket_size_type size) { + m_begin = begin; + m_size = size; + } + + inline bucket_id_type id() const { return *m_begin; } + + inline uint64_t const* begin() const { return m_begin + 1; } + + inline uint64_t const* end() const { return m_begin + 1 + m_size; } + + inline bucket_size_type size() const { return m_size; } + + private: + uint64_t const* m_begin; + bucket_size_type m_size; +}; + +template +struct payload_iterator { + payload_iterator(PairsRandomAccessIterator const& iterator) + : m_iterator(iterator) {} + + uint64_t operator*() const { return (*m_iterator).payload; } + + void operator++() { ++m_iterator; } + + private: + PairsRandomAccessIterator m_iterator; +}; + +template +void merge_single_block(Pairs const& pairs, Merger& merger, bool verbose) { + progress_logger logger(pairs.size(), " == merged ", " pairs", verbose); + + bucket_size_type bucket_size = 1; + uint64_t num_pairs = pairs.size(); + logger.log(); + for (uint64_t i = 1; i != num_pairs; ++i) { + if (pairs[i].bucket_id == pairs[i - 1].bucket_id) { + if (PTHASH_LIKELY(pairs[i].payload != pairs[i - 1].payload)) { + ++bucket_size; + } else { + throw seed_runtime_error(); + } + } else { + merger.add(pairs[i - 1].bucket_id, bucket_size, + payload_iterator( + pairs.begin() + i - bucket_size)); + bucket_size = 1; + } + logger.log(); + } + + // add the last bucket + merger.add(pairs[num_pairs - 1].bucket_id, bucket_size, + payload_iterator(pairs.end() - + bucket_size)); + logger.finalize(); +} + +template +void merge_multiple_blocks(std::vector const& pairs_blocks, + Merger& merger, bool verbose) { + uint64_t num_pairs = std::accumulate( + pairs_blocks.begin(), pairs_blocks.end(), static_cast(0), + [](uint64_t sum, Pairs const& pairs) { return sum + pairs.size(); }); + progress_logger logger(num_pairs, " == merged ", " pairs", verbose); + + // input iterators and heap + std::vector iterators; + std::vector idx_heap; + iterators.reserve(pairs_blocks.size()); + idx_heap.reserve(pairs_blocks.size()); + + // heap functions + auto stdheap_idx_comparator = [&](uint32_t idxa, uint32_t idxb) { + return !((*iterators[idxa]) < (*iterators[idxb])); + }; + auto advance_heap_head = [&]() { + auto idx = idx_heap[0]; + ++iterators[idx]; + if (PTHASH_LIKELY(iterators[idx] != pairs_blocks[idx].end())) { + // percolate down the head + uint64_t pos = 0; + uint64_t size = idx_heap.size(); + while (2 * pos + 1 < size) { + uint64_t i = 2 * pos + 1; + if (i + 1 < size and + stdheap_idx_comparator(idx_heap[i], idx_heap[i + 1])) + ++i; + if (stdheap_idx_comparator(idx_heap[i], idx_heap[pos])) + break; + std::swap(idx_heap[pos], idx_heap[i]); + pos = i; + } + } else { + std::pop_heap(idx_heap.begin(), idx_heap.end(), stdheap_idx_comparator); + idx_heap.pop_back(); + } + }; + + // create the input iterators and the heap + for (uint64_t i = 0; i != pairs_blocks.size(); ++i) { + iterators.push_back(pairs_blocks[i].begin()); + idx_heap.push_back(i); + } + std::make_heap(idx_heap.begin(), idx_heap.end(), stdheap_idx_comparator); + + bucket_id_type bucket_id; + std::vector bucket_payloads; + bucket_payloads.reserve(MAX_BUCKET_SIZE); + + // read the first pair + { + bucket_payload_pair pair = (*iterators[idx_heap[0]]); + bucket_id = pair.bucket_id; + bucket_payloads.push_back(pair.payload); + advance_heap_head(); + logger.log(); + } + + // merge + for (uint64_t i = 0; (PTHASH_LIKELY(idx_heap.size())); + ++i, advance_heap_head()) { + bucket_payload_pair pair = (*iterators[idx_heap[0]]); + + if (pair.bucket_id == bucket_id) { + if (PTHASH_LIKELY(pair.payload != bucket_payloads.back())) { + bucket_payloads.push_back(pair.payload); + } else { + throw seed_runtime_error(); + } + } else { + merger.add(bucket_id, bucket_payloads.size(), bucket_payloads.begin()); + bucket_id = pair.bucket_id; + bucket_payloads.clear(); + bucket_payloads.push_back(pair.payload); + } + logger.log(); + } + + // add the last bucket + merger.add(bucket_id, bucket_payloads.size(), bucket_payloads.begin()); + logger.finalize(); +} + +template +void merge(std::vector const& pairs_blocks, Merger& merger, + bool verbose) { + if (pairs_blocks.size() == 1) { + merge_single_block(pairs_blocks[0], merger, verbose); + } else { + merge_multiple_blocks(pairs_blocks, merger, verbose); + } +} + +template +void fill_free_slots(bit_vector_builder const& taken, uint64_t num_keys, + FreeSlots& free_slots) { + uint64_t table_size = taken.size(); + if (table_size <= num_keys) + return; + + uint64_t next_used_slot = num_keys; + uint64_t last_free_slot = 0, last_valid_free_slot = 0; + + while (true) { + // find the next free slot (on the left) + while (last_free_slot < num_keys && taken.get(last_free_slot)) + ++last_free_slot; + // exit condition + if (last_free_slot == num_keys) + break; + // fill with the last free slot (on the left) until I find a new used slot + // (on the right) note: since I found a free slot on the left, there must be + // an used slot on the right + assert(next_used_slot < table_size); + while (!taken.get(next_used_slot)) { + free_slots.emplace_back(last_free_slot); + ++next_used_slot; + } + assert(next_used_slot < table_size); + // fill the used slot (on the right) with the last free slot and advance all + // cursors + free_slots.emplace_back(last_free_slot); + last_valid_free_slot = last_free_slot; + ++next_used_slot; + ++last_free_slot; + } + // fill the tail with the last valid slot that I found + while (next_used_slot != table_size) { + free_slots.emplace_back(last_valid_free_slot); + ++next_used_slot; + } + assert(next_used_slot == table_size); +} + +} // namespace pthash diff --git a/thirdparty/pthash/encoders/bit_vector.hpp b/thirdparty/pthash/encoders/bit_vector.hpp new file mode 100644 index 00000000..27547a7d --- /dev/null +++ b/thirdparty/pthash/encoders/bit_vector.hpp @@ -0,0 +1,347 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "pthash/encoders/util.hpp" +#include "pthash/essentials/essentials.hpp" + +namespace pthash { + +struct bit_vector_builder { + bit_vector_builder(uint64_t size = 0, bool init = 0) : m_size(size) { + m_bits.resize(essentials::words_for(size), uint64_t(-init)); + if (size) { + m_cur_word = &m_bits.back(); + // clear padding bits + if (init && (size & 63)) { + *m_cur_word >>= 64 - (size & 63); + } + } + } + + void reserve(uint64_t num_bits) { + m_bits.reserve(essentials::words_for(num_bits)); + } + + inline void push_back(bool b) { + uint64_t pos_in_word = m_size % 64; + if (pos_in_word == 0) { + m_bits.push_back(0); + m_cur_word = &m_bits.back(); + } + *m_cur_word |= (uint64_t) b << pos_in_word; + ++m_size; + } + + inline void zero_extend(uint64_t n) { + m_size += n; + uint64_t needed = essentials::words_for(m_size) - m_bits.size(); + if (needed) { + m_bits.insert(m_bits.end(), needed, 0); + m_cur_word = &m_bits.back(); + } + } + + inline void set(uint64_t pos, bool b = true) { + assert(pos < size()); + uint64_t word = pos >> 6; + uint64_t pos_in_word = pos & 63; + m_bits[word] &= ~(uint64_t(1) << pos_in_word); + m_bits[word] |= uint64_t(b) << pos_in_word; + } + + inline uint64_t get(uint64_t pos) const { + assert(pos < size()); + uint64_t word = pos >> 6; + uint64_t pos_in_word = pos & 63; + return m_bits[word] >> pos_in_word & uint64_t(1); + } + + inline void set_bits(uint64_t pos, uint64_t bits, size_t len) { + assert(pos + len <= size()); + // check there are no spurious bits + assert(len == 64 || (bits >> len) == 0); + if (!len) + return; + uint64_t mask = (len == 64) ? uint64_t(-1) : ((uint64_t(1) << len) - 1); + uint64_t word = pos >> 6; + uint64_t pos_in_word = pos & 63; + + m_bits[word] &= ~(mask << pos_in_word); + m_bits[word] |= bits << pos_in_word; + + uint64_t stored = 64 - pos_in_word; + if (stored < len) { + m_bits[word + 1] &= ~(mask >> stored); + m_bits[word + 1] |= bits >> stored; + } + } + + inline void append_bits(uint64_t bits, size_t len) { + // check there are no spurious bits + assert(len == 64 || (bits >> len) == 0); + if (!len) + return; + uint64_t pos_in_word = m_size & 63; + m_size += len; + if (pos_in_word == 0) { + m_bits.push_back(bits); + } else { + *m_cur_word |= bits << pos_in_word; + if (len > 64 - pos_in_word) { + m_bits.push_back(bits >> (64 - pos_in_word)); + } + } + m_cur_word = &m_bits.back(); + } + + inline uint64_t get_word64(uint64_t pos) const { + assert(pos < size()); + uint64_t block = pos >> 6; + uint64_t shift = pos & 63; + uint64_t word = m_bits[block] >> shift; + if (shift && block + 1 < m_bits.size()) { + word |= m_bits[block + 1] << (64 - shift); + } + return word; + } + + void append(bit_vector_builder const& rhs) { + if (!rhs.size()) + return; + + uint64_t pos = m_bits.size(); + uint64_t shift = size() % 64; + m_size = size() + rhs.size(); + m_bits.resize(essentials::words_for(m_size)); + + if (shift == 0) { // word-aligned, easy case + std::copy(rhs.m_bits.begin(), rhs.m_bits.end(), + m_bits.begin() + ptrdiff_t(pos)); + } else { + uint64_t* cur_word = &m_bits.front() + pos - 1; + for (size_t i = 0; i < rhs.m_bits.size() - 1; ++i) { + uint64_t w = rhs.m_bits[i]; + *cur_word |= w << shift; + *++cur_word = w >> (64 - shift); + } + *cur_word |= rhs.m_bits.back() << shift; + if (cur_word < &m_bits.back()) { + *++cur_word = rhs.m_bits.back() >> (64 - shift); + } + } + m_cur_word = &m_bits.back(); + } + + void resize(uint64_t size) { + m_size = size; + m_bits.resize(essentials::words_for(m_size)); + } + + void swap(bit_vector_builder& other) { + m_bits.swap(other.m_bits); + std::swap(m_size, other.m_size); + std::swap(m_cur_word, other.m_cur_word); + } + + std::vector& data() { return m_bits; } + + uint64_t size() const { return m_size; } + + private: + std::vector m_bits; + uint64_t m_size; + uint64_t* m_cur_word; +}; + +struct bit_vector { + bit_vector() : m_size(0) {} + + void build(bit_vector_builder* in) { + m_size = in->size(); + m_bits.swap(in->data()); + } + + bit_vector(bit_vector_builder* in) { build(in); } + + void swap(bit_vector& other) { + std::swap(other.m_size, m_size); + other.m_bits.swap(m_bits); + } + + inline size_t size() const { return m_size; } + + uint64_t bytes() const { + return sizeof(m_size) + essentials::vec_bytes(m_bits); + } + + // get i-th bit + inline uint64_t operator[](uint64_t i) const { + assert(i < size()); + uint64_t block = i >> 6; + uint64_t shift = i & 63; + return m_bits[block] >> shift & uint64_t(1); + } + + inline uint64_t get_bits(uint64_t pos, uint64_t len) const { + assert(pos + len <= size()); + if (!len) + return 0; + uint64_t block = pos >> 6; + uint64_t shift = pos & 63; + uint64_t mask = -(len == 64) | ((1ULL << len) - 1); + if (shift + len <= 64) { + return m_bits[block] >> shift & mask; + } else { + return (m_bits[block] >> shift) | + (m_bits[block + 1] << (64 - shift) & mask); + } + } + + // fast and unsafe version: it retrieves at least 56 bits + inline uint64_t get_word56(uint64_t pos) const { + const char* base_ptr = reinterpret_cast(m_bits.data()); + return *(reinterpret_cast(base_ptr + (pos >> 3))) >> + (pos & 7); + } + + // pad with zeros if extension further size is needed + inline uint64_t get_word64(uint64_t pos) const { + assert(pos < size()); + uint64_t block = pos >> 6; + uint64_t shift = pos & 63; + uint64_t word = m_bits[block] >> shift; + if (shift && block + 1 < m_bits.size()) { + word |= m_bits[block + 1] << (64 - shift); + } + return word; + } + + inline uint64_t predecessor1(uint64_t pos) const { + assert(pos < m_size); + uint64_t block = pos / 64; + uint64_t shift = 64 - pos % 64 - 1; + uint64_t word = m_bits[block]; + word = (word << shift) >> shift; + + unsigned long ret; + while (!util::msb(word, ret)) { + assert(block); + word = m_bits[--block]; + }; + return block * 64 + ret; + } + + std::vector const& data() const { return m_bits; } + + struct unary_iterator { + unary_iterator() : m_data(0), m_position(0), m_buf(0) {} + + unary_iterator(bit_vector const& bv, uint64_t pos = 0) { + m_data = bv.data().data(); + m_position = pos; + m_buf = m_data[pos >> 6]; + // clear low bits + m_buf &= uint64_t(-1) << (pos & 63); + } + + uint64_t position() const { return m_position; } + + uint64_t next() { + unsigned long pos_in_word; + uint64_t buf = m_buf; + while (!util::lsb(buf, pos_in_word)) { + m_position += 64; + buf = m_data[m_position >> 6]; + } + + m_buf = buf & (buf - 1); // clear LSB + m_position = (m_position & ~uint64_t(63)) + pos_in_word; + return m_position; + } + + // skip to the k-th one after the current position + void skip(uint64_t k) { + uint64_t skipped = 0; + uint64_t buf = m_buf; + uint64_t w = 0; + while (skipped + (w = util::popcount(buf)) <= k) { + skipped += w; + m_position += 64; + buf = m_data[m_position / 64]; + } + assert(buf); + uint64_t pos_in_word = util::select_in_word(buf, k - skipped); + m_buf = buf & (uint64_t(-1) << pos_in_word); + m_position = (m_position & ~uint64_t(63)) + pos_in_word; + } + + // skip to the k-th zero after the current position + void skip0(uint64_t k) { + uint64_t skipped = 0; + uint64_t pos_in_word = m_position % 64; + uint64_t buf = ~m_buf & (uint64_t(-1) << pos_in_word); + uint64_t w = 0; + while (skipped + (w = util::popcount(buf)) <= k) { + skipped += w; + m_position += 64; + buf = ~m_data[m_position / 64]; + } + assert(buf); + pos_in_word = util::select_in_word(buf, k - skipped); + m_buf = ~buf & (uint64_t(-1) << pos_in_word); + m_position = (m_position & ~uint64_t(63)) + pos_in_word; + } + + private: + uint64_t const* m_data; + uint64_t m_position; + uint64_t m_buf; + }; + + template + void visit(Visitor& visitor) { + visitor.visit(m_size); + visitor.visit(m_bits); + } + + template + void load(Loader& loader) { + loader.load(m_size); + loader.load_vec(m_bits); + } + + template + void dump(Dumper& dumper) const { + dumper.dump(m_size); + dumper.dump_vec(m_bits); + } + + protected: + size_t m_size; + std::vector m_bits; +}; + +} // namespace pthash diff --git a/thirdparty/pthash/encoders/compact_vector.hpp b/thirdparty/pthash/encoders/compact_vector.hpp new file mode 100644 index 00000000..b2ec2a69 --- /dev/null +++ b/thirdparty/pthash/encoders/compact_vector.hpp @@ -0,0 +1,306 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace pthash { + +struct compact_vector { + template + struct enumerator { + enumerator() {} + + enumerator(Data const* data, uint64_t i = 0) + : m_i(i), + m_cur_val(0), + m_cur_block((i * data->m_width) >> 6), + m_cur_shift((i * data->m_width) & 63), + m_data(data) {} + + uint64_t operator*() { + read(); + return m_cur_val; + } + + enumerator& operator++() { + ++m_i; + return *this; + } + + inline uint64_t value() { + read(); + return m_cur_val; + } + + inline void next() { ++m_i; } + + bool operator==(enumerator const& other) const { return m_i == other.m_i; } + + bool operator!=(enumerator const& other) const { return !(*this == other); } + + private: + uint64_t m_i; + uint64_t m_cur_val; + uint64_t m_cur_block; + int64_t m_cur_shift; + Data const* m_data; + + void read() { + if (m_cur_shift + m_data->m_width <= 64) { + m_cur_val = m_data->m_bits[m_cur_block] >> m_cur_shift & m_data->m_mask; + } else { + uint64_t res_shift = 64 - m_cur_shift; + m_cur_val = + (m_data->m_bits[m_cur_block] >> m_cur_shift) | + (m_data->m_bits[m_cur_block + 1] << res_shift & m_data->m_mask); + ++m_cur_block; + m_cur_shift = -res_shift; + } + + m_cur_shift += m_data->m_width; + + if (m_cur_shift == 64) { + m_cur_shift = 0; + ++m_cur_block; + } + } + }; + + struct builder { + builder() + : m_size(0), + m_width(0), + m_mask(0), + m_back(0), + m_cur_block(0), + m_cur_shift(0) {} + + builder(uint64_t n, uint64_t w) { resize(n, w); } + + void resize(size_t n, uint64_t w) { + m_size = n; + m_width = w; + m_mask = -(w == 64) | ((uint64_t(1) << w) - 1); + m_back = 0; + m_cur_block = 0; + m_cur_shift = 0; + m_bits.resize( + /* use 1 word more for safe access() */ + essentials::words_for(m_size * m_width) + 1, 0); + } + + template + builder(Iterator begin, uint64_t n, uint64_t w) : builder(n, w) { + fill(begin, n); + } + + template + void fill(Iterator begin, uint64_t n) { + if (!m_width) + throw std::runtime_error("width must be greater than 0"); + for (uint64_t i = 0; i != n; ++i, ++begin) + push_back(*begin); + } + + void set(uint64_t i, uint64_t v) { + assert(m_width); + assert(i < m_size); + if (i == m_size - 1) + m_back = v; + + uint64_t pos = i * m_width; + uint64_t block = pos >> 6; + uint64_t shift = pos & 63; + + m_bits[block] &= ~(m_mask << shift); + m_bits[block] |= v << shift; + + uint64_t res_shift = 64 - shift; + if (res_shift < m_width) { + m_bits[block + 1] &= ~(m_mask >> res_shift); + m_bits[block + 1] |= v >> res_shift; + } + } + + void push_back(uint64_t v) { + assert(m_width); + m_back = v; + m_bits[m_cur_block] &= ~(m_mask << m_cur_shift); + m_bits[m_cur_block] |= v << m_cur_shift; + + uint64_t res_shift = 64 - m_cur_shift; + if (res_shift < m_width) { + ++m_cur_block; + m_bits[m_cur_block] &= ~(m_mask >> res_shift); + m_bits[m_cur_block] |= v >> res_shift; + m_cur_shift = -res_shift; + } + + m_cur_shift += m_width; + + if (m_cur_shift == 64) { + m_cur_shift = 0; + ++m_cur_block; + } + } + + friend struct enumerator; + + typedef enumerator iterator; + + iterator begin() const { return iterator(this); } + + iterator end() const { return iterator(this, size()); } + + void build(compact_vector& cv) { + cv.m_size = m_size; + cv.m_width = m_width; + cv.m_mask = m_mask; + cv.m_bits.swap(m_bits); + builder().swap(*this); + } + + void swap(compact_vector::builder& other) { + std::swap(m_size, other.m_size); + std::swap(m_width, other.m_width); + std::swap(m_mask, other.m_mask); + std::swap(m_cur_block, other.m_cur_block); + std::swap(m_cur_shift, other.m_cur_shift); + m_bits.swap(other.m_bits); + } + + uint64_t back() const { return m_back; } + + uint64_t size() const { return m_size; } + + uint64_t width() const { return m_width; } + + std::vector& bits() { return m_bits; } + + private: + uint64_t m_size; + uint64_t m_width; + uint64_t m_mask; + uint64_t m_back; + uint64_t m_cur_block; + int64_t m_cur_shift; + std::vector m_bits; + }; + + compact_vector() : m_size(0), m_width(0), m_mask(0) {} + + template + void build(Iterator begin, uint64_t n) { + assert(n > 0); + uint64_t max = *std::max_element(begin, begin + n); + uint64_t width = max == 0 ? 1 : std::ceil(std::log2(max + 1)); + build(begin, n, width); + } + + template + void build(Iterator begin, uint64_t n, uint64_t w) { + compact_vector::builder builder(begin, n, w); + builder.build(*this); + } + + inline uint64_t operator[](uint64_t i) const { + assert(i < size()); + uint64_t pos = i * m_width; + uint64_t block = pos >> 6; + uint64_t shift = pos & 63; + return shift + m_width <= 64 + ? m_bits[block] >> shift & m_mask + : (m_bits[block] >> shift) | + (m_bits[block + 1] << (64 - shift) & m_mask); + } + + // it retrieves at least 57 bits + inline uint64_t access(uint64_t pos) const { + assert(pos < size()); + uint64_t i = pos * m_width; + const char* ptr = reinterpret_cast(m_bits.data()); + return (*(reinterpret_cast(ptr + (i >> 3))) >> (i & 7)) & + m_mask; + } + + uint64_t back() const { return operator[](size() - 1); } + + inline uint64_t size() const { return m_size; } + + inline uint64_t width() const { return m_width; } + + typedef enumerator iterator; + + iterator begin() const { return iterator(this); } + + iterator end() const { return iterator(this, size()); } + + iterator at(uint64_t pos) const { return iterator(this, pos); } + + std::vector const& bits() const { return m_bits; } + + size_t bytes() const { + return sizeof(m_size) + sizeof(m_width) + sizeof(m_mask) + + essentials::vec_bytes(m_bits); + } + + void swap(compact_vector& other) { + std::swap(m_size, other.m_size); + std::swap(m_width, other.m_width); + std::swap(m_mask, other.m_mask); + m_bits.swap(other.m_bits); + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_size); + visitor.visit(m_width); + visitor.visit(m_mask); + visitor.visit(m_bits); + } + + template + void load(Loader& loader) { + loader.load(m_size); + loader.load(m_width); + loader.load(m_mask); + loader.load_vec(m_bits); + } + + template + void dump(Dumper& dumper) const { + dumper.dump(m_size); + dumper.dump(m_width); + dumper.dump(m_mask); + dumper.dump_vec(m_bits); + } + + private: + uint64_t m_size; + uint64_t m_width; + uint64_t m_mask; + std::vector m_bits; +}; + +} // namespace pthash \ No newline at end of file diff --git a/thirdparty/pthash/encoders/darray.hpp b/thirdparty/pthash/encoders/darray.hpp new file mode 100644 index 00000000..48de5991 --- /dev/null +++ b/thirdparty/pthash/encoders/darray.hpp @@ -0,0 +1,185 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "pthash/encoders/bit_vector.hpp" +#include "pthash/encoders/util.hpp" + +namespace pthash { +namespace detail { + +template +struct darray { + darray() : m_positions() {} + + darray(bit_vector const& bv) : m_positions() { + std::vector const& data = bv.data(); + std::vector cur_block_positions; + std::vector block_inventory; + std::vector subblock_inventory; + std::vector overflow_positions; + + for (size_t word_idx = 0; word_idx < data.size(); ++word_idx) { + size_t cur_pos = word_idx << 6; + uint64_t cur_word = WordGetter()(data, word_idx); + unsigned long l; + while (util::lsb(cur_word, l)) { + cur_pos += l; + cur_word >>= l; + if (cur_pos >= bv.size()) + break; + + cur_block_positions.push_back(cur_pos); + + if (cur_block_positions.size() == block_size) { + flush_cur_block(cur_block_positions, block_inventory, + subblock_inventory, overflow_positions); + } + + // can't do >>= l + 1, can be 64 + cur_word >>= 1; + cur_pos += 1; + m_positions += 1; + } + } + if (cur_block_positions.size()) { + flush_cur_block(cur_block_positions, block_inventory, subblock_inventory, + overflow_positions); + } + m_block_inventory.swap(block_inventory); + m_subblock_inventory.swap(subblock_inventory); + m_overflow_positions.swap(overflow_positions); + } + + void swap(darray& other) { + std::swap(other.m_positions, m_positions); + m_block_inventory.swap(other.m_block_inventory); + m_subblock_inventory.swap(other.m_subblock_inventory); + m_overflow_positions.swap(other.m_overflow_positions); + } + + inline uint64_t select(bit_vector const& bv, uint64_t idx) const { + assert(idx < num_positions()); + uint64_t block = idx / block_size; + int64_t block_pos = m_block_inventory[block]; + if (block_pos < 0) { // sparse super-block + uint64_t overflow_pos = uint64_t(-block_pos - 1); + return m_overflow_positions[overflow_pos + (idx & (block_size - 1))]; + } + + size_t subblock = idx / subblock_size; + size_t start_pos = uint64_t(block_pos) + m_subblock_inventory[subblock]; + size_t reminder = idx & (subblock_size - 1); + if (!reminder) + return start_pos; + + std::vector const& data = bv.data(); + size_t word_idx = start_pos >> 6; + size_t word_shift = start_pos & 63; + uint64_t word = WordGetter()(data, word_idx) & (uint64_t(-1) << word_shift); + while (true) { + size_t popcnt = util::popcount(word); + if (reminder < popcnt) + break; + reminder -= popcnt; + word = WordGetter()(data, ++word_idx); + } + return (word_idx << 6) + util::select_in_word(word, reminder); + } + + inline uint64_t num_positions() const { return m_positions; } + + uint64_t bytes() const { + return sizeof(m_positions) + essentials::vec_bytes(m_block_inventory) + + essentials::vec_bytes(m_subblock_inventory) + + essentials::vec_bytes(m_overflow_positions); + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_positions); + visitor.visit(m_block_inventory); + visitor.visit(m_subblock_inventory); + visitor.visit(m_overflow_positions); + } + + template + void load(Loader& loader) { + loader.load(m_positions); + loader.load_vec(m_block_inventory); + loader.load_vec(m_subblock_inventory); + loader.load_vec(m_overflow_positions); + } + + template + void dump(Dumper& dumper) const { + dumper.dump(m_positions); + dumper.dump_vec(m_block_inventory); + dumper.dump_vec(m_subblock_inventory); + dumper.dump_vec(m_overflow_positions); + } + + protected: + static void flush_cur_block(std::vector& cur_block_positions, + std::vector& block_inventory, + std::vector& subblock_inventory, + std::vector& overflow_positions) { + if (cur_block_positions.back() - cur_block_positions.front() < + max_in_block_distance) { + block_inventory.push_back(int64_t(cur_block_positions.front())); + for (size_t i = 0; i < cur_block_positions.size(); i += subblock_size) { + subblock_inventory.push_back( + uint16_t(cur_block_positions[i] - cur_block_positions.front())); + } + } else { + block_inventory.push_back(-int64_t(overflow_positions.size()) - 1); + for (size_t i = 0; i < cur_block_positions.size(); ++i) { + overflow_positions.push_back(cur_block_positions[i]); + } + for (size_t i = 0; i < cur_block_positions.size(); i += subblock_size) { + subblock_inventory.push_back(uint16_t(-1)); + } + } + cur_block_positions.clear(); + } + + static const size_t block_size = 1024; // 2048 + static const size_t subblock_size = 32; + static const size_t max_in_block_distance = 1 << 16; + + size_t m_positions; + std::vector m_block_inventory; + std::vector m_subblock_inventory; + std::vector m_overflow_positions; +}; + +struct identity_getter { + uint64_t operator()(std::vector const& data, size_t idx) const { + return data[idx]; + } +}; + +} // namespace detail + +typedef detail::darray darray1; + +} // namespace pthash diff --git a/thirdparty/pthash/encoders/ef_sequence.hpp b/thirdparty/pthash/encoders/ef_sequence.hpp new file mode 100644 index 00000000..6c71ca17 --- /dev/null +++ b/thirdparty/pthash/encoders/ef_sequence.hpp @@ -0,0 +1,145 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "pthash/encoders/bit_vector.hpp" +#include "pthash/encoders/compact_vector.hpp" +#include "pthash/encoders/darray.hpp" + +namespace pthash { + +template +struct ef_sequence { + ef_sequence() {} + + template + void encode(Iterator begin, uint64_t n) { + if (n == 0) + return; + uint64_t u; +#if __cplusplus >= 201703L + if constexpr (encode_prefix_sum) { +#else + if (encode_prefix_sum) { +#endif + u = std::accumulate(begin, begin + n, static_cast(0)); + n = n + 1; // because I will add a zero at the beginning + } else { + u = *(begin + n - 1); + }; + + uint64_t l = uint64_t((n && u / n) ? util::msb(u / n) : 0); + bit_vector_builder bvb_high_bits(n + (u >> l) + 1); + compact_vector::builder cv_builder_low_bits(n, l); + + uint64_t low_mask = (uint64_t(1) << l) - 1; + uint64_t last = 0; + // I add a zero at the beginning +#if __cplusplus >= 201703L + if constexpr (encode_prefix_sum) { +#else + if (encode_prefix_sum) { +#endif + if (l) + cv_builder_low_bits.push_back(0); + bvb_high_bits.set(0, 1); + n = n - 1; // restore n + } + for (size_t i = 0; i < n; ++i, ++begin) { + auto v = *begin; +#if __cplusplus >= 201703L + if constexpr (encode_prefix_sum) { +#else + if (encode_prefix_sum) { +#endif + v = v + last; // prefix sum + } else if (i and v < last) { // check the order + std::cerr << "error at " << i << "/" << n << ":\n"; + std::cerr << "last " << last << "\n"; + std::cerr << "current " << v << "\n"; + throw std::runtime_error("ef_sequence is not sorted"); + } + if (l) + cv_builder_low_bits.push_back(v & low_mask); + bvb_high_bits.set((v >> l) + i + encode_prefix_sum, 1); + last = v; + } + + bit_vector(&bvb_high_bits).swap(m_high_bits); + cv_builder_low_bits.build(m_low_bits); + darray1(m_high_bits).swap(m_high_bits_d1); + } + + inline uint64_t access(uint64_t i) const { + assert(i < size()); + return ((m_high_bits_d1.select(m_high_bits, i) - i) << m_low_bits.width()) | + m_low_bits.access(i); + } + + inline uint64_t diff(uint64_t i) const { + assert(i < size() && encode_prefix_sum); + uint64_t low1 = m_low_bits.access(i); + uint64_t low2 = m_low_bits.access(i + 1); + uint64_t l = m_low_bits.width(); + uint64_t pos = m_high_bits_d1.select(m_high_bits, i); + uint64_t h1 = pos - i; + uint64_t h2 = + bit_vector::unary_iterator(m_high_bits, pos + 1).next() - i - 1; + uint64_t val1 = (h1 << l) | low1; + uint64_t val2 = (h2 << l) | low2; + return val2 - val1; + } + + inline uint64_t size() const { return m_low_bits.size(); } + + uint64_t num_bits() const { + return 8 * + (m_high_bits.bytes() + m_high_bits_d1.bytes() + m_low_bits.bytes()); + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_high_bits); + visitor.visit(m_high_bits_d1); + visitor.visit(m_low_bits); + } + + template + void load(Loader& loader) { + m_high_bits.load(loader); + m_high_bits_d1.load(loader); + m_low_bits.load(loader); + } + + template + void dump(Dumper& dumper) const { + m_high_bits.dump(dumper); + m_high_bits_d1.dump(dumper); + m_low_bits.dump(dumper); + } + + private: + bit_vector m_high_bits; + darray1 m_high_bits_d1; + compact_vector m_low_bits; +}; + +} // namespace pthash diff --git a/thirdparty/pthash/encoders/encoders.hpp b/thirdparty/pthash/encoders/encoders.hpp new file mode 100644 index 00000000..422119a5 --- /dev/null +++ b/thirdparty/pthash/encoders/encoders.hpp @@ -0,0 +1,161 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "pthash/essentials/essentials.hpp" + +#include "pthash/encoders/compact_vector.hpp" +#include "pthash/encoders/ef_sequence.hpp" + +#include +#include +#include + +namespace pthash { + +template +std::pair, std::vector> +compute_ranks_and_dictionary(Iterator begin, uint64_t n) { + // accumulate frequencies + std::unordered_map distinct; + for (auto it = begin, end = begin + n; it != end; ++it) { + auto find_it = distinct.find(*it); + if (find_it != distinct.end()) { // found + (*find_it).second += 1; + } else { + distinct[*it] = 1; + } + } + std::vector> vec; + vec.reserve(distinct.size()); + for (auto p : distinct) + vec.emplace_back(p.first, p.second); + std::sort(vec.begin(), vec.end(), + [](const std::pair& x, + const std::pair& y) { + return x.second > y.second; + }); + distinct.clear(); + // assign codewords by non-increasing frequency + std::vector dict; + dict.reserve(distinct.size()); + for (uint64_t i = 0; i != vec.size(); ++i) { + auto p = vec[i]; + distinct.insert({p.first, i}); + dict.push_back(p.first); + } + + std::vector ranks; + ranks.reserve(n); + for (auto it = begin, end = begin + n; it != end; ++it) + ranks.push_back(distinct[*it]); + return {ranks, dict}; +} + +struct dictionary { + template + void encode(Iterator begin, uint64_t n) { + auto pair = compute_ranks_and_dictionary(begin, n); + m_ranks.build(pair.first.begin(), pair.first.size()); + m_dict.build(pair.second.begin(), pair.second.size()); + } + + static std::string name() { return "dictionary"; } + + size_t size() const { return m_ranks.size(); } + + size_t num_bits() const { return (m_ranks.bytes() + m_dict.bytes()) * 8; } + + uint64_t access(uint64_t i) const { + uint64_t rank = m_ranks.access(i); + return m_dict.access(rank); + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_ranks); + visitor.visit(m_dict); + } + + template + void load(Loader& loader) { + m_ranks.load(loader); + m_dict.load(loader); + } + + template + void dump(Dumper& dumper) const { + m_ranks.dump(dumper); + m_dict.dump(dumper); + } + + private: + compact_vector m_ranks; + compact_vector m_dict; +}; + +template +struct dual { + template + void encode(Iterator begin, uint64_t n) { + size_t front_size = n * 0.3; + m_front.encode(begin, front_size); + m_back.encode(begin + front_size, n - front_size); + } + + static std::string name() { return Front::name() + "-" + Back::name(); } + + size_t num_bits() const { return m_front.num_bits() + m_back.num_bits(); } + + uint64_t access(uint64_t i) const { + if (i < m_front.size()) + return m_front.access(i); + return m_back.access(i - m_front.size()); + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_front); + visitor.visit(m_back); + } + + template + void load(Loader& loader) { + m_front.load(loader); + m_back.load(loader); + } + + template + void dump(Dumper& dumper) const { + m_front.dump(dumper); + m_back.dump(dumper); + } + + private: + Front m_front; + Back m_back; +}; + +/* dual encoders */ +typedef dual dictionary_dictionary; + +} // namespace pthash diff --git a/thirdparty/pthash/encoders/util.hpp b/thirdparty/pthash/encoders/util.hpp new file mode 100644 index 00000000..6f53b018 --- /dev/null +++ b/thirdparty/pthash/encoders/util.hpp @@ -0,0 +1,114 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#if defined(__x86_64__) && __SSE4_2__ +#include +#endif + +namespace pthash::util { + +#if defined(__x86_64__) && __SSE4_2__ +template +inline void prefetch(T const* ptr) { + _mm_prefetch(reinterpret_cast(ptr), _MM_HINT_T0); +} +#endif + +inline uint8_t msb(uint64_t x) { + assert(x); + unsigned long ret = -1U; + if (x) { + ret = (unsigned long) (63 - __builtin_clzll(x)); + } + return (uint8_t) ret; +} + +inline bool bsr64(unsigned long* const index, const uint64_t mask) { + if (mask) { + *index = (unsigned long) (63 - __builtin_clzll(mask)); + return true; + } else { + return false; + } +} + +inline uint8_t msb(uint64_t x, unsigned long& ret) { return bsr64(&ret, x); } + +inline uint8_t lsb(uint64_t x, unsigned long& ret) { + if (x) { + ret = (unsigned long) __builtin_ctzll(x); + return true; + } + return false; +} + +inline uint8_t lsb(uint64_t x) { + assert(x); + unsigned long ret = -1U; + lsb(x, ret); + return (uint8_t) ret; +} + +inline uint64_t popcount(uint64_t x) { +#ifdef __SSE4_2__ + return static_cast(_mm_popcnt_u64(x)); +#elif __cplusplus >= 202002L + return std::popcount(x); +#else + return static_cast(__builtin_popcountll(x)); +#endif +} + +inline uint64_t select64_pdep_tzcnt(uint64_t x, const uint64_t k) { +#if defined(__x86_64__) && defined(__BMI2__) || defined(__AVX2__) + uint64_t i = 1ULL << k; + asm("pdep %[x], %[mask], %[x]" : [x] "+r"(x) : [mask] "r"(i)); + asm("tzcnt %[bit], %[index]" : [index] "=r"(i) : [bit] "g"(x) : "cc"); + return i; +#else + uint64_t count = 0; + uint64_t result = 0; + + for (uint64_t bit = 0; bit < 64; ++bit) { + if ((x >> bit) & 1) { + if (count == k) { + result = bit; + break; + } + ++count; + } + } + + return result; +#endif +} + +inline uint64_t select_in_word(const uint64_t x, const uint64_t k) { + assert(k < popcount(x)); + return select64_pdep_tzcnt(x, k); +} + +} // namespace pthash::util \ No newline at end of file diff --git a/thirdparty/pthash/essentials/essentials.hpp b/thirdparty/pthash/essentials/essentials.hpp new file mode 100644 index 00000000..aeed8e06 --- /dev/null +++ b/thirdparty/pthash/essentials/essentials.hpp @@ -0,0 +1,644 @@ +/** Copyright 2019-2021 Giulio Ermanno Pibiri + * + * The following sets forth attribution notices for third party software. + * + * C++ Essentials: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/essentials + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __GNUG__ +#include // for name demangling +#endif + +namespace essentials { + +inline void logger(std::string const& msg) { + time_t t = std::time(nullptr); + std::locale loc; + const std::time_put& tp = std::use_facet>(loc); + const char* fmt = "%F %T"; + tp.put(std::cout, std::cout, ' ', std::localtime(&t), fmt, fmt + strlen(fmt)); + std::cout << ": " << msg << std::endl; +} + +static const uint64_t GB = 1000 * 1000 * 1000; +static const uint64_t GiB = uint64_t(1) << 30; +static const uint64_t MB = 1000 * 1000; +static const uint64_t MiB = uint64_t(1) << 20; +static const uint64_t KB = 1000; +static const uint64_t KiB = uint64_t(1) << 10; + +inline double convert(size_t bytes, uint64_t unit) { + return static_cast(bytes) / unit; +} + +template +size_t vec_bytes(T const& vec) { + return vec.size() * sizeof(vec.front()) + sizeof(typename T::size_type); +} + +template +size_t pod_bytes(T const& pod) { + static_assert(std::is_pod::value); + return sizeof(pod); +} + +inline size_t file_size(char const* filename) { + std::ifstream is(filename, std::ios::binary | std::ios::ate); + if (!is.good()) { + throw std::runtime_error( + "Error in opening binary " + "file."); + } + size_t bytes = (size_t) is.tellg(); + is.close(); + return bytes; +} + +template +uint64_t words_for(uint64_t bits) { + uint64_t word_bits = sizeof(WordType) * 8; + return (bits + word_bits - 1) / word_bits; +} + +template +inline void do_not_optimize_away(T&& value) { + asm volatile("" : "+r"(value)); +} + +inline uint64_t maxrss_in_bytes() { + struct rusage ru; + if (getrusage(RUSAGE_SELF, &ru) == 0) { + // NOTE: ru_maxrss is in kilobytes on Linux, but not on Apple... +#ifdef __APPLE__ + return ru.ru_maxrss; +#endif + return ru.ru_maxrss * 1000; + } + return 0; +} + +template +void load_pod(std::istream& is, T& val) { + static_assert(std::is_pod::value); + is.read(reinterpret_cast(&val), sizeof(T)); +} + +template +void load_vec(std::istream& is, std::vector& vec) { + size_t n; + load_pod(is, n); + vec.resize(n); + is.read(reinterpret_cast(vec.data()), + static_cast(sizeof(T) * n)); +} + +template +void save_pod(std::ostream& os, T const& val) { + static_assert(std::is_pod::value); + os.write(reinterpret_cast(&val), sizeof(T)); +} + +template +void save_vec(std::ostream& os, std::vector const& vec) { + static_assert(std::is_pod::value); + size_t n = vec.size(); + save_pod(os, n); + os.write(reinterpret_cast(vec.data()), + static_cast(sizeof(T) * n)); +} + +template +struct timer { + void start() { m_start = ClockType::now(); } + + void stop() { + m_stop = ClockType::now(); + auto elapsed = std::chrono::duration_cast(m_stop - m_start); + m_timings.push_back(elapsed.count()); + } + + size_t runs() const { return m_timings.size(); } + + void reset() { m_timings.clear(); } + + double min() const { + return *std::min_element(m_timings.begin(), m_timings.end()); + } + + double max() const { + return *std::max_element(m_timings.begin(), m_timings.end()); + } + + void discard_first() { + if (runs()) { + m_timings.erase(m_timings.begin()); + } + } + + void discard_min() { + if (runs() > 1) { + m_timings.erase(std::min_element(m_timings.begin(), m_timings.end())); + } + } + + void discard_max() { + if (runs() > 1) { + m_timings.erase(std::max_element(m_timings.begin(), m_timings.end())); + } + } + + double elapsed() { + return std::accumulate(m_timings.begin(), m_timings.end(), 0.0); + } + + double average() { return elapsed() / runs(); } + + private: + typename ClockType::time_point m_start; + typename ClockType::time_point m_stop; + std::vector m_timings; +}; + +typedef std::chrono::high_resolution_clock clock_type; +typedef std::chrono::microseconds duration_type; +typedef timer timer_type; + +inline unsigned get_random_seed() { + return std::chrono::system_clock::now().time_since_epoch().count(); +} + +template +struct uniform_int_rng { + uniform_int_rng(IntType from, IntType to, unsigned seed = 13) + : m_rng(seed), m_distr(from, to) {} + + IntType gen() { return m_distr(m_rng); } + + private: + std::mt19937_64 m_rng; + std::uniform_int_distribution m_distr; +}; + +struct loader { + loader(char const* filename) + : m_num_bytes_pods(0), + m_num_bytes_vecs_of_pods(0), + m_is(filename, std::ios::binary) { + if (!m_is.good()) { + throw std::runtime_error( + "Error in opening binary " + "file."); + } + } + + ~loader() { m_is.close(); } + + template + void visit(T& val) { +#if __cplusplus >= 201703L + if constexpr (std::is_pod::value) { +#else + if (std::is_pod::value) { +#endif + load_pod(m_is, val); + m_num_bytes_pods += pod_bytes(val); + } else { + val.visit(*this); + } + } + + template + void visit(std::vector& vec) { + size_t n; + visit(n); + vec.resize(n); +#if __cplusplus >= 201703L + if constexpr (std::is_pod::value) { +#else + if (std::is_pod::value) { +#endif + m_is.read(reinterpret_cast(vec.data()), + static_cast(sizeof(T) * n)); + m_num_bytes_vecs_of_pods += n * sizeof(T); + } else { + for (auto& v : vec) + visit(v); + } + } + + size_t bytes() { return m_is.tellg(); } + + size_t bytes_pods() { return m_num_bytes_pods; } + + size_t bytes_vecs_of_pods() { return m_num_bytes_vecs_of_pods; } + + private: + size_t m_num_bytes_pods; + size_t m_num_bytes_vecs_of_pods; + std::ifstream m_is; +}; + +struct saver { + saver(char const* filename) : m_os(filename, std::ios::binary) { + if (!m_os.good()) { + throw std::runtime_error( + "Error in opening binary " + "file."); + } + } + + ~saver() { m_os.close(); } + + template + void visit(T& val) { +#if __cplusplus >= 201703L + if constexpr (std::is_pod::value) { +#else + if (std::is_pod::value) { +#endif + save_pod(m_os, val); + } else { + val.visit(*this); + } + } + + template + void visit(std::vector& vec) { +#if __cplusplus >= 201703L + if constexpr (std::is_pod::value) { +#else + if (std::is_pod::value) { +#endif + save_vec(m_os, vec); + } else { + size_t n = vec.size(); + visit(n); + for (auto& v : vec) + visit(v); + } + } + + size_t bytes() { return m_os.tellp(); } + + private: + std::ofstream m_os; +}; + +inline std::string demangle(char const* mangled_name) { + size_t len = 0; + int status = 0; + std::unique_ptr ptr( + __cxxabiv1::__cxa_demangle(mangled_name, nullptr, &len, &status), + &std::free); + return ptr.get(); +} + +struct sizer { + sizer(std::string const& root_name = "") + : m_root(0, 0, root_name), m_current(&m_root) {} + + struct node { + node(size_t b, size_t d, std::string const& n = "") + : bytes(b), depth(d), name(n) {} + + size_t bytes; + size_t depth; + std::string name; + std::vector children; + }; + + template + void visit(T& val) { +#if __cplusplus >= 201703L + if constexpr (std::is_pod::value) { +#else + if (std::is_pod::value) { +#endif + node n(pod_bytes(val), m_current->depth + 1, demangle(typeid(T).name())); + m_current->children.push_back(n); + m_current->bytes += n.bytes; + } else { + val.visit(*this); + } + } + + template + void visit(std::vector& vec) { +#if __cplusplus >= 201703L + if constexpr (std::is_pod::value) { +#else + if (std::is_pod::value) { +#endif + node n(vec_bytes(vec), m_current->depth + 1, + demangle(typeid(std::vector).name())); + m_current->children.push_back(n); + m_current->bytes += n.bytes; + } else { + size_t n = vec.size(); + m_current->bytes += pod_bytes(n); + node* parent = m_current; + for (auto& v : vec) { + node n(0, parent->depth + 1, demangle(typeid(T).name())); + parent->children.push_back(n); + m_current = &parent->children.back(); + visit(v); + parent->bytes += m_current->bytes; + } + m_current = parent; + } + } + + template + void print(node const& n, size_t total_bytes, Device& device) const { + auto indent = std::string(n.depth * 4, ' '); + device << indent << "'" << n.name << "' - bytes = " << n.bytes << " (" + << n.bytes * 100.0 / total_bytes << "%)" << std::endl; + for (auto const& child : n.children) { + device << indent; + print(child, total_bytes, device); + } + } + + template + void print(Device& device) const { + print(m_root, bytes(), device); + } + + size_t bytes() const { return m_root.bytes; } + + private: + node m_root; + node* m_current; +}; + +template +struct allocator : std::allocator { + typedef T value_type; + + allocator() : m_addr(nullptr) {} + + allocator(T* addr) : m_addr(addr) {} + + T* allocate(size_t n) { + if (m_addr == nullptr) + return std::allocator::allocate(n); + return m_addr; + } + + void deallocate(T* p, size_t n) { + if (m_addr == nullptr) + return std::allocator::deallocate(p, n); + } + + private: + T* m_addr; +}; + +struct contiguous_memory_allocator { + contiguous_memory_allocator() : m_begin(nullptr), m_end(nullptr), m_size(0) {} + + struct visitor { + visitor(uint8_t* begin, size_t size, char const* filename) + : m_begin(begin), + m_end(begin), + m_size(size), + m_is(filename, std::ios::binary) { + if (!m_is.good()) { + throw std::runtime_error( + "Error in opening binary " + "file."); + } + } + + ~visitor() { m_is.close(); } + + template + void visit(T& val) { +#if __cplusplus >= 201703L + if constexpr (std::is_pod::value) { +#else + if (std::is_pod::value) { +#endif + load_pod(m_is, val); + } else { + val.visit(*this); + } + } + + template + void visit(std::vector& vec) { +#if __cplusplus >= 201703L + if constexpr (std::is_pod::value) { +#else + if (std::is_pod::value) { +#endif + vec = std::vector(make_allocator()); + load_vec(m_is, vec); + consume(vec.size() * sizeof(T)); + } else { + size_t n; + visit(n); + vec.resize(n); + for (auto& v : vec) + visit(v); + } + } + + uint8_t* end() { return m_end; } + + size_t size() const { return m_size; } + + size_t allocated() const { + assert(m_end >= m_begin); + return m_end - m_begin; + } + + template + allocator make_allocator() { + return allocator(reinterpret_cast(m_end)); + } + + void consume(size_t num_bytes) { + if (m_end == nullptr) + return; + if (allocated() + num_bytes > size()) { + throw std::runtime_error("allocation failed"); + } + m_end += num_bytes; + } + + private: + uint8_t* m_begin; + uint8_t* m_end; + size_t m_size; + std::ifstream m_is; + }; + + template + size_t allocate(T& data_structure, char const* filename) { + loader l(filename); + l.visit(data_structure); + m_size = l.bytes_vecs_of_pods(); + m_begin = reinterpret_cast(malloc(m_size)); + if (m_begin == nullptr) + throw std::runtime_error("malloc failed"); + visitor v(m_begin, m_size, filename); + v.visit(data_structure); + m_end = v.end(); + return l.bytes(); + } + + ~contiguous_memory_allocator() { free(m_begin); } + + uint8_t* begin() { return m_begin; } + + uint8_t* end() { return m_end; } + + size_t size() const { return m_size; } + + private: + uint8_t* m_begin; + uint8_t* m_end; + size_t m_size; +}; + +template +size_t visit(T& data_structure, char const* filename) { + Visitor visitor(filename); + visitor.visit(data_structure); + return visitor.bytes(); +} + +template +size_t load(T& data_structure, char const* filename) { + return visit(data_structure, filename); +} + +template +size_t load_with_custom_memory_allocation(T& data_structure, + char const* filename) { + return data_structure.get_allocator().allocate(data_structure, filename); +} + +template +size_t save(T& data_structure, char const* filename) { + return visit(data_structure, filename); +} + +template +size_t print_size(T& data_structure, Device& device) { + sizer visitor(demangle(typeid(T).name())); + visitor.visit(data_structure); + visitor.print(device); + return visitor.bytes(); +} + +#if defined(__CYGWIN__) || defined(_WIN32) || defined(_WIN64) +#else +struct directory { + struct file_name { + std::string name; + std::string fullpath; + std::string extension; + }; + + ~directory() { + for (int i = 0; i != items(); ++i) { + free(m_items_names[i]); + } + free(m_items_names); + } + + directory(std::string const& name) : m_name(name) { + m_n = scandir(m_name.c_str(), &m_items_names, NULL, alphasort); + if (m_n < 0) { + throw std::runtime_error("error during scandir"); + } + } + + std::string const& name() const { return m_name; } + + int items() const { return m_n; } + + struct iterator { + iterator(directory const* d, int i) : m_d(d), m_i(i) {} + + file_name operator*() { + file_name fn; + fn.name = m_d->m_items_names[m_i]->d_name; + fn.fullpath = m_d->name() + "/" + fn.name; + size_t p = fn.name.find_last_of("."); + fn.extension = fn.name.substr(p + 1); + return fn; + } + + void operator++() { ++m_i; } + + bool operator==(iterator const& rhs) const { return m_i == rhs.m_i; } + + bool operator!=(iterator const& rhs) const { return !(*this == rhs); } + + private: + directory const* m_d; + int m_i; + }; + + iterator begin() { return iterator(this, 0); } + + iterator end() { return iterator(this, items()); } + + private: + std::string m_name; + struct dirent** m_items_names; + int m_n; +}; +#endif + +inline bool create_directory(std::string const& name) { + if (mkdir(name.c_str(), 0777) != 0) { + if (errno == EEXIST) { + std::cerr << "directory already exists" << std::endl; + } + return false; + } + return true; +} + +inline bool remove_directory(std::string const& name) { + return rmdir(name.c_str()) == 0; +} + +} // namespace essentials diff --git a/thirdparty/pthash/fastmod/fastmod.h b/thirdparty/pthash/fastmod/fastmod.h new file mode 100644 index 00000000..8ac9743d --- /dev/null +++ b/thirdparty/pthash/fastmod/fastmod.h @@ -0,0 +1,209 @@ +// credits to Daniel Lemire: https://github.com/lemire/fastmod + +#ifndef FASTMOD_H +#define FASTMOD_H + +#ifndef __cplusplus +#include +#include +#else +// In C++ / are irelevant as bool is already a type +#include +#endif + +#ifndef __cplusplus +#define FASTMOD_API static inline +#else +// In C++ we mark all the functions inline. +// If C++14 relaxed constexpr is supported we use constexpr so functions +// can be used at compile-time. +#if __cpp_constexpr >= 201304 && !defined(_MSC_VER) +// visual studio does not like constexpr +#define FASTMOD_API constexpr +#define FASTMOD_CONSTEXPR constexpr +#else +#define FASTMOD_API inline +#define FASTMOD_CONSTEXPR +#endif +#endif + +#ifdef _MSC_VER +#include +#endif + +#ifdef __cplusplus +namespace fastmod { +#endif + +#ifdef _MSC_VER + +// __umulh is only available in x64 mode under Visual Studio: don't compile to +// 32-bit! +FASTMOD_API uint64_t mul128_u32(uint64_t lowbits, uint32_t d) { + return __umulh(lowbits, d); +} + +#else // _MSC_VER NOT defined + +FASTMOD_API uint64_t mul128_u32(uint64_t lowbits, uint32_t d) { + return ((__uint128_t) lowbits * d) >> 64; +} + +FASTMOD_API uint64_t mul128_s32(uint64_t lowbits, int32_t d) { + return ((__int128_t) lowbits * d) >> 64; +} + +// This is for the 64-bit functions. +FASTMOD_API uint64_t mul128_u64(__uint128_t lowbits, uint64_t d) { + __uint128_t bottom_half = + (lowbits & UINT64_C(0xFFFFFFFFFFFFFFFF)) * d; // Won't overflow + bottom_half >>= + 64; // Only need the top 64 bits, as we'll shift the lower half away; + __uint128_t top_half = (lowbits >> 64) * d; + __uint128_t both_halves = + bottom_half + top_half; // Both halves are already shifted down by 64 + both_halves >>= 64; // Get top half of both_halves + return (uint64_t) both_halves; +} + +#endif // _MSC_VER + +/** + * Unsigned integers. + * Usage: + * uint32_t d = ... ; // divisor, should be non-zero + * uint64_t M = computeM_u32(d); // do once + * fastmod_u32(a,M,d) is a % d for all 32-bit a. + * + **/ + +// M = ceil( (1<<64) / d ), d > 0 +FASTMOD_API uint64_t computeM_u32(uint32_t d) { + return UINT64_C(0xFFFFFFFFFFFFFFFF) / d + 1; +} + +// fastmod computes (a % d) given precomputed M +FASTMOD_API uint32_t fastmod_u32(uint32_t a, uint64_t M, uint32_t d) { + uint64_t lowbits = M * a; + return (uint32_t) (mul128_u32(lowbits, d)); +} + +// fastmod computes (a / d) given precomputed M for d>1 +FASTMOD_API uint32_t fastdiv_u32(uint32_t a, uint64_t M) { + return (uint32_t) (mul128_u32(M, a)); +} + +// given precomputed M, checks whether n % d == 0 +FASTMOD_API bool is_divisible(uint32_t n, uint64_t M) { return n * M <= M - 1; } + +/** + * signed integers + * Usage: + * int32_t d = ... ; // should be non-zero and between [-2147483647,2147483647] + * int32_t positive_d = d < 0 ? -d : d; // absolute value + * uint64_t M = computeM_s32(d); // do once + * fastmod_s32(a,M,positive_d) is a % d for all 32-bit a. + **/ + +// M = floor( (1<<64) / d ) + 1 +// you must have that d is different from 0 and -2147483648 +// if d = -1 and a = -2147483648, the result is undefined +FASTMOD_API uint64_t computeM_s32(int32_t d) { + if (d < 0) + d = -d; + return UINT64_C(0xFFFFFFFFFFFFFFFF) / d + 1 + ((d & (d - 1)) == 0 ? 1 : 0); +} + +// fastmod computes (a % d) given precomputed M, +// you should pass the absolute value of d +FASTMOD_API int32_t fastmod_s32(int32_t a, uint64_t M, int32_t positive_d) { + uint64_t lowbits = M * a; + int32_t highbits = mul128_u32(lowbits, positive_d); + return highbits - ((positive_d - 1) & (a >> 31)); +} + +#ifndef _MSC_VER +// fastmod computes (a / d) given precomputed M, assumes that d must not +// be one of -1, 1, or -2147483648 +FASTMOD_API int32_t fastdiv_s32(int32_t a, uint64_t M, int32_t d) { + uint64_t highbits = mul128_s32(M, a); + highbits += (a < 0 ? 1 : 0); + if (d < 0) + return -(int32_t) (highbits); + return (int32_t) (highbits); +} + +// What follows is the 64-bit functions. +// They are currently not supported on Visual Studio +// due to the lack of a mul128_u64 function. +// They may not be faster than what the compiler +// can produce. + +FASTMOD_API __uint128_t computeM_u64(uint64_t d) { + // what follows is just ((__uint128_t)0 - 1) / d) + 1 spelled out + __uint128_t M = UINT64_C(0xFFFFFFFFFFFFFFFF); + M <<= 64; + M |= UINT64_C(0xFFFFFFFFFFFFFFFF); + M /= d; + M += 1; + return M; +} + +FASTMOD_API __uint128_t computeM_s64(int64_t d) { + if (d < 0) + d = -d; + __uint128_t M = UINT64_C(0xFFFFFFFFFFFFFFFF); + M <<= 64; + M |= UINT64_C(0xFFFFFFFFFFFFFFFF); + M /= d; + M += 1; + M += ((d & (d - 1)) == 0 ? 1 : 0); + return M; +} + +FASTMOD_API uint64_t fastmod_u64(uint64_t a, __uint128_t M, uint64_t d) { + __uint128_t lowbits = M * a; + return mul128_u64(lowbits, d); +} + +FASTMOD_API uint64_t fastdiv_u64(uint64_t a, __uint128_t M) { + return mul128_u64(M, a); +} + +// End of the 64-bit functions + +#endif // #ifndef _MSC_VER + +#ifdef __cplusplus + +template +FASTMOD_API uint32_t fastmod(uint32_t x) { + FASTMOD_CONSTEXPR uint64_t v = computeM_u32(d); + return fastmod_u32(x, v, d); +} +template +FASTMOD_API uint32_t fastdiv(uint32_t x) { + FASTMOD_CONSTEXPR uint64_t v = computeM_u32(d); + return fastdiv_u32(x, v); +} +template +FASTMOD_API int32_t fastmod(int32_t x) { + FASTMOD_CONSTEXPR uint64_t v = computeM_s32(d); + return fastmod_s32(x, v, d); +} +template +FASTMOD_API int32_t fastdiv(int32_t x) { + FASTMOD_CONSTEXPR uint64_t v = computeM_s32(d); + return fastdiv_s32(x, v, d); +} + +} // fastmod +#endif + +// There's no reason to polute the global scope with this macro once its use +// ends This won't create any problems as the preprocessor will have done its +// thing once it reaches this point +#undef FASTMOD_API +#undef FASTMOD_CONSTEXPR + +#endif // FASTMOD_H \ No newline at end of file diff --git a/thirdparty/pthash/mm_file/mm_file.hpp b/thirdparty/pthash/mm_file/mm_file.hpp new file mode 100644 index 00000000..c132a194 --- /dev/null +++ b/thirdparty/pthash/mm_file/mm_file.hpp @@ -0,0 +1,176 @@ +/** Copyright 2019 Giulio Ermanno Pibiri + * + * The following sets forth attribution notices for third party software. + * + * Memory-mapped files: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/mm_file + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include // close(fd) +#include +#include + +namespace mm { + +namespace advice { +static const int normal = POSIX_MADV_NORMAL; +static const int random = POSIX_MADV_RANDOM; +static const int sequential = POSIX_MADV_SEQUENTIAL; +} // namespace advice + +template +struct file { + file() { init(); } + file(file&& other) { + m_fd = other.m_fd; + m_size = other.m_size; + m_data = other.m_data; + other.init(); + } + + ~file() { close(); } + + file(file const&) = delete; // non construction-copyable + file& operator=(file const&) = delete; // non copyable + + bool is_open() const { return m_fd != -1; } + + void close() { + if (is_open()) { + if (munmap((char*) m_data, m_size) == -1) { + throw std::runtime_error("munmap failed when closing file"); + } + ::close(m_fd); + init(); + } + } + + size_t bytes() const { return m_size; } + + size_t size() const { return m_size / sizeof(T); } + + T* data() const { return m_data; } + + struct iterator { + iterator(T* addr, size_t offset = 0) : m_ptr(addr + offset) {} + + T operator*() { return *m_ptr; } + + void operator++() { ++m_ptr; } + + bool operator==(iterator const& rhs) const { return m_ptr == rhs.m_ptr; } + + bool operator!=(iterator const& rhs) const { return !((*this) == rhs); } + + private: + T* m_ptr; + }; + + iterator begin() const { return iterator(m_data); } + + iterator end() const { return iterator(m_data, size()); } + + protected: + int m_fd; + size_t m_size; + T* m_data; + + void init() { + m_fd = -1; + m_size = 0; + m_data = nullptr; + } + + void check_fd() { + if (m_fd == -1) + throw std::runtime_error("cannot open file"); + } +}; + +template +Pointer mmap(int fd, size_t size, int prot) { + static const size_t offset = 0; + Pointer p = + static_cast(::mmap(NULL, size, prot, MAP_SHARED, fd, offset)); + if (p == MAP_FAILED) + throw std::runtime_error("mmap failed"); + return p; +} + +template +struct file_source : public file { + typedef file base; + + file_source() {} + + file_source(std::string const& path, int adv = advice::normal) { + open(path, adv); + } + + void open(std::string const& path, int adv = advice::normal) { + base::m_fd = ::open(path.c_str(), O_RDONLY); + base::check_fd(); + struct stat fs; + if (fstat(base::m_fd, &fs) == -1) { + throw std::runtime_error("cannot stat file"); + } + base::m_size = fs.st_size; + base::m_data = mmap(base::m_fd, base::m_size, PROT_READ); + if (posix_madvise((void*) base::m_data, base::m_size, adv)) { + throw std::runtime_error("madvise failed"); + } + } +}; + +template +struct file_sink : public file { + typedef file base; + + file_sink() {} + + file_sink(std::string const& path) { open(path); } + + file_sink(std::string const& path, size_t n) { open(path, n); } + + void open(std::string const& path) { + static const mode_t mode = 0600; // read/write + base::m_fd = ::open(path.c_str(), O_RDWR, mode); + base::check_fd(); + struct stat fs; + if (fstat(base::m_fd, &fs) == -1) { + throw std::runtime_error("cannot stat file"); + } + base::m_size = fs.st_size; + base::m_data = mmap(base::m_fd, base::m_size, PROT_READ | PROT_WRITE); + } + + void open(std::string const& path, size_t n) { + static const mode_t mode = 0600; // read/write + base::m_fd = ::open(path.c_str(), O_RDWR | O_CREAT | O_TRUNC, mode); + base::check_fd(); + base::m_size = n * sizeof(T); + ftruncate(base::m_fd, + base::m_size); // truncate the file at the new size + base::m_data = mmap(base::m_fd, base::m_size, PROT_READ | PROT_WRITE); + } +}; + +} // namespace mm \ No newline at end of file diff --git a/thirdparty/pthash/pthash.hpp b/thirdparty/pthash/pthash.hpp new file mode 100644 index 00000000..d46f5412 --- /dev/null +++ b/thirdparty/pthash/pthash.hpp @@ -0,0 +1,25 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "pthash/encoders/encoders.hpp" +#include "pthash/single_phf.hpp" diff --git a/thirdparty/pthash/single_phf.hpp b/thirdparty/pthash/single_phf.hpp new file mode 100644 index 00000000..005e523e --- /dev/null +++ b/thirdparty/pthash/single_phf.hpp @@ -0,0 +1,159 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "pthash/builders/external_memory_builder_single_phf.hpp" +#include "pthash/builders/internal_memory_builder_single_phf.hpp" +#include "pthash/builders/util.hpp" +#include "pthash/encoders/ef_sequence.hpp" +#include "pthash/utils/bucketers.hpp" + +namespace pthash { + +template +struct single_phf { + typedef Encoder encoder_type; + static constexpr bool minimal = Minimal; + + template + build_timings build_in_internal_memory(Iterator keys, uint64_t n, + build_configuration const& config) { + internal_memory_builder_single_phf builder; + auto timings = builder.build_from_keys(keys, n, config); + timings.encoding_seconds = build(builder, config); + return timings; + } + + template + build_timings build_in_external_memory(Iterator keys, uint64_t n, + build_configuration const& config) { + external_memory_builder_single_phf builder; + auto timings = builder.build_from_keys(keys, n, config); + timings.encoding_seconds = build(builder, config); + return timings; + } + + template + double build(Builder const& builder, build_configuration const&) { + auto start = clock_type::now(); + m_seed = builder.seed(); + m_num_keys = builder.num_keys(); + m_table_size = builder.table_size(); + m_M = fastmod::computeM_u64(m_table_size); + m_bucketer = builder.bucketer(); + m_pilots.encode(builder.pilots().data(), m_bucketer.num_buckets()); +#if __cplusplus >= 201703L + if constexpr (Minimal) { +#else + if (Minimal) { +#endif + m_free_slots.encode(builder.free_slots().data(), + m_table_size - m_num_keys); + } + auto stop = clock_type::now(); + return seconds(stop - start); + } + + template + uint64_t operator()(T const& key) const { + auto hash = Hasher::hash(key, m_seed); + return position(hash); + } + + uint64_t position(typename Hasher::hash_type hash) const { + uint64_t bucket = m_bucketer.bucket(hash.first()); + uint64_t pilot = m_pilots.access(bucket); + uint64_t hashed_pilot = default_hash64(pilot, m_seed); + uint64_t p = + fastmod::fastmod_u64(hash.second() ^ hashed_pilot, m_M, m_table_size); +#if __cplusplus >= 201703L + if constexpr (Minimal) { +#else + if (Minimal) { +#endif + if (PTHASH_LIKELY(p < num_keys())) + return p; + return m_free_slots.access(p - num_keys()); + } + return p; + } + + size_t num_bits_for_pilots() const { + return 8 * (sizeof(m_seed) + sizeof(m_num_keys) + sizeof(m_table_size) + + sizeof(m_M)) + + m_bucketer.num_bits() + m_pilots.num_bits(); + } + + size_t num_bits_for_mapper() const { return m_free_slots.num_bits(); } + + size_t num_bits() const { + return num_bits_for_pilots() + num_bits_for_mapper(); + } + + inline uint64_t num_keys() const { return m_num_keys; } + + inline uint64_t table_size() const { return m_table_size; } + + template + void visit(Visitor& visitor) { + visitor.visit(m_seed); + visitor.visit(m_num_keys); + visitor.visit(m_table_size); + visitor.visit(m_M); + visitor.visit(m_bucketer); + visitor.visit(m_pilots); + visitor.visit(m_free_slots); + } + + template + void load(Loader& loader) { + loader.load(m_seed); + loader.load(m_num_keys); + loader.load(m_table_size); + loader.load(m_M); + m_bucketer.load(loader); + m_pilots.load(loader); + m_free_slots.load(loader); + } + + template + void dump(Dumper& dumper) const { + dumper.dump(m_seed); + dumper.dump(m_num_keys); + dumper.dump(m_table_size); + dumper.dump(m_M); + m_bucketer.dump(dumper); + m_pilots.dump(dumper); + m_free_slots.dump(dumper); + } + + private: + uint64_t m_seed; + uint64_t m_num_keys; + uint64_t m_table_size; + __uint128_t m_M; + skew_bucketer m_bucketer; + Encoder m_pilots; + ef_sequence m_free_slots; +}; + +} // namespace pthash diff --git a/thirdparty/pthash/utils/bucketers.hpp b/thirdparty/pthash/utils/bucketers.hpp new file mode 100644 index 00000000..3af0ce06 --- /dev/null +++ b/thirdparty/pthash/utils/bucketers.hpp @@ -0,0 +1,92 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "pthash/utils/util.hpp" + +namespace pthash { + +struct skew_bucketer { + skew_bucketer() {} + + void init(uint64_t num_buckets) { + m_num_dense_buckets = 0.3 * num_buckets; + m_num_sparse_buckets = num_buckets - m_num_dense_buckets; + m_M_num_dense_buckets = fastmod::computeM_u64(m_num_dense_buckets); + m_M_num_sparse_buckets = fastmod::computeM_u64(m_num_sparse_buckets); + } + + inline uint64_t bucket(uint64_t hash) const { + static const uint64_t T = UINT64_MAX / 5 * 3; + return (hash < T) ? fastmod::fastmod_u64(hash, m_M_num_dense_buckets, + m_num_dense_buckets) + : m_num_dense_buckets + + fastmod::fastmod_u64(hash, m_M_num_sparse_buckets, + m_num_sparse_buckets); + } + + uint64_t num_buckets() const { + return m_num_dense_buckets + m_num_sparse_buckets; + } + + size_t num_bits() const { + return 8 * (sizeof(m_num_dense_buckets) + sizeof(m_num_sparse_buckets) + + sizeof(m_M_num_dense_buckets) + sizeof(m_M_num_sparse_buckets)); + } + + void swap(skew_bucketer& other) { + std::swap(m_num_dense_buckets, other.m_num_dense_buckets); + std::swap(m_num_sparse_buckets, other.m_num_sparse_buckets); + std::swap(m_M_num_dense_buckets, other.m_M_num_dense_buckets); + std::swap(m_M_num_sparse_buckets, other.m_M_num_sparse_buckets); + } + + template + void visit(Visitor& visitor) { + visitor.visit(m_num_dense_buckets); + visitor.visit(m_num_sparse_buckets); + visitor.visit(m_M_num_dense_buckets); + visitor.visit(m_M_num_sparse_buckets); + } + + template + void load(Loader& loader) { + loader.load(m_num_dense_buckets); + loader.load(m_num_sparse_buckets); + loader.load(m_M_num_dense_buckets); + loader.load(m_M_num_sparse_buckets); + } + + template + void dump(Dumper& dumper) const { + dumper.dump(m_num_dense_buckets); + dumper.dump(m_num_sparse_buckets); + dumper.dump(m_M_num_dense_buckets); + dumper.dump(m_M_num_sparse_buckets); + } + + private: + uint64_t m_num_dense_buckets, m_num_sparse_buckets; + __uint128_t m_M_num_dense_buckets, m_M_num_sparse_buckets; +}; + +} // namespace pthash \ No newline at end of file diff --git a/thirdparty/pthash/utils/hasher.hpp b/thirdparty/pthash/utils/hasher.hpp new file mode 100644 index 00000000..9856b3be --- /dev/null +++ b/thirdparty/pthash/utils/hasher.hpp @@ -0,0 +1,188 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +// See also https://github.com/jermp/bench_hash_functions + +namespace pthash { + +struct byte_range { + uint8_t const* begin; + uint8_t const* end; +}; + +/* + This code is an adaptation from + https://github.com/aappleby/smhasher/blob/master/src/MurmurHash2.cpp + by Austin Appleby +*/ +inline uint64_t MurmurHash2_64(void const* key, size_t len, uint64_t seed) { + const uint64_t m = 0xc6a4a7935bd1e995ULL; + const int r = 47; + + uint64_t h = seed ^ (len * m); + +#if defined(__arm) || defined(__arm__) + const size_t ksize = sizeof(uint64_t); + const unsigned char* data = (const unsigned char*) key; + const unsigned char* end = data + (std::size_t)(len / 8) * ksize; +#else + const uint64_t* data = (const uint64_t*) key; + const uint64_t* end = data + (len / 8); +#endif + + while (data != end) { +#if defined(__arm) || defined(__arm__) + uint64_t k; + memcpy(&k, data, ksize); + data += ksize; +#else + uint64_t k = *data++; +#endif + + k *= m; + k ^= k >> r; + k *= m; + + h ^= k; + h *= m; + } + + const unsigned char* data2 = (const unsigned char*) data; + + switch (len & 7) { + // fall through + case 7: + h ^= uint64_t(data2[6]) << 48; + // fall through + case 6: + h ^= uint64_t(data2[5]) << 40; + // fall through + case 5: + h ^= uint64_t(data2[4]) << 32; + // fall through + case 4: + h ^= uint64_t(data2[3]) << 24; + // fall through + case 3: + h ^= uint64_t(data2[2]) << 16; + // fall through + case 2: + h ^= uint64_t(data2[1]) << 8; + // fall through + case 1: + h ^= uint64_t(data2[0]); + h *= m; + }; + + h ^= h >> r; + h *= m; + h ^= h >> r; + + return h; +} + +inline uint64_t default_hash64(uint64_t val, uint64_t seed) { + return MurmurHash2_64(&val, sizeof(uint64_t), seed); +} + +struct hash64 { + hash64() {} + hash64(uint64_t hash) : m_hash(hash) {} + + inline uint64_t first() const { return m_hash; } + + inline uint64_t second() const { return m_hash; } + + inline uint64_t mix() const { + // From: + // http://zimbry.blogspot.com/2011/09/better-bit-mixing-improving-on.html + // 13-th variant + uint64_t z = m_hash; + z = (z ^ (z >> 30)) * 0xbf58476d1ce4e5b9; + z = (z ^ (z >> 27)) * 0x94d049bb133111eb; + return z ^ (z >> 31); + } + + private: + uint64_t m_hash; +}; + +struct hash128 { + hash128() {} + hash128(uint64_t first, uint64_t second) : m_first(first), m_second(second) {} + + inline uint64_t first() const { return m_first; } + + inline uint64_t second() const { return m_second; } + + inline uint64_t mix() const { return m_first ^ m_second; } + + private: + uint64_t m_first, m_second; +}; + +struct murmurhash2_64 { + typedef hash64 hash_type; + + // generic range of bytes + static inline hash64 hash(byte_range range, uint64_t seed) { + return MurmurHash2_64(range.begin, range.end - range.begin, seed); + } + + // specialization for std::string + static inline hash64 hash(std::string const& val, uint64_t seed) { + return MurmurHash2_64(val.data(), val.size(), seed); + } + + // specialization for uint64_t + static inline hash64 hash(uint64_t val, uint64_t seed) { + return MurmurHash2_64(reinterpret_cast(&val), sizeof(val), + seed); + } +}; + +struct murmurhash2_128 { + typedef hash128 hash_type; + + // generic range of bytes + static inline hash128 hash(byte_range range, uint64_t seed) { + return {MurmurHash2_64(range.begin, range.end - range.begin, seed), + MurmurHash2_64(range.begin, range.end - range.begin, ~seed)}; + } + + // specialization for std::string + static inline hash128 hash(std::string const& val, uint64_t seed) { + return {MurmurHash2_64(val.data(), val.size(), seed), + MurmurHash2_64(val.data(), val.size(), ~seed)}; + } + + // specialization for uint64_t + static inline hash128 hash(uint64_t val, uint64_t seed) { + return { + MurmurHash2_64(reinterpret_cast(&val), sizeof(val), seed), + MurmurHash2_64(reinterpret_cast(&val), sizeof(val), + ~seed)}; + } +}; + +} // namespace pthash \ No newline at end of file diff --git a/thirdparty/pthash/utils/logger.hpp b/thirdparty/pthash/utils/logger.hpp new file mode 100644 index 00000000..068d04e8 --- /dev/null +++ b/thirdparty/pthash/utils/logger.hpp @@ -0,0 +1,87 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace pthash { + +struct progress_logger { + progress_logger(uint64_t total_events, std::string const& prefix = "", + std::string const& suffix = "", bool enable = true) + : m_total_events(total_events), + m_prefix(prefix), + m_suffix(suffix), + m_logged_events(0) { + // TODO: improve the computation of log_step using timings ! + uint64_t perc_fraction = (total_events >= 100000000) ? 100 : 20; + m_log_step = (total_events + perc_fraction - 1) / perc_fraction; + m_next_event_to_log = static_cast(-1); + if (enable) { + m_next_event_to_log = m_log_step; + update(false); + } + } + + inline void log() { + if (++m_logged_events >= m_next_event_to_log) { + update(false); + m_next_event_to_log += m_log_step; + // the following ensures the last update on 100% + if (m_next_event_to_log > m_total_events) + m_next_event_to_log = m_total_events; + } + } + + void finalize() { + if (m_next_event_to_log != static_cast(-1)) { + assert(m_next_event_to_log == m_total_events); + assert(m_logged_events == m_total_events); + update(true); + } + } + + uint64_t total_events() const { return m_total_events; } + + uint64_t logged_events() const { return m_logged_events; } + + private: + inline void update(bool final) const { + uint64_t perc = (100 * m_logged_events / m_total_events); + std::cout << "\r" << m_prefix << perc << "%" << m_suffix; + if (final) { + std::cout << std::endl; + } else { + std::cout << std::flush; + } + } + + const uint64_t m_total_events; + const std::string m_prefix = ""; + const std::string m_suffix = ""; + uint64_t m_logged_events; + uint64_t m_log_step; + uint64_t m_next_event_to_log; +}; + +} // namespace pthash \ No newline at end of file diff --git a/thirdparty/pthash/utils/util.hpp b/thirdparty/pthash/utils/util.hpp new file mode 100644 index 00000000..c64452c3 --- /dev/null +++ b/thirdparty/pthash/utils/util.hpp @@ -0,0 +1,57 @@ +/** Copyright 2020-2024 Giulio Ermanno Pibiri and Roberto Trani + * + * The following sets forth attribution notices for third party software. + * + * PTHash: + * The software includes components licensed by Giulio Ermanno Pibiri and + * Roberto Trani, available at https://github.com/jermp/pthash + * + * Licensed under the MIT License (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://opensource.org/licenses/MIT + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +#include "pthash/essentials/essentials.hpp" +#include "pthash/fastmod/fastmod.h" + +#define PTHASH_LIKELY(expr) __builtin_expect((bool) (expr), true) + +namespace pthash { + +typedef std::chrono::high_resolution_clock clock_type; + +namespace constants { +static const uint64_t available_ram = + sysconf(_SC_PAGESIZE) * sysconf(_SC_PHYS_PAGES); +static const uint64_t invalid_seed = uint64_t(-1); +static const uint64_t invalid_num_buckets = uint64_t(-1); +static const std::string default_tmp_dirname("."); +} // namespace constants + +inline uint64_t random_value() { + unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::mt19937_64 rng(seed); + return rng(); +} + +template +double seconds(DurationType const& d) { + return static_cast( + std::chrono::duration_cast(d).count()) / + 1000; // better resolution than std::chrono::seconds +} + +} // namespace pthash