-
Notifications
You must be signed in to change notification settings - Fork 615
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Executor 2.0: Stream assignment #5602
Merged
Merged
Changes from 6 commits
Commits
Show all changes
7 commits
Select commit
Hold shift + click to select a range
538eb3b
Add ExecNode stream assignment algorithms and tests.
mzient 6aebb84
Add workarounds for the temporary absence of exec2.h.
mzient 01e9a1c
Return stream ids to free pool when skipping.
mzient ba0fb15
Add tests for simple assignment policies. Improve comments.
mzient c199977
Minimize assignment with many-to-many graphs.
mzient 771d840
Improve robustness.
mzient bd304b7
Remove PerOperator policy.
mzient File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,376 @@ | ||
// Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// http://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
#ifndef DALI_PIPELINE_EXECUTOR_EXECUTOR2_STREAM_ASSIGNMENT_H_ | ||
#define DALI_PIPELINE_EXECUTOR_EXECUTOR2_STREAM_ASSIGNMENT_H_ | ||
|
||
#include <algorithm> | ||
#include <cassert> | ||
#include <functional> | ||
#include <optional> | ||
#include <queue> | ||
#include <unordered_map> | ||
#include <set> | ||
#include <utility> | ||
#include <vector> | ||
#include "dali/pipeline/graph/graph_util.h" | ||
#include "dali/pipeline/executor/executor2/exec_graph.h" | ||
// TODO(michalz): This is here for review process only. Remove when exec2.h is available | ||
// #include "dali/pipeline/executor/executor2/exec2.h" | ||
#include "dali/pipeline/graph/op_graph2.h" | ||
|
||
namespace dali { | ||
namespace exec2 { | ||
|
||
// TODO(michalz): This is here for review process only. Remove when exec2.h is available | ||
enum class StreamPolicy : int { | ||
Single, //< There's just one stream that's used by all operators | ||
PerBackend, //< Operators are scheduled on a stream specific to their backend (mixed or GPU) | ||
PerOperator //< Independent operators are executed on separate streams. | ||
|
||
// TODO(michalz): Check if this is legal with existing operator implementations - likely not | ||
// PerIteration, //< Streams are cycled on a per-iteration basis | ||
}; | ||
|
||
|
||
template <StreamPolicy policy> | ||
class StreamAssignment; | ||
|
||
inline bool NeedsStream(const ExecNode *node) { | ||
if (node->is_pipeline_output) { | ||
for (auto &pipe_out : node->inputs) { | ||
if (pipe_out->device == StorageDevice::GPU) | ||
return true; | ||
} | ||
} else { | ||
return node->backend != OpType::CPU; | ||
} | ||
return false; | ||
} | ||
|
||
inline OpType NodeType(const ExecNode *node) { | ||
if (node->is_pipeline_output) { | ||
OpType type = OpType::CPU; | ||
for (auto &pipe_out : node->inputs) { | ||
if (pipe_out->device == StorageDevice::GPU) { | ||
auto producer_type = pipe_out->producer->backend; | ||
if (producer_type == OpType::GPU) { | ||
return OpType::GPU; | ||
} else if (producer_type == OpType::MIXED) { | ||
type = OpType::MIXED; | ||
} | ||
} | ||
} | ||
return type; | ||
} else { | ||
return node->backend; | ||
} | ||
} | ||
|
||
/** A trivial stream policy, with just one stream shared by all non-CPU operaotrs. */ | ||
template <> | ||
class StreamAssignment<StreamPolicy::Single> { | ||
public: | ||
explicit StreamAssignment(ExecGraph &graph) { | ||
for (auto &node : graph.Nodes()) { | ||
if (NeedsStream(&node)) { | ||
needs_stream_ = true; | ||
break; | ||
} | ||
} | ||
} | ||
|
||
std::optional<int> operator[](const ExecNode *node) const { | ||
if (NeedsStream(node)) | ||
return 0; | ||
else | ||
return std::nullopt; | ||
} | ||
|
||
int NumStreams() const { | ||
return needs_stream_ ? 1 : 0; | ||
} | ||
|
||
private: | ||
bool needs_stream_ = false; | ||
}; | ||
|
||
|
||
/** A simple stream policy where all mixed and GPU operators share their respective streams. | ||
* | ||
* In this policy there are 0..2 streams, depending on the number of mixed and GPU nodes: | ||
* 0 - only CPU nodes | ||
* 1 - there are some mixed or some GPU nodes, but not both | ||
* 2 - there are both mixed and CPU nodes present. | ||
*/ | ||
template <> | ||
class StreamAssignment<StreamPolicy::PerBackend> { | ||
public: | ||
explicit StreamAssignment(ExecGraph &graph) { | ||
for (auto &node : graph.Nodes()) { | ||
switch (NodeType(&node)) { | ||
case OpType::GPU: | ||
has_gpu_ = true; | ||
if (has_mixed_) | ||
return; // we already have both, nothing more can happen | ||
break; | ||
case OpType::MIXED: | ||
has_mixed_ = true; | ||
if (has_gpu_) | ||
return; // we already have both, nothing more can happen | ||
break; | ||
default: | ||
break; | ||
} | ||
} | ||
} | ||
|
||
/** Returns a stream index for a non-CPU operator. | ||
* | ||
* If the node is a Mixed node, it gets stream index 0. | ||
* If the node is a GPU node it gets stream index 1 if there are any mixed nodes, otherwise | ||
* the only stream is the GPU stream and the returned index is 0. | ||
*/ | ||
std::optional<int> operator[](const ExecNode *node) const { | ||
switch (NodeType(node)) { | ||
case OpType::CPU: | ||
return std::nullopt; | ||
case OpType::GPU: | ||
return has_mixed_ ? 1 : 0; | ||
case OpType::MIXED: | ||
return 0; | ||
default: | ||
assert(false && "Unreachable"); | ||
return std::nullopt; | ||
} | ||
} | ||
|
||
int NumStreams() const { | ||
return has_gpu_ + has_mixed_; | ||
} | ||
|
||
private: | ||
bool has_gpu_ = false; | ||
bool has_mixed_ = false; | ||
}; | ||
|
||
/** Implements per-operator stream assignment. | ||
* | ||
* This policy implements stream assingment such that independent GPU/Mixed operators get | ||
* separate streams. When there's a dependency then one dependent operator shares the stream of | ||
* its predecessor. | ||
* | ||
* Example - numbers are stream indices, "X" means no stream, "s" means synchronization | ||
* ``` | ||
* CPU(X) ---- GPU(0) --- GPU(0) -- GPU(0) -- output 0 | ||
* \ s | ||
* \ / | ||
* ----- GPU(1) ---- | ||
* \ | ||
* \ | ||
* CPU(X) --- GPU(2) ----s GPU(1) ----------s output 1 | ||
* ``` | ||
*/ | ||
template <> | ||
class StreamAssignment<StreamPolicy::PerOperator> { | ||
public: | ||
explicit StreamAssignment(ExecGraph &graph) { | ||
Assign(graph); | ||
} | ||
|
||
std::optional<int> operator[](const ExecNode *node) const { | ||
auto it = node_ids_.find(node); | ||
assert(it != node_ids_.end()); | ||
return stream_assignment_[it->second]; | ||
} | ||
|
||
/** Gets the total number of streams required to run independent operators on separate streams. */ | ||
int NumStreams() const { | ||
return total_streams_; | ||
} | ||
|
||
private: | ||
void Assign(ExecGraph &graph) { | ||
// pre-fill the id pool with sequential numbers | ||
int num_nodes = graph.Nodes().size(); | ||
for (int i = 0; i < num_nodes; i++) { | ||
free_stream_ids_.insert(i); | ||
} | ||
|
||
// the nodes in the graph must be sorted topologically | ||
sorted_nodes_.reserve(num_nodes); | ||
stream_assignment_.resize(num_nodes); | ||
for (auto &node : graph.Nodes()) { | ||
int idx = sorted_nodes_.size(); | ||
sorted_nodes_.push_back(&node); | ||
node_ids_[&node] = idx; | ||
if (node.inputs.empty()) { | ||
queue_.push({ node_ids_[&node], AssignStreamId(&node).first.value_or(kInvalidStreamIdx) }); | ||
} else { | ||
for (auto &inp : node.inputs) { | ||
assert(node_ids_.count(inp->producer) >= 0 && "Nodes must be topologically sorted."); | ||
} | ||
} | ||
} | ||
|
||
assert(static_cast<size_t>(num_nodes) == sorted_nodes_.size()); | ||
|
||
FindGPUContributors(graph); | ||
|
||
graph::ClearVisitMarkers(graph.Nodes()); | ||
Traverse(); | ||
ClearCPUStreams(); | ||
total_streams_ = CalcNumStreams(); | ||
} | ||
|
||
void Traverse() { | ||
while (!queue_.empty()) { | ||
// PrintQueue(); /* uncomment for debugging */ | ||
auto [idx, stream_idx] = queue_.top(); | ||
std::optional<int> stream_id; | ||
if (stream_idx != kInvalidStreamIdx) | ||
stream_id = stream_idx; | ||
|
||
queue_.pop(); | ||
auto *node = sorted_nodes_[idx]; | ||
// This will be true for nodes which have no outputs or which don't contribute to any | ||
// GPU nodes. | ||
bool keep_stream_id = stream_id.has_value(); | ||
|
||
if (stream_id.has_value()) | ||
free_stream_ids_.insert(*stream_id); | ||
|
||
graph::Visit v(node); | ||
if (!v) { | ||
assert(stream_assignment_[idx].value_or(kInvalidStreamIdx) <= stream_idx); | ||
continue; // we've been here already - skip | ||
} | ||
|
||
stream_assignment_[idx] = stream_id; | ||
|
||
for (auto &output_desc : node->outputs) { | ||
for (auto *out : output_desc.consumers) { | ||
auto [out_stream_id, is_new] = AssignStreamId(out->consumer, stream_id); | ||
if (out_stream_id == stream_id) | ||
keep_stream_id = false; | ||
if (is_new) | ||
queue_.push({node_ids_[out->consumer], out_stream_id.value_or(kInvalidStreamIdx)}); | ||
} | ||
} | ||
if (stream_id.has_value() && keep_stream_id) | ||
free_stream_ids_.erase(*stream_id); | ||
} | ||
} | ||
|
||
void ClearCPUStreams() { | ||
for (int i = 0, n = sorted_nodes_.size(); i < n; i++) { | ||
if (!NeedsStream(sorted_nodes_[i])) | ||
stream_assignment_[i] = std::nullopt; | ||
} | ||
} | ||
|
||
int CalcNumStreams() { | ||
int max = -1; | ||
for (auto a : stream_assignment_) { | ||
if (a.has_value()) | ||
max = std::max(max, *a); | ||
} | ||
return max + 1; | ||
} | ||
|
||
void PrintQueue(std::ostream &os = std::cout) { | ||
auto q2 = queue_; | ||
while (!q2.empty()) { | ||
auto [idx, stream_idx] = q2.top(); | ||
q2.pop(); | ||
auto *node = sorted_nodes_[idx]; | ||
if (!node->instance_name.empty()) | ||
os << node->instance_name; | ||
else if (node->is_pipeline_output) | ||
os << "<output>"; | ||
else | ||
os << "[" << idx << "]"; | ||
os << "("; | ||
if (stream_idx != kInvalidStreamIdx) | ||
os << stream_idx; | ||
else | ||
os << "none"; | ||
os << ") "; | ||
} | ||
os << "\n"; | ||
} | ||
|
||
std::pair<std::optional<int>, bool> AssignStreamId( | ||
const ExecNode *node, | ||
std::optional<int> prev_stream_id = std::nullopt) { | ||
// If the preceding node had a stream, then we have to pass it on through CPU nodes if | ||
// there are any GPU nodes down the graph. | ||
// If the preceding node didn't have a stream, then we only need a stream if the current | ||
// node needs one. | ||
bool needs_stream = prev_stream_id.has_value() | ||
? gpu_contributors_.count(node) != 0 | ||
: NeedsStream(node); | ||
if (needs_stream) { | ||
assert(!free_stream_ids_.empty()); | ||
auto b = free_stream_ids_.begin(); | ||
int next_free = *b; | ||
auto ¤t = stream_assignment_[node_ids_[node]]; | ||
if (!current.has_value() || *current > next_free) { | ||
current = next_free; | ||
free_stream_ids_.erase(b); | ||
return { next_free, true }; | ||
} else { | ||
return { *current, false }; | ||
} | ||
} else { | ||
return { std::nullopt, true }; | ||
} | ||
} | ||
|
||
void FindGPUContributors(ExecGraph &graph) { | ||
// Run DFS, output to input, and find nodes which contribute to any node that requires a stream | ||
graph::ClearVisitMarkers(graph.Nodes()); | ||
for (auto it = graph.Nodes().rbegin(); it != graph.Nodes().rend(); ++it) { | ||
auto &node = *it; | ||
FindGPUContributors(&node, false); | ||
} | ||
} | ||
|
||
void FindGPUContributors(const ExecNode *node, bool is_gpu_contributor) { | ||
graph::Visit v(node); | ||
if (!v) | ||
return; | ||
if (!is_gpu_contributor) | ||
is_gpu_contributor = NeedsStream(node); | ||
if (is_gpu_contributor) | ||
gpu_contributors_.insert(node); | ||
for (auto *inp : node->inputs) | ||
FindGPUContributors(inp->producer, is_gpu_contributor); | ||
} | ||
|
||
|
||
static constexpr int kInvalidStreamIdx = 0x7fffffff; | ||
std::vector<std::optional<int>> stream_assignment_; | ||
int total_streams_ = 0; | ||
std::unordered_map<const ExecNode *, int> node_ids_; // topologically sorted nodes | ||
std::set<const ExecNode *> gpu_contributors_; | ||
std::vector<const ExecNode *> sorted_nodes_; | ||
std::set<int> free_stream_ids_; | ||
std::priority_queue<std::pair<int, int>, std::vector<std::pair<int, int>>, std::greater<>> queue_; | ||
}; | ||
|
||
} // namespace exec2 | ||
} // namespace dali | ||
|
||
#endif // DALI_PIPELINE_EXECUTOR_EXECUTOR2_STREAM_ASSIGNMENT_H_ |
Oops, something went wrong.
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
As for the minimality of the assignment, what happens for the bipartite graphs? Take 2x2 example. The left A, B ops contribute to both right C, D ops. Assuming that the output edges are visited in the topological order (i.e. C comes before D in A and B lists), can we end up with A0, B1, C0, D2 assignment?
If so, for larger bipartite graphs, it seems the need for free_stream_ids can exceed the number of nodes.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed. The algorithm has been reworked and now the assignment is made as soon as a node is pushed to the queue. It's never re-pushed with a worse index.