Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Executor 2.0: Stream assignment #5602

Merged
merged 7 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
376 changes: 376 additions & 0 deletions dali/pipeline/executor/executor2/stream_assignment.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,376 @@
// Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#ifndef DALI_PIPELINE_EXECUTOR_EXECUTOR2_STREAM_ASSIGNMENT_H_
#define DALI_PIPELINE_EXECUTOR_EXECUTOR2_STREAM_ASSIGNMENT_H_

#include <algorithm>
#include <cassert>
#include <functional>
#include <optional>
#include <queue>
#include <unordered_map>
#include <set>
#include <utility>
#include <vector>
#include "dali/pipeline/graph/graph_util.h"
#include "dali/pipeline/executor/executor2/exec_graph.h"
// TODO(michalz): This is here for review process only. Remove when exec2.h is available
// #include "dali/pipeline/executor/executor2/exec2.h"
#include "dali/pipeline/graph/op_graph2.h"

namespace dali {
namespace exec2 {

// TODO(michalz): This is here for review process only. Remove when exec2.h is available
enum class StreamPolicy : int {
Single, //< There's just one stream that's used by all operators
PerBackend, //< Operators are scheduled on a stream specific to their backend (mixed or GPU)
PerOperator //< Independent operators are executed on separate streams.

// TODO(michalz): Check if this is legal with existing operator implementations - likely not
// PerIteration, //< Streams are cycled on a per-iteration basis
};


template <StreamPolicy policy>
class StreamAssignment;

inline bool NeedsStream(const ExecNode *node) {
if (node->is_pipeline_output) {
for (auto &pipe_out : node->inputs) {
if (pipe_out->device == StorageDevice::GPU)
return true;
}
} else {
return node->backend != OpType::CPU;
}
return false;
}

inline OpType NodeType(const ExecNode *node) {
if (node->is_pipeline_output) {
OpType type = OpType::CPU;
for (auto &pipe_out : node->inputs) {
if (pipe_out->device == StorageDevice::GPU) {
auto producer_type = pipe_out->producer->backend;
if (producer_type == OpType::GPU) {
return OpType::GPU;
} else if (producer_type == OpType::MIXED) {
type = OpType::MIXED;
}
}
}
return type;
} else {
return node->backend;
}
}

/** A trivial stream policy, with just one stream shared by all non-CPU operaotrs. */
template <>
class StreamAssignment<StreamPolicy::Single> {
public:
explicit StreamAssignment(ExecGraph &graph) {
for (auto &node : graph.Nodes()) {
if (NeedsStream(&node)) {
needs_stream_ = true;
break;
}
}
}

std::optional<int> operator[](const ExecNode *node) const {
if (NeedsStream(node))
return 0;
else
return std::nullopt;
}

int NumStreams() const {
return needs_stream_ ? 1 : 0;
}

private:
bool needs_stream_ = false;
};


/** A simple stream policy where all mixed and GPU operators share their respective streams.
*
* In this policy there are 0..2 streams, depending on the number of mixed and GPU nodes:
* 0 - only CPU nodes
* 1 - there are some mixed or some GPU nodes, but not both
* 2 - there are both mixed and CPU nodes present.
*/
template <>
class StreamAssignment<StreamPolicy::PerBackend> {
public:
explicit StreamAssignment(ExecGraph &graph) {
for (auto &node : graph.Nodes()) {
switch (NodeType(&node)) {
case OpType::GPU:
has_gpu_ = true;
if (has_mixed_)
return; // we already have both, nothing more can happen
break;
case OpType::MIXED:
has_mixed_ = true;
if (has_gpu_)
return; // we already have both, nothing more can happen
break;
default:
break;
}
}
}

/** Returns a stream index for a non-CPU operator.
*
* If the node is a Mixed node, it gets stream index 0.
* If the node is a GPU node it gets stream index 1 if there are any mixed nodes, otherwise
* the only stream is the GPU stream and the returned index is 0.
*/
std::optional<int> operator[](const ExecNode *node) const {
switch (NodeType(node)) {
case OpType::CPU:
return std::nullopt;
case OpType::GPU:
return has_mixed_ ? 1 : 0;
case OpType::MIXED:
return 0;
default:
assert(false && "Unreachable");
return std::nullopt;
}
}

int NumStreams() const {
return has_gpu_ + has_mixed_;
}

private:
bool has_gpu_ = false;
bool has_mixed_ = false;
};

/** Implements per-operator stream assignment.
*
* This policy implements stream assingment such that independent GPU/Mixed operators get
* separate streams. When there's a dependency then one dependent operator shares the stream of
* its predecessor.
*
* Example - numbers are stream indices, "X" means no stream, "s" means synchronization
* ```
* CPU(X) ---- GPU(0) --- GPU(0) -- GPU(0) -- output 0
* \ s
* \ /
* ----- GPU(1) ----
* \
* \
* CPU(X) --- GPU(2) ----s GPU(1) ----------s output 1
* ```
*/
template <>
class StreamAssignment<StreamPolicy::PerOperator> {
public:
explicit StreamAssignment(ExecGraph &graph) {
Assign(graph);
}

std::optional<int> operator[](const ExecNode *node) const {
auto it = node_ids_.find(node);
assert(it != node_ids_.end());
return stream_assignment_[it->second];
}

/** Gets the total number of streams required to run independent operators on separate streams. */
int NumStreams() const {
return total_streams_;
}

private:
void Assign(ExecGraph &graph) {
// pre-fill the id pool with sequential numbers
int num_nodes = graph.Nodes().size();
for (int i = 0; i < num_nodes; i++) {
free_stream_ids_.insert(i);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As for the minimality of the assignment, what happens for the bipartite graphs? Take 2x2 example. The left A, B ops contribute to both right C, D ops. Assuming that the output edges are visited in the topological order (i.e. C comes before D in A and B lists), can we end up with A0, B1, C0, D2 assignment?

If so, for larger bipartite graphs, it seems the need for free_stream_ids can exceed the number of nodes.

A -- C
 \  /
 /  \
B -- D

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fixed. The algorithm has been reworked and now the assignment is made as soon as a node is pushed to the queue. It's never re-pushed with a worse index.

}

// the nodes in the graph must be sorted topologically
sorted_nodes_.reserve(num_nodes);
stream_assignment_.resize(num_nodes);
for (auto &node : graph.Nodes()) {
int idx = sorted_nodes_.size();
sorted_nodes_.push_back(&node);
node_ids_[&node] = idx;
if (node.inputs.empty()) {
queue_.push({ node_ids_[&node], AssignStreamId(&node).first.value_or(kInvalidStreamIdx) });
} else {
for (auto &inp : node.inputs) {
assert(node_ids_.count(inp->producer) >= 0 && "Nodes must be topologically sorted.");
}
}
}

assert(static_cast<size_t>(num_nodes) == sorted_nodes_.size());

FindGPUContributors(graph);

graph::ClearVisitMarkers(graph.Nodes());
Traverse();
ClearCPUStreams();
total_streams_ = CalcNumStreams();
}

void Traverse() {
while (!queue_.empty()) {
// PrintQueue(); /* uncomment for debugging */
auto [idx, stream_idx] = queue_.top();
std::optional<int> stream_id;
if (stream_idx != kInvalidStreamIdx)
stream_id = stream_idx;

queue_.pop();
auto *node = sorted_nodes_[idx];
// This will be true for nodes which have no outputs or which don't contribute to any
// GPU nodes.
bool keep_stream_id = stream_id.has_value();

if (stream_id.has_value())
free_stream_ids_.insert(*stream_id);

graph::Visit v(node);
if (!v) {
assert(stream_assignment_[idx].value_or(kInvalidStreamIdx) <= stream_idx);
continue; // we've been here already - skip
}

stream_assignment_[idx] = stream_id;

for (auto &output_desc : node->outputs) {
for (auto *out : output_desc.consumers) {
auto [out_stream_id, is_new] = AssignStreamId(out->consumer, stream_id);
if (out_stream_id == stream_id)
keep_stream_id = false;
if (is_new)
queue_.push({node_ids_[out->consumer], out_stream_id.value_or(kInvalidStreamIdx)});
}
}
if (stream_id.has_value() && keep_stream_id)
free_stream_ids_.erase(*stream_id);
}
}

void ClearCPUStreams() {
for (int i = 0, n = sorted_nodes_.size(); i < n; i++) {
if (!NeedsStream(sorted_nodes_[i]))
stream_assignment_[i] = std::nullopt;
}
}

int CalcNumStreams() {
int max = -1;
for (auto a : stream_assignment_) {
if (a.has_value())
max = std::max(max, *a);
}
return max + 1;
}

void PrintQueue(std::ostream &os = std::cout) {
auto q2 = queue_;
while (!q2.empty()) {
auto [idx, stream_idx] = q2.top();
q2.pop();
auto *node = sorted_nodes_[idx];
if (!node->instance_name.empty())
os << node->instance_name;
else if (node->is_pipeline_output)
os << "<output>";
else
os << "[" << idx << "]";
os << "(";
if (stream_idx != kInvalidStreamIdx)
os << stream_idx;
else
os << "none";
os << ") ";
}
os << "\n";
}

std::pair<std::optional<int>, bool> AssignStreamId(
const ExecNode *node,
std::optional<int> prev_stream_id = std::nullopt) {
// If the preceding node had a stream, then we have to pass it on through CPU nodes if
// there are any GPU nodes down the graph.
// If the preceding node didn't have a stream, then we only need a stream if the current
// node needs one.
bool needs_stream = prev_stream_id.has_value()
? gpu_contributors_.count(node) != 0
: NeedsStream(node);
if (needs_stream) {
assert(!free_stream_ids_.empty());
auto b = free_stream_ids_.begin();
int next_free = *b;
auto &current = stream_assignment_[node_ids_[node]];
if (!current.has_value() || *current > next_free) {
current = next_free;
free_stream_ids_.erase(b);
return { next_free, true };
} else {
return { *current, false };
}
} else {
return { std::nullopt, true };
}
}

void FindGPUContributors(ExecGraph &graph) {
// Run DFS, output to input, and find nodes which contribute to any node that requires a stream
graph::ClearVisitMarkers(graph.Nodes());
for (auto it = graph.Nodes().rbegin(); it != graph.Nodes().rend(); ++it) {
auto &node = *it;
FindGPUContributors(&node, false);
}
}

void FindGPUContributors(const ExecNode *node, bool is_gpu_contributor) {
graph::Visit v(node);
if (!v)
return;
if (!is_gpu_contributor)
is_gpu_contributor = NeedsStream(node);
if (is_gpu_contributor)
gpu_contributors_.insert(node);
for (auto *inp : node->inputs)
FindGPUContributors(inp->producer, is_gpu_contributor);
}


static constexpr int kInvalidStreamIdx = 0x7fffffff;
std::vector<std::optional<int>> stream_assignment_;
int total_streams_ = 0;
std::unordered_map<const ExecNode *, int> node_ids_; // topologically sorted nodes
std::set<const ExecNode *> gpu_contributors_;
std::vector<const ExecNode *> sorted_nodes_;
std::set<int> free_stream_ids_;
std::priority_queue<std::pair<int, int>, std::vector<std::pair<int, int>>, std::greater<>> queue_;
};

} // namespace exec2
} // namespace dali

#endif // DALI_PIPELINE_EXECUTOR_EXECUTOR2_STREAM_ASSIGNMENT_H_
Loading
Loading