Skip to content

Commit

Permalink
[Graph Debugger] Expose way to benchmark individual nodes. (#11000)
Browse files Browse the repository at this point in the history
* initial

* secondary commit

* docs

* match tests

* fix test

* use std::fixed, max precision, typed pack func, fix isnan

* comments on docs

* address tristan comments

* add test

* tristan comments

* use skipif

* empty commit

* empty commit

* jostle again

* remove assert statement
  • Loading branch information
AndrewZhaoLuo committed Apr 28, 2022
1 parent 94269a8 commit 9fd279b
Show file tree
Hide file tree
Showing 3 changed files with 214 additions and 68 deletions.
55 changes: 50 additions & 5 deletions python/tvm/contrib/debugger/debug_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,18 @@
# under the License.
"""Graph debug runtime executes TVM debug packed functions."""

import logging
import os
import tempfile
import shutil
import logging
import tvm._ffi
import tempfile

import tvm._ffi
from tvm._ffi.base import string_types
from tvm.contrib import graph_executor
from . import debug_result
from tvm.runtime.module import BenchmarkResult

from ...runtime.profiling import Report
from . import debug_result

_DUMP_ROOT_PREFIX = "tvmdbg_"
_DUMP_PATH_PREFIX = "_tvmdbg_"
Expand Down Expand Up @@ -111,6 +113,7 @@ def __init__(self, module, device, graph_json_str, dump_root):
self._dump_root = dump_root
self._dump_path = None
self._run_individual = module["run_individual"]
self._run_individual_node = module["run_individual_node"]
self._debug_get_output = module["debug_get_output"]
self._execute_node = module["execute_node"]
self._get_node_output = module["get_node_output"]
Expand Down Expand Up @@ -223,7 +226,6 @@ def _run_debug(self):
"""Execute the node specified with index will be executed.
Each debug output will be copied to the buffer
Time consumed for each execution will be set as debug output.
"""
# Get timing.
self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)]
Expand Down Expand Up @@ -281,6 +283,49 @@ def run_individual(self, number, repeat=1, min_repeat_ms=0):
ret = self._run_individual(number, repeat, min_repeat_ms)
return ret.strip(",").split(",") if ret else []

def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
"""Benchmark a single node in the serialized graph.
This does not do any data transfers and uses arrays already on the device.
Parameters
----------
index : int
The index of the node, see `self.debug_datum.get_graph_nodes`
number: int
The number of times to run this function for taking average.
We call these runs as one `repeat` of measurement.
repeat: int, optional
The number of times to repeat the measurement.
In total, the function will be invoked (1 + number x repeat) times,
where the first one is warm up and will be discarded.
The returned result contains `repeat` costs,
each of which is an average of `number` costs.
min_repeat_ms: int, optional
The minimum duration of one `repeat` in milliseconds.
By default, one `repeat` contains `number` runs. If this parameter is set,
the parameters `number` will be dynamically adjusted to meet the
minimum duration requirement of one `repeat`.
i.e., When the run time of one `repeat` falls below this time, the `number` parameter
will be automatically increased.
Returns
-------
A module BenchmarkResult
"""
# Results are returned as serialized strings which we deserialize
ret = self._run_individual_node(index, number, repeat, min_repeat_ms)
answer = []
for value in ret.split(","):
if value.strip() == "":
continue
answer.append(float(value))

return BenchmarkResult(answer)

def profile(self, collectors=None, **input_dict):
"""Run forward execution of the graph and collect overall and per-op
performance metrics.
Expand Down
113 changes: 72 additions & 41 deletions src/runtime/graph_executor/debug/graph_executor_debug.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,10 @@
#include <tvm/runtime/registry.h>

#include <chrono>
#include <cmath>
#include <sstream>

#include "../../rpc/rpc_session.h"
#include "../graph_executor.h"

namespace tvm {
Expand Down Expand Up @@ -67,44 +69,14 @@ class GraphExecutorDebug : public GraphExecutor {
time_sec_per_op[index] += RunOpRPC(index, number, repeat, min_repeat_ms);
}
} else {
for (int i = 0; i < repeat; ++i) {
std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds>
tbegin, tend;
double duration_ms = 0.0;
do {
std::fill(time_sec_per_op.begin(), time_sec_per_op.end(), 0);
if (duration_ms > 0.0) {
number = static_cast<int>(std::max((min_repeat_ms / (duration_ms / number) + 1),
number * 1.618)); // 1.618 is chosen by random
}
tbegin = std::chrono::high_resolution_clock::now();
std::vector<std::vector<Timer>> op_timers;
for (size_t index = 0; index < op_execs_.size(); index++) {
op_timers.push_back({});
}
for (int k = 0; k < number; k++) {
for (size_t index = 0; index < op_execs_.size(); ++index) {
if (op_execs_[index]) {
op_timers[index].push_back(RunOpHost(index));
}
}
}
for (size_t index = 0; index < op_execs_.size(); ++index) {
for (auto t : op_timers[index]) {
time_sec_per_op[index] += t->SyncAndGetElapsedNanos() / 1e9;
}
}
tend = std::chrono::high_resolution_clock::now();
duration_ms =
std::chrono::duration_cast<std::chrono::duration<double>>(tend - tbegin).count() *
1000;
} while (duration_ms < min_repeat_ms);

LOG(INFO) << "Iteration: " << i;
int op = 0;
for (size_t index = 0; index < time_sec_per_op.size(); index++) {
for (size_t index = 0; index < op_execs_.size(); ++index) {
std::vector<double> results = RunIndividualNode(index, number, repeat, min_repeat_ms);
for (size_t cur_repeat = 0; cur_repeat < results.size(); cur_repeat++) {
time_sec_per_op[index] = results[cur_repeat];

LOG(INFO) << "Iteration: " << cur_repeat;
int op = 0;
if (op_execs_[index]) {
time_sec_per_op[index] /= number;
LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": "
<< time_sec_per_op[index] * 1e6 << " us/iter";
}
Expand All @@ -114,15 +86,50 @@ class GraphExecutorDebug : public GraphExecutor {

std::ostringstream os;
for (size_t index = 0; index < time_sec_per_op.size(); index++) {
os << time_sec_per_op[index] << ",";
double time = time_sec_per_op[index];
// To have good behavior when calculating total time, etc.
if (std::isnan(time)) {
time = 0;
}
os << time << ",";
}
return os.str();
}

std::vector<double> RunIndividualNode(int node_index, int number, int repeat, int min_repeat_ms) {
std::string tkey = module_->type_key();

// results_in_seconds[a][b] is the bth index run of the ath index repeat
std::vector<double> results_in_seconds(repeat, 0);

if (tkey == "rpc") {
LOG(FATAL) << "RPC measurements should not use RunIndividualNode!";
}

if (!op_execs_[node_index]) {
// don't return anything...
return results_in_seconds;
}

// assume host runs things which is first device
Device& d = devices_[0];
PackedFunc time_evaluator = WrapTimeEvaluator(
TypedPackedFunc<void()>([this, node_index]() { this->RunOpHost(node_index); }), d, number,
repeat, min_repeat_ms);
std::string result = time_evaluator();
const double* results_arr = reinterpret_cast<const double*>(result.data());
size_t double_bytes = sizeof(double);
for (size_t i = 0; i < result.size() / double_bytes; i++) {
results_in_seconds[i] = results_arr[i];
}
return results_in_seconds;
}

double RunOpRPC(int index, int number, int repeat, int min_repeat_ms) {
// Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes which
// represent inputs/parameters to the graph. Other types may be supported in the future, but
// consideration would be needed as to how to do that over RPC before we support it here.
// Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes
// which represent inputs/parameters to the graph. Other types may be supported in the
// future, but consideration would be needed as to how to do that over RPC before we support
// it here.
if (nodes_[index].op_type != "tvm_op") {
CHECK_EQ(nodes_[index].op_type, "null")
<< "Don't know how to run op type " << nodes_[index].op_type
Expand Down Expand Up @@ -362,6 +369,30 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
ICHECK_GE(min_repeat_ms, 0);
*rv = this->RunIndividual(number, repeat, min_repeat_ms);
});
} else if (name == "run_individual_node") {
return TypedPackedFunc<std::string(int, int, int, int)>(
[sptr_to_self, this](int node_index, int number, int repeat, int min_repeat_ms) {
ICHECK_GE(node_index, 0);
ICHECK_LT(node_index, nodes_.size());
ICHECK_GT(number, 0);
ICHECK_GT(repeat, 0);
ICHECK_GE(min_repeat_ms, 0);
std::vector<double> results =
this->RunIndividualNode(node_index, number, repeat, min_repeat_ms);

// Have problems returning FloatImm so serialize to string results as hack.
std::stringstream s;

// use maximum precision available and use fixed representation
s << std::fixed;
s.precision(std::numeric_limits<double>::max_digits10);

for (double cur : results) {
s << cur << ", ";
}

return s.str();
});
} else if (name == "profile") {
return TypedPackedFunc<profiling::Report(Array<profiling::MetricCollector>)>(
[sptr_to_self, this](Array<profiling::MetricCollector> collectors) {
Expand Down
114 changes: 92 additions & 22 deletions tests/python/unittest/test_runtime_graph_debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,56 @@
import re
import sys
import time
from distutils.log import debug

import numpy as np
import pytest

import tvm
import tvm.testing
from tvm import te
import numpy as np
from tvm import rpc
from tvm import rpc, te
from tvm._ffi.base import TVMError
from tvm.contrib import utils
from tvm.contrib.debugger import debug_executor


@tvm.testing.requires_llvm
@tvm.testing.requires_rpc
def test_graph_simple():
n = 4
A = te.placeholder((n,), name="A")
B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
s = te.create_schedule(B.op)
# Constants for creating simple graphs, fixtures to avoid free globals
@pytest.fixture
def n():
return 4


@pytest.fixture
def A(n):
return te.placeholder((n,), name="A")


@pytest.fixture
def B(A):
return te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")


@pytest.fixture
def s(B):
return te.create_schedule(B.op)


@pytest.fixture
def mlib(s, A, B):
return tvm.build(s, [A, B], "llvm", name="myadd")


@pytest.fixture
def myadd(mlib):
def _myadd(*args):
to_return = mlib["myadd"](*args)
time.sleep(0.25)
return to_return

return _myadd


@pytest.fixture
def graph():
node0 = {"op": "null", "name": "x", "inputs": []}
node1 = {
"op": "tvm_op",
Expand All @@ -64,21 +94,19 @@ def test_graph_simple():
"attrs": attrs,
}
graph = json.dumps(graph)
return graph

def check_verify():
mlib = tvm.build(s, [A, B], "llvm", name="myadd")

def myadd(*args):
to_return = mlib["myadd"](*args)
time.sleep(0.25)
return to_return

@tvm.testing.requires_llvm
@tvm.testing.requires_rpc
@pytest.mark.skipif(
tvm.support.libinfo()["USE_PROFILER"] != "ON", reason="TVM was not built with profiler support"
)
def test_end_to_end_graph_simple(graph, n, A, B, s, myadd):
def check_verify():
mlib_proxy = tvm.support.FrontendTestModule()
mlib_proxy["myadd"] = myadd
try:
mod = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))
except ValueError:
return
mod = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))

a = np.random.uniform(size=(n,)).astype(A.dtype)
mod.set_input(x=a)
Expand Down Expand Up @@ -185,5 +213,47 @@ def check_remote(server):
check_remote(rpc.Server("127.0.0.1"))


@tvm.testing.requires_llvm
@pytest.mark.skipif(
tvm.support.libinfo()["USE_PROFILER"] != "ON", reason="TVM was not built with profiler support"
)
def test_run_single_node(graph, n, A, myadd):
mlib_proxy = tvm.support.FrontendTestModule()
mlib_proxy["myadd"] = myadd
mod: debug_executor.GraphModuleDebug = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))

a = np.random.uniform(size=(n,)).astype(A.dtype)
mod.set_input(x=a)

assert len(mod.debug_datum.get_graph_nodes()) == 2
assert mod.debug_datum.get_graph_nodes()[0]["op"] == "param"
assert mod.debug_datum.get_graph_nodes()[1]["op"] == "myadd"

# Running a node with no associated function should return instantly and have 0 runtime
assert mod.run_individual_node(0, number=1).mean == 0

# Meanwhile the actual function should take some time, more time if you run it more times
repeat_1_result = mod.run_individual_node(1, repeat=1)
assert repeat_1_result.mean > 0

# Running multiple times (10) should take longer than 1 time
repeat_3_results = mod.run_individual_node(1, repeat=3)
assert sum(repeat_3_results.results) > sum(repeat_1_result.results)

# Increasing the number of repeats should give you the number of results asked for
assert len(mod.run_individual_node(1, repeat=10).results) == 10

# Doing repeat_ms should have the run time greater than the asked amount
start = time.time()
mod.run_individual_node(1, min_repeat_ms=500)
end = time.time()
elapsed_time_in_seconds = end - start
assert elapsed_time_in_seconds >= 0.5

# Going out of bounds of node index throws a tvm error
with pytest.raises(TVMError):
mod.run_individual_node(2)


if __name__ == "__main__":
sys.exit(pytest.main([__file__] + sys.argv[1:]))

0 comments on commit 9fd279b

Please sign in to comment.