[Graph Debugger] Expose way to benchmark individual nodes. (#11000)

* initial * secondary commit * docs * match tests * fix test * use std::fixed, max precision, typed pack func, fix isnan * comments on docs * address tristan comments * add test * tristan comments * use skipif * empty commit * empty commit * jostle again * remove assert statement
apache · Apr 28, 2022 · 9fd279b · 9fd279b
1 parent 94269a8
commit 9fd279b
Show file tree

Hide file tree

Showing 3 changed files with 214 additions and 68 deletions.
diff --git a/python/tvm/contrib/debugger/debug_executor.py b/python/tvm/contrib/debugger/debug_executor.py
@@ -16,16 +16,18 @@
 # under the License.
 """Graph debug runtime executes TVM debug packed functions."""
 
+import logging
 import os
-import tempfile
 import shutil
-import logging
-import tvm._ffi
+import tempfile
 
+import tvm._ffi
 from tvm._ffi.base import string_types
 from tvm.contrib import graph_executor
-from . import debug_result
+from tvm.runtime.module import BenchmarkResult
+
 from ...runtime.profiling import Report
+from . import debug_result
 
 _DUMP_ROOT_PREFIX = "tvmdbg_"
 _DUMP_PATH_PREFIX = "_tvmdbg_"
@@ -111,6 +113,7 @@ def __init__(self, module, device, graph_json_str, dump_root):
         self._dump_root = dump_root
         self._dump_path = None
         self._run_individual = module["run_individual"]
+        self._run_individual_node = module["run_individual_node"]
         self._debug_get_output = module["debug_get_output"]
         self._execute_node = module["execute_node"]
         self._get_node_output = module["get_node_output"]
@@ -223,7 +226,6 @@ def _run_debug(self):
         """Execute the node specified with index will be executed.
         Each debug output will be copied to the buffer
         Time consumed for each execution will be set as debug output.
-
         """
         # Get timing.
         self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)]
@@ -281,6 +283,49 @@ def run_individual(self, number, repeat=1, min_repeat_ms=0):
         ret = self._run_individual(number, repeat, min_repeat_ms)
         return ret.strip(",").split(",") if ret else []
 
+    def run_individual_node(self, index, number=10, repeat=1, min_repeat_ms=0):
+        """Benchmark a single node in the serialized graph.
+
+        This does not do any data transfers and uses arrays already on the device.
+
+        Parameters
+        ----------
+        index : int
+            The index of the node, see `self.debug_datum.get_graph_nodes`
+
+        number: int
+            The number of times to run this function for taking average.
+            We call these runs as one `repeat` of measurement.
+
+        repeat: int, optional
+            The number of times to repeat the measurement.
+            In total, the function will be invoked (1 + number x repeat) times,
+            where the first one is warm up and will be discarded.
+            The returned result contains `repeat` costs,
+            each of which is an average of `number` costs.
+
+        min_repeat_ms: int, optional
+            The minimum duration of one `repeat` in milliseconds.
+            By default, one `repeat` contains `number` runs. If this parameter is set,
+            the parameters `number` will be dynamically adjusted to meet the
+            minimum duration requirement of one `repeat`.
+            i.e., When the run time of one `repeat` falls below this time, the `number` parameter
+            will be automatically increased.
+
+        Returns
+        -------
+        A module BenchmarkResult
+        """
+        # Results are returned as serialized strings which we deserialize
+        ret = self._run_individual_node(index, number, repeat, min_repeat_ms)
+        answer = []
+        for value in ret.split(","):
+            if value.strip() == "":
+                continue
+            answer.append(float(value))
+
+        return BenchmarkResult(answer)
+
     def profile(self, collectors=None, **input_dict):
         """Run forward execution of the graph and collect overall and per-op
         performance metrics.

diff --git a/src/runtime/graph_executor/debug/graph_executor_debug.cc b/src/runtime/graph_executor/debug/graph_executor_debug.cc
@@ -27,8 +27,10 @@
 #include <tvm/runtime/registry.h>
 
 #include <chrono>
+#include <cmath>
 #include <sstream>
 
+#include "../../rpc/rpc_session.h"
 #include "../graph_executor.h"
 
 namespace tvm {
@@ -67,44 +69,14 @@ class GraphExecutorDebug : public GraphExecutor {
         time_sec_per_op[index] += RunOpRPC(index, number, repeat, min_repeat_ms);
       }
     } else {
-      for (int i = 0; i < repeat; ++i) {
-        std::chrono::time_point<std::chrono::high_resolution_clock, std::chrono::nanoseconds>
-            tbegin, tend;
-        double duration_ms = 0.0;
-        do {
-          std::fill(time_sec_per_op.begin(), time_sec_per_op.end(), 0);
-          if (duration_ms > 0.0) {
-            number = static_cast<int>(std::max((min_repeat_ms / (duration_ms / number) + 1),
-                                               number * 1.618));  // 1.618 is chosen by random
-          }
-          tbegin = std::chrono::high_resolution_clock::now();
-          std::vector<std::vector<Timer>> op_timers;
-          for (size_t index = 0; index < op_execs_.size(); index++) {
-            op_timers.push_back({});
-          }
-          for (int k = 0; k < number; k++) {
-            for (size_t index = 0; index < op_execs_.size(); ++index) {
-              if (op_execs_[index]) {
-                op_timers[index].push_back(RunOpHost(index));
-              }
-            }
-          }
-          for (size_t index = 0; index < op_execs_.size(); ++index) {
-            for (auto t : op_timers[index]) {
-              time_sec_per_op[index] += t->SyncAndGetElapsedNanos() / 1e9;
-            }
-          }
-          tend = std::chrono::high_resolution_clock::now();
-          duration_ms =
-              std::chrono::duration_cast<std::chrono::duration<double>>(tend - tbegin).count() *
-              1000;
-        } while (duration_ms < min_repeat_ms);
-
-        LOG(INFO) << "Iteration: " << i;
-        int op = 0;
-        for (size_t index = 0; index < time_sec_per_op.size(); index++) {
+      for (size_t index = 0; index < op_execs_.size(); ++index) {
+        std::vector<double> results = RunIndividualNode(index, number, repeat, min_repeat_ms);
+        for (size_t cur_repeat = 0; cur_repeat < results.size(); cur_repeat++) {
+          time_sec_per_op[index] = results[cur_repeat];
+
+          LOG(INFO) << "Iteration: " << cur_repeat;
+          int op = 0;
           if (op_execs_[index]) {
-            time_sec_per_op[index] /= number;
             LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": "
                       << time_sec_per_op[index] * 1e6 << " us/iter";
           }
@@ -114,15 +86,50 @@ class GraphExecutorDebug : public GraphExecutor {
 
     std::ostringstream os;
     for (size_t index = 0; index < time_sec_per_op.size(); index++) {
-      os << time_sec_per_op[index] << ",";
+      double time = time_sec_per_op[index];
+      // To have good behavior when calculating total time, etc.
+      if (std::isnan(time)) {
+        time = 0;
+      }
+      os << time << ",";
     }
     return os.str();
   }
 
+  std::vector<double> RunIndividualNode(int node_index, int number, int repeat, int min_repeat_ms) {
+    std::string tkey = module_->type_key();
+
+    // results_in_seconds[a][b] is the bth index run of the ath index repeat
+    std::vector<double> results_in_seconds(repeat, 0);
+
+    if (tkey == "rpc") {
+      LOG(FATAL) << "RPC measurements should not use RunIndividualNode!";
+    }
+
+    if (!op_execs_[node_index]) {
+      // don't return anything...
+      return results_in_seconds;
+    }
+
+    // assume host runs things which is first device
+    Device& d = devices_[0];
+    PackedFunc time_evaluator = WrapTimeEvaluator(
+        TypedPackedFunc<void()>([this, node_index]() { this->RunOpHost(node_index); }), d, number,
+        repeat, min_repeat_ms);
+    std::string result = time_evaluator();
+    const double* results_arr = reinterpret_cast<const double*>(result.data());
+    size_t double_bytes = sizeof(double);
+    for (size_t i = 0; i < result.size() / double_bytes; i++) {
+      results_in_seconds[i] = results_arr[i];
+    }
+    return results_in_seconds;
+  }
+
   double RunOpRPC(int index, int number, int repeat, int min_repeat_ms) {
-    // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes which
-    // represent inputs/parameters to the graph. Other types may be supported in the future, but
-    // consideration would be needed as to how to do that over RPC before we support it here.
+    // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes
+    // which represent inputs/parameters to the graph. Other types may be supported in the
+    // future, but consideration would be needed as to how to do that over RPC before we support
+    // it here.
     if (nodes_[index].op_type != "tvm_op") {
       CHECK_EQ(nodes_[index].op_type, "null")
           << "Don't know how to run op type " << nodes_[index].op_type
@@ -362,6 +369,30 @@ PackedFunc GraphExecutorDebug::GetFunction(const std::string& name,
       ICHECK_GE(min_repeat_ms, 0);
       *rv = this->RunIndividual(number, repeat, min_repeat_ms);
     });
+  } else if (name == "run_individual_node") {
+    return TypedPackedFunc<std::string(int, int, int, int)>(
+        [sptr_to_self, this](int node_index, int number, int repeat, int min_repeat_ms) {
+          ICHECK_GE(node_index, 0);
+          ICHECK_LT(node_index, nodes_.size());
+          ICHECK_GT(number, 0);
+          ICHECK_GT(repeat, 0);
+          ICHECK_GE(min_repeat_ms, 0);
+          std::vector<double> results =
+              this->RunIndividualNode(node_index, number, repeat, min_repeat_ms);
+
+          // Have problems returning FloatImm so serialize to string results as hack.
+          std::stringstream s;
+
+          // use maximum precision available and use fixed representation
+          s << std::fixed;
+          s.precision(std::numeric_limits<double>::max_digits10);
+
+          for (double cur : results) {
+            s << cur << ", ";
+          }
+
+          return s.str();
+        });
   } else if (name == "profile") {
     return TypedPackedFunc<profiling::Report(Array<profiling::MetricCollector>)>(
         [sptr_to_self, this](Array<profiling::MetricCollector> collectors) {

diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py
@@ -19,26 +19,56 @@
 import re
 import sys
 import time
+from distutils.log import debug
 
+import numpy as np
 import pytest
-
 import tvm
 import tvm.testing
-from tvm import te
-import numpy as np
-from tvm import rpc
+from tvm import rpc, te
+from tvm._ffi.base import TVMError
 from tvm.contrib import utils
 from tvm.contrib.debugger import debug_executor
 
 
-@tvm.testing.requires_llvm
-@tvm.testing.requires_rpc
-def test_graph_simple():
-    n = 4
-    A = te.placeholder((n,), name="A")
-    B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
-    s = te.create_schedule(B.op)
+# Constants for creating simple graphs, fixtures to avoid free globals
+@pytest.fixture
+def n():
+    return 4
+
+
+@pytest.fixture
+def A(n):
+    return te.placeholder((n,), name="A")
+
+
+@pytest.fixture
+def B(A):
+    return te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B")
+
 
+@pytest.fixture
+def s(B):
+    return te.create_schedule(B.op)
+
+
+@pytest.fixture
+def mlib(s, A, B):
+    return tvm.build(s, [A, B], "llvm", name="myadd")
+
+
+@pytest.fixture
+def myadd(mlib):
+    def _myadd(*args):
+        to_return = mlib["myadd"](*args)
+        time.sleep(0.25)
+        return to_return
+
+    return _myadd
+
+
+@pytest.fixture
+def graph():
     node0 = {"op": "null", "name": "x", "inputs": []}
     node1 = {
         "op": "tvm_op",
@@ -64,21 +94,19 @@ def test_graph_simple():
         "attrs": attrs,
     }
     graph = json.dumps(graph)
+    return graph
 
-    def check_verify():
-        mlib = tvm.build(s, [A, B], "llvm", name="myadd")
-
-        def myadd(*args):
-            to_return = mlib["myadd"](*args)
-            time.sleep(0.25)
-            return to_return
 
+@tvm.testing.requires_llvm
+@tvm.testing.requires_rpc
+@pytest.mark.skipif(
+    tvm.support.libinfo()["USE_PROFILER"] != "ON", reason="TVM was not built with profiler support"
+)
+def test_end_to_end_graph_simple(graph, n, A, B, s, myadd):
+    def check_verify():
         mlib_proxy = tvm.support.FrontendTestModule()
         mlib_proxy["myadd"] = myadd
-        try:
-            mod = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))
-        except ValueError:
-            return
+        mod = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))
 
         a = np.random.uniform(size=(n,)).astype(A.dtype)
         mod.set_input(x=a)
@@ -185,5 +213,47 @@ def check_remote(server):
     check_remote(rpc.Server("127.0.0.1"))
 
 
+@tvm.testing.requires_llvm
+@pytest.mark.skipif(
+    tvm.support.libinfo()["USE_PROFILER"] != "ON", reason="TVM was not built with profiler support"
+)
+def test_run_single_node(graph, n, A, myadd):
+    mlib_proxy = tvm.support.FrontendTestModule()
+    mlib_proxy["myadd"] = myadd
+    mod: debug_executor.GraphModuleDebug = debug_executor.create(graph, mlib_proxy, tvm.cpu(0))
+
+    a = np.random.uniform(size=(n,)).astype(A.dtype)
+    mod.set_input(x=a)
+
+    assert len(mod.debug_datum.get_graph_nodes()) == 2
+    assert mod.debug_datum.get_graph_nodes()[0]["op"] == "param"
+    assert mod.debug_datum.get_graph_nodes()[1]["op"] == "myadd"
+
+    # Running a node with no associated function should return instantly and have 0 runtime
+    assert mod.run_individual_node(0, number=1).mean == 0
+
+    # Meanwhile the actual function should take some time, more time if you run it more times
+    repeat_1_result = mod.run_individual_node(1, repeat=1)
+    assert repeat_1_result.mean > 0
+
+    # Running multiple times (10) should take longer than 1 time
+    repeat_3_results = mod.run_individual_node(1, repeat=3)
+    assert sum(repeat_3_results.results) > sum(repeat_1_result.results)
+
+    # Increasing the number of repeats should give you the number of results asked for
+    assert len(mod.run_individual_node(1, repeat=10).results) == 10
+
+    # Doing repeat_ms should have the run time greater than the asked amount
+    start = time.time()
+    mod.run_individual_node(1, min_repeat_ms=500)
+    end = time.time()
+    elapsed_time_in_seconds = end - start
+    assert elapsed_time_in_seconds >= 0.5
+
+    # Going out of bounds of node index throws a tvm error
+    with pytest.raises(TVMError):
+        mod.run_individual_node(2)
+
+
 if __name__ == "__main__":
     sys.exit(pytest.main([__file__] + sys.argv[1:]))