diff --git a/docs/conf.py b/docs/conf.py index 5826526d55b02..7ece63bd7aa86 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -198,7 +198,6 @@ '../tutorials/language', '../tutorials/optimize', '../tutorials/autotvm', - '../tutorials/ansor', '../tutorials/dev', '../tutorials/topi', '../tutorials/deployment', diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h index 95476ed61bdd6..750a8a43163c3 100644 --- a/include/tvm/relay/attrs/transform.h +++ b/include/tvm/relay/attrs/transform.h @@ -296,19 +296,6 @@ struct LayoutTransformAttrs : public tvm::AttrsNode { } }; -/*! \brief Attributes for KernelLayoutTransform operator */ -struct KernelLayoutTransformAttrs : public tvm::AttrsNode { - std::string src_layout; - std::string dst_layout; - - TVM_DECLARE_ATTRS(KernelLayoutTransformAttrs, "relay.attrs.KernelLayoutTransformAttrs") { - TVM_ATTR_FIELD(src_layout) - .describe("The source layout of the tensor. (e.g. 1N32C112H112W)"); - TVM_ATTR_FIELD(dst_layout) - .describe("The destination layout of the tensor. (e.g. 1N2C112H112W16c)"); - } -}; - /*! \brief Attributes for ShapeOf operator */ struct ShapeOfAttrs : public tvm::AttrsNode { DataType dtype; diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h index 5f5d9b643633e..1b8b31aee5d10 100644 --- a/include/tvm/relay/transform.h +++ b/include/tvm/relay/transform.h @@ -277,20 +277,6 @@ TVM_DLL Pass CanonicalizeOps(); */ TVM_DLL Pass AlterOpLayout(); -/*! - * \brief Alternate the layouts of kernels. - * - * \return The pass. - */ -TVM_DLL Pass KernelLayoutTransform(); - -/*! - * \brief The reverse of FuseOps. - * - * \return The pass. - */ -TVM_DLL Pass DeFuseOps(); - /*! * \brief Given a dest layout, this pass transforms the expr such that most of the ops input data * layout is changed to the dest layout. In ideal situation, there are only 2 layout transforms, one diff --git a/python/tvm/ansor/__init__.py b/python/tvm/ansor/__init__.py index c629c1049a87b..8b8c03a142b34 100644 --- a/python/tvm/ansor/__init__.py +++ b/python/tvm/ansor/__init__.py @@ -21,26 +21,14 @@ from . import measure from . import serialization from . import loop_state -from . import auto_schedule from . import utils from . import feature -from . import workload_registry -from . import task_scheduler # Shortcut from .compute_dag import ComputeDAG, LayoutRewriteLevel -from .auto_schedule import SearchTask, SketchSearchPolicy, TuneOption, HardwareParams, \ - PreloadMeasuredStates, PreloadCustomSketchRule, auto_schedule +from .auto_schedule import SearchTask, HardwareParams from .measure import MeasureInput, LocalBuilder, LocalRunner, RPCRunner, LocalRPCMeasureContext -from .cost_model import RandomModel -from .cost_model.xgb_model import XGBModel from .serialization import LogToFile, LogReader, best_measure_pair_in_file, \ load_from_file, write_measure_records_to_file from .workload_registry import register_workload_func, \ workload_key_to_dag, make_workload_key_func -from .task_scheduler import TaskScheduler, SimpleTaskScheduler -from .dispatcher import DispatchContext, ApplyConfig, ApplyHistoryBest as apply_history_best, \ - FallbackContext -from .relay_integration import extract_from_program, extract_from_multiple_program, \ - finish_layout_rewrite, prepare_layout_rewrite, auto_schedule_topi -from .env import GLOBAL_SCOPE diff --git a/python/tvm/ansor/auto_schedule.py b/python/tvm/ansor/auto_schedule.py index a03d9fdacbc2b..41891872b76ec 100644 --- a/python/tvm/ansor/auto_schedule.py +++ b/python/tvm/ansor/auto_schedule.py @@ -22,7 +22,6 @@ import tvm._ffi from tvm.runtime import Object from .measure import LocalBuilder, LocalRunner -from .cost_model import RandomModel, XGBModel from . import _ffi_api @@ -64,206 +63,3 @@ def __init__(self, dag, workload_key, target, target_host=None, self.__init_handle_by_constructor__(_ffi_api.SearchTask, dag, workload_key, target, target_host, hardware_params) - - -@tvm._ffi.register_object("ansor.SearchPolicy") -class SearchPolicy(Object): - """ The base class for search policy """ - def continue_search(self, task, num_measure, verbose, measurer): - return _ffi_api.SearchPolicyContinueSearchOneRound(self, task, - num_measure, verbose, measurer) - - def set_task(self, task): - _ffi_api.SearchPolicySetTask(self, task) - - def set_verbose(self, verbose): - _ffi_api.SearchPolicySetVerbose(self, verbose) - - def run_callbacks(self, callbacks): - _ffi_api.SearchPolicyRunCallbacks(self, callbacks) - - -@tvm._ffi.register_object("ansor.SketchSearchPolicy") -class SketchSearchPolicy(SearchPolicy): - """ The search policy that searches in a hierarchical search space defined by sketches. - The policy randomly samples programs from the space defined by sketches - and use evolutionary search to fine-tune them. - - Parameters - ---------- - program_cost_model: CostModel - Cost model for programs - params: int - Parameters of the search policy. See `src/ansor/search_policy/sketch_search_policy.h` - to find the definitions. See code below to find the default values - seed: int - Random seed - """ - def __init__(self, - program_cost_model, - params=None, - seed=None): - # set default parameters - default_params = { - "eps_greedy": 0.05, - - 'evolutionary_search_population': 2048, - 'evolutionary_search_num_iters': 15, - "evolutionary_search_mutation_prob": 0.85, - "evolutionary_search_use_measured_ratio": 0.2, - - 'cpu_multi_level_tiling_structure': 'SSRSRS', - 'gpu_multi_level_tiling_structure': 'SSSRRSRS', - - 'disable_change_compute_location': 0, - } - - if params is None: - params = default_params - else: - for key, value in default_params.items(): - if key not in params: - params[key] = value - - self.__init_handle_by_constructor__( - _ffi_api.SketchSearchPolicy, program_cost_model, params, - seed or random.randint(1, 1 << 30)) - - -@tvm._ffi.register_object("ansor.SearchCallback") -class SearchCallback(Object): - """Callback function before or after search process""" - pass - - -@tvm._ffi.register_object("ansor.PreloadMeasuredStates") -class PreloadMeasuredStates(SearchCallback): - """ A SearchCallback to load measured states from the log file for a search policy. - This can resume the state of the search policy. - - Parameters - ---------- - filename: str - """ - def __init__(self, filename: str): - self.__init_handle_by_constructor__( - _ffi_api.PreloadMeasuredStates, filename) - - -@tvm._ffi.register_object("ansor.PreloadCustomSketchRule") -class PreloadCustomSketchRule(SearchCallback): - """ - A SearchCallback for SketchSearchPolicy that allowing users to add - custom sketch rule. - - Notes - ----- - This is an advanced feature. Make sure you're clear how it - works and this should only be used in SketchSearchPolicy. - - Parameters - ---------- - meet_condition_func: Function - A function with `(policy, state, stage_id) -> int` - apply_func: Function - A function with `(policy, state, stage_id) -> [[State, int], ...]` - """ - def __init__(self, meet_condition_func, apply_func): - self.__init_handle_by_constructor__( - _ffi_api.PreloadCustomSketchRule, meet_condition_func, apply_func) - - -@tvm._ffi.register_object("ansor.TuneOption") -class TuneOption(Object): - """ The options for tuning - - Parameters - ---------- - n_trials: int - Number of total measurement trials - early_stopping: int - Stops early the tuning if no improvement after n measurements - num_measure_per_iter: int - The number of programs to be measured at each iteration - verbose: int - Verbosity level. 0 means silent. - builder: Builder - Builder which builds the program - runner: Runner - Runner which runs the program and measure time costs - measure_callbacks: List[MeasureCallback] - Callback functions called after each measure - Candidates: - - ansor.LogToFile - pre_search_callbacks: List[SearchCallback] - Callback functions called before the search process - Candidates: - - ansor.PreloadMeasuredStates - - ansor.PreloadCustomSketchRule - """ - def __init__(self, n_trials=0, early_stopping=-1, num_measure_per_iter=64, - verbose=1, builder='local', runner='local', measure_callbacks=None, - pre_search_callbacks=None): - if isinstance(builder, str): - if builder == 'local': - builder = LocalBuilder() - else: - raise ValueError("Invalid builder: " + builder) - - if isinstance(runner, str): - if runner == 'local': - runner = LocalRunner() - else: - raise ValueError("Invalid builder: " + runner) - - if measure_callbacks is None: - measure_callbacks = [] - - if pre_search_callbacks is None: - pre_search_callbacks = [] - - self.__init_handle_by_constructor__( - _ffi_api.TuneOption, n_trials, early_stopping, num_measure_per_iter, - verbose, builder, runner, measure_callbacks, pre_search_callbacks) - - -def auto_schedule(workload, target=None, - target_host=None, search_policy='default', - hardware_params=None, tune_option=None): - """ Do auto scheduling for a computation declaration. - - The workload parameter can be a `string` as workload_key, or directly - passing a `SearchTask` as input. - - Parameters - ---------- - workload : Union[SearchTask, str] - target : Target - target_host : Target = None - search_policy : Union[SearchPolicy, str] - hardware_params : HardwareParams - tune_option : TuneOption - - Returns - ------- - sch : tvm.Schedule - tensors : List[Tensor] - """ - if isinstance(search_policy, str): - if search_policy == 'default': - search_policy = SketchSearchPolicy(RandomModel()) - else: - raise ValueError("Invalid search policy: " + search_policy) - - if tune_option is None: - tune_option = TuneOption(n_trials=0) - - if isinstance(workload, str): - sch, tensors = _ffi_api.AutoScheduleByWorkloadKey( - workload, target, target_host, search_policy, hardware_params, tune_option) - return sch, tensors - elif isinstance(workload, SearchTask): - sch, tensors = _ffi_api.AutoScheduleBySearchTask(workload, search_policy, tune_option) - return sch, tensors - else: - raise ValueError("Invalid workload: " + workload + ". Expect a string or SearchTask") diff --git a/python/tvm/ansor/cost_model/__init__.py b/python/tvm/ansor/cost_model/__init__.py deleted file mode 100644 index 56e4a5f9128b3..0000000000000 --- a/python/tvm/ansor/cost_model/__init__.py +++ /dev/null @@ -1,21 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=unused-import, redefined-builtin -""" Cost model that estimates the performance of programs """ - -from .cost_model import RandomModel -from .xgb_model import XGBModel diff --git a/python/tvm/ansor/cost_model/cost_model.py b/python/tvm/ansor/cost_model/cost_model.py deleted file mode 100644 index 57cc53853b2e1..0000000000000 --- a/python/tvm/ansor/cost_model/cost_model.py +++ /dev/null @@ -1,78 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" Cost model that estimates the performance of programs """ -import ctypes -import numpy as np - -import tvm._ffi -from tvm.runtime import Object -from .. import _ffi_api - - -@tvm._ffi.register_object("ansor.CostModel") -class CostModel(Object): - """The base class for cost model""" - pass - - -@tvm._ffi.register_object("ansor.RandomModel") -class RandomModel(Object): - """A model returns random estimation for all inputs""" - def __init__(self): - self.__init_handle_by_constructor__(_ffi_api.RandomModel) - - -@tvm._ffi.register_func("ansor.cost_model.random_number") -def random_number(n, return_ptr): - """ A random number generator func for c++'s RandomModel """ - if n == 0: - return - return_ptr = ctypes.cast(return_ptr, ctypes.POINTER(ctypes.c_float)) - array_wrapper = np.ctypeslib.as_array(return_ptr, shape=(n,)) - array_wrapper[:] = np.random.uniform(0, 1, (n,)) - - -@tvm._ffi.register_object("ansor.PythonBasedModel") -class PythonBasedModel(CostModel): - """Base class for cost models implemented in python""" - def __init__(self): - def update_func(inputs, results): - self.update(inputs, results) - - def predict_func(task, states, return_ptr): - return_ptr = ctypes.cast(return_ptr, ctypes.POINTER(ctypes.c_float)) - array_wrapper = np.ctypeslib.as_array(return_ptr, shape=(len(states),)) - array_wrapper[:] = self.predict(task, states) - - def predict_stage_func(task, states, return_ptr): - ret = self.predict_stages(task, states) - return_ptr = ctypes.cast(return_ptr, ctypes.POINTER(ctypes.c_float)) - array_wrapper = np.ctypeslib.as_array(return_ptr, shape=ret.shape) - array_wrapper[:] = ret - - self.__init_handle_by_constructor__(_ffi_api.PythonBasedModel, update_func, - predict_func, predict_stage_func) - - def update(self, inputs, results): - raise NotImplementedError - - def predict(self, task, states): - raise NotImplementedError - - def predict_stages(self, task, states): - raise NotImplementedError diff --git a/python/tvm/ansor/cost_model/xgb_model.py b/python/tvm/ansor/cost_model/xgb_model.py deleted file mode 100644 index 42af17daae2c6..0000000000000 --- a/python/tvm/ansor/cost_model/xgb_model.py +++ /dev/null @@ -1,474 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Cost model based on xgboost""" -import multiprocessing -import logging -from collections import defaultdict - -import numpy as np -import xgboost as xgb - -from tvm.autotvm.tuner.xgboost_cost_model import get_rank, recall_curve, max_curve -from .cost_model import PythonBasedModel -from ..feature import get_per_stmt_features_from_measure_pairs, get_per_stmt_features_from_states -from ..serialization import LogReader - -logger = logging.getLogger('ansor') - -class XGBDMatrixContext: - """Context to hold additional attributes of xgb.DMatrix""" - def __init__(self): - self.context_dict = defaultdict(dict) - - def get(self, key, matrix, default=None): - return self.context_dict[key].get(matrix.handle.value, default) - - def put(self, key, matrix, value): - self.context_dict[key][matrix.handle.value] = value - -dmatrix_context = XGBDMatrixContext() - -class XGBModel(PythonBasedModel): - """Train a XGBoost model to predict the runtime cost of a program. - The cost of a program = the sum of the costs of all stages in this program. - i.e. Cost(p) = cost_s0 + cost_s1 + ... + cost_sn, where cost_si is the cost of Stage i - - The xgboost model makes prediction per stage, then we sum them up. - The final predction made by this class is normalized throughtput (from 0 to 1, larger is better) - - To support this stage decomposition, we have to implement a custom loss function for - XGBoost, which is the `pack_sum` in the code below. - """ - def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None): - self.xgb_params = { - 'max_depth': 10, - 'gamma': 0.001, - 'min_child_weight': 0, - 'eta': 0.2, - # todo(lmzheng): automatically decrease learning rate when the loss is too large - - 'n_gpus': 0, - 'nthread': multiprocessing.cpu_count() // 2, - 'verbosity': 0, - 'seed': seed or 43, - 'disable_default_eval_metric': 1 - } - self.bst = None - self.plan_size = 32 - self.num_warmup_sample = num_warmup_sample - self.verbose_eval = verbose_eval - - super().__init__() - - # measurement input/result pairs - self.inputs = [] - self.results = [] - self.inputs_feature_cache = [] - - def update(self, inputs, results): - if len(inputs) <= 0: - return - - self.inputs.extend(inputs) - self.results.extend(results) - - # extract feature - n_cached = len(self.inputs_feature_cache) - features, normalized_throughputs, task_ids = \ - get_per_stmt_features_from_measure_pairs(self.inputs, self.results, - skip_first_n_feature_extraction=n_cached) - if n_cached > 0: - features = list(features) - features[:n_cached] = self.inputs_feature_cache - features = np.array(features) - self.inputs_feature_cache = features - dtrain = pack_sum_xgbmatrix(features, normalized_throughputs, - task_ids, normalized_throughputs) - - # train xgb model - self.bst = xgb.train(self.xgb_params, dtrain, - num_boost_round=10000, - obj=pack_sum_square_error, - callbacks=[custom_callback( - stopping_rounds=50, - metric='tr-p-rmse', - fevals=[ - pack_sum_rmse, pack_sum_average_peak_score(self.plan_size), - ], - evals=[(dtrain, 'tr')], - maximize=False, - verbose_eval=self.verbose_eval)]) - - def predict(self, task, states): - features = get_per_stmt_features_from_states(states, task) - if self.bst is not None and len(self.inputs) > self.num_warmup_sample: - dtest, pack_ids = pack_sum_xgbmatrix_for_prediction(features) - raw_preds = self.bst.predict(dtest) - ret = pack_sum_predict_throughput(raw_preds, pack_ids) - else: - ret = np.random.uniform(0, 1, (len(states),)) - - # Predict 0 for invalid states that failed to be lowered. - for idx, feature in enumerate(features): - if feature.min() == feature.max() == 0: - ret[idx] = float('-inf') - - return ret - - def predict_stages(self, task, states): - # Format: (s0 score, ..., sN score, s0 n_stage, s0 stage 0, ..., s1 n_stage, s1 stage 0,) - features = get_per_stmt_features_from_states(states, task) - if self.bst is not None and len(self.inputs) > self.num_warmup_sample: - dtest, pack_ids = pack_sum_xgbmatrix_for_prediction(features) - raw_preds = self.bst.predict(dtest) - breakdown = pack_sum_predict_throughput(raw_preds, pack_ids) - stage_scores = [[] for _ in range(len(states))] - for pred, pack_id in zip(raw_preds, pack_ids): - stage_scores[pack_id].append(pred) - for idx, stage_score in enumerate(stage_scores): - breakdown = np.append(breakdown, len(stage_score)) - breakdown = np.concatenate((breakdown, -np.array(stage_score))) - else: - breakdown = np.concatenate( - (np.random.uniform(0, 1, (len(states), )), np.zeros(len(states), ))) - - # Predict 0 for invalid states that failed to be lowered. - for idx, feature in enumerate(features): - if feature.min() == feature.max() == 0: - breakdown[idx] = float('-inf') - - return breakdown - - def load_log_file(self, file_name, n_lines=-1): - inputs, results = LogReader(file_name).read_lines(n_lines) - logger.info("XGBModel: Loaded %s lines of history log from %s", len(inputs), file_name) - self.update(inputs, results) - - def save(self, file_name: str): - self.bst.save_model(file_name) - - def load(self, file_name: str): - if self.bst is None: - self.bst = xgb.Booster(self.xgb_params) - self.bst.load_model(file_name) - self.num_warmup_sample = -1 - - -def pack_sum_xgbmatrix_for_prediction(xs): - x_flatten = [] - pack_ids = [] - - for ct, x in enumerate(xs): - for row in x: - x_flatten.append(row) - pack_ids.append(ct) - - return xgb.DMatrix(np.array(x_flatten)), pack_ids - - -def pack_sum_xgbmatrix(xs, ys, gids=None, weights=None): - if gids is not None: - # sort by group - indices = gids.argsort() - xs, ys = xs[indices], ys[indices] - group_sizes = np.bincount(gids) - if weights is not None: - weights = weights[indices] - else: - # assume it has only one group - group_sizes = [len(xs)] - - x_flatten = [] - y_flatten = [] - weights_flatten = [] - pack_ids = [] - - if weights is not None: - for ct, (x, y, w) in enumerate(zip(xs, ys, weights)): - for row in x: - x_flatten.append(row) - y_flatten.append(y) - weights_flatten.append(w) - pack_ids.append(ct) - else: - for ct, (x, y) in enumerate(zip(xs, ys)): - for row in x: - x_flatten.append(row) - y_flatten.append(y) - pack_ids.append(ct) - - ret = xgb.DMatrix(np.array(x_flatten), y_flatten) - if weights is not None: - ret.set_weight(weights_flatten) - dmatrix_context.put('pack_ids', ret, np.array(pack_ids)) - dmatrix_context.put('group_sizes', ret, group_sizes) - return ret - -LOSS_TYPE = 3 - -# Type 0 -# The model predicts cost. Use square error of throughput as loss -# loss = 1/2 * (1 / sum(x_i) - y) ^ 2 -# -# Type 1 -# The model predicts cost. Use square error of cost as loss -# loss = 1/2 * (sum(x_i) - 1 / y) ^ 2 -# -# Type 2 -# The model predicts throughput. Use square error of throughput as loss. -# loss = 1/2 * (1 / sum(1 / x_i) - y) ^ 2 -# -# Type 3 -# The model predicts throughput. Use square error of throughput as loss. -# But approximate 1 / (1 / a_1 + 1 / a_2 + ... + 1 / a_n) with -(b_1 + b_2 + b_3) -# loss = 1/2 * (-sum(x_i) - y) ^ 2 -# -# Type 4 -# The model predicts throughput. Use square error of throughput as loss. -# But approximate 1 / (1 / a_1 + 1 / a_2 + ... + 1 / a_n) with -(b_1 + b_2 + b_3) -# Also add a sigmoid to force the prediction to be within the range of (0, 1) -# loss = 1/2 * (sigmoid(-sum(x_i)) - y) ^ 2 -# - -def pack_sum_predict_throughput(raw_preds, pack_ids): - if LOSS_TYPE == 0: - sum_pred = np.bincount(pack_ids, weights=raw_preds) - return 1 / sum_pred - elif LOSS_TYPE == 1: - sum_pred = np.bincount(pack_ids, weights=raw_preds) - return 1 / sum_pred - elif LOSS_TYPE == 2: - sum_inverse_preds = np.bincount(pack_ids, weights=1 / raw_preds) - return 1 / sum_inverse_preds - elif LOSS_TYPE == 3: - sum_pred = np.bincount(pack_ids, weights=raw_preds) - return - sum_pred # pylint: disable=invalid-unary-operand-type - elif LOSS_TYPE == 4: - sum_pred = np.bincount(pack_ids, weights=raw_preds) - return 1 / (1 + np.exp(sum_pred)) - else: - raise ValueError("Invalid loss type: " + LOSS_TYPE) - -def pack_sum_square_error(preds, dtrain): - pack_ids = dmatrix_context.get("pack_ids", dtrain) - weight = dtrain.get_weight() - - if LOSS_TYPE == 0: - sum_pred = np.bincount(pack_ids, weights=preds) - x = sum_pred[pack_ids] - y = dtrain.get_label() - gradient = (x * y - 1) / np.power(x, 3) - hessian = (3 - 2 * x * y) / np.power(x, 4) - elif LOSS_TYPE == 1: - sum_pred = np.bincount(pack_ids, weights=preds) - x = sum_pred[pack_ids] - y = dtrain.get_label() - gradient = x - 1 / np.minimum(y, 1e6) - hessian = np.ones_like(gradient) - elif LOSS_TYPE == 2: - sum_inverse_preds = np.bincount(pack_ids, weights=1 / preds)[pack_ids] - y = dtrain.get_label() - gradient = (1 / sum_inverse_preds - y) / (np.power(preds * sum_inverse_preds, 2)) - hessian = (2 * preds * y * np.power(sum_inverse_preds, 2) - 2 * y * sum_inverse_preds - 2 * preds * sum_inverse_preds + 3) / (np.power(preds * sum_inverse_preds, 4)) - elif LOSS_TYPE == 3: - sum_pred = np.bincount(pack_ids, weights=preds) - x = sum_pred[pack_ids] - y = dtrain.get_label() - gradient = x + y - hessian = np.ones_like(gradient) - elif LOSS_TYPE == 4: - sum_pred = np.bincount(pack_ids, weights=preds) - exp_x = np.exp(sum_pred[pack_ids]) - exp_2x = np.power(exp_x, 2) - y = dtrain.get_label() - gradient = exp_x * (exp_x * y + y - 1) / np.power(exp_x + 1, 3) - hessian = exp_x * (-exp_2x * y + 2 * exp_x + y - 1) / np.power(exp_x + 1, 4) - else: - raise ValueError("Invalid loss type: " + LOSS_TYPE) - - if len(weight) == 0: - return gradient, hessian - else: - return gradient * weight, hessian * weight - -def pack_sum_rmse(raw_preds, dtrain): - pack_ids = dmatrix_context.get("pack_ids", dtrain) - preds = pack_sum_predict_throughput(raw_preds, pack_ids)[pack_ids] - return 'p-rmse', np.sqrt(np.mean(np.square((preds - dtrain.get_label())))) - -def pack_sum_average_peak_score(N): - """Evaluate pack sum average peak score for xgb""" - - def feval(preds, labels): - group_sizes = dmatrix_context.get('group_sizes', labels, [len(preds)]) - pack_ids = dmatrix_context.get("pack_ids", labels) - - preds = pack_sum_predict_throughput(preds, pack_ids) - labels = (np.bincount(pack_ids, weights=labels.get_label()) - / np.unique(pack_ids, return_counts=True)[1]) - - scores = [] - offset = 0 - for size in group_sizes: - preds_group = preds[offset:offset + size] - labels_group = labels[offset:offset + size] - offset += size - - trials = np.argsort(preds_group)[::-1][:N] - trial_scores = labels_group[trials] - curve = max_curve(trial_scores) / np.max(labels_group) - scores.append(np.mean(curve)) - return "a-peak@%d" % N, np.mean(scores) - return feval - -def pack_sum_average_recall_score(N): - """Evaluate average recall score for xgb""" - - def feval(preds, labels): - group_sizes = dmatrix_context.get('group_sizes', labels, [len(preds)]) - pack_ids = dmatrix_context.get("pack_ids", labels) - - preds = pack_sum_predict_throughput(preds, pack_ids) - labels = (np.bincount(pack_ids, weights=labels.get_label()) - / np.unique(pack_ids, return_counts=True)[1]) - - scores = [] - offset = 0 - for size in group_sizes: - preds_group = preds[offset:offset + size] - labels_group = labels[offset:offset + size] - offset += size - - trials = np.argsort(preds_group)[::-1] - ranks = get_rank(labels_group[trials])[:N] - curve = recall_curve(ranks) - scores.append(np.mean(curve)) - return "a-recall@%d" % N, np.mean(scores) - return feval - - -def custom_callback(stopping_rounds, metric, fevals, evals=(), log_file=None, - maximize=False, verbose_eval=True, skip_every=2): - """Callback function for xgboost to support multiple custom evaluation functions""" - from xgboost.core import EarlyStopException - from xgboost.callback import _fmt_metric - from xgboost.training import aggcv - - state = {} - metric_shortname = metric.split("-")[1] - - def init(env): - """internal function""" - bst = env.model - - state['maximize_score'] = maximize - state['best_iteration'] = 0 - if maximize: - state['best_score'] = float('-inf') - else: - state['best_score'] = float('inf') - - if bst is not None: - if bst.attr('best_score') is not None: - state['best_score'] = float(bst.attr('best_score')) - state['best_iteration'] = int(bst.attr('best_iteration')) - state['best_msg'] = bst.attr('best_msg') - else: - bst.set_attr(best_iteration=str(state['best_iteration'])) - bst.set_attr(best_score=str(state['best_score'])) - else: - assert env.cvfolds is not None - - def callback(env): - """internal function""" - if not state: - init(env) - - bst = env.model - i = env.iteration - cvfolds = env.cvfolds - - res_dict = {} - - if i % skip_every == 1: - return - - ##### evaluation ##### - if cvfolds is not None: - for feval in fevals: - tmp = aggcv([f.eval(i, feval) for f in cvfolds]) - for k, mean, std in tmp: - res_dict[k] = [mean, std] - else: - for feval in fevals: - bst_eval = bst.eval_set(evals, i, feval) - res = [x.split(':') for x in bst_eval.split()] - for kv in res[1:]: - res_dict[kv[0]] = [float(kv[1])] - - eval_res = [] - keys = list(res_dict.keys()) - keys.sort(key=lambda x: x if metric_shortname not in x else "a" + x) - for key in keys: - v = res_dict[key] - eval_res.append([key] + v) - - ##### print eval result ##### - if not isinstance(verbose_eval, bool) and verbose_eval and i % verbose_eval == 0: - infos = ["XGB iter: %3d" % i] - for item in eval_res: - if 'null' in item[0]: - continue - infos.append("%s: %.6f" % (item[0], item[1])) - - logger.debug("\t".join(infos)) - if log_file: - with open(log_file, "a") as fout: - fout.write("\t".join(infos) + '\n') - - ##### choose score and do early stopping ##### - score = None - for item in eval_res: - if item[0] == metric: - score = item[1] - break - assert score is not None - - best_score = state['best_score'] - best_iteration = state['best_iteration'] - maximize_score = state['maximize_score'] - if (maximize_score and score > best_score) or \ - (not maximize_score and score < best_score): - msg = '[%d] %s' % ( - env.iteration, - '\t'.join([_fmt_metric(x) for x in eval_res])) - state['best_msg'] = msg - state['best_score'] = score - state['best_iteration'] = env.iteration - # save the property to attributes, so they will occur in checkpoint. - if env.model is not None: - env.model.set_attr(best_score=str(state['best_score']), - best_iteration=str(state['best_iteration']), - best_msg=state['best_msg']) - elif env.iteration - best_iteration >= stopping_rounds: - best_msg = state['best_msg'] - if verbose_eval and env.rank == 0: - logger.debug("XGB stopped. Best iteration: %s ", best_msg) - raise EarlyStopException(best_iteration) - - return callback diff --git a/python/tvm/ansor/dispatcher.py b/python/tvm/ansor/dispatcher.py deleted file mode 100644 index 0c07fd141bd2c..0000000000000 --- a/python/tvm/ansor/dispatcher.py +++ /dev/null @@ -1,299 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -The global context that dispatches best configurations to workloads -""" -# pylint: disable=invalid-name - -from __future__ import absolute_import as _abs - -import logging - -import numpy as np - -from tvm.tir.expr import FloatImm - -logger = logging.getLogger('auto_scheduler') - - -class DispatchContext(object): - """ - Base class of dispatch context. - """ - current = None - - def __init__(self): - self._old_ctx = DispatchContext.current - - def query(self, target, workload): - """ - Query the context to get the specific config for a workload. - If cannot find the result inside this context, this function will query it - from the upper contexts. - - Parameters - ---------- - target: Target - The current target - workload : str - The current workload - - Returns - ------- - cfg : State - The schedule configuration for the workload - """ - ret = self._query_inside(target, workload) - return ret - - def update(self, target, workload, cfg): - """ - Update the config for a workload - - Parameters - ---------- - target: Target - The current target - workload : Workload - The current workload. - cfg : State - The schedule configuration for the workload - """ - raise NotImplementedError() - - def _query_inside(self, target, workload): - """ - Query the context to get the specific config for a workload. - This function only query config inside this context. - - Parameters - ---------- - target: Target - The current target - workload : Workload - The current workload. - - Returns - ------- - cfg : State or str - The schedule configuration for the workload - """ - raise NotImplementedError() - - def __enter__(self): - self._old_ctx = DispatchContext.current - DispatchContext.current = self - return self - - def __exit__(self, ptype, value, trace): - DispatchContext.current = self._old_ctx - - -class ApplyConfig(DispatchContext): - """Apply a deterministic config for all queries. - - Parameters - ---------- - config : State - The schedule configuration - """ - def __init__(self, config): - super(ApplyConfig, self).__init__() - self._config = config - self.workload = None - - def _query_inside(self, target, workload): - """Override query""" - self.workload = workload - return self._config - - def update(self, target, workload, cfg): - """Override update""" - self.workload = workload - self._config = cfg - - -class ApplyHistoryBest(DispatchContext): - """ - Apply the history best config - - Parameters - ---------- - records : str or iterator of (MeasureInput, MeasureResult) - Collection of tuning records. - If is str, then it should be the filename of a records log file. - Each row of this file is an encoded record pair. - Otherwise, it is an iterator. - n_lines: int (optional) - if it is not None, only load the first `n_lines` lines of log - """ - def __init__(self, records, n_lines=None): - super(ApplyHistoryBest, self).__init__() - - self.best_by_targetkey = {} - self.best_by_model = {} - self._best_user_defined = {} - - if records: - self.load(records, n_lines) - - def load(self, records, n_lines=None): - """Load records to this dispatch context - - Parameters - ---------- - records : str or iterator of (MeasureInput, MeasureResult) - Collection of tuning records. - If is str, then it should be the filename of a records log file. - Each row of this file is an encoded record pair. - Otherwise, it is an iterator. - n_lines: int (optional) - if it is not None, only load the first `n_lines` lines of log - """ - from pathlib import Path - from . import load_from_file - - if isinstance(records, Path): - records = str(records) - - if isinstance(records, str): - records = load_from_file(records) - if not records: - return - - best_by_targetkey = self.best_by_targetkey - best_by_model = self.best_by_model - - counter = 0 - for inp, res in records: - if n_lines is not None and counter >= n_lines: - break - counter += 1 - if res.error_no != 0: - continue - - # use target keys in tvm target system as key to build best map - for k in inp.task.target.keys: - key = (k, inp.task.workload_key) - if key not in best_by_targetkey: - best_by_targetkey[key] = (inp, res) - else: - _, other_res = best_by_targetkey[key] - other_costs = [x.value for x in other_res.costs if isinstance(x, FloatImm)] - costs = [x.value for x in res.costs if isinstance(x, FloatImm)] - if np.mean(other_costs) > np.mean(costs): - best_by_targetkey[key] = (inp, res) - - # use model as key to build best map - key = (inp.task.target.model, inp.task.workload_key) - if key not in best_by_model: - if inp.task.target.model != 'unknown': - best_by_model[key] = (inp, res) - else: - _, other_res = best_by_model[key] - other_costs = [x.value for x in other_res.costs if isinstance(x, FloatImm)] - costs = [x.value for x in res.costs if isinstance(x, FloatImm)] - if np.mean(other_costs) > np.mean(costs): - best_by_model[key] = (inp, res) - - logger.debug("Finish loading %d records", counter) - - def _query_inside(self, target, workload): - if target is None: - raise RuntimeError("Need a target context to find the history best. " - "Hint: If your target is llvm, use `with tvm.target.create('llvm'):`" - " above the dispatcher call. So does other target. ") - - # first try matching by model - key = (target.model, workload) - if key in self._best_user_defined: - return self._best_user_defined[key] - if key in self.best_by_model: - return self.best_by_model[key][0].state - - # then try matching by target key - for k in target.keys: - key = (k, workload) - if key in self._best_user_defined: - return self._best_user_defined[key] - if key in self.best_by_targetkey: - return self.best_by_targetkey[key][0].state - - return None - - def update(self, target, workload, state): - model = target.model - key = (model, workload) - self._best_user_defined[key] = state - - for k in target.keys: - key = (k, workload) - self._best_user_defined[key] = state - - -class FallbackContext(DispatchContext): - """ - A fallback dispatch context. - This is used as the root context. - """ - - def __init__(self): - super(FallbackContext, self).__init__() - self.memory = {} - self.silent = False - - # a set to prevent print duplicated message - self.messages = set() - - def _query_inside(self, target, workload): - key = (str(target), workload) - if key in self.memory: - return self.memory[key] - - if not self.silent: - msg = "Cannot find config for target=%s, workload=%s. A fallback configuration "\ - "is used, which may bring great performance regression." % (target, workload) - if msg not in self.messages: - self.messages.add(msg) - logger.warning(msg) - cfg = None - - # cache this config to avoid duplicated warning message - self.memory[key] = cfg - return cfg - - def clear_cache(self, target, workload): - """Clear fallback cache. Pass the same argument as _query_inside to this function - to clean the cache. - - Parameters - ---------- - target: Target - The current target - workload : Workload - The current workload. - """ - key = (str(target), workload) - if key in self.memory: - del self.memory[key] - - def update(self, target, workload, cfg): - key = (str(target), workload) - self.memory[key] = cfg - - -DispatchContext.current = FallbackContext() diff --git a/python/tvm/ansor/env.py b/python/tvm/ansor/env.py deleted file mode 100644 index 0f35f92acbbc6..0000000000000 --- a/python/tvm/ansor/env.py +++ /dev/null @@ -1,26 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -""" The scope to store global variables in ansor """ - - -class AutoschedulerGlobalScope(object): - def __init__(self): - self.topi_in_compute_rewrite_mode = False - -GLOBAL_SCOPE = AutoschedulerGlobalScope() - diff --git a/python/tvm/ansor/relay_integration.py b/python/tvm/ansor/relay_integration.py deleted file mode 100644 index 3c2eabd3dfacb..0000000000000 --- a/python/tvm/ansor/relay_integration.py +++ /dev/null @@ -1,240 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=unused-variable,invalid-name - -""" -Integrate ansor into relay. It implements the following items: -1. Extract search tasks from a relay program -2. Provide auto-scheduling for all TOPI compute functions -""" -import os -import json -import threading - -from tvm import target, te, transform -from tvm.te.tensor import PlaceholderOp, ComputeOp -from .dispatcher import DispatchContext -from .workload_registry import register_workload_bufs, compute_dag_hash -from .compute_dag import ComputeDAG, LayoutRewriteLevel -from .env import GLOBAL_SCOPE - -def call_all_topi_funcs(mod, target, params): - """Call all TOPI compute + schedule to extract tasks in a relay program""" - # pylint: disable=import-outside-toplevel - from tvm import relay - - with transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): - bld_mod = relay.build_module.BuildModule() - bld_mod.call_all_topi_funcs(mod, target=target, params=params) - -def extract_from_program(mod, params, target, target_host=None): - """ Extract tuning tasks from a relay program. - - This function is the single program version of extract_from_multiple_program. - - Parameters - ---------- - mod : relay.Module - The module to extract. - params: dict of str to numpy array - The associated parameters of the program - ops: List of relay op - List of relay ops to be tuned - target: tvm.target.Target - The compilation target - target_host: tvm.target.Target - The host compilation target - - Returns - ------- - workloads: Array of Tuple(wkl_key, target) - """ - return extract_from_multiple_program([mod], [params], target, target_host) - -def extract_from_multiple_program(mods, params, target, target_host=None): - """ Extract tuning tasks from multiple relay programs. - - Parameters - ---------- - mods : List of relay.Module - The modules to extract. - params: List of dict of str to numpy array - The associated parameters of the programs - ops: List of relay op - List of relay ops to be tuned - target: tvm.target.Target - The compilation target - target_host: tvm.target.Target - The host compilation target - - Returns - ------- - workloads: Array of Tuple(wkl_key, target) - """ - # pylint: disable=import-outside-toplevel - from tvm import relay - - env = TracingEnvironment(TracingMode.EXTRACT_TASK) - with env: - # run compiler to collect all TOPI calls during compilation - for mod, param in zip(mods, params): - # wrap build call in a new thread to avoid the conflict - # between python's multiprocessing and tvm's thread pool - build_thread = threading.Thread(target=call_all_topi_funcs, - args=(mod, target, param)) - build_thread.start() - build_thread.join() - relay.backend.compile_engine.get().clear() - - # create tasks for target - wkl_keys = [] - wkl_weights = [] - for wkl_key, wkl_weight in env.wkl_key_collection.items(): - wkl_keys.append(wkl_key) - wkl_weights.append(wkl_weight) - - return wkl_keys, wkl_weights - - -def prepare_layout_rewrite(mod, params, target): - """ - Prepare for kernel layout rewrite. This function will write layout infos to a global static variable. - Then these layout info will be used by a relay pass `kernel_layout_transform`. - """ - # pylint: disable=import-outside-toplevel - from tvm import relay - - env = TracingEnvironment(TracingMode.PREPARE_LAYOUT_REWRITE) - with env: - # wrap build call in a new thread to avoid the conflict - # between python's multiprocessing and tvm's thread pool - build_thread = threading.Thread(target=call_all_topi_funcs, - args=(mod, target, params)) - build_thread.start() - build_thread.join() - relay.backend.compile_engine.get().clear() - - if env.layout_rewrite_success_ct > 0: - GLOBAL_SCOPE.topi_in_compute_rewrite_mode = True - -def finish_layout_rewrite(): - """Clear the global flag for layout rewrite""" - GLOBAL_SCOPE.topi_in_compute_rewrite_mode = False - - -class TracingMode: - """Two modes for tracing""" - EXTRACT_TASK = 0 # trace all topi calls to extract tasks - PREPARE_LAYOUT_REWRITE = 1 # trace all topi calls to prepare layout rewrite - -class TracingEnvironment: - """Global environment for tracing all topi function calls""" - current = None - - def __init__(self, tracing_mode): - self.tracing_mode = tracing_mode - self.relay_disable_build_cache = "false" - self.layout_rewrite_success_ct = 0 - self.wkl_key_collection = {} - - def __enter__(self): - self.relay_disable_build_cache = os.environ.get("TVM_RELAY_DISABLE_BUILD_CACHE", "false") - os.environ["TVM_RELAY_DISABLE_BUILD_CACHE"] = "true" - TracingEnvironment.current = self - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - os.environ["TVM_RELAY_DISABLE_BUILD_CACHE"] = self.relay_disable_build_cache - TracingEnvironment.current = None - - def add_workload_key(self, key): - """Add the workload key of an Ansor search task - - Parameters - ---------- - key: str - """ - if key in self.wkl_key_collection: - self.wkl_key_collection[key] += 1 - else: - self.wkl_key_collection[key] = 1 - - -def traverse_to_get_io_tensors(outs): - """Traverse from a list of output tensors to get a whole computational DAG""" - layout_free_ops = [] - inputs = [] - - visited = set() - - def traverse(t): - if t in visited: - return - if isinstance(t.op, PlaceholderOp): - inputs.append(t) - elif isinstance(t.op, ComputeOp): - if "layout_free_placeholders" in t.op.attrs: - layout_free_ops.append(t.op) - for x in t.op.input_tensors: - traverse(x) - visited.add(t) - - for t in outs: - traverse(t) - - has_layout_free = (len(layout_free_ops) > 0) - return inputs + [t for t in outs], has_layout_free - - -def auto_schedule_topi(outs): - """ Use ansor to auto-schedule a topi compute declaration """ - io_tensors, has_layout_free = traverse_to_get_io_tensors(outs) - key = register_workload_bufs(io_tensors) - - env = TracingEnvironment.current - if env is None: # in the final build mode - state = DispatchContext.current.query(target.Target.current(), key) - if state is None: - return te.create_schedule([x.op for x in outs]) - - dag = ComputeDAG(io_tensors) - # Only update compute body, layout_rewrite_level = LayoutRewriteLevel.COMPUTE_REWRITE, - # Since kernel layout has already been rewritten in relay pass - schedule, _ = dag.apply_steps_from_state(state, - layout_rewrite_level=LayoutRewriteLevel.COMPUTE_REWRITE) - return schedule - elif env.tracing_mode == TracingMode.EXTRACT_TASK: # in the task extraction mode - env.add_workload_key(key) - return te.create_schedule([x.op for x in outs]) - elif env.tracing_mode == TracingMode.PREPARE_LAYOUT_REWRITE: - # in prepare_layout_rewrite mode - if has_layout_free: - # Rewrite the DAG and update the transform history for - # the new dag in DispatchContext - dispatch_ctx = DispatchContext.current - tgt = target.Target.current() - state = dispatch_ctx.query(tgt, key) - assert state is not None - dag = ComputeDAG(outs) - new_dag = dag.rewrite_layout_from_state(state) - new_key = json.dumps((compute_dag_hash(new_dag),)) - dispatch_ctx.update(tgt, new_key, state) - if new_key != key: - env.layout_rewrite_success_ct += 1 - return te.create_schedule([x.op for x in outs]) - else: - raise ValueError("Invalid tracing mode: " + env.tracing_mode) diff --git a/python/tvm/ansor/task_scheduler.py b/python/tvm/ansor/task_scheduler.py deleted file mode 100644 index 587fe3121e883..0000000000000 --- a/python/tvm/ansor/task_scheduler.py +++ /dev/null @@ -1,293 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""TaskScheduler that allocates the time resources when tuning multiple tasks together""" -from typing import List, Union, Callable -import time - -import numpy as np - -from .auto_schedule import SearchTask, SearchPolicy, SketchSearchPolicy, TuneOption -from .cost_model import RandomModel, XGBModel -from .measure import ProgramMeasurer -from .utils import array_mean, to_str_round - - -class TaskScheduler: - """Allocate the time resources when tuning multiple tasks together""" - def __init__(self, - tasks: List[SearchTask], - objective_func: Callable = None): - self.tasks = tasks - self.objective_func = objective_func or sum - - def compute_score(self, costs: List[float]) -> float: - return self.objective_func(costs) - - -def get_search_policies(search_policy: Union[str, List[SearchPolicy]], tasks: List[SearchTask], - num_measure_per_iter, load_model_file=None, load_log_file=None): - if search_policy == 'default': - search_policy = 'sketch.xgb' - - if isinstance(search_policy, str): - policy_type, model_type = search_policy.split('.') - if model_type == 'xgb': - cost_model = XGBModel(num_warmup_sample=len(tasks) * num_measure_per_iter) - if load_model_file: - print("Load pretrained model...") - cost_model.load(load_model_file) - elif load_log_file: - cost_model.load_log_file(load_log_file) - elif model_type == 'random': - cost_model = RandomModel() - else: - raise ValueError("Invalid search policy: " + search_policy) - - if policy_type == 'sketch': - search_policies = [SketchSearchPolicy(cost_model) for _ in range(len(tasks))] - elif policy_type == 'limit-space': - search_policies = [SketchSearchPolicy(cost_model, - params={'cpu_multi_level_tiling_structure': 'SRS', - 'disable_change_compute_location': 1}) - for _ in range(len(tasks))] - elif policy_type == 'beam-search': - search_policies = [SketchSearchPolicy(cost_model, - params={'use_beam_search': 1}) - for _ in range(len(tasks))] - else: - raise ValueError("Invalid search policy: " + search_policy) - else: - # check type - assert isinstance(search_policy, (tuple, list)) - for item in search_policy: - assert isinstance(item, SearchPolicy) - search_policies = search_policy - - return search_policies - - -class SimpleTaskScheduler(TaskScheduler): - """The default task scheduler with several strategies - - Parameters - ---------- - tasks: List[SearchTask] - All workloads to tune - weights: List[float] - Weights of tasks (i.e. the number of occurrence of a task in the whole network) - strategy: str - The joint tuning strategy. - "sequential" : Tune tasks sequentially. Divide n_trials equally to every task. - "round-robin": Tune tasks in round robin order. - "gradient" : Tune tasks with gradient descent. - load_log_file: str - Load history log file to pre-train cost model - eps-random: float - Always allocate this percent of n_trials to select tasks randomly. This is for encouraging exploration. - verbose: int - The level of verbosity. 0 means silent. - alpha: float - The parameter used for 'gradient' strategy - beta: float - The parameter used for 'gradient' strategy - backward_window_size: int - The parameter used for 'gradient' strategy - """ - def __init__(self, - tasks: List[SearchTask], - objective_func: Callable = None, - strategy: str = 'gradient', - load_log_file: str = None, - load_model_file: str = None, - eps_random: float = 0.05, - verbose: int = 1, - alpha: float = 0.2, - beta: float = 2, - gamma: float = 0.5, - backward_window_size: int = 3, - use_debug_measurement_simulator=None): - super().__init__(tasks, objective_func) - self.strategy = strategy - self.eps_random = eps_random - self.verbose = verbose - self.load_log_file = load_log_file - self.load_model_file = load_model_file - self.alpha = alpha - self.beta = beta - self.gamma = gamma - self.backward_window_size = backward_window_size - self.use_debug_measurement_simulator = use_debug_measurement_simulator - - assert self.strategy in ['round-robin', 'gradient'] - - self.task_cts = [] - self.task_costs_history = [] - self.best_costs = self.cur_score = None - self.tune_option = self.measurer = self.search_policies = self.ct = self.tic = None - self.num_measure_per_iter = None - self.dead_tasks = set() - self.sequential_now_task_idx = 0 - self.sequential_now_task_begin_ct = 0 - - def tune(self, tune_option: TuneOption, search_policy: Union[str, List[SearchPolicy]] = 'default'): - """ Tune tasks. - - Notice: This method does not have return value, make sure to set `LogToFile` - measure callback in `tune_option`. - - Parameters - ---------- - tune_option: TuneOption - search_policy: Str or List[SearchPolicy] - """ - # init members - self.task_cts = [0 for _ in range(len(self.tasks))] - self.task_costs_history = [[] for _ in range(len(self.tasks))] - self.best_costs = 1e10 * np.ones(len(self.tasks)) - self.cur_score = self.compute_score(self.best_costs) - self.tune_option = tune_option - if self.use_debug_measurement_simulator is None: - self.measurer = ProgramMeasurer(tune_option.builder, tune_option.runner, - tune_option.measure_callbacks, tune_option.verbose) - self.ct = 0 - self.tic = time.time() - # reset num_measure_per_iter to make sure every task is tuned at least once - self.num_measure_per_iter = min(tune_option.num_measure_per_iter, - tune_option.n_trials // len(self.tasks)) - self.search_policies = get_search_policies(search_policy, self.tasks, - self.num_measure_per_iter, - self.load_model_file, - self.load_log_file) - self.dead_tasks = set() - self.sequential_now_task_idx = 0 - self.sequential_now_task_begin_ct = 0 - - for i in range(len(self.tasks)): - search_policy = self.search_policies[i] - task = self.tasks[i] - search_policy.set_task(task) - search_policy.set_verbose(tune_option.verbose) - search_policy.run_callbacks(tune_option.pre_search_callbacks) - - # do a round robin first - if self.strategy != 'sequential': - for i in range(len(self.tasks)): - self.tune_task(i) - - # use the specific strategy to choose workload to tune - task_idx = -1 - while self.ct < tune_option.n_trials and len(self.dead_tasks) < len(self.tasks): - if self.strategy == 'sequential': - allocated_total_ct = ((tune_option.n_trials - self.sequential_now_task_begin_ct) - / (len(self.tasks) - self.sequential_now_task_idx)) - used_ct = self.ct - self.sequential_now_task_begin_ct - - if self.sequential_now_task_idx in self.dead_tasks or used_ct >= allocated_total_ct: - self.sequential_now_task_idx += 1 - self.sequential_now_task_begin_ct = self.ct - task_idx = self.sequential_now_task_idx - if task_idx >= len(self.tasks): - break - elif self.strategy == 'round-robin': - task_idx = (task_idx + 1) % len(self.tasks) - while task_idx in self.dead_tasks: - task_idx = (task_idx + 1) % len(self.tasks) - elif self.strategy == 'gradient': - gradients = [] - for i in range(len(self.tasks)): - if i in self.dead_tasks: - gradients.append(0) - continue - - # compute gradient from chain rule : (delta f / delta g_i) - delta = 1e-7 - new_costs = list(self.best_costs) - new_costs[i] -= delta - chain_grad = (self.compute_score(self.best_costs) - self.compute_score(new_costs)) / delta - - # compute (g_i(t_i) - g(t_i - \Delta t)) / (\Delta t) - if self.task_cts[i] - 1 - self.backward_window_size >= 0: - backward_grad = (self.task_costs_history[i][self.task_cts[i] - 1] - - self.task_costs_history[i][self.task_cts[i] - 1 - self.backward_window_size]) \ - / self.backward_window_size - else: - backward_grad = 0 - - # compute (g_i(t_i + \Delta t) - g(t_i)) / (\Delta t) - g_next_1 = self.best_costs[i] - (self.best_costs[i] / self.task_cts[i]) - # todo(lmzheng): this needs adding attribute to topi.compute for similarity check - g_next_2 = self.beta * 1e20 - g_next = min(g_next_1, g_next_2) - forward_grad = g_next - self.best_costs[i] - - # combine all grads - grad = chain_grad * (self.alpha * backward_grad + (1 - self.alpha) * forward_grad) - assert grad <= 0 - gradients.append(grad) - - if max(gradients) == min(gradients): - task_idx = np.random.choice(len(gradients)) - else: - task_idx = np.argmin(gradients) - else: - raise ValueError("Invalid strategy: " + self.strategy) - - if self.verbose >= 1: - print("Next tuning task: %d" % task_idx) - self.tune_task(task_idx) - - def tune_task(self, task_idx): - if self.use_debug_measurement_simulator is not None: - measure_inputs, measure_results = \ - self.use_debug_measurement_simulator.get_next_batch( - self.tasks[task_idx], - self.num_measure_per_iter, - ) - else: - measure_inputs, measure_results = \ - self.search_policies[task_idx].continue_search( - self.tasks[task_idx], - self.num_measure_per_iter, - self.tune_option.verbose, - self.measurer) - - for inp, res in zip(measure_inputs, measure_results): - cost = array_mean(res.costs) - if cost < self.best_costs[task_idx]: - self.best_costs[task_idx] = cost - - if len(measure_inputs) == 0: - self.dead_tasks.add(task_idx) - - self.task_cts[task_idx] += 1 - self.task_costs_history[task_idx].append(self.best_costs[task_idx]) - - self.ct += len(measure_inputs) - self.cur_score = self.compute_score(self.best_costs) - - if self.verbose >= 1: - print(("TaskScheduler\tct: %d\testimated cost (ms): %.3f\ttime elapsed: %.2f\t" + - "best_costs (ms): %s\ttask_ct: %s") % - (self.ct, self.cur_score * 1e3, time.time() - self.tic, - to_str_round(self.best_costs * 1e3, decimal=3), - self.task_cts)) - - def remove_dead_task(self, prob): - for idx in self.dead_tasks: - prob[idx] = 0 - return prob / prob.sum() diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py index 41bd10cabe3ef..d104c1b1c2f8b 100644 --- a/python/tvm/relay/op/_transform.py +++ b/python/tvm/relay/op/_transform.py @@ -74,8 +74,6 @@ def compute_strided_set(attrs, inputs, output_type): # layout_transform _reg.register_injective_schedule("layout_transform") _reg.register_pattern("layout_transform", OpPattern.INJECTIVE) -_reg.register_injective_schedule("kernel_layout_transform") -_reg.register_pattern("kernel_layout_transform", OpPattern.INJECTIVE) # argwhere @_reg.register_compute("argwhere") diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py index 58b9269a4c48c..486d63c36ff0f 100644 --- a/python/tvm/relay/op/op_attrs.py +++ b/python/tvm/relay/op/op_attrs.py @@ -261,9 +261,6 @@ class ClipAttrs(Attrs): class LayoutTransformAttrs(Attrs): """Attributes for transform.layout_transform""" -@tvm._ffi.register_object("relay.attrs.KernelLayoutTransformAttrs") -class KernelLayoutTransformAttrs(Attrs): - """Attributes for transform.kernel_layout_transform""" @tvm._ffi.register_object("relay.attrs.ShapeOfAttrs") class ShapeOfAttrs(Attrs): diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 2a0ddd1329b57..b02db416bdc85 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -16,16 +16,14 @@ # under the License. """Definition of x86 operator strategy.""" # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import +import logging -import os +import re +import topi from tvm.te import SpecializedCondition -from tvm import ansor from .generic import * from .. import op as _op -# Set the priority level to use the Ansor auto-scheduler -ansor_plevel = 11 - logger = logging.getLogger('strategy') _NCHWc_matcher = re.compile("^NCHW[0-9]+c$") @@ -41,7 +39,7 @@ def schedule_injective_cpu(attrs, outs, target): def schedule_reduce_cpu(attrs, outs, target): """schedule reduction ops for x86""" with target: - return ansor.auto_schedule_topi(outs) + return topi.x86.schedule_reduce(outs) @schedule_concatenate.register("cpu") def schedule_concatenate_cpu(attrs, outs, target): @@ -53,13 +51,13 @@ def schedule_concatenate_cpu(attrs, outs, target): def schedule_pool_cpu(attrs, outs, target): """schedule pooling ops for x86""" with target: - return ansor.auto_schedule_topi(outs) + return topi.x86.schedule_pool(outs, attrs.layout) @schedule_adaptive_pool.register("cpu") def schedule_adaptive_pool_cpu(attrs, outs, target): """schedule adaptive pooling ops for x86""" with target: - return ansor.auto_schedule_topi(outs) + return topi.x86.schedule_adaptive_pool(outs) @softmax_strategy.register("cpu") def softmax_strategy_cpu(attrs, inputs, out_type, target): @@ -67,15 +65,15 @@ def softmax_strategy_cpu(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implementation( wrap_compute_softmax(topi.nn.softmax), - wrap_topi_schedule(ansor.auto_schedule_topi), - name="ansor") + wrap_topi_schedule(topi.x86.schedule_softmax), + name="softmax.x86") return strategy @schedule_log_softmax.register("cpu") def schedule_log_softmax_cpu(attrs, outs, target): """schedule log_softmax op for x86""" with target: - return ansor.auto_schedule_topi(outs) + return topi.x86.schedule_softmax(outs) @conv2d_strategy.register("cpu") def conv2d_strategy_cpu(attrs, inputs, out_type, target): @@ -107,18 +105,18 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): return conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target) elif layout == "NHWC": assert kernel_layout == "HWIO" - #logger.warning("For x86 target, NCHW layout is recommended for conv2d.") + logger.warning("For x86 target, NCHW layout is recommended for conv2d.") strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_nhwc), - wrap_topi_schedule(ansor.auto_schedule_topi), - name="ansor") + wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc), + name="conv2d_nhwc.x86") elif layout == "HWCN": assert kernel_layout == "HWIO" - #logger.warning("conv2d HWCN layout is not optimized for x86.") + logger.warning("conv2d HWCN layout is not optimized for x86.") strategy.add_implementation( wrap_compute_conv2d(topi.nn.conv2d_hwcn), - wrap_topi_schedule(ansor.auto_schedule_topi), - name="ansor") + wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn), + name="conv2d_hwcn.generic") else: raise RuntimeError("Unsupported conv2d layout {} for x86".format(layout)) elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups): @@ -145,8 +143,8 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): logger.warning("depthwise_conv2d NHWC layout is not optimized for x86.") strategy.add_implementation( wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc), - wrap_topi_schedule(ansor.auto_schedule_topi), - name="ansor") + wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc), + name="depthwise_conv2d_nhwc.generic") else: raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout)) else: # group_conv2d @@ -155,8 +153,8 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): logger.warning("group_conv2d is not optimized for x86.") strategy.add_implementation( wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True), - wrap_topi_schedule(ansor.auto_schedule_topi), - name="ansor") + wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw), + name="group_conv2d_nchw.generic") else: raise RuntimeError("Unsupported group_conv2d layout {}".format(layout)) return strategy @@ -233,8 +231,8 @@ def conv3d_strategy_cpu(attrs, inputs, out_type, target): name="conv3d_ncdhw.x86") elif layout == "NDHWC": strategy.add_implementation(wrap_compute_conv3d(topi.x86.conv3d_ndhwc), - wrap_topi_schedule(ansor.auto_schedule_topi), - name="ansor") + wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc), + name="conv3d_ndhwc.x86") else: raise ValueError("Not support this layout {} yet".format(layout)) return strategy @@ -253,8 +251,8 @@ def conv1d_strategy_cpu(attrs, inputs, out_type, target): name="conv1d_ncw.x86") elif layout == "NWC": strategy.add_implementation(wrap_compute_conv1d(topi.nn.conv1d_nwc), - wrap_topi_schedule(ansor.auto_schedule_topi), - name="ansor") + wrap_topi_schedule(topi.x86.schedule_conv1d_nwc), + name="conv1d_nwc.x86") else: raise ValueError("Unsupported conv1d layout {}".format(layout)) return strategy @@ -263,23 +261,16 @@ def conv1d_strategy_cpu(attrs, inputs, out_type, target): def dense_strategy_cpu(attrs, inputs, out_type, target): """dense x86 strategy""" strategy = _op.OpStrategy() - - strategy.add_implementation(wrap_compute_dense(topi.nn.dense), - wrap_topi_schedule(ansor.auto_schedule_topi), - name='ansor', - plevel=ansor_plevel) - + m, _ = inputs[0].shape strategy.add_implementation(wrap_compute_dense(topi.x86.dense_nopack), wrap_topi_schedule(topi.x86.schedule_dense_nopack), name="dense_nopack.x86", plevel=10) - if "cblas" in target.libs: strategy.add_implementation(wrap_compute_dense(topi.x86.dense_cblas), wrap_topi_schedule(topi.x86.schedule_dense_cblas), name="dense_cblas.x86", plevel=15) - m, _ = inputs[0].shape with SpecializedCondition(m >= 16): # this implementation may not be well-optimized, so use plevel=8 for now. strategy.add_implementation(wrap_compute_dense(topi.x86.dense_pack), @@ -292,12 +283,6 @@ def dense_strategy_cpu(attrs, inputs, out_type, target): def batch_matmul_strategy_cpu(attrs, inputs, out_type, target): """batch_matmul x86 strategy""" strategy = _op.OpStrategy() - - strategy.add_implementation(wrap_compute_dense(topi.nn.batch_matmul), - wrap_topi_schedule(ansor.auto_schedule_topi), - name='ansor', - plevel=ansor_plevel) - strategy.add_implementation(wrap_compute_batch_matmul(topi.x86.batch_matmul), wrap_topi_schedule(topi.x86.schedule_batch_matmul), name="batch_matmul.x86", diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py index f2fa2b5f5b901..a37226ea4f586 100644 --- a/python/tvm/relay/op/transform.py +++ b/python/tvm/relay/op/transform.py @@ -815,27 +815,6 @@ def layout_transform(data, src_layout, dst_layout): """ return _make.layout_transform(data, src_layout, dst_layout) -def kernel_layout_transform(data, src_layout, dst_layout): - """Transform the layout of a kernel - - Parameters - ---------- - data : relay.Expr - The source tensor to be transformed - - src_layout: str - The source layout. (e.g 1N32C112H112W) - - dst_layout: str - The destination layout. (e.g. 1N2C112H112W16c) - - Returns - ------- - ret : relay.Expr - The transformed tensor. - """ - return _make.kernel_layout_transform(data, src_layout, dst_layout) - def reverse_reshape(data, newshape): """Reshapes the input array where the special values are inferred from diff --git a/python/tvm/te/tensor.py b/python/tvm/te/tensor.py index 6539aabaa48f8..7d73bf42ab7d1 100644 --- a/python/tvm/te/tensor.py +++ b/python/tvm/te/tensor.py @@ -57,10 +57,8 @@ class Tensor(DataProducer, _expr.ExprOp): def __call__(self, *indices): ndim = self.ndim - # After ansor kernel layout rewrite, len(indices) <= ndim, - # and the indices will get modified by Ansor during schedule generation. - # if len(indices) != ndim: - # raise ValueError("Need to provide %d index in tensor slice" % ndim) + if len(indices) != ndim: + raise ValueError("Need to provide %d index in tensor slice" % ndim) indices = convert_to_object(indices) args = [] for x in indices: diff --git a/scripts/common.py b/scripts/common.py deleted file mode 100644 index 8f4fbec09dd0f..0000000000000 --- a/scripts/common.py +++ /dev/null @@ -1,1017 +0,0 @@ -"""Common utility for scripts""" -import argparse -import math -import os -import re -import time -from collections import defaultdict, namedtuple -from typing import Dict, List, Tuple - -import numpy as np -import matplotlib.pyplot as plt - -import topi -import tvm -from tvm import te -from tvm.ansor import (LogReader, make_workload_key_func, - register_workload_func, - write_measure_records_to_file) -from tvm.contrib import ndk, util - -############################################################ -###################### Test Workloads #################### -############################################################ - -@register_workload_func -def min_mn(M, N): - A = te.placeholder((M, N), name='A') - B = topi.min(A, axis=1) - - return [A, B] - -@register_workload_func -def argmin_mn(M, N): - A = te.placeholder((M, N), name='A') - B = topi.argmin(A, axis=1) - - return [A, B] - -@register_workload_func -def softmax_mn(M, N): - A = te.placeholder((M, N), name='A') - B = topi.nn.softmax(A, axis=1) - - return [A, B] - -@register_workload_func -def norm_bmn(B, M, N): - A = te.placeholder((B, M, N), name='A') - i = te.reduce_axis((0, M)) - j = te.reduce_axis((0, N)) - C = te.compute((B,), lambda b: te.sum(A[b][i][j] * A[b][i][j], axis=[i, j]), name='C') - D = te.compute((B,), lambda b: te.sqrt(C[b]), name='D') - - return [A, D] - -@register_workload_func -def add_mn(M, N): - A = te.placeholder((M, N), name='A') - B = te.placeholder((M, N), name='B') - C = te.compute((M, N), lambda i, j: A[i][j] + B[i][j], name='C') - - return [A, B, C] - -@register_workload_func -def matmul_nkkm(N, M, K, in_type='float32', out_type='float32', - tensor_core_support=False): - A = te.placeholder((N, K), name='A', dtype=in_type) - B = te.placeholder((K, M), name='B', dtype=in_type) - k = te.reduce_axis((0, K), name='k') - if in_type == out_type: - if not (in_type == 'float16' and out_type == 'float16'): - tensor_core_support = False - C = te.compute((N, M), - lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), - name='C', - attrs={"ansor_tensor_core_support": "True" if tensor_core_support else "False"}) - else: - if not ((in_type == 'float16' and out_type == 'float32') or \ - (in_type == 'int8' and out_type == 'int32')): - tensor_core_support = False - C = te.compute((N, M), - lambda i, j: te.sum(A[i][k].astype(out_type) * B[k][j].astype(out_type), - axis=[k]), - name='C', - attrs={"ansor_tensor_core_support": "True" if tensor_core_support else "False"}) - - return [A, B, C] - -@register_workload_func -def dense_layer(batch, in_dim, out_dim): - A = te.placeholder((batch, in_dim), name='A') - B = te.placeholder((out_dim, in_dim), name='B') - k = te.reduce_axis((0, in_dim), name='k') - C = te.compute((batch, out_dim), lambda i, j: te.sum(A[i][k] * B[j][k], axis=[k]), name='C') - - return [A, B, C] - -@register_workload_func -def max_pool_2d_nchw(N, C, H, W): - data = te.placeholder((N, C, H, W), name='data') - out = topi.nn.pool(data, (2, 2), (1, 1), (0, 0, 0, 0), pool_type='max', ceil_mode=True, - layout="NCHW", count_include_pad=True) - - return [data, out] - -@register_workload_func -def add_min_relu(M, N): - A = te.placeholder((M, N), name='A') - B = te.placeholder((M, N), name='B') - C = topi.add(A, B) - D = topi.min(C, axis=1) - out = topi.nn.relu(D) - return [A, B, out] - -@register_workload_func -def conv2d_relu_softmax_min(N, H, W, CI, CO, KH, KW, strides, padding, dilation): - data = te.placeholder((N, CI, H, W), name='data') - kernel = te.placeholder((CO, CI, KH, KW), name='kernel') - conv = topi.nn.conv2d_nchw(data, kernel, strides, padding, dilation) - relu = topi.nn.relu(conv) - softmax = topi.nn.softmax(relu, axis=1) - out = topi.min(softmax, axis=1) - - return [data, kernel, out] - -@register_workload_func -def conv2d_nchw_bias(N, H, W, CI, CO, KH, KW, strides, padding, dilation): - data = te.placeholder((N, CI, H, W), name='data') - kernel = te.placeholder((CO, CI, KH, KW), name='kernel') - bias = te.placeholder((CO, 1, 1), name='bias') - conv = topi.nn.conv2d_nchw(data, kernel, strides, padding, dilation) - #out = topi.nn.relu(conv) - out = topi.add(conv, bias) - return [data, kernel, bias, out] - -def conv2d_nhwc_without_layout_rewrite(Input, Filter, stride, padding, dilation, out_dtype='float32'): - """A copy of `topi.nn.conv2d_nhwc` but without the 'layout_free` attribute. - We use this in single op and subgraph evaluation because we don't want to introduce graph level optimization. - """ - assert isinstance(stride, int) or len(stride) == 2 - assert isinstance(dilation, int) or len(dilation) == 2 - - if isinstance(stride, int): - stride_h = stride_w = stride - else: - stride_h, stride_w = stride - - if isinstance(dilation, int): - dilation_h = dilation_w = dilation - else: - dilation_h, dilation_w = dilation - - batch, in_height, in_width, in_channel = Input.shape - if len(Filter.shape) == 10: - kernel_h = Filter.shape[2] * Filter.shape[6] - kernel_w = Filter.shape[3] * Filter.shape[7] - channel = Filter.shape[4] * Filter.shape[8] - num_filter = Filter.shape[0] * Filter.shape[1] * Filter.shape[5] * Filter.shape[9] - #Filter = te.placeholder([kernel_h, kernel_w, channel, num_filter], Filter.dtype, Filter.name) - elif len(Filter.shape) == 11: - kernel_h = Filter.shape[3] * Filter.shape[7] - kernel_w = Filter.shape[4] * Filter.shape[8] - channel = Filter.shape[5] * Filter.shape[9] - num_filter = Filter.shape[0] * Filter.shape[1] * Filter.shape[2] * Filter.shape[6] * Filter.shape[10] - else: - kernel_h, kernel_w, channel, num_filter = Filter.shape - - # compute the output shape - dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 - dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 - pad_top, pad_left, pad_down, pad_right = topi.nn.get_pad_tuple( - padding, (dilated_kernel_h, dilated_kernel_w)) - out_channel = num_filter - out_height = topi.util.simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1) - out_width = topi.util.simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1) - pad_before = [0, pad_top, pad_left, 0] - pad_after = [0, pad_down, pad_right, 0] - PaddedInput = topi.nn.pad(Input, pad_before, pad_after, name="PaddedInput") - rc = te.reduce_axis((0, in_channel), name='rc') - ry = te.reduce_axis((0, kernel_h), name='ry') - rx = te.reduce_axis((0, kernel_w), name='rx') - Output = te.compute( - (batch, out_height, out_width, out_channel), - lambda nn, yy, xx, ff: te.sum( - PaddedInput[nn, yy * stride_h + ry * dilation_h, - xx * stride_w + rx * dilation_w, rc].astype(out_dtype) * - Filter[ry, rx, rc, ff].astype(out_dtype) - , axis=[ry, rx, rc]), - name="Conv2dOutput", tag="conv2d_nhwc") - return Output - - -@register_workload_func -def conv2d_nhwc_bias_with_rewrite(N, H, W, CI, CO, KH, KW, strides, padding, dilation): - data = te.placeholder((N, H, W, CI), name='data') - kernel = te.placeholder((KH, KW, CI, CO), name='kernel') - bias = te.placeholder((CO, ), name='bias') - conv = topi.nn.conv2d_nhwc(data, kernel, strides, padding, dilation) - out = topi.add(conv, bias) - return [data, kernel, bias, out] - -@register_workload_func -def depthwise_conv2d_nhwc_bias_with_rewrite(N, H, W, CI, CO, KH, KW, strides, padding, dilation): - data = te.placeholder((N, H, W, CI), name='data') - kernel = te.placeholder((KH, KW, CI, 1), name='kernel') - bias = te.placeholder((CO, ), name='bias') - conv = topi.nn.depthwise_conv2d_nhwc(data, kernel, strides, padding, dilation) - out = topi.add(conv, bias) - return [data, kernel, bias, out] - -@register_workload_func -def conv2d_nhwc_bias(N, H, W, CI, CO, KH, KW, strides, padding, dilation): - data = te.placeholder((N, H, W, CI), name='data') - kernel = te.placeholder((KH, KW, CI, CO), name='kernel') - bias = te.placeholder((CO, ), name='bias') - conv = conv2d_nhwc_without_layout_rewrite(data, kernel, strides, padding, dilation) - out = topi.add(conv, bias) - return [data, kernel, bias, out] - - -@register_workload_func -def conv2d_nchw_bn_relu(N, H, W, CI, CO, kernel_size, strides, padding, dilation=1): - data = te.placeholder((N, CI, H, W), name='data') - kernel = te.placeholder((CO, CI, kernel_size, kernel_size), name='kernel') - bias = te.placeholder((CO, 1, 1), name='bias') - bn_scale = te.placeholder((CO, 1, 1), name='bn_scale') - bn_offset = te.placeholder((CO, 1, 1), name='bn_offset') - - OH = (H + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1 - OW = (W + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1 - - conv = topi.nn.conv2d_nchw(data, kernel, strides, padding, dilation) - conv = te.compute((N, CO, OH, OW), - lambda i, j, k, l: conv[i, j, k, l] + bias[j, 0, 0], - name='bias_add') - conv = te.compute((N, CO, OH, OW), - lambda i, j, k, l: conv[i, j, k, l] * bn_scale[j, 0, 0], - name='bn_mul') - conv = te.compute((N, CO, OH, OW), - lambda i, j, k, l: conv[i, j, k, l] + bn_offset[j, 0, 0], - name='bn_add') - out = topi.nn.relu(conv) - - return [data, kernel, bias, bn_offset, bn_scale, out] - -@register_workload_func -def conv2d_nhwc_bn_relu(N, H, W, CI, CO, kernel_size, strides, padding, dilation=1): - data = te.placeholder((N, H, W, CI), name='data') - kernel = te.placeholder((kernel_size, kernel_size, CI, CO), name='kernel') - bias = te.placeholder((CO,), name='bias') - bn_scale = te.placeholder((CO,), name='bn_scale') - bn_offset = te.placeholder((CO,), name='bn_offset') - - OH = (H + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1 - OW = (W + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1 - - conv = conv2d_nhwc_without_layout_rewrite(data, kernel, strides, padding, dilation) - conv = te.compute((N, OH, OW, CO), - lambda i, j, k, l: conv[i, j, k, l] + bias[l], - name='bias_add') - conv = te.compute((N, OH, OW, CO), - lambda i, j, k, l: conv[i, j, k, l] * bn_scale[l], - name='bn_mul') - conv = te.compute((N, OH, OW, CO), - lambda i, j, k, l: conv[i, j, k, l] + bn_offset[l], - name='bn_add') - out = topi.nn.relu(conv) - - return [data, kernel, bias, bn_offset, bn_scale, out] - -resnet_conv2d_configs = { - # format : N, H, W, CI, CO, KH, KW, strides, padding, dilation - '18': [ - (1, 224, 224, 3, 64, 7, 7, (2, 2), (3, 3), (1, 1)), - (1, 56, 56, 64, 128, 3, 3, (2, 2), (1, 1), (1, 1)), - (1, 56, 56, 64, 128, 1, 1, (2, 2), (0, 0), (1, 1)), - (1, 56, 56, 64, 64, 3, 3, (1, 1), (1, 1), (1, 1)), - (1, 56, 56, 64, 64, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 28, 28, 128, 256, 3, 3, (2, 2), (1, 1), (1, 1)), - (1, 28, 28, 128, 256, 1, 1, (2, 2), (0, 0), (1, 1)), - (1, 28, 28, 128, 128, 3, 3, (1, 1), (1, 1), (1, 1)), - (1, 14, 14, 256, 512, 3, 3, (2, 2), (1, 1), (1, 1)), - (1, 14, 14, 256, 512, 1, 1, (2, 2), (0, 0), (1, 1)), - (1, 14, 14, 256, 256, 3, 3, (1, 1), (1, 1), (1, 1)), - (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)), - ], - '50': [ - (1, 224, 224, 3, 64, 7, 7, (2, 2), (3, 3), (1, 1)), - (1, 56, 56, 256, 512, 1, 1, (2, 2), (0, 0), (1, 1)), - (1, 56, 56, 256, 128, 1, 1, (2, 2), (0, 0), (1, 1)), - (1, 56, 56, 256, 64, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 56, 56, 64, 256, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 56, 56, 64, 64, 3, 3, (1, 1), (1, 1), (1, 1)), - (1, 56, 56, 64, 64, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 28, 28, 512, 1024, 1, 1, (2, 2), (0, 0), (1, 1)), - (1, 28, 28, 512, 256, 1, 1, (2, 2), (0, 0), (1, 1)), - (1, 28, 28, 512, 128, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 28, 28, 128, 512, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 28, 28, 128, 128, 3, 3, (1, 1), (1, 1), (1, 1)), - (1, 14, 14, 1024, 2048, 1, 1, (2, 2), (0, 0), (1, 1)), - (1, 14, 14, 1024, 512, 1, 1, (2, 2), (0, 0), (1, 1)), - (1, 14, 14, 1024, 256, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 14, 14, 256, 1024, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 14, 14, 256, 256, 3, 3, (1, 1), (1, 1), (1, 1)), - (1, 7, 7, 2048, 512, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 7, 7, 512, 2048, 1, 1, (1, 1), (0, 0), (1, 1)), - (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)), - ], -} - -# number of appearance for all conv2ds in resnet -resnet_conv2d_weights = { - '18': [1, 1, 1, 4, 1, 1, 1, 3, 1, 1, 3, 3], - '50': [1, 1, 1, 2, 4, 3, 1, 1, 1, 3, 4, 4, 1, 1, 5, 6, 6, 2, 3, 3], -} - - -def parse_workload_name(name: str) -> List[str]: - """Parse workload name with wildcard character and abbreviation to standard names""" - if name.startswith('matmul-'): # e.g. matmul-512, matmul-1024, matmul-+ - N = name.split('-', maxsplit=1)[1] - if N == '+': - cfg_list = [256, 512, 1024] - else: - cfg_list = [N] - return ["matmul-%s" % x for x in cfg_list] - elif name.startswith('dense-'): # e.g. dense-1-512-1024, dense-16-512-512 - N = name.split('-', maxsplit=1)[1] - if N == '+': - cfg_list = ["1-512-512", "16-512-512"] - else: - cfg_list = [N] - return ["dense-%s" % x for x in cfg_list] - elif name.startswith('min-'): # e.g. min-4096 - N = name.split('-', maxsplit=1)[1] - if N == '+': - cfg_list = [4096, 8192, 16384] - else: - cfg_list = [N] - return ["min-%s" % x for x in cfg_list] - elif name.startswith('argmin-'): # e.g. argmin-4096 - N = name.split('-', maxsplit=1)[1] - if N == '+': - cfg_list = [4096, 8192, 16384] - else: - cfg_list = [N] - return ["argmin-%s" % x for x in cfg_list] - elif name.startswith('softmax-'): # e.g. softmax-4096 - N = name.split('-', maxsplit=1)[1] - if N == '+': - cfg_list = [4096, 8192, 16384] - else: - cfg_list = [N] - return ["softmax-%s" % x for x in cfg_list] - elif name.startswith('add-'): # e.g. add-4096 - N = name.split('-', maxsplit=1)[1] - if N == '+': - cfg_list = [4096, 8192, 16384] - else: - cfg_list = [N] - return ["add-%s" % x for x in cfg_list] - elif name.startswith('norm-'): # e.g. norm-1024 - N = name.split('-', maxsplit=1)[1] - if N == '+': - cfg_list = [4096, 8192, 16384] - else: - cfg_list = [N] - return ["norm-%s" % x for x in cfg_list] - elif name.startswith('add-min-relu'): # e.g. add-min-relu-4096 - N = name.split('-', maxsplit=3)[3] - if N == '+': - cfg_list = [4096, 8192, 16384] - else: - cfg_list = [N] - return ["add-min-relu-%s" % x for x in cfg_list] - elif name.startswith('nhwc-resnet-'): # e.g. nhwc-resnet-50.C1 - res = re.match(r'nhwc-resnet-(\d+).C([\d\+]+)(.B(\d+))?', name) - n_layers = res.group(1) - if res.group(2) == '+': - idx_list = range(len(resnet_conv2d_configs[n_layers])) - else: - idx_list = [int(res.group(2))] - - batch_size = 1 if res.group(4) is None else int(res.group(4)) - return ['nhwc-resnet-%s.C%d.B%d' % (n_layers, i, batch_size) for i in idx_list] - elif name.startswith('resnet-'): # e.g. resnet-50.C1, resnet-50.C1.B2, resnet-50.C+.B2 - res = re.match(r'resnet-(\d+).C([\d\+]+)(.B(\d+))?', name) - n_layers = res.group(1) - if res.group(2) == '+': - idx_list = range(len(resnet_conv2d_configs[n_layers])) - else: - idx_list = [int(res.group(2))] - - batch_size = 1 if res.group(4) is None else int(res.group(4)) - return ['resnet-%s.C%d.B%d' % (n_layers, i, batch_size) for i in idx_list] - elif name in ['conv2d-bn-relu', 'conv2d-relu-softmax-min', 'max-pool-2d', 'conv2d-rewrite', 'depthwise-conv2d-rewrite']: - return [name] - else: - raise ValueError("Invalid workload " + name) - - -def get_workload_keys(name: str) -> List[str]: - """Parse workload name and return the workload keys""" - normalized_names = parse_workload_name(name) - - ret = [] - for name in normalized_names: - if name.startswith('matmul-'): - name_split = name.split('-') - in_type = out_type = 'float32' - tensor_core_support = False - if len(name_split) == 2: # e.g. matmul-512 - N = K = M = int(name_split[1]) - elif len(name_split) == 4: # e.g. matmul-32-256-512 - N = int(name_split[1]) - K = int(name_split[2]) - M = int(name_split[3]) - elif len(name_split) == 6: # e.g. matmul-32-512-512-float16-float32 - N = int(name_split[1]) - K = int(name_split[2]) - M = int(name_split[3]) - in_type = name_split[4] - out_type = name_split[5] - elif len(name_split) == 7: # e.g. matmul-32-512-512-float16-float32-tc - N = int(name_split[1]) - K = int(name_split[2]) - M = int(name_split[3]) - in_type = name_split[4] - out_type = name_split[5] - tensor_core_support = name_split[6] == "tc" - else: - raise ValueError("Invalid matmul workload") - ret.append(make_workload_key_func(matmul_nkkm, - (N, M, K, in_type, out_type, tensor_core_support))) - elif name.startswith('dense-'): # e.g. dense-1-512-1024, dense-16-512-512 - name_split = name.split('-') - assert len(name_split) == 4 - batch = int(name_split[1]) - in_dim = int(name_split[2]) - out_dim = int(name_split[3]) - ret.append(make_workload_key_func(dense_layer, (batch, in_dim, out_dim))) - elif name.startswith('min-'): # e.g. min-4096 - name_split = name.split('-') - if len(name_split) == 2: - M = 64 - N = int(name_split[1]) - elif len(name_split) == 3: - M = int(name_split[1]) - N = int(name_split[2]) - else: - raise ValueError("Invalid min workload") - ret.append(make_workload_key_func(min_mn, (M, N))) - elif name.startswith('argmin-'): # e.g. argmin-4096 - name_split = name.split('-') - if len(name_split) == 2: - M = 64 - N = int(name_split[1]) - elif len(name_split) == 3: - M = int(name_split[1]) - N = int(name_split[2]) - else: - raise ValueError("Invalid argmin workload") - ret.append(make_workload_key_func(argmin_mn, (M, N))) - elif name.startswith('softmax-'): # e.g. softmax-4096 - name_split = name.split('-') - if len(name_split) == 2: - M = 64 - N = int(name_split[1]) - elif len(name_split) == 3: - M = int(name_split[1]) - N = int(name_split[2]) - else: - raise ValueError("Invalid softmax workload") - ret.append(make_workload_key_func(softmax_mn, (M, N))) - elif name.startswith('add-min-relu'): # e.g. add-min-relu-4096 - name_split = name.split('-') - if len(name_split) == 4: - M = 64 - N = int(name_split[3]) - elif len(name_split) == 5: - M = int(name_split[3]) - N = int(name_split[4]) - else: - raise ValueError("Invalid workload") - ret.append(make_workload_key_func(add_min_relu, (M, N))) - elif name.startswith('add-'): # e.g. add-4096 - name_split = name.split('-') - if len(name_split) == 2: - N = M = int(name_split[1]) - elif len(name_split) == 3: - M = int(name_split[1]) - N = int(name_split[2]) - else: - raise ValueError("Invalid add workload") - ret.append(make_workload_key_func(add_mn, (M, N))) - elif name.startswith('norm-'): # e.g. norm-4096 - name_split = name.split('-') - B = 2 - if len(name_split) == 2: - N = M = int(name_split[1]) - elif len(name_split) == 3: - M = int(name_split[1]) - N = int(name_split[2]) - else: - raise ValueError("Invalid norm workload") - ret.append(make_workload_key_func(norm_bmn, (B, M, N))) - elif name.startswith('nhwc-resnet-'): # e.g. nhwc-resnet-50.C1.B2 - res = re.match(r'nhwc-resnet-(\d+).C(\d+).B(\d+)', name) - n_layers = res.group(1) - idx = int(res.group(2)) - batch_size = 1 if res.group(3) is None else int(res.group(3)) - args = list(resnet_conv2d_configs[n_layers][idx]) - args[0] = batch_size - ret.append(make_workload_key_func(conv2d_nhwc_bias, args)) - elif name.startswith('resnet-'): # e.g. resnet-50.C1.B2 - res = re.match(r'resnet-(\d+).C(\d+).B(\d+)', name) - n_layers = res.group(1) - idx = int(res.group(2)) - batch_size = 1 if res.group(3) is None else int(res.group(3)) - args = list(resnet_conv2d_configs[n_layers][idx]) - args[0] = batch_size - ret.append(make_workload_key_func(conv2d_nchw_bias, args)) - elif name == 'max-pool-2d': - return [make_workload_key_func(max_pool_2d_nchw, (2, 512, 7, 7))] - elif name == 'conv2d-bn-relu': - return [make_workload_key_func(conv2d_nhwc_bn_relu, - (1, 7, 7, 512, 512, 3, 1, 1, 1)) ] - elif name == 'conv2d-rewrite': - return [ make_workload_key_func(conv2d_nhwc_bias_with_rewrite, - (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)))] - elif name == 'depthwise-conv2d-rewrite': - return [ make_workload_key_func(depthwise_conv2d_nhwc_bias_with_rewrite, - (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)))] - elif name == 'conv2d-relu-softmax-min': - return [make_workload_key_func(conv2d_relu_softmax_min, - (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)))] - else: - raise ValueError("Invalid workload " + name) - - return ret - - -def get_workload_weights(name: str) -> List[float]: - """Return weights for workload name""" - if name.startswith('resnet-'): - res = re.match(r'resnet-(\d+).C+', name) - n_layers = res.group(1) - return np.array(resnet_conv2d_weights[n_layers]) - else: - return np.ones(len(get_workload_keys(name))) - - -############################################################ -###################### Measure Tools #################### -############################################################ - - -def measure_schedule(s, - bufs, - target, - target_host=None, - remote=None, - ndk_cc=None, - number=10, - repeat=3, - min_repeat_ms=500): - """Measure the time cost of a schedule""" - func = tvm.build(s, bufs, target=target, target_host=target_host) - if remote: - ctx = remote.context(str(target), 0) - temp = util.tempdir() - remote_path = temp.relpath("tmp_deploy_lib.so") - os.environ['TVM_NDK_CC'] = ndk_cc - func.export_library(remote_path, ndk.create_shared) - remote.upload(remote_path) - func = remote.load_module("tmp_deploy_lib.so") - else: - ctx = tvm.context(str(target), 0) - - if os.environ.get('TVM_AUTO_CACHE_FLUSH', '0') == '1': - min_repeat_ms = 0 - number = 1 - - time_f = func.time_evaluator(func.entry_name, - ctx, - number=number, - repeat=repeat, - min_repeat_ms=min_repeat_ms) - - np_args = [np.ones(topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs] - args = [tvm.nd.array(x, ctx=ctx) for x in np_args] - ctx.sync() - - costs = time_f(*args).results - - return costs - -def check_correctness(s, bufs, s_ref, buf_ref, target, target_host=None, remote=None, ndk_cc=None): - """Check the correctness of a schedule against a reference schedule""" - func = tvm.build(s, bufs, target=target, target_host=target_host) - func_ref = tvm.build(s_ref, buf_ref, target='llvm') - - if remote: - raise NotImplemented - else: - ctx = tvm.context(str(target), 0) - ctx_ref = tvm.cpu() - - np_args = [np.ones(topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs] - args = [tvm.nd.array(x, ctx=ctx) for x in np_args] - args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args] - ctx.sync() - - func(*args) - func_ref(*args_ref) - - for arr, arr_ref in zip(args, args_ref): - np.testing.assert_allclose(arr.asnumpy(), arr_ref.asnumpy()) - - -############################################################ -##################### Other Utilities #################### -############################################################ - - -def geomean(xs): - """Compute geometric mean""" - return math.exp(math.fsum(math.log(x) for x in xs) / len(xs)) - - -def str2bool(v): - if isinstance(v, bool): - return v - if v.lower() in ('yes', 'true', 't', 'y', '1'): - return True - elif v.lower() in ('no', 'false', 'f', 'n', '0'): - return False - else: - raise argparse.ArgumentTypeError('Boolean value expected.') - - -global last_tic -last_tic = None - - -def PRINT_TIME(msg): - """Print time interval between differnt calls. This is for debug so we make the name letters capital""" - global last_tic - now = time.time() - - if last_tic is None: - last_tic = now - - print(msg, now - last_tic) - last_tic = now - - -############################################################ -###################### I/O Utilities ##################### -############################################################ - -# The format for a line in resulst file -BenchmarkRecord = namedtuple("BenchmarkRecord", [ - 'device', 'backend', 'workload_type', 'workload_name', 'library', 'algorithm', 'value', - 'time_stamp' -]) - - -class BaselineDatabase: - """A class for query records in baseline database""" - def __init__(self, filename): - self.filename = filename - - self.lines = [] - for line in open(filename): - if line.startswith('#') or line.isspace(): - continue - self.lines.append(line.split('\t')) - - def filter_records(self, devices=None, backends=None, wkl_names=None, libraries=None): - ret = [] - for line in self.lines: - line = BenchmarkRecord(*line) - - if devices is not None and line.device not in devices: - continue - if backends is not None and line.backend not in backends: - continue - if wkl_names is not None and line.workload_name not in wkl_names: - continue - if libraries is not None and line.library not in libraries: - continue - - ret.append(line) - return ret - - def get_data_dict(self, device, target, wkl_names) -> Tuple[Dict, List]: - """Return a data dict s.t. data[wkl][library] = cost""" - data = defaultdict(lambda: defaultdict(lambda: 1e10)) - - all_libraries = set() - - if "cpu" in target.keys: - backends = ['cpu'] - elif "gpu" in target.keys: - backends = ['gpu'] - else: - raise ValueError("Invalid target: " + target) - - # Read costs for baselines - records = self.filter_records(devices=[device], backends=backends, wkl_names=wkl_names) - for record in records: - # use min over (possible) multiple algorithms - all_libraries.add(record.library) - data[record.workload_name][record.library] = \ - min(data[record.workload_name][record.library], - np.mean(eval(record.value)['costs'])) - - return data, list(all_libraries) - - -class LogFileDatabase: - """A class for indexing best records in a log file""" - def __init__(self, filename: str, n_lines: int = -1): - inputs, results = LogReader(filename).read_lines(n_lines) - - # best records, search by (target_key, workload_key). e.g. ('gpu', 'conv2d...') - self.best_by_targetkey = {} - - # best according to (model, workload_key). e.g. ('1080ti', 'conv2d...')) - self.best_by_model = {} - - # find best records and build the index - for inp, res in zip(inputs, results): - if res.error_no != 0: - continue - - # use target keys in tvm target system as key to build best map - for target_key in inp.task.target.keys: - key = (target_key, inp.task.workload_key) - if key not in self.best_by_targetkey: - self.best_by_targetkey[key] = (inp, res) - else: - _, other_res = self.best_by_targetkey[key] - if np.mean([x.value for x in other_res.costs]) > \ - np.mean([x.value for x in res.costs]): - self.best_by_targetkey[key] = (inp, res) - - # use model as key to build best map - key = (inp.task.target.model, inp.task.workload_key) - if key not in self.best_by_model: - if inp.task.target.model != 'unknown': - self.best_by_model[key] = (inp, res) - else: - _, other_res = self.best_by_model[key] - if np.mean([x.value for x in other_res.costs]) > \ - np.mean([x.value for x in res.costs]): - self.best_by_model[key] = (inp, res) - - def write_best(self, filename: str): - best_records = list(self.best_by_targetkey.values()) - inputs = [x[0] for x in best_records] - results = [x[1] for x in best_records] - write_measure_records_to_file(filename, inputs, results) - - -############################################################ -###################### Plot Utilities #################### -############################################################ - -def max_curve(raw_curve): - """Return b[i] = max(a[:i]) """ - ret = [] - cur_max = -np.inf - for x in raw_curve: - cur_max = max(cur_max, x) - ret.append(cur_max) - return ret - -def min_curve(raw_curve): - """Return b[i] = min(a[:i]) """ - ret = [] - cur_min = np.inf - for x in raw_curve: - cur_min = min(cur_min, x) - ret.append(cur_min) - return ret - -def mean_curve(raw_curve, window_size=None): - """Return b[i] = mean(a[:i]) """ - ret = [] - mean = 0 - if window_size is None: - for i, x in enumerate(raw_curve): - mean = (mean * i + x) / (i + 1) - ret.append(mean) - else: - for i, x in enumerate(raw_curve): - if i >= window_size: - mean = (mean * window_size + x - raw_curve[i - window_size]) / window_size - else: - mean = (mean * i + x) / (i + 1) - ret.append(mean) - return ret - - -def enhance_color(color, h=1, l=1, s=1): - """Make color looks better for pyplot""" - import matplotlib.colors as mc - import colorsys - try: - c = mc.cnames[color] - except: - c = color - c = np.array(colorsys.rgb_to_hls(*mc.to_rgb(c))) - - h, l, s = h * c[0], l * c[1], s * c[2] - h, l, s = [max(min(x, 1), 0) for x in [h, l, s]] - - return colorsys.hls_to_rgb(h, l, s) - - -method_color_dict = { - 'ours': 'C0', - 'AutoTVM': 'C1', - - 'tensorflow': 'C2', - 'tensorflow-tensorrt': 'C9', - 'tflite': 'C2', - - 'pytorch': enhance_color('C3', l=1.1, s=0.9), - - 'FlexTensor': enhance_color('C5'), - 'halide': enhance_color('teal', l=1.25), - - 'Limit space': 'C7', - 'No fine-tuning': 'C8', - 'No task scheduler': 'C1', -} - -def method2color(method): - if '-batch-' in method: - method, batch_size = method.split('-batch-') - #return enhance_color(method_color_dict[method], s=1.1, l=1.5) - return method_color_dict[method] - else: - return method_color_dict[method] - -method_order_list = [ - 'pytorch', 'tensorflow', 'tensorflow-xla', 'tensorflow-tensorrt', - 'tflite', 'halide', 'FlexTensor', 'AutoTVM', - - 'Limit space', 'No fine-tuning', - 'ours', -] - -def method2order(method): - if '-batch-' in method: - method, batch_size = method.split('-batch-') - batch_size = int(batch_size) - return method_order_list.index(method) + batch_size / 100 - else: - return method_order_list.index(method) - -show_name_replace_dict = { - 'pytorch': "PyTorch", - 'tensorflow-tensorrt': 'TensorRT-TF', - 'tensorflow': 'TensorFlow', - 'tflite': 'TensorFlow Lite', - 'halide': 'Halide', - - 'ours': 'Ansor (ours)', - 'batch-16': 'batch', - - 'resnet_50': 'ResNet-50', - 'mobilenet_v2': 'Mobilenet V2', - 'resnet_18_3d': '3D-ResNet', - 'dcgan': 'DCGAN', - 'dqn': 'DQN', - 'bert': 'BERT', -} - -def show_name(name): - # if name.startswith('resnet-'): - # return name.split('.')[1] - for key, value in show_name_replace_dict.items(): - name = name.replace(key, value) - - return name - -def draw_grouped_bar_chart(data, baseline='pytorch', output='out.png', - yscale_log=False, yticks=None, y_max=None, - legend_bbox_to_anchor=None, legend_nrow=None, - figure_size=None, figax=None, draw_ylabel=True, draw_legend=True): - width = 1 - gap = 1.5 - fontsize = 19 - xticks_font_size = fontsize - 2 - - figure_size = figure_size or (11, 4) - legend_bbox_to_anchor = legend_bbox_to_anchor or (0.45, 1.35) - - all_methods = set() - legend_set = {} - - if figax is None: - fig, ax = plt.subplots() - axes = [] - axes.append(ax) - else: - ax = figax - - x0 = 0 - xticks = [] - xlabels = [] - - workloads = list(data.keys()) - for wkl in workloads: - ys = [] - colors = [] - - methods = list(data[wkl].keys()) - - if baseline in data[wkl]: - baseline_cost = data[wkl][baseline] - else: - # normalize to best library - baseline_cost = 1e10 - for method in methods: - if data[wkl][method] < baseline_cost: - baseline_cost = data[wkl][method] - - methods.sort(key=lambda x: method2order(x)) - for method in methods: - relative_speedup = baseline_cost / data[wkl][method] - if yticks is None: - ys.append(relative_speedup) - else: - ys.append(max(relative_speedup, yticks[0] * 1.1)) - colors.append(method2color(method)) - - # draw the bars - xs = np.arange(x0, x0 + len(ys)) - bars = ax.bar(xs, ys, width=width, color=colors) - - for method, bar_obj in zip(methods, bars): - all_methods.add(method) - if method not in legend_set: - legend_set[method] = bar_obj - - # tick and label - x0 += len(ys) + gap - - xticks.append(x0 - gap - len(ys)*width/2.0 - width/2.0) - xlabels.append(show_name(wkl)) - - ax.set_xticks(xticks) - ax.set_xticklabels(xlabels, fontsize=xticks_font_size) - plt.tick_params(axis='x', which='both', bottom='off', top='off') - - if draw_ylabel is True: - ax.set_ylabel('Relative Speedup', fontsize=fontsize) - elif isinstance(draw_ylabel, str): - ax.set_ylabel(draw_ylabel, fontsize=fontsize) - - if yscale_log: - ax.set_yscale('log', basey=2) - if yticks is not None: - ax.set_yticks(yticks) - if y_max: - ax.set_ylim(top=y_max) - - from matplotlib.ticker import FormatStrFormatter - ax.set_yticklabels(ax.get_yticks(), fontsize=fontsize) - ax.yaxis.set_major_formatter(FormatStrFormatter('%.1f')) - ax.yaxis.grid(linewidth=0.4, linestyle='dotted') # draw grid line - ax.set_axisbelow(True) # grid lines are behind the rest - ax.tick_params(bottom=False, top=False, right=False) - - # put legend outside the plot - all_methods = list(all_methods) - all_methods.sort(key=lambda x : method2order(x)) - - if draw_legend: - legend_nrow = legend_nrow or 2 - ncol = (len(all_methods) + legend_nrow - 1)// legend_nrow - ax.legend([legend_set[x] for x in all_methods], - [show_name(x) for x in all_methods], - fontsize=fontsize-1, - loc='upper center', - bbox_to_anchor=legend_bbox_to_anchor, - ncol=ncol, - handlelength=1.0, - handletextpad=0.5, - columnspacing=1.1) - - if figax is None: - fig.set_size_inches(figure_size) - fig.savefig(output, bbox_inches='tight') - print("Output the plot to %s" % output) - - -def to_str_round(x, decimal=6): - if isinstance(x, str): - return x - if isinstance(x, (list, tuple)) or isinstance(x, np.ndarray): - return "[" + ", ".join([to_str_round(y, decimal=decimal) - for y in x]) + "]" - if isinstance(x, dict): - return str({k: eval(to_str_round(v)) for k, v in x.items()}) - if isinstance(x, int): - return str(x) - if isinstance(x, float): - format_str = "%%.%df" % decimal - return format_str % x - raise ValueError("Invalid value: " + str(x)) - diff --git a/scripts/shape_configs.py b/scripts/shape_configs.py deleted file mode 100644 index 244638f5b29c3..0000000000000 --- a/scripts/shape_configs.py +++ /dev/null @@ -1,230 +0,0 @@ -""" Shape configurations for single operator / subgraph evaluation -This file is shared by tune_op_subgraph.py and scripts in scripts/baseline/ -""" - -matmul_shapes = [ - (1, 128, 128, 128), - (1, 512, 32, 512), - (1, 512, 512, 512), - (1, 1024, 1024, 1024), -] - -conv1d_shapes = [ - # derived from conv2d_shapes - (1, 256, 64, 128, 3, 2, 1), -# (1, 256, 64, 128, 1, 2, 0), -# (1, 256, 64, 64, 1, 1, 0), -# (1, 128, 128, 256, 3, 2, 1), - (1, 128, 128, 256, 1, 2, 0), -# (1, 128, 128, 128, 3, 1, 1), -# (1, 64, 256, 512, 3, 2, 1), -# (1, 64, 256, 512, 1, 2, 0), - (1, 64, 256, 256, 5, 1, 2), - (1, 32, 512, 512, 3, 1, 1), -] - -conv2d_shapes = [ - # all conv2d layers in resnet-18 - (1, 224, 224, 3, 64, 7, 2, 3), -# (1, 56, 56, 64, 128, 3, 2, 1), -# (1, 56, 56, 64, 128, 1, 2, 0), -# (1, 56, 56, 64, 64, 3, 1, 1), - (1, 56, 56, 64, 64, 1, 1, 0), -# (1, 28, 28, 128, 256, 3, 2, 1), -# (1, 28, 28, 128, 256, 1, 2, 0), -# (1, 28, 28, 128, 128, 3, 1, 1), -# (1, 14, 14, 256, 512, 3, 2, 1), -# (1, 14, 14, 256, 512, 1, 2, 0), - (1, 14, 14, 256, 256, 3, 1, 1), - (1, 7, 7, 512, 512, 3, 1, 1), -] - -conv3d_shapes = [ - # Derived from cnov2d_shapes. Use depth=16 for all configurations - (1, 16, 224, 224, 3, 64, 7, 2, 3), -# (1, 16, 56, 56, 64, 128, 3, 2, 1), -# (1, 16, 56, 56, 64, 128, 1, 2, 0), -# (1, 16, 56, 56, 64, 64, 3, 1, 1), - (1, 16, 56, 56, 64, 64, 1, 1, 0), -# (1, 16, 28, 28, 128, 256, 3, 2, 1), -# (1, 16, 28, 28, 128, 256, 1, 2, 0), -# (1, 16, 28, 28, 128, 128, 3, 1, 1), -# (1, 16, 14, 14, 256, 512, 3, 2, 1), -# (1, 16, 14, 14, 256, 512, 1, 2, 0), - (1, 16, 14, 14, 256, 256, 3, 1, 1), - (1, 16, 7, 7, 512, 512, 3, 1, 1), -] - -group_conv2d_shapes = [ - # Derived from cnov2d_shapes. Use group=4 for all configurations - (1, 56, 56, 64, 128, 3, 2, 1 , 1, 4), -# (1, 56, 56, 64, 128, 1, 2, 0 , 1, 4), -# (1, 56, 56, 64, 64, 3, 1, 1 , 1, 4), - (1, 56, 56, 64, 64, 1, 1, 0 , 1, 4), -# (1, 28, 28, 128, 256, 3, 2, 1, 1, 4), -# (1, 28, 28, 128, 256, 1, 2, 0, 1, 4), -# (1, 28, 28, 128, 128, 3, 1, 1, 1, 4), -# (1, 14, 14, 256, 512, 3, 2, 1, 1, 4), -# (1, 14, 14, 256, 512, 1, 2, 0, 1, 4), - (1, 14, 14, 256, 256, 3, 1, 1, 1, 4), - (1, 7, 7, 512, 512, 3, 1, 1 , 1, 4), -] - -dilation_conv2d_shapes = [ - # Derived from cnov2d_shapes. Use dilation=2 for all configurations - (1, 224, 224, 3, 64, 7, 2, 3 , 2), -# (1, 56, 56, 64, 128, 3, 2, 1 , 2), -# (1, 56, 56, 64, 128, 1, 2, 0 , 2), -# (1, 56, 56, 64, 64, 3, 1, 1 , 2), - (1, 56, 56, 64, 64, 1, 1, 0 , 2), -# (1, 28, 28, 128, 256, 3, 2, 1, 2), -# (1, 28, 28, 128, 256, 1, 2, 0, 2), -# (1, 28, 28, 128, 128, 3, 1, 1, 2), -# (1, 14, 14, 256, 512, 3, 2, 1, 2), -# (1, 14, 14, 256, 512, 1, 2, 0, 2), - (1, 14, 14, 256, 256, 3, 1, 1, 2), - (1, 7, 7, 512, 512, 3, 1, 1 , 2), -] - -depthwise_conv2d_shapes = [ - # all depthwise conv2d layers in mobilenet - (1, 112, 112, 32, 3, 1, 1), - (1, 112, 112, 64, 3, 2, 1), -# (1, 56, 56, 128, 3, 1, 1), -# (1, 56, 56, 128, 3, 2, 1), -# (1, 28, 28, 256, 3, 1, 1), -# (1, 28, 28, 256, 3, 2, 1), -# (1, 14, 14, 512, 3, 1, 1), - (1, 14, 14, 512, 3, 2, 1), - (1, 7, 7, 1024, 3, 1, 1), -] - -conv2d_transpose_shapes = [ - # all conv2d tranpose layers in DCGAN - (1, 4, 4, 512, 256, 4, 2, 1), - (1, 8, 8, 256, 128, 4, 2, 1), - (1, 16, 16, 128, 64, 4, 2, 1), - (1, 32, 32, 64, 3, 4, 2, 1), -] - -conv2d_capsule_shapes = [ - # all conv2d capsule layers in matrix capsules withemrouting (ICLR 2018) - (1, 16, 16, 32, 32, 3, 2, 1), - (1, 8, 8, 32, 32, 3, 1, 1), - (1, 16, 16, 8, 16, 3, 2, 1), - (1, 8, 8, 16, 16, 3, 1, 1), -] - -conv2d_winograd_nhwc_shapes = [ - (1, 56, 56, 64, 64, 3, 1, 1), - (1, 28, 28, 128, 128, 3, 1, 1), - (1, 14, 14, 256, 256, 3, 1, 1), - (1, 7, 7, 512, 512, 3, 1, 1), -] - -conv2d_winograd_nchw_shapes = [ - (1, 64, 56, 56, 64, 3, 1, 1), - (1, 128, 28, 28, 128, 3, 1, 1), - (1, 256, 14, 14, 256, 3, 1, 1), - (1, 512, 7, 7, 512, 3, 1, 1), -] - -matmul_tensor_core_shapes = [ - (16, 512, 512, 'float16', 'float32', True), - (32, 512, 512, 'float16', 'float32', True), - (512, 512, 512, 'float16', 'float32', True), -] - -norm_shapes = [ - (1, 256, 256), - (1, 512, 512), - (1, 1024, 1024), - (1, 4096, 1024), -] - -single_op_shape_dict = { - 'C1D': conv1d_shapes, - 'C2D': conv2d_shapes, - 'C3D': conv3d_shapes, - 'GMM': matmul_shapes, - 'GRP': group_conv2d_shapes, - 'DIL': dilation_conv2d_shapes, - 'DEP': depthwise_conv2d_shapes, - 'T2D': conv2d_transpose_shapes, - 'CAP': conv2d_capsule_shapes, - 'NRM': norm_shapes, - -# The following workloads are not in our sinle op evaluation plan. -# They should be moved to `common.py` and be used by `tune_wkl.py`. -# 'C2D_NCHW': conv2d_nchw_shapes, -# 'C2DWG_NHWC': conv2d_winograd_nhwc_shapes, -# 'C2DWG_NCHW': conv2d_winograd_nchw_shapes, -# 'GMM_TC': matmul_tensor_core_shapes, -} - -conv2d_bn_relu_shapes = [ - (1, 224, 224, 3, 64, 7, 2, 3), - (1, 56, 56, 64, 128, 3, 2, 1), - (1, 28, 28, 128, 256, 1, 2, 0), - (1, 7, 7, 512, 512, 3, 1, 1, 1), - (16, 224, 224, 3, 64, 7, 2, 3), - (16, 56, 56, 64, 128, 3, 2, 1), - (16, 28, 28, 128, 256, 1, 2, 0), - (16, 7, 7, 512, 512, 3, 1, 1, 1), -] - -transpose_batch_matmul_shapes = [ - (1, 128, 12, 64), - (1, 128, 16, 64), - (1, 64, 12, 128), - (1, 128, 12, 128), - (16, 128, 12, 64), - (16, 128, 16, 64), - (16, 64, 12, 128), - (16, 128, 12, 128), -] - -subgraph_shape_dict = { - "conv2d_bn_relu": conv2d_bn_relu_shapes, - "transpose_batch_matmul": transpose_batch_matmul_shapes, -} - -resnet_shapes = [ - (1, ), - (16, ), -] - -mobilenet_v2_shapes = [ - (1, ), - (16, ), -] - -dcgan_shapes = [ - (1, ), - (16, ), -] - -dqn_shapes = [ - (1, ), - (16, ), -] - -bert_shapes = [ - (1, ), - (16, ), -] - -resnet18_3d_shapes = [ - (1, ), - (16, ), -] - -network_shape_dict = { - 'resnet_50': resnet_shapes, - 'mobilenet_v2': mobilenet_v2_shapes, - 'dcgan': dcgan_shapes, - 'dqn': dqn_shapes, - 'bert': bert_shapes, - 'resnet_18_3d': resnet18_3d_shapes, -} - diff --git a/scripts/tune_network.py b/scripts/tune_network.py deleted file mode 100644 index 1905d81320039..0000000000000 --- a/scripts/tune_network.py +++ /dev/null @@ -1,388 +0,0 @@ -"""Tune a whole neural network""" -import argparse -import logging -import random -import os -import numpy as np - -import tvm -from tvm import ansor, relay -import tvm.contrib.graph_runtime as runtime -from tvm.contrib.debugger import debug_runtime -from tvm.contrib import util, ndk -from tvm.relay import testing -from tvm.ansor.utils import request_remote -#from baseline.utils import log_line, BenchmarkRecord - -from common import str2bool -from tune_test import create_tune_option - -dtype = "float32" - -def get_network(name, network_path, batch_size, layout): - """Get the relay module and random weights for a network""" - input_shape = (batch_size, 3, 224, 224) - output_shape = (batch_size, 1000) - input_name = 'data' - - if name.startswith("resnet3d"): - n_layer = int(name.split('-')[1]) - layout = "NDHWC" - image_shape = (16, 112, 112, 3) - input_shape = (batch_size, *image_shape) - mod, params = relay.testing.resnet3d.get_workload(num_layers=n_layer, batch_size=batch_size, image_shape=image_shape, dtype=dtype, layout=layout) - elif name.startswith("resnet"): - n_layer = int(name.split('-')[1]) - image_shape = (224, 224, 3) if layout == 'NHWC' else (3, 224, 224) - input_shape = (batch_size, *image_shape) - mod, params = relay.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, layout=layout, image_shape=image_shape, dtype=dtype) - elif "lstm" in name: - mod, params = relay.testing.lstm.get_workload(iterations=10, num_hidden=512, batch_size=batch_size, dtype=dtype) - elif "mlp" in name: - input_shape = (batch_size, 1, 28, 28) - mod, params = relay.testing.mlp.get_workload(batch_size=batch_size, dtype=dtype) - elif "vgg" in name: - n_layer = int(name.split('-')[1]) - mod, params = relay.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype) - elif name == 'dcgan': - input_shape = (batch_size, 100) - mod, params = relay.testing.dcgan.get_workload(batch_size=batch_size) - elif name == 'dqn': - layout = "NHWC" - image_shape = (84, 84, 4) - input_shape = (batch_size, *image_shape) - mod, params = relay.testing.dqn.get_workload(batch_size=batch_size, image_shape=image_shape, dtype=dtype, layout=layout) - elif name == 'mobilenet': - image_shape = (224, 224, 3) if layout == 'NHWC' else (3, 224, 224) - input_shape = (batch_size, *image_shape) - mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size, layout=layout, image_shape=image_shape, dtype=dtype) - elif name == 'r3d_18': - import torch - import torchvision - - model = getattr(torchvision.models.video, name)(pretrained=False) - model = model.eval() - - # We grab the TorchScripted model via tracing - input_shape = [batch_size, 3, 16, 112, 112] - input_data = torch.randn(input_shape) - scripted_model = torch.jit.trace(model, input_data).eval() - - input_name = 'input0' # only one input, set it to this name - shape_list = {input_name: input_shape} - mod, params = relay.frontend.from_pytorch(scripted_model, - shape_list) - elif name == 'squeezenet_v1.1': - mod, params = relay.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1', dtype=dtype) - elif name == 'inception_v3': - input_shape = (batch_size, 3, 299, 299) - mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) - elif name == 'mxnet': - # an example for mxnet model - from mxnet.gluon.model_zoo.vision import get_model - block = get_model('resnet18_v1', pretrained=True) - mod, params = relay.frontend.from_mxnet(block, shape={"input_name": input_shape}, dtype=dtype) - net = mod["main"] - net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs) - mod = relay.Module.from_expr(net) - elif name == 'tflite-mobilenet-v2' or name == 'tflite-resnet-v2-50': - try: - import tflite.Model - except ImportError: - raise ImportError("The tflite package must be installed") - input_name = "input" - input_shape = (1, 224, 224, 3) - output_shape = (1, 1001) - input_dtype = "float32" - tflite_model_buf = open(network_path, "rb").read() - tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0) - mod, params = relay.frontend.from_tflite(tflite_model, - shape_dict={input_name: input_shape}, - dtype_dict={input_name: input_dtype}) - elif name == 'pytorch-mobilenet-v2': - import torch - - model = torch.hub.load('pytorch/vision:v0.5.0', 'mobilenet_v2', pretrained=False) - model.eval() - - input_shape = [batch_size, 3, 224, 224] - input_data = torch.randn(input_shape) - scripted_model = torch.jit.trace(model, input_data).eval() - - input_name = 'input0' - shape_list = {input_name: input_shape} - mod, params = relay.frontend.from_pytorch(scripted_model, - shape_list) - elif name == 'bert': - import tensorflow as tf - - bert_pb = './baseline/tensorflow/tf_models/bert/bert-B%d.pb' % batch_size - try: - with tf.compat.v1.gfile.GFile(bert_pb, 'rb') as f: - graph_def = tf.compat.v1.GraphDef() - graph_def.ParseFromString(f.read()) - except: - raise ValueError("Need to run ./baseline/tensorflow/bert/generate_bert_pb.py to get model first") - - input_shape = (batch_size, 128) - input_name = ['input'] - shape_dict = { - 'input': input_shape - } - out_names = [ - 'bert/pooler/dense/Tanh' - ] - - mod, params = relay.frontend.from_tensorflow(graph_def, - shape=shape_dict, - outputs=out_names) - else: - raise ValueError("Unsupported network: " + name) - - return mod, params, input_name, input_shape, output_shape - - -def create_module(data_shape, graph, lib, target, input_name, params, debug_profile, - local_measure, ndk_cc, rpc_device_key, rpc_host, rpc_port, rpc_num_threads, seed=43): - if local_measure: - if target.target_name == "cuda": - ctx = tvm.gpu() - else: - ctx = tvm.cpu() - else: - print("=============== Request Remote ===============") - if 'TVM_NDK_CC' not in os.environ: - os.environ['TVM_NDK_CC'] = ndk_cc - remote = request_remote(rpc_device_key, rpc_host, rpc_port) - - print("=============== Export ===============") - ctx = remote.cpu() - temp = util.tempdir() - path_lib = temp.relpath("deploy_lib.so") - lib.export_library(path_lib, ndk.create_shared) - - print("=============== Upload ===============") - remote.upload(path_lib) - - print("=============== Load ===============") - lib = remote.load_module("deploy_lib.so") - - if rpc_num_threads: - config_threadpool = remote.get_function('runtime.config_threadpool') - config_threadpool(0, rpc_num_threads) - - np.random.seed(seed) - data_tvm = tvm.nd.array(100 * (np.random.uniform(size=data_shape)).astype(dtype), ctx=ctx) - if debug_profile: - module = debug_runtime.create(graph, lib, ctx) - else: - module = runtime.create(graph, lib, ctx) - - if type(input_name) == list: - for name in input_name: - module.set_input(name, data_tvm) - else: - module.set_input(input_name, data_tvm) - for k, v in params.items(): - module.set_input(k, v) - - return module, ctx - - -def tune_and_evaluate(network_arguments, target, target_host, - search_policy, task_scheduler_arguments, tune_option_arguments, - tune, debug_profile, check_correctness, log_n_lines): - # Extract tasks from relay program - mod, params, input_name, data_shape, out_shape = get_network(**network_arguments) - - # Tune all - if tune: - print("=============== Extract Workloads ===============") - workloads, wkl_weights = ansor.extract_from_program(mod, target=target, params=params) - print("Extract %d workloads in total" % (len(workloads))) - - # Tune workloads with auto scheduler - print("=============== Tune ===============") - tasks = [] - for i, wkl_key in enumerate(workloads): - dag = ansor.workload_key_to_dag(wkl_key) - print("[========= Task %d =========]\n" % i, dag) - tasks.append(ansor.SearchTask(dag, wkl_key, target, target_host)) - - tuner = ansor.SimpleTaskScheduler(tasks, - lambda costs: sum(c * w for c, w in zip(costs, wkl_weights)), - **task_scheduler_arguments) - tune_option, measure_ctx = create_tune_option(target, **tune_option_arguments) - - if tune_option_arguments['local_measure'] and target.target_name != 'cuda': - os.environ['TVM_BIND_MASTER_CORE_0'] = "1" - tuner.tune(tune_option, search_policy) - - if measure_ctx: - del measure_ctx - - kernel_layout_rewrite = True - - # Compile graph with best states found by auto-scheduler - print("=============== Compile ===============") - with ansor.apply_history_best(tune_option_arguments['log_file'], log_n_lines): - os.environ['TVM_AUTO_CACHE_FLUSH'] = "0" - - if kernel_layout_rewrite: - ansor.prepare_layout_rewrite(mod, target=target, params=params) - else: - # disable layout rewrite - ansor.LayoutRewriteLevel.BOTH_REWRITE = ansor.LayoutRewriteLevel.NO_REWRITE - ansor.LayoutRewriteLevel.COMPUTE_REWRITE = ansor.LayoutRewriteLevel.NO_REWRITE - - with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): - graph, lib, opt_params = relay.build_module.build( - mod, target=target, params=params) - - ansor.finish_layout_rewrite() - print("=============== Compile Finish ===============") - - module, ctx = create_module(data_shape, graph, lib, target, input_name, - opt_params, debug_profile, **common_measure_parameters) - - # Evaluate - print("========== Evaluate ==========") - ftimer = module.module.time_evaluator("run", ctx, number=10, repeat=3) - prof_res = np.array(ftimer().results) - - # display profile information - if debug_profile or check_correctness: - module.run() - if check_correctness: - actual_output = module.get_output(0).asnumpy() - print(actual_output) - - print("Mean inference time (std dev): %.2f ms (%.2f ms)" % - (np.mean(prof_res) * 1000, np.std(prof_res) * 1000)) - #log_line(BenchmarkRecord(target.target_name, 'gpu' if target.target_name == 'cuda' else 'cpu', 'network', - # "%s.B%d" % (network_name, batch_size), 'AutoSchedule', layout, - # {"costs": prof_res}, time.time()), record_file) - - if check_correctness: - print("========== Check Correctness ==========") - # clean relay cache - relay.backend.compile_engine.get().clear() - - # disable layout rewrite - ansor.LayoutRewriteLevel.BOTH_REWRITE = ansor.LayoutRewriteLevel.NO_REWRITE - ansor.LayoutRewriteLevel.COMPUTE_REWRITE = ansor.LayoutRewriteLevel.NO_REWRITE - target = tvm.target.create('llvm') - with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): - graph, lib, opt_params = relay.build_module.build( - mod, target=target, params=params) - - module, _ = create_module(data_shape, graph, lib, target, input_name, - opt_params, debug_profile, **common_measure_parameters) - module.run() - - expected_output = module.get_output(0).asnumpy() - np.testing.assert_allclose(actual_output, expected_output, rtol=1e-3, atol=1e-3) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - - # Search task related arguments - parser.add_argument("--network", type=str, required=True) - parser.add_argument("--network-path", type=str, default=None, help="The path of tflite model") - parser.add_argument("--batch-size", type=int, default=1) - parser.add_argument("--layout", type=str, default='NHWC') - parser.add_argument("--target", type=str, default='llvm -mcpu=core-avx2') - parser.add_argument("--target-host", type=str, default=None) - parser.add_argument("--check-correctness", type=str2bool, nargs='?', const=True, default=False) - parser.add_argument("--debug-profile", type=str2bool, nargs='?', const=True, default=False) - parser.add_argument("--tune", type=str2bool, nargs='?', const=True, default=True) - - # Search strategy related arguments - parser.add_argument("--n-trials", type=int, default=1000) - parser.add_argument("--policy", type=str, choices=['sketch'], default='sketch') - parser.add_argument("--model-type", type=str, choices=['xgb', 'random', 'no-share'], default='xgb') - parser.add_argument("--task-scheduler", type=str, default='gradient', - choices=['no', 'gradient', 'round-robin'], - help='The strategy of task scheduler') - parser.add_argument("--seed", type=int, default=0, help='random seed') - - # Log file related arguments - parser.add_argument("--log-file", type=str, help="Write measurement records to this log file") - parser.add_argument("--load-log", type=str, help="Load history log to resume the status of search") - parser.add_argument("--log-n-lines", type=int, help="Only load the first n lines for history log") - parser.add_argument("--load-model", type=str, help="Load pre trained cost model file") - - # Measurement related and other arguments - parser.add_argument("--num-measure-per-iter", type=int, default=48, - help="The number of programs to be measured at each iteration") - parser.add_argument("--build-timeout", type=int, default=10) - parser.add_argument("--run-timeout", type=int, default=10) - parser.add_argument("--early-stopping", type=int, default=-1) - parser.add_argument("--verbose", type=int, default=1) - parser.add_argument("--local-measure", type=str2bool, nargs='?', const=True, default=True) - parser.add_argument("--rpc-device-key", type=str, default=None) - parser.add_argument("--rpc-host", type=str, default='0.0.0.0') - parser.add_argument("--rpc-port", type=int, default=9190) - parser.add_argument("--rpc-num-threads", type=int, default=None) - parser.add_argument("--n-parallel", type=int, default=1) - parser.add_argument("--ndk-cc", type=str, default=None) - args = parser.parse_args() - - np.random.seed(args.seed) - random.seed(args.seed) - logging.basicConfig() - logging.getLogger('ansor').setLevel(logging.DEBUG) - os.environ["TOPHUB_LOCATION"] = "NONE" # disable autotvm - - target = tvm.target.create(args.target) - log_file = args.log_file or "%s-B%d-%s.json" % (args.network, args.batch_size, - target.target_name) - load_log_file = args.load_log or log_file - search_policy = "%s.%s" % (args.policy, args.model_type) - if args.layout: - layout = args.layout - elif target.target_name == "cuda": - layout = "NCHW" - else: - layout = "NHWC" - - network_arguments = { - 'name': args.network, - 'network_path': args.network_path, - 'batch_size': args.batch_size, - 'layout': layout - } - - task_scheduler_parameters = { - 'strategy': args.task_scheduler, - 'load_log_file': load_log_file, - 'load_model_file': args.load_model, - 'verbose': args.verbose, - } - - common_measure_parameters = { - 'local_measure': args.local_measure, - 'rpc_device_key': args.rpc_device_key, - 'rpc_host': args.rpc_host, - 'rpc_port': args.rpc_port, - 'rpc_num_threads': args.rpc_num_threads, - 'ndk_cc': args.ndk_cc, - } - - tune_option_arguments = { - 'log_file': log_file, - 'n_trials': args.n_trials, - 'num_measure_per_iter': args.num_measure_per_iter, - 'verbose': args.verbose, - 'n_parallel': args.n_parallel, - 'build_timeout': args.build_timeout, - 'run_timeout': args.run_timeout, - 'early_stopping': args.early_stopping, - **common_measure_parameters - } - - tune_and_evaluate(network_arguments, target, args.target_host, - search_policy, task_scheduler_parameters, tune_option_arguments, - args.tune, args.debug_profile, args.check_correctness, - args.log_n_lines) diff --git a/scripts/tune_op_subgraph.py b/scripts/tune_op_subgraph.py deleted file mode 100644 index 6574bb77e510b..0000000000000 --- a/scripts/tune_op_subgraph.py +++ /dev/null @@ -1,585 +0,0 @@ -"""Tune all workloads for single op & subgraph evaluation""" -import argparse -import logging -import random - -import numpy as np - -import tvm -from tvm import te, ansor -import topi -from topi.nn.winograd_util import winograd_transform_matrices -from topi.util import get_const_tuple - -from common import measure_schedule, str2bool, norm_bmn, conv2d_nhwc_bn_relu, conv2d_nchw_bn_relu -from shape_configs import single_op_shape_dict, subgraph_shape_dict -from tune_test import tune_workloads_jointly, replay_workload, create_tune_option - -# ========================== Single Ops ========================== - -@ansor.register_workload_func -def batch_matmul_nkkm(B, N, M, K): - X = te.placeholder((B, N, K), name='A') - Y = te.placeholder((B, K, M), name='B') - k = te.reduce_axis((0, K), name='k') - Z = te.compute((B, N, M), lambda b, i, j: te.sum(X[b][i][k] * Y[b][k][j], axis=[k]), name='C') - return [X, Y, Z] - -@ansor.register_workload_func -def conv1d_nlc(N, L, CI, CO, kernel_size, stride=1, padding=0, dilation=1, groups=1): - inputs = te.placeholder((N, L, CI), name='inputs') - weight = te.placeholder((kernel_size, CI//groups, CO), name='weight') - - batch_size, in_len, in_channel = inputs.shape - k_len, channel_per_group, out_channel = weight.shape - out_channel_per_group = out_channel // groups - out_len = (in_len + 2 * padding - dilation * (k_len - 1) - 1) // stride + 1 - rc = te.reduce_axis((0, channel_per_group), name='rc') - rl = te.reduce_axis((0, k_len), name='rl') - - padded = topi.nn.pad(inputs, [0, padding, 0]) - output = te.compute( - (batch_size, out_len, out_channel), - lambda n, l, co: te.sum( - (padded[n, l * stride + rl * dilation, co // out_channel_per_group * channel_per_group + rc] * - weight[rl, rc, co]), axis=[rl, rc]), - name='conv1d_nlc' - ) - return [inputs, weight, output] - -@ansor.register_workload_func -def conv2d_nhwc(N, H, W, CI, CO, kernel_size, stride=1, padding=0, dilation=1, groups=1): - inputs = te.placeholder((N, H, W, CI), name='inputs') - weight = te.placeholder((kernel_size, kernel_size, CI//groups, CO), name='weight') - batch_size, in_h, in_w, in_channel = inputs.shape - k_h, k_w, channel_per_group, out_channel = weight.shape - out_channel_per_group = out_channel // groups - - out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1 - out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1 - rh = te.reduce_axis((0, k_h), name="rh") - rw = te.reduce_axis((0, k_w), name="rw") - rc = te.reduce_axis((0, channel_per_group), name="rc") - - padded = topi.nn.pad(inputs, [0, padding, padding, 0]) - output = te.compute( - (batch_size, out_h, out_w, out_channel), - lambda n, h, w, co: te.sum( - (padded[n, h * stride + rh * dilation, w * stride + rw * dilation, - co // out_channel_per_group * channel_per_group + rc] - * weight[rh, rw, rc, co]), axis=[rh, rw, rc] - ), - name='conv2d_nhwc' - ) - return [inputs, weight, output] - -@ansor.register_workload_func -def conv2d_nchw(N, CI, H, W, CO, kernel_size, stride=1, padding=0, dilation=1, groups=1): - inputs = te.placeholder((N, CI, H, W), name='inputs') - weight = te.placeholder((CO, CI//groups, kernel_size, kernel_size), name='weight') - batch_size, in_channel, in_h, in_w = inputs.shape - out_channel, channel_per_group, k_h, k_w, = weight.shape - out_channel_per_group = out_channel // groups - - out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1 - out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1 - rc = te.reduce_axis((0, channel_per_group), name="rc") - rh = te.reduce_axis((0, k_h), name="rh") - rw = te.reduce_axis((0, k_w), name="rw") - - padded = topi.nn.pad(inputs, [0, 0, padding, padding]) - output = te.compute( - (batch_size, out_channel, out_h, out_w), - lambda n, co, h, w: te.sum( - (padded[n, co // out_channel_per_group * channel_per_group + rc, - h * stride + rh * dilation, w * stride + rw * dilation] - * weight[co, rc, rh, rw]), axis=[rc, rh, rw] - ), - name='conv2d_nchw' - ) - return [inputs, weight, output] - -@ansor.register_workload_func -def conv3d_ndhwc(N, D, H, W, CI, CO, kernel_size, stride=1, padding=0, dilation=1, groups=1): - inputs = te.placeholder((N, D, H, W, CI)) - weight = te.placeholder((kernel_size, kernel_size, kernel_size, CI//groups, CO)) - batch_size, in_d, in_h, in_w, in_channel = inputs.shape - k_d, k_h, k_w, channel_per_group, out_channel = weight.shape - out_channel_per_group = out_channel // groups - - out_d = (in_d + 2 * padding - dilation * (k_d - 1) - 1) // stride + 1 - out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1 - out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1 - rd = te.reduce_axis((0, k_d), name='rd') - rh = te.reduce_axis((0, k_h), name='rh') - rw = te.reduce_axis((0, k_w), name='rw') - rc = te.reduce_axis((0, channel_per_group), name='rc') - - padded = topi.nn.pad(inputs, [0, padding, padding, padding, 0]) - output = te.compute( - (batch_size, out_d, out_h, out_w, out_channel), - lambda n, d, h, w, co: te.sum( - (padded[n, d * stride + rd * dilation, - h * stride + rh * dilation, w * stride + rw * dilation, - co // out_channel_per_group * channel_per_group + rc] - * weight[rd, rh, rw, rc, co]), - axis=[rd, rh, rw, rc] - ), - name='conv3d_ndhwc' - ) - return [inputs, weight, output] - -@ansor.register_workload_func -def depthwise_conv2d_nhwc(N, H, W, C, kernel_size, stride=1, padding=0, dilation=1, factor=1): - inputs = te.placeholder((N, H, W, C)) - weight = te.placeholder((factor, kernel_size, kernel_size, C)) - - batch_size, in_h, in_w, in_channel = inputs.shape - factor, k_h, k_w, in_channel = weight.shape - out_channel = in_channel * factor - - assert factor.value == 1, "Not optimized for factor != 1" - - out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1 - out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1 - rh = te.reduce_axis((0, k_h), name='rh') - rw = te.reduce_axis((0, k_w), name='rw') - - padded = topi.nn.pad(inputs, [0, padding, padding, 0]) - output = te.compute( - (batch_size, out_h, out_w, out_channel), - lambda n, h, w, c: te.sum( - (padded[n, h * stride + rh * dilation, w * stride + rw * dilation, c // factor] - * weight[c % factor, rh, rw, c // factor]), - axis=[rh, rw] - ), - name="depth_conv2d_nhwc" - ) - return [inputs, weight, output] - -@ansor.register_workload_func -def conv2d_transpose_nhwc(N, H, W, CI, CO, kernel_size, stride=1, padding=0): - inputs = te.placeholder((N, H, W, CI), name='inputs') - weight = te.placeholder((kernel_size, kernel_size, CI, CO), name='weight') - - batch, in_h, in_w, in_c = inputs.shape - filter_h, filter_w, in_c, out_c = weight.shape - stride_h, stride_w = (stride, stride) - - # compute padding - fpad_top, fpad_left, fpad_bottom, fpad_right = topi.nn.get_pad_tuple(padding, (filter_h, filter_w)) - bpad_top = filter_h - 1 - fpad_top - bpad_bottom = filter_h - 1 - fpad_bottom - bpad_left = filter_w - 1 - fpad_left - bpad_right = filter_w - 1 - fpad_right - - # padding stage - padded = topi.nn.pad(inputs, - [0, (bpad_top + stride_h - 1) // stride_h, - (bpad_left + stride_w - 1) // stride_w, 0], - [0, (bpad_bottom + stride_h - 1) // stride_h, - (bpad_right + stride_w - 1) // stride_w, 0]) - - # remove extra padding introduced by dilatation - idxdiv = te.indexdiv - idxmod = te.indexmod - border_h = idxmod(stride_h - idxmod(bpad_top, stride_h), stride_h) - border_w = idxmod(stride_w - idxmod(bpad_left, stride_w), stride_w) - - # dilation stage - strides = [1, stride_h, stride_w, 1] - n = len(padded.shape) - - # We should embed this dilation directly into te.compute rather than creating a new te.compute. - # Only in this way can we use unroll to eliminate the multiplication of zeros. - def _dilate(*indices): - not_zero = [] - index_tuple = [] - for i in range(n): - if not strides[i] == 1: - index_tuple.append(idxdiv(indices[i], strides[i])) - not_zero.append(idxmod(indices[i], strides[i]).equal(0)) - else: - index_tuple.append(indices[i]) - if not_zero: - not_zero = te.all(*not_zero) - return te.if_then_else(not_zero, padded(*index_tuple), tvm.tir.const(0.0, padded.dtype)) - return padded(*index_tuple) - - # convolution stage - out_h = (in_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h - out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w - rc = te.reduce_axis((0, in_c), name='rc') - rh = te.reduce_axis((0, filter_h), name='rh') - rw = te.reduce_axis((0, filter_w), name='rw') - - output = te.compute( - (batch, out_h, out_w, out_c), - lambda n, h, w, co: te.sum( - _dilate(n, h + rh + border_h, w + rw + border_w, rc) * - weight[filter_h - 1 - rh, filter_w - 1 - rw, rc, co], - axis=[rh, rw, rc]), - name="conv2d_transpose_nhwc", - attrs={"ansor_always_unroll_inner": ["h", "w", "rh", "rw", "h_c", "w_c"]}) - # todo(lmzheng): add constraints on the tile size of h and w - - return [inputs, weight, output] - -@ansor.register_workload_func -def conv2d_capsule_nhwijc(N, H, W, CI, CO, kernel_size, stride=1, padding=0, capsule_size=4): - inputs = te.placeholder((N, H, W, capsule_size, capsule_size, CI), name='inputs') - weight = te.placeholder((kernel_size, kernel_size, capsule_size, capsule_size, CI, CO), name='weight') - batch_size, in_h, in_w, _, _, in_channel = inputs.shape - k_h, k_w, _, _, _, out_channel = weight.shape - - out_h = (in_h + 2 * padding - kernel_size) // stride + 1 - out_w = (in_w + 2 * padding - kernel_size) // stride + 1 - - rh = te.reduce_axis((0, k_h), name="rh") - rw = te.reduce_axis((0, k_w), name="rw") - cap_k = te.reduce_axis((0, capsule_size), name='cap_k') - rc = te.reduce_axis((0, in_channel), name="rc") - - padded = topi.nn.pad(inputs, [0, padding, padding, 0, 0, 0]) - output = te.compute( - (batch_size, out_h, out_w, capsule_size, capsule_size, out_channel), - lambda n, h, w, cap_i, cap_j, co: te.sum( - (padded[n, h * stride + rh, w * stride + rw, cap_i, cap_k, rc] - * weight[rh, rw, cap_k, cap_j, rc, co]), axis=[rh, rw, cap_k, rc] - ), - name='conv2d_capsule_nhwijc' - ) - return [inputs, weight, output] - - -@ansor.register_workload_func -def conv2d_winograd_nhwc(N, H, W, CI, CO, kernel_size=3, stride=1, padding=0, dilation=1): - # TODO: implement tile_size - tile_size = 4 #_infer_tile_size(data, kernel) - inputs = te.placeholder((N, H, W, CI), name='inputs') - #weight = te.placeholder((kernel_size, kernel_size, CI, CO), name='weight') - N, H, W, CI = get_const_tuple(inputs.shape) - if isinstance(dilation, int): - dilation_h = dilation_w = dilation - else: - dilation_h, dilation_w = dilation - # if dilation_h != 1 or dilation_w != 1: - # weight = topi.nn.dilate(weight, (1, 1, dilation_h, dilation_w)) - KH = KW = kernel_size - HPAD, WPAD, _, _ = topi.nn.get_pad_tuple(padding, (KH, KW)) - HSTR, WSTR = (stride, stride) if isinstance(stride, int) else stride - assert HSTR == 1 and WSTR == 1 and KH == KW - - data_pad = topi.nn.pad(inputs, (0, HPAD, WPAD, 0), (0, HPAD, WPAD, 0), name="data_pad") - - r = KW - m = tile_size - alpha = m + r - 1 - A, B, G = winograd_transform_matrices(m, r, 'float32') - - H = (H + 2 * HPAD - KH) // HSTR + 1 - W = (W + 2 * WPAD - KW) // WSTR + 1 - nH, nW = (H + m - 1) // m, (W + m - 1) // m - P = N * nH * nW - r_kh = te.reduce_axis((0, KH), name='r_kh') - r_kw = te.reduce_axis((0, KW), name='r_kw') - # kernel_pack = te.compute((alpha, alpha, CO, CI), lambda eps, nu, co, ci: - # weight[0][0][0][0], - # name='kernel_pack') - kshape = (alpha, alpha, CO, CI) - kernel_pack = te.placeholder(kshape, inputs.dtype, name="weight") - - idxdiv = te.indexdiv - idxmod = te.indexmod - # pack input tile - input_tile = te.compute((alpha, alpha, P, CI), lambda eps, nu, p, ci: - data_pad[idxdiv(p, (nH * nW))][idxmod(idxdiv(p, nW), nH) * m + eps] - [idxmod(p, nW) * m + nu][ci], name='input_tile',) - - # transform data - r_a = te.reduce_axis((0, alpha), 'r_a') - r_b = te.reduce_axis((0, alpha), 'r_b') - data_pack = te.compute((alpha, alpha, P, CI), lambda eps, nu, p, ci: - te.sum(input_tile[r_a][r_b][p][ci] * B[r_a][eps] * B[r_b][nu], - axis=[r_a, r_b]), name='data_pack', - attrs={"ansor_no_split_at_inner": ["eps", "nu", "r_a", "r_b"], - "ansor_last_split_is_one": ["ci", "p"], - "ansor_always_unroll": ["eps", "nu", "r_a", "r_b"], - "ansor_no_cache_write": "True", - }) - - # do batch gemm - ci = te.reduce_axis((0, CI), name='ci') - bgemm = te.compute((alpha, alpha, P, CO), lambda eps, nu, p, co: - te.sum(data_pack[eps][nu][p][ci] * - kernel_pack[eps][nu][co][ci], - axis=[ci]), name='bgemm') - - # inverse transform - r_a = te.reduce_axis((0, alpha), 'r_a') - r_b = te.reduce_axis((0, alpha), 'r_b') - inverse = te.compute((m, m, P, CO), lambda vh, vw, p, co: - te.sum(bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw], - axis=[r_a, r_b]), name='inverse', - attrs={"ansor_no_split_at_inner": ["vh", "vw", "r_a", "r_b"], - "ansor_always_unroll": ["vh", "vw", "r_a", "r_b"], - "ansor_last_split_is_one": ["co", "p"], - "ansor_no_cache_write": "True", - }) - - # output - output = te.compute((N, H, W, CO), lambda n, h, w, co: - inverse[idxmod(h, m), - idxmod(w, m), - n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m), - co], - name='conv2d_winograd', - tag='conv2d_winograd_nhwc', - attrs={"ansor_no_split_at_outer": ["n", "h", "w", "co"],}) - return [inputs, kernel_pack, output] - -@ansor.register_workload_func -def conv2d_winograd_nchw(N, CI, H, W, CO, kernel_size=3, stride=1, padding=0, dilation=1, precompute=False): - # TODO: implement tile_size - tile_size = 4 #_infer_tile_size(data, kernel) - inputs = te.placeholder((N, CI, H, W), name='inputs') - #weight = te.placeholder((CO, CI, kernel_size, kernel_size), name='weight') - N, CI, H, W = get_const_tuple(inputs.shape) - # if isinstance(dilation, int): - # dilation_h = dilation_w = dilation - # else: - # dilation_h, dilation_w = dilation - # if dilation_h != 1 or dilation_w != 1: - # weight = topi.nn.dilate(weight, (1, 1, dilation_h, dilation_w)) - KH = KW = kernel_size - HPAD, WPAD, _, _ = topi.nn.get_pad_tuple(padding, (KH, KW)) - HSTR, WSTR = (stride, stride) if isinstance(stride, int) else stride - assert HSTR == 1 and WSTR == 1 and KH == KW - - data_pad = topi.nn.pad(inputs, (0, 0, HPAD, WPAD), (0, 0, HPAD, WPAD), name="data_pad") - - r = KW - m = tile_size - alpha = m + r - 1 - A, B, G = winograd_transform_matrices(m, r, 'float32') - - H = (H + 2 * HPAD - KH) // HSTR + 1 - W = (W + 2 * WPAD - KW) // WSTR + 1 - nH, nW = (H + m - 1) // m, (W + m - 1) // m - P = N * nH * nW - r_kh = te.reduce_axis((0, KH), name='r_kh') - r_kw = te.reduce_axis((0, KW), name='r_kw') - # kernel_pack = te.compute((alpha, alpha, CI, CO), lambda eps, nu, ci, co: - # weight[0][0][0][0], - # name='kernel_pack') - kshape = (alpha, alpha, CI, CO) - kernel_pack = te.placeholder(kshape, inputs.dtype, name="weight") - - idxdiv = te.indexdiv - idxmod = te.indexmod - # pack input tile - input_tile = te.compute((CI, P, alpha, alpha), lambda ci, p, eps, nu: - data_pad[idxdiv(p, (nH * nW))][ci][idxmod(idxdiv(p, nW), nH) * m + eps] - [idxmod(p, nW) * m + nu], name='input_tile') - - # transform data - r_a = te.reduce_axis((0, alpha), 'r_a') - r_b = te.reduce_axis((0, alpha), 'r_b') - data_pack = te.compute((alpha, alpha, CI, P), lambda eps, nu, ci, p: - te.sum(input_tile[ci][p][r_a][r_b] * B[r_a][eps] * B[r_b][nu], - axis=[r_a, r_b]), name='data_pack', - attrs={"ansor_no_split_at_inner": ["eps", "nu", "r_a", "r_b"], - "ansor_no_split_at_outer": ["ci", "p"], - "ansor_always_unroll": ["eps", "nu", "r_a", "r_b"], - "ansor_no_cache_write": "True", - }) - - # do batch gemm - ci = te.reduce_axis((0, CI), name='ci') - bgemm = te.compute((alpha, alpha, CO, P), lambda eps, nu, co, p: - te.sum(data_pack[eps][nu][ci][p] * - kernel_pack[eps][nu][ci][co], - axis=[ci]), name='bgemm') - - # inverse transform - r_a = te.reduce_axis((0, alpha), 'r_a') - r_b = te.reduce_axis((0, alpha), 'r_b') - inverse = te.compute((CO, P, m, m), lambda co, p, vh, vw: - te.sum(bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw], - axis=[r_a, r_b]), name='inverse', - attrs={"ansor_no_split_at_outer": ["co", "p", "vh", "vw", "r_a", "r_b"], - "ansor_always_unroll": ["vh", "vw", "r_a", "r_b"], - "ansor_no_cache_write": "True"}) - - # output - output = te.compute((N, CO, H, W), lambda n, co, h, w: - inverse[co, n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m), - idxmod(h, m), - idxmod(w, m)], - name='conv2d_winograd', - attrs={"ansor_no_split_at_outer": ["n", "co", "h", "w"],}) - return [inputs, kernel_pack, output] - -# ========================== Subgraphs ========================== - -@ansor.register_workload_func -def transpose_batch_matmul(batch, seq_len, n_head, n_dim): - query = te.placeholder((batch, seq_len, n_head, n_dim), name='query') - value = te.placeholder((batch, seq_len, n_head, n_dim), name='value') - query_T = te.compute((batch, n_head, seq_len, n_dim), - lambda b, h, l, d: query[b, l, h, d], name="query_T") - value_T = te.compute((batch, n_head, n_dim, seq_len), - lambda b, h, d, l: value[b, l, h, d], name="value_T") - k = te.reduce_axis((0, n_dim), name='k') - out = te.compute((batch, n_head, seq_len, seq_len), - lambda b, h, i, j: te.sum(query_T[b][h][i][k] * value_T[b][h][k][j], axis=[k]), - name='C') - return [query, value, out] - -# ========================== Tune function & Task dicts ========================== - -def tune_wkl(task_func_dict, shape_dict, wkl_type, args): - target = tvm.target.create(args.target) - - for wkl_meta_name, func in task_func_dict.items(): - if not args.wkl in ["all", wkl_type, wkl_meta_name]: - continue - - log_file = args.log_file or wkl_meta_name + ".json" - wkl_keys = [] - for shape in shape_dict[wkl_meta_name]: - if shape[0] == 1: - shape = list(shape) - shape[0] = args.batch_size - - wkl_key = ansor.make_workload_key_func(func, shape) - wkl_keys.append(wkl_key) - if args.fast_check: - break - - if not args.tune: - cost, gflops = replay_workload( - wkl_key, target, args.target_host, log_file, - args.local_measure, args.rpc_device_key, args.rpc_host, - args.rpc_port, args.rpc_num_threads, args.ndk_cc, False) - # log_line(BenchmarkRecord(target.name, 'gpu' if target.name == 'cuda' else 'cpu', 'subgraph', - # workload_name, "AutoSchedule", "default", - # {"costs": [cost]}, time.time()), args.out_file) - - if args.tune: - print("========== Tune for %s (%d shapes) ========== " % (wkl_meta_name, len(wkl_keys))) - - load_log_file = args.load_log or log_file - n_trials = args.n_trials_per_shape * len(wkl_keys) - - tune_option, measure_ctx = create_tune_option(target, log_file, - n_trials, args.num_measure_per_iter, args.verbose, - args.n_parallel, args.build_timeout, args.local_measure, - args.rpc_device_key, args.rpc_host, args.rpc_port, - args.rpc_num_threads, args.ndk_cc) - - # tune workloads jointly using JointTuner - tune_workloads_jointly(wkl_keys, np.ones(len(wkl_keys)), args.task_scheduler, - target, args.target_host, args.policy, args.model_type, - args.load_model, load_log_file, tune_option) - - if measure_ctx: - del measure_ctx - - -single_op_task_func_dict = { - 'GMM': batch_matmul_nkkm, - 'C1D': conv1d_nlc, - 'C2D': conv2d_nhwc, - 'C3D': conv3d_ndhwc, - 'GRP': conv2d_nhwc, - 'DIL': conv2d_nhwc, - 'DEP': depthwise_conv2d_nhwc, - 'T2D': conv2d_transpose_nhwc, - 'CAP': conv2d_capsule_nhwijc, - 'NRM': norm_bmn, - #'SMX': softmax_mn, - -# The following workloads are not in our sinle op evaluation plan. -# They should be moved to `common.py` and be used by `tune_wkl.py`. -# 'C2D_NCHW': conv2d_nchw, -# 'C2DWG_NHWC': conv2d_winograd_nhwc, -# 'C2DWG_NCHW': conv2d_winograd_nchw, -# 'GMM_TC': matmul_nkkm, -} - -subgraph_task_func_dict = { - 'conv2d_bn_relu': conv2d_nhwc_bn_relu, - #'conv2d_bn_relu': conv2d_nchw_bn_relu, # some old log uses conv2d_nchw_bn_relu - 'transpose_batch_matmul': transpose_batch_matmul, -} - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Search task related arguments - parser.add_argument("--wkl", type=str, required=True, - help="all - Tune all workloads; \ - op - Tune all single ops; \ - subgraph - Tune all subgraphs; \ - specific wkl name - Tune a specific workload") - parser.add_argument("--batch-size", type=int, default=1) - parser.add_argument("--target", type=str, default='llvm -mcpu=core-avx2') - parser.add_argument("--target-host", type=str, default=None) - parser.add_argument("--tune", type=str2bool, nargs='?', const=True, default=True) - parser.add_argument("--fast-check", action='store_true', - help='Only run one shape for each workload. This is used for fast checking') - - # Search strategy related arguments - parser.add_argument("--n-trials-per-shape", type=int, default=1000) - parser.add_argument("--policy", type=str, choices=['sketch', 'beam-search'], default='sketch') - parser.add_argument("--model-type", type=str, choices=['xgb', 'random', 'no-share'], default='xgb') - parser.add_argument("--task-scheduler", type=str, default='round-robin', - choices=['no', 'gradient', 'round-robin'], help='The strategy of task scheduler') - parser.add_argument("--seed", type=int, default=0, help='random seed') - - # Log file related arguments - parser.add_argument("--log-file", type=str, help="Write measurement records to this log file") - parser.add_argument("--load-log", type=str, help="Load history log to resume the status of search") - parser.add_argument("--load-model", type=str, help="Load pre-trained cost model from this file") - - # Measurement related and other arguments - parser.add_argument("--num-measure-per-iter", type=int, default=48, - help="The number of programs to be measured at each iteration") - parser.add_argument("--build-timeout", type=int, default=10) - parser.add_argument("--run-timeout", type=int, default=60) - parser.add_argument("--verbose", type=int, default=1) - parser.add_argument("--local-measure", type=str2bool, nargs='?', const=True, default=True) - parser.add_argument("--rpc-device-key", type=str, default=None) - parser.add_argument("--rpc-host", type=str, default='0.0.0.0') - parser.add_argument("--rpc-port", type=int, default=9190) - parser.add_argument("--rpc-num-threads", type=int, default=None) - parser.add_argument("--n-parallel", type=int, default=1) - parser.add_argument("--ndk-cc", type=str, default=None) - args = parser.parse_args() - - np.random.seed(args.seed) - random.seed(args.seed) - logging.basicConfig() - logging.getLogger('ansor').setLevel(logging.DEBUG) - - # compute the number of tasks - num_tasks = 0 - for wkl_meta_name in single_op_task_func_dict: - if not args.wkl in ["all", "op", wkl_meta_name]: - continue - if args.fast_check: - num_tasks += 1 - else: - num_tasks += len(single_op_shape_dict[wkl_meta_name]) - for wkl_meta_name in subgraph_task_func_dict: - if not args.wkl in ["all", "subgraph", wkl_meta_name]: - continue - if args.fast_check: - num_tasks += 1 - else: - num_tasks += len(subgraph_shape_dict[wkl_meta_name]) - print("Number of tasks: %d\tTotal trials: %d" % (num_tasks, num_tasks * args.n_trials_per_shape)) - - # tune for tasks - tune_wkl(single_op_task_func_dict, single_op_shape_dict, "op", args) - tune_wkl(subgraph_task_func_dict, subgraph_shape_dict, "subgraph", args) diff --git a/scripts/tune_test.py b/scripts/tune_test.py deleted file mode 100644 index 67c0526dd6247..0000000000000 --- a/scripts/tune_test.py +++ /dev/null @@ -1,212 +0,0 @@ -"""Use auto scheduler to tune workloads""" -import argparse -import logging -import os -import random - -import numpy as np - -import tvm -from tvm import ansor -from tvm.ansor.utils import request_remote - -from common import get_workload_keys, get_workload_weights, measure_schedule, str2bool - -def create_tune_option(target, log_file, n_trials, num_measure_per_iter, verbose, - n_parallel, build_timeout, local_measure, rpc_device_key, rpc_host, - rpc_port, rpc_num_threads, ndk_cc, early_stopping=-1, run_timeout=10): - builder = runner = measure_ctx = None - if local_measure: - builder = ansor.LocalBuilder(timeout=build_timeout) - if target.target_name == "cuda": - measure_ctx = ansor.LocalRPCMeasureContext(repeat=1, min_repeat_ms=400) - runner = measure_ctx.runner - else: - os.environ['TVM_AUTO_CACHE_FLUSH'] = "1" - runner = ansor.LocalRunner(repeat=10, number=1, min_repeat_ms=0, timeout=run_timeout) - else: - os.environ['TVM_NDK_CC'] = ndk_cc - builder = ansor.LocalBuilder(timeout=build_timeout, build_func='ndk') - runner = ansor.RPCRunner(key=rpc_device_key, host=rpc_host, port=rpc_port, - timeout=run_timeout, n_parallel=n_parallel, - repeat=1, min_repeat_ms=200) - remote = request_remote(rpc_device_key, rpc_host, rpc_port) - if rpc_num_threads: - config_threadpool = remote.get_function('runtime.config_threadpool') - config_threadpool(0, rpc_num_threads) - - tune_option = ansor.TuneOption(n_trials=n_trials, early_stopping=early_stopping, - num_measure_per_iter=num_measure_per_iter, - verbose=verbose, - builder=builder, - runner=runner, - measure_callbacks=[ansor.LogToFile(log_file)], - pre_search_callbacks=[ansor.PreloadMeasuredStates(log_file)]) - - return tune_option, measure_ctx - - -def replay_workload(wkl_key, target, target_host, log_file, - local_measure=True, rpc_device_key=None, rpc_host="0.0.0.0", - rpc_port=9190, rpc_num_threads=None, ndk_cc=None, - show_lower_result=True): - cost = gflops = None - - inp, res = ansor.best_measure_pair_in_file(log_file, wkl_key, target) - if inp is None: - print("Cannot find log for: %s" % wkl_key) - else: - dag = ansor.workload_key_to_dag(inp.task.workload_key) - print("Found schedule for: %s" % wkl_key) - - s, bufs = dag.apply_steps_from_state(inp.state) - if show_lower_result: - print(tvm.lower(s, bufs, simple_mode=True)) - - if local_measure: - remote = None - else: - remote = request_remote(rpc_device_key, rpc_host, rpc_port) - if rpc_num_threads: - config_threadpool = remote.get_function('runtime.config_threadpool') - config_threadpool(0, rpc_num_threads) - - cost = np.mean((measure_schedule(s, bufs, target, target_host, - remote=remote, ndk_cc=ndk_cc))) - gflops = ansor.ComputeDAG(bufs).flop_ct / cost / 1e9 - print("Best schedule: %.2f GFLOPS\tcost: %.3f ms" % (gflops, cost * 1e3)) - - return cost, gflops - - -def tune_workload(wkl_key, target, target_host, policy, model_type, - load_model_file, load_log_file, tune_option): - """Tune a workload""" - - if False: - # Debug info. Print static analysis results from the access analyzer - dag = ansor.workload_key_to_dag(wkl_key) - print(dag.access_analyzer) - exit() - - if model_type == 'xgb': - model = ansor.XGBModel() - if load_model_file: - print("Load pretrained model...") - model.load(load_model_file) - elif load_log_file: - model.load_log_file(load_log_file) - elif model_type == "random": - model = ansor.RandomModel() - else: - raise ValueError("Invalid model: " + model_type) - - if policy == 'sketch': - policy = ansor.SketchSearchPolicy(program_cost_model=model) - elif policy == 'beam-search': - policy = ansor.SketchSearchPolicy(program_cost_model=model, - params={'use_beam_search': 1}) - else: - raise ValueError("Invalid search policy: " + policy) - - s, bufs = ansor.auto_schedule(wkl_key, - target=target, target_host=target_host, - search_policy=policy, - tune_option=tune_option) - -def tune_workloads_jointly(wkl_keys, weights, task_scheduler, target, target_host, - search_policy, model_type, load_model_file, load_log_file, - tune_option): - """Tune for multiple workloads together with TaksScheduler""" - tasks = [] - for wkl_key in wkl_keys: - dag = ansor.workload_key_to_dag(wkl_key) - tasks.append(ansor.SearchTask(dag, wkl_key, target, target_host)) - - def objective_func(costs): - return sum(c * w for c, w in zip(costs, weights)) - - tuner = ansor.SimpleTaskScheduler(tasks, objective_func, strategy=task_scheduler, - load_log_file=load_log_file, load_model_file=load_model_file) - search_policy = "%s.%s" % (search_policy, model_type) - tuner.tune(tune_option, search_policy) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - # Search task related arguments - parser.add_argument("--wkl", type=str, required=True) - parser.add_argument("--target", type=str, default='llvm -mcpu=core-avx2') - parser.add_argument("--target-host", type=str, default=None) - parser.add_argument("--tune", type=str2bool, nargs='?', const=True, default=True) - - # Search strategy related arguments - parser.add_argument("--n-trials", type=int, default=1000) - parser.add_argument("--policy", type=str, choices=['sketch', 'beam-search'], default='sketch') - parser.add_argument("--model-type", type=str, choices=['xgb', 'random', 'no-share'], default='xgb') - parser.add_argument("--task-scheduler", type=str, default='no', - choices=['no', 'gradient', 'round-robin'], - help='The strategy of task scheduler') - parser.add_argument("--seed", type=int, default=0, help='random seed') - - # Log file related arguments - parser.add_argument("--log-file", type=str, help="Write measurement records to this log file") - parser.add_argument("--load-log", type=str, help="Load history log to resume the status of search") - parser.add_argument("--load-model", type=str, help="Load pre-trained cost model from this file") - - # Measurement related and other arguments - parser.add_argument("--num-measure-per-iter", type=int, default=48, - help="The number of programs to be measured at each iteration") - parser.add_argument("--build-timeout", type=int, default=10) - parser.add_argument("--run-timeout", type=int, default=60) - parser.add_argument("--verbose", type=int, default=1) - parser.add_argument("--local-measure", type=str2bool, nargs='?', const=True, default=True) - parser.add_argument("--rpc-device-key", type=str, default=None) - parser.add_argument("--rpc-host", type=str, default='0.0.0.0') - parser.add_argument("--rpc-port", type=int, default=9190) - parser.add_argument("--rpc-num-threads", type=int, default=None) - parser.add_argument("--n-parallel", type=int, default=1) - parser.add_argument("--ndk-cc", type=str, default=None) - args = parser.parse_args() - - np.random.seed(args.seed) - random.seed(args.seed) - logging.basicConfig() - logging.getLogger('ansor').setLevel(logging.DEBUG) - - wkl_keys = get_workload_keys(args.wkl) - target = tvm.target.create(args.target) - log_file = args.log_file or args.wkl + ".json" - - # Tune workloads - if args.tune: - load_log_file = args.load_log or log_file - weights = get_workload_weights(args.wkl) - - tune_option, measure_ctx = create_tune_option(target, log_file, - args.n_trials, args.num_measure_per_iter, args.verbose, - args.n_parallel, args.build_timeout, args.local_measure, - args.rpc_device_key, args.rpc_host, args.rpc_port, args.rpc_num_threads, - args.ndk_cc) - - if args.task_scheduler == 'no': - # tune workloads one by one - for wkl_key in wkl_keys: - tune_workload(wkl_key, target, args.target_host, args.policy, - args.model_type, args.load_model, load_log_file, - tune_option) - else: - # tune workloads jointly with TaskScheduler - tune_workloads_jointly(wkl_keys, weights, args.task_scheduler, - target, args.target_host, args.policy, - args.model_type, args.load_model, load_log_file, - tune_option) - if measure_ctx: - del measure_ctx - - # Replay the best found schedule - if len(wkl_keys) == 1 or not args.tune: - for wkl_key in wkl_keys: - replay_workload(wkl_key, target, args.target_host, log_file, - args.local_measure, args.rpc_device_key, args.rpc_host, - args.rpc_port, args.rpc_num_threads, args.ndk_cc) diff --git a/src/ansor/auto_schedule.cc b/src/ansor/auto_schedule.cc deleted file mode 100644 index 05cb95c2c4514..0000000000000 --- a/src/ansor/auto_schedule.cc +++ /dev/null @@ -1,113 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file ansor/auto_schedule.cc - * \brief The user interface of the auto-scheduler - */ - -#include "auto_schedule.h" -#include -#include -#include -#include "search_policy/sketch_search_policy.h" - -namespace tvm { -namespace ansor { - -TVM_REGISTER_NODE_TYPE(TuneOptionNode); - -TuneOption::TuneOption(int n_trials, int early_stopping, - int num_measure_per_iter, int verbose, Builder builder, - Runner runner, Array measure_callbacks, - Array pre_search_callbacks) { - auto node = make_object(); - node->n_trials = n_trials; - node->early_stopping = early_stopping; - node->num_measure_per_iter = num_measure_per_iter; - node->verbose = verbose; - node->builder = std::move(builder); - node->runner = std::move(runner); - node->measure_callbacks = std::move(measure_callbacks); - node->pre_search_callbacks = std::move(pre_search_callbacks); - data_ = std::move(node); -} - -std::pair > AutoSchedule(SearchTask task, - SearchPolicy search_policy, TuneOption tune_option) { - // Search for the best schedule - ProgramMeasurer measurer = - ProgramMeasurer(tune_option->builder, tune_option->runner, - tune_option->measure_callbacks, - tune_option->verbose); - - State state = search_policy->Search( - task, tune_option->n_trials, tune_option->early_stopping, - tune_option->num_measure_per_iter, tune_option->verbose, measurer, - tune_option->pre_search_callbacks); - - return task->compute_dag.ApplySteps(state->transform_steps); -} - -std::pair > AutoSchedule( - std::string workload_key, Target target, Target target_host, - SearchPolicy search_policy, HardwareParams hardware_params, - TuneOption tune_option) { - ComputeDAG dag = ComputeDAG(workload_key); - SearchTask task = SearchTask( - std::move(dag), std::move(workload_key), std::move(target), - std::move(target_host), std::move(hardware_params)); - return AutoSchedule(std::move(task), std::move(search_policy), - std::move(tune_option)); -} - -TVM_REGISTER_GLOBAL("ansor.TuneOption") -.set_body_typed([](int n_trials, int early_stopping, - int num_measure_per_iter, int verbose, Builder builder, - Runner runner, Array measure_callbacks, - Array pre_search_callbacks) { - return TuneOption(n_trials, early_stopping, num_measure_per_iter, verbose, - builder, runner, measure_callbacks, pre_search_callbacks); -}); - -TVM_REGISTER_GLOBAL("ansor.AutoScheduleBySearchTask") -.set_body_typed([](SearchTask task, SearchPolicy search_policy, - TuneOption tune_option) { - te::Schedule sch; - Array return_tensors; - std::tie(sch, return_tensors) = AutoSchedule(task, search_policy, tune_option); - - return Array{sch, return_tensors}; -}); - -TVM_REGISTER_GLOBAL("ansor.AutoScheduleByWorkloadKey") -.set_body_typed([](std::string workload_key, Target target, - Target target_host, SearchPolicy search_policy, - HardwareParams hardware_params, TuneOption tune_option) { - te::Schedule sch; - Array return_tensors; - std::tie(sch, return_tensors) = - AutoSchedule(workload_key, target, target_host, search_policy, - hardware_params, tune_option); - - return Array{sch, return_tensors}; -}); - -} // namespace ansor -} // namespace tvm diff --git a/src/ansor/auto_schedule.h b/src/ansor/auto_schedule.h deleted file mode 100644 index f17c043cfadd3..0000000000000 --- a/src/ansor/auto_schedule.h +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file ansor/auto_schedule.h - * \brief The user interface of the auto-scheduler - */ - -#ifndef TVM_ANSOR_AUTO_SCHEDULE_H_ -#define TVM_ANSOR_AUTO_SCHEDULE_H_ - -#include -#include -#include "measure.h" -#include "search_policy/search_policy.h" - -namespace tvm { -namespace ansor { - -/*! \brief Tuning and measurement options */ -class TuneOptionNode : public Object { - public: - int n_trials; // Number of total measurement trials - int early_stopping; // Stops early the tuning if no improvement after n - // measurements - int num_measure_per_iter; // The number of programs to be measured at each - // iteration - int verbose; // Verbosity level. 0 means silent. - Builder builder; // Builder which builds the program - Runner runner; // Runner which runs the program and measure time - // costs - Array measure_callbacks; // MeasureCallback functions - Array pre_search_callbacks; // SearchCallback functions - // run before search - - void VisitAttrs(tvm::AttrVisitor* v) { - v->Visit("n_trials", &n_trials); - v->Visit("early_stopping", &early_stopping); - v->Visit("num_measure_per_iter", &num_measure_per_iter); - v->Visit("verbose", &verbose); - v->Visit("builder", &builder); - v->Visit("runner", &runner); - v->Visit("measure_callbacks", &measure_callbacks); - v->Visit("pre_search_callbacks", &pre_search_callbacks); - } - - static constexpr const char* _type_key = "ansor.TuneOption"; - TVM_DECLARE_FINAL_OBJECT_INFO(TuneOptionNode, Object); -}; - -/*! - * \brief Managed reference to TuneOptionNode. - * \sa TuneOptionNode - */ -class TuneOption : public ObjectRef { - public: - TuneOption(int n_trials, int early_stopping, int num_measure_per_iter, - int verbose, Builder builder, Runner runner, - Array measure_callbacks, - Array pre_search_callbacks); - - TVM_DEFINE_OBJECT_REF_METHODS(TuneOption, ObjectRef, TuneOptionNode); - TVM_DEFINE_OBJECT_REF_COW_METHOD(TuneOptionNode); -}; - -/*! \brief Auto schedule for a compute declaration */ -std::pair > AutoSchedule( - SearchTask task, SearchPolicy search_policy, TuneOption tune_option); - -std::pair > AutoSchedule( - std::string workload_key, Target target, Target target_host, - SearchPolicy search_policy, HardwareParams hardware_params, - TuneOption tune_option); - -} // namespace ansor -} // namespace tvm - -#endif // TVM_ANSOR_AUTO_SCHEDULE_H_ diff --git a/src/ansor/compute_dag.cc b/src/ansor/compute_dag.cc index 13f64b2bdc89d..6c89c55a5ceec 100644 --- a/src/ansor/compute_dag.cc +++ b/src/ansor/compute_dag.cc @@ -37,8 +37,6 @@ #include #include #include "transform_step.h" -#include "search_policy/utils.h" -#include "../relay/transforms/kernel_layout_transform.h" namespace tvm { namespace ansor { @@ -473,6 +471,24 @@ bool AccessAnalyzer::ElementWiseMatch(const te::Operation& op, return true; } +// Extract primitive iterators from a nested fused or splitted iterator's name +inline void ExtractOriginalIterators(const std::string& name, std::set* rets) { + size_t last_pos = 0; + for (size_t i = 0; i < name.size(); ++i) { + if (name[i] == '@' || name[i] == '.') { // '@' for fuse and '.' for split + if (!isdigit(name[last_pos]) && name[last_pos] != '@' && name[last_pos] != '.') { + rets->insert(name.substr(last_pos, i - last_pos)); + } + last_pos = i + 1; + } + } + + if (last_pos < name.size() && !isdigit(name[last_pos]) && + name[last_pos] != '@' && name[last_pos] != '.') { + rets->insert(name.substr(last_pos, name.size() - last_pos)); + } +} + // Estimate number of float operations in an expression class FlopEstimator: public ExprFunctor { public: @@ -788,7 +804,7 @@ void ComputeDAG::RewriteLayout( CHECK_EQ(placeholder_axis_names.size(), placeholder->shape.size()); std::string ori_layout = os.str(); os.str(""); - ::tvm::relay::KernelLayoutVisitor::global_ori_layouts_queue.push_back(ori_layout); + // ::tvm::relay::KernelLayoutVisitor::global_ori_layouts_queue.push_back(ori_layout); } } @@ -851,7 +867,7 @@ void ComputeDAG::RewriteLayout( } std::string new_layout = os.str(); os.str(""); - ::tvm::relay::KernelLayoutVisitor::global_new_layouts_queue.push_back(new_layout); + // ::tvm::relay::KernelLayoutVisitor::global_new_layouts_queue.push_back(new_layout); placeholder_new_names[placeholder_op] = new_names; placeholder_new_shapes[placeholder_op] = new_shape; diff --git a/src/ansor/cost_model/cost_model.cc b/src/ansor/cost_model/cost_model.cc deleted file mode 100644 index ee7bf8b260532..0000000000000 --- a/src/ansor/cost_model/cost_model.cc +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file ansor/cost_model.h - * \brief Cost model that estimates the performance of programs - */ - -#include "cost_model.h" - -#include -#include - -#include - -namespace tvm { -namespace ansor { - -using ::tvm::runtime::NDArray; - -TVM_REGISTER_OBJECT_TYPE(CostModelNode); -TVM_REGISTER_OBJECT_TYPE(RandomModelNode); -TVM_REGISTER_OBJECT_TYPE(MeasureModelNode); -TVM_REGISTER_OBJECT_TYPE(PythonBasedModelNode); - -void RandomNumber(TVMArgs args, TVMRetValue* rv) { - int n = args[0]; - void* data = args[1]; - float* fdata = reinterpret_cast(data); - for (int i = 0; i < n; i++) { - fdata[i] = static_cast(rand_r(nullptr)) / (static_cast(RAND_MAX)); - } -} - -RandomModel::RandomModel() { - ObjectPtr node = make_object(); - node->random_number_func = - runtime::Registry::Get("ansor.cost_model.random_number"); - if (node->random_number_func == nullptr) { - LOG(WARNING) << "ansor.cost_model.random_number is not registered, " - << "use C++ default random_number func instead."; - static PackedFunc cost_model_random_number(RandomNumber); - node->random_number_func = &cost_model_random_number; - } - data_ = std::move(node); -} - -void RandomModelNode::Update(const Array& inputs, - const Array& results) {} - -void RandomModelNode::Predict(const SearchTask& task, - const std::vector& states, - std::vector* scores) { - scores->resize(states.size()); - (*random_number_func)(states.size(), static_cast(scores->data())); -} - -MeasureModel::MeasureModel(Builder builder, Runner runner) { - ObjectPtr node = make_object(); - node->measurer = ProgramMeasurer(std::move(builder), std::move(runner), - Array(), 0); - data_ = std::move(node); -} - -void MeasureModelNode::Update(const Array& inputs, - const Array& results) {} - -void MeasureModelNode::Predict(const SearchTask& task, - const std::vector& states, - std::vector* scores) { - std::vector inputs; - std::vector results; - - inputs.clear(); - inputs.reserve(states.size()); - for (const auto& state : states) { - inputs.push_back(MeasureInput(task, state)); - } - measurer->SilentMeasure(task, inputs, &results); - - scores->clear(); - scores->reserve(results.size()); - for (const auto& res : results) { - scores->push_back(1.0 / FloatArrayMean(res->costs)); - } -} - -PythonBasedModel::PythonBasedModel(PackedFunc update_func, - PackedFunc predict_func, - PackedFunc predict_stage_func) { - auto node = make_object(); - node->update_func = std::move(update_func); - node->predict_func = std::move(predict_func); - node->predict_stage_func = std::move(predict_stage_func); - data_ = std::move(node); -} - -void PythonBasedModelNode::Update(const Array& inputs, - const Array& results) { - update_func(inputs, results); -} - -void PythonBasedModelNode::Predict(const SearchTask& task, - const std::vector& states, - std::vector* scores) { - scores->resize(states.size()); - predict_func(task, Array(states.begin(), states.end()), - static_cast(scores->data())); -} - -void PythonBasedModelNode::PredictStages(const SearchTask& task, - const std::vector& states, std::vector* state_scores, - std::vector>* stage_scores) { - int n_states = states.size(); - int n_stages = task->compute_dag.GetInitState()->stages.size(); - std::vector flatten_scores; - // Allocate sufficient spaces. - flatten_scores.resize(n_states * n_stages * 2); - predict_stage_func(task, Array(states.begin(), states.end()), - static_cast(flatten_scores.data())); - - // Unpack flatten scores. - state_scores->clear(); - stage_scores->clear(); - - // Score of each states. - for (int i = 0; i < n_states; ++i) { - state_scores->push_back(flatten_scores[i]); - } - - // Score of each stage in each states. - size_t idx = n_states; - for (int i = 0; i < n_states; ++i) { - CHECK_LE(idx, flatten_scores.size()); - - // Number of scored stages of this state. - int s_length = static_cast(flatten_scores[idx++]); - - if (s_length > 0) { - std::vector scores; - int offset = 0; - - if ((*state_scores)[i] > -INFINITY) { - // If the score is valid. Copy scored stages and assign 0 to placeholder - // and inlined stages. If the score is 0, meaning this state failed to - // be lowered. Just bypass to update offset. - for (const Stage& stage : states[i]->stages) { - if (stage->op_type == kPlaceholder) { - scores.push_back(0); - continue; - } - if (stage->compute_at == kInlined) { - scores.push_back(0); - continue; - } - scores.push_back(flatten_scores[idx + offset]); - offset++; - } - CHECK_EQ(offset, s_length); - stage_scores->push_back(std::move(scores)); - } - idx += s_length; - } else { - // Cost model does not provide any stage score details. - stage_scores->push_back({}); - } - } -} - -TVM_REGISTER_GLOBAL("ansor.RandomModel").set_body_typed([]() { - return RandomModel(); -}); - -TVM_REGISTER_GLOBAL("ansor.PythonBasedModel") -.set_body_typed([](PackedFunc update_func, PackedFunc predict_func, - PackedFunc predict_stage_func) { - return PythonBasedModel(update_func, predict_func, - predict_stage_func); -}); - -} // namespace ansor -} // namespace tvm diff --git a/src/ansor/cost_model/cost_model.h b/src/ansor/cost_model/cost_model.h deleted file mode 100644 index f38624a3572c1..0000000000000 --- a/src/ansor/cost_model/cost_model.h +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file ansor/cost_model.h - * \brief Cost model that estimates the performance of programs -*/ - -#ifndef TVM_ANSOR_COST_MODEL_COST_MODEL_H_ -#define TVM_ANSOR_COST_MODEL_COST_MODEL_H_ - -#include -#include -#include -#include -#include "../measure.h" - -namespace tvm { -namespace ansor { - -using runtime::PackedFunc; - -/*! \brief The base class for cost model */ -class CostModelNode: public Object { - public: - // Update the cost model according to new measurement pairs - virtual void Update(const Array& inputs, - const Array& results) = 0; - - // Predict the scores of states - virtual void Predict(const SearchTask& task, const std::vector& states, - std::vector* scores) = 0; - - // Predict the scores of all stages in states - virtual void PredictStages(const SearchTask& task, - const std::vector& states, - std::vector* state_scores, - std::vector>* stage_scores) { - LOG(FATAL) << "Not Implemented"; - } - - static constexpr const char *_type_key = "ansor.CostModel"; - TVM_DECLARE_BASE_OBJECT_INFO(CostModelNode, Object); -}; -TVM_DEFINE_MUTABLE_OBJECT_REF(CostModel, CostModelNode); - -/*! \brief The cost model returns random value for all predictions */ -class RandomModelNode: public CostModelNode { - public: - const PackedFunc* random_number_func; - - void Update(const Array& inputs, - const Array& results) final; - void Predict(const SearchTask& task, const std::vector& states, - std::vector* scores) final; - - static constexpr const char *_type_key = "ansor.RandomModel"; - TVM_DECLARE_FINAL_OBJECT_INFO(RandomModelNode, CostModelNode); -}; - -/*! - * \brief Managed reference to RandomModelNode. - * \sa RandomModelNode - */ -class RandomModel : public CostModel { - public: - RandomModel(); - explicit RandomModel(::tvm::runtime::ObjectPtr<::tvm::runtime::Object> n) - : CostModel(n) {} - - RandomModelNode* operator->() const { - return static_cast(data_.get()); - } - - TVM_DEFINE_DEFAULT_COPY_MOVE_AND_ASSIGN(RandomModel); - using ContainerType = RandomModelNode; -}; - -/*! \brief The cost model returns actual cost by measurement */ -class MeasureModelNode : public CostModelNode { - public: - ProgramMeasurer measurer; - - void Update(const Array& inputs, - const Array& results) final; - void Predict(const SearchTask& task, const std::vector& states, - std::vector* scores) final; - - static constexpr const char* _type_key = "ansor.MeasureModel"; - TVM_DECLARE_FINAL_OBJECT_INFO(MeasureModelNode, CostModelNode); -}; - -/*! - * \brief Managed reference to MeasureModelNode. - * \sa MeasureModelNode - */ -class MeasureModel : public CostModel { - public: - MeasureModel(Builder builder, Runner runner); - - TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(MeasureModel, CostModel, - MeasureModelNode); -}; - -/*! \brief A wrapper for cost model defined by python code - * This class will call python's function */ -class PythonBasedModelNode: public CostModelNode { - public: - PackedFunc update_func; - PackedFunc predict_func; - PackedFunc predict_stage_func; - - void Update(const Array& inputs, - const Array& results) final; - void Predict(const SearchTask& task, const std::vector& states, - std::vector* scores) final; - void PredictStages(const SearchTask& task, const std::vector& states, - std::vector* state_scores, - std::vector>* stage_scores) final; - - static constexpr const char *_type_key = "ansor.PythonBasedModel"; - TVM_DECLARE_FINAL_OBJECT_INFO(PythonBasedModelNode, CostModelNode); -}; - -/*! - * \brief Managed reference to PythonBasedModelNode. - * \sa PythonBasedModelNode - */ -class PythonBasedModel : public CostModel { - public: - PythonBasedModel(PackedFunc update_func, PackedFunc predict_func, - PackedFunc predict_stage_func); - - TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PythonBasedModel, CostModel, - PythonBasedModelNode); -}; - -} // namespace ansor -} // namespace tvm - -#endif // TVM_ANSOR_COST_MODEL_COST_MODEL_H_ diff --git a/src/ansor/search_policy/search_policy.cc b/src/ansor/search_policy/search_policy.cc deleted file mode 100644 index 51a48780813a2..0000000000000 --- a/src/ansor/search_policy/search_policy.cc +++ /dev/null @@ -1,128 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file ansor/search_policy/search_policy.cc - * \brief The base class for search policy - */ - -#include "search_policy.h" -#include -#include "../serialization.h" - -namespace tvm { -namespace ansor { - -TVM_REGISTER_OBJECT_TYPE(SearchPolicyNode); -TVM_REGISTER_OBJECT_TYPE(PreloadMeasuredStatesNode); - -void SearchPolicyNode::PreloadMeasuredStates(const std::string& log_file) { - LogReader reader = LogReader(log_file); - const auto& res = reader->ReadLines(-1); - size_t log_size = res.first.size(); - CHECK_EQ(log_size, res.second.size()); - if (log_size) { - std::vector measured_states; - std::vector measured_throughputs; - for (size_t i = 0; i < log_size; i++) { - const auto& inp = res.first[i]; - if (inp->task->workload_key == cur_task->workload_key && - inp->task->target->target_name.compare( - cur_task->target->target_name) == 0) { - State state = cur_task->compute_dag.GetInitState(); - state.CopyOnWrite()->transform_steps = inp->state->transform_steps; - state.DoSteps(inp->state->transform_steps, cur_task->compute_dag); - measured_states.emplace_back(std::move(state)); - measured_throughputs.push_back(res.second[i]->error_no == 0 ? - (1.0 / FloatArrayMean(res.second[i]->costs)) : 0.0); - } - } - cur_task->compute_dag.InferBound(&measured_states); - for (size_t i = 0; i < measured_states.size(); i ++) { - auto& state = measured_states[i]; - const auto& state_str = state.ToStr(); - if (!measured_states_set_.count(state_str)) { - measured_states_set_.insert(state_str); - if (measured_throughputs[i] != 0.0) { - measured_states_vector_.emplace_back(std::move(state)); - measured_states_throughputs_.emplace_back(measured_throughputs[i]); - } - } - } - - StdCout(verbose) << "Successfully load " << measured_states_set_.size() - << " measurement records from " << log_file - << " for " << cur_task->workload_key << std::endl; - } else { - StdCout(verbose) << "No measurement records found in " - << log_file << " for " << cur_task->workload_key << std::endl; - } -} - -void SearchPolicyNode::RunCallbacks(const Array& callbacks) { - if (callbacks.defined() && callbacks.size()) { - PrintTitle("Call search callbacks", verbose); - for (const auto& callback : callbacks) { - callback->callback(this); - } - } -} - -PreloadMeasuredStates::PreloadMeasuredStates(std::string filename) { - auto node = make_object(); - node->filename = std::move(filename); - data_ = std::move(node); -} - -void PreloadMeasuredStatesNode::callback(SearchPolicyNode* policy) { - policy->PreloadMeasuredStates(filename); -} - -// Search Policy -TVM_REGISTER_GLOBAL("ansor.SearchPolicyContinueSearchOneRound") -.set_body_typed([](SearchPolicy policy, SearchTask task, int num_measure, - int verbose, ProgramMeasurer measurer) { - Array inputs; - Array results; - std::tie(inputs, results) = policy->ContinueSearchOneRound(task, num_measure, verbose, measurer); - return Array{inputs, results}; -}); - -TVM_REGISTER_GLOBAL("ansor.SearchPolicyRunCallbacks") -.set_body_typed([](SearchPolicy policy, Array callbacks) { - policy->RunCallbacks(callbacks); -}); - -TVM_REGISTER_GLOBAL("ansor.SearchPolicySetTask") -.set_body_typed([](SearchPolicy policy, SearchTask task) { - policy->cur_task = task; -}); - -TVM_REGISTER_GLOBAL("ansor.SearchPolicySetVerbose") -.set_body_typed([](SearchPolicy policy, int verbose) { - policy->verbose = verbose; -}); - -TVM_REGISTER_GLOBAL("ansor.PreloadMeasuredStates") -.set_body_typed([](std::string filename) { - return PreloadMeasuredStates(filename); -}); - -} // namespace ansor -} // namespace tvm diff --git a/src/ansor/search_policy/search_policy.h b/src/ansor/search_policy/search_policy.h deleted file mode 100644 index 03e7c3f025dfc..0000000000000 --- a/src/ansor/search_policy/search_policy.h +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file ansor/search_policy/search_policy.h - * \brief The base class for search policy - */ - -#ifndef TVM_ANSOR_SEARCH_POLICY_SEARCH_POLICY_H_ -#define TVM_ANSOR_SEARCH_POLICY_SEARCH_POLICY_H_ - -#include "../search_task.h" -#include -#include -#include -#include -#include -#include "../measure.h" - -namespace tvm { -namespace ansor { - -class SearchPolicyNode; - -/*! \brief Callback function to be called before or after the search process */ -class SearchCallbackNode : public Object { - public: - virtual void callback(SearchPolicyNode* policy) = 0; - - static constexpr const char *_type_key = "ansor.SearchCallback"; - TVM_DECLARE_BASE_OBJECT_INFO(SearchCallbackNode, Object); -}; -TVM_DEFINE_MUTABLE_OBJECT_REF(SearchCallback, SearchCallbackNode); - -/*! \brief Preload measured states from a log file. - * This can resume the state of the search policy */ -class PreloadMeasuredStatesNode : public SearchCallbackNode { - public: - std::string filename; - - void callback(SearchPolicyNode* policy) final; - - static constexpr const char *_type_key = "ansor.PreloadMeasuredStates"; - TVM_DECLARE_FINAL_OBJECT_INFO(PreloadMeasuredStatesNode, SearchCallbackNode); -}; - -/*! - * \brief Managed reference to PreloadMeasuredStatesNode. - * \sa PreloadMeasuredStatesNode - */ -class PreloadMeasuredStates : public SearchCallback { - public: - explicit PreloadMeasuredStates(std::string filename); - - TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PreloadMeasuredStates, SearchCallback, - PreloadMeasuredStatesNode); -}; - -/*! \brief The base class for search policy */ -class SearchPolicyNode : public Object { - public: - SearchTask cur_task; // The current task - int verbose; // Verbose level (0 means silent) - - void VisitAttrs(AttrVisitor* v) { - v->Visit("cur_task", &cur_task); - v->Visit("verbose", &verbose); - } - - // Search for a task - virtual State Search(SearchTask task, int n_trials, - int early_stopping, int num_measure_per_iter, - int verbose, ProgramMeasurer measurer, - Array pre_search_callbacks) = 0; - - // Continue search one round for a task. - // This is used in the task scheduler for searching for multiple tasks together. - virtual std::pair, Array > ContinueSearchOneRound( - SearchTask task, int num_measure, int verbose, ProgramMeasurer measurer) = 0; - - // Preload measured states from a log file to resume the state of the search policy - void PreloadMeasuredStates(const std::string& log_file); - - // Run a list of callback functions - void RunCallbacks(const Array& callbacks); - - // Dict keys to give hints to the policy - static constexpr const char* always_unroll_inner_key = "ansor_always_unroll_inner"; - static constexpr const char* always_unroll_key = "ansor_always_unroll"; - static constexpr const char* no_split_at_inner_key = "ansor_no_split_at_inner"; - static constexpr const char* no_split_at_outer_key = "ansor_no_split_at_outer"; - static constexpr const char* last_split_is_one_key = "ansor_last_split_is_one"; - // Flag keys to give hints to the policy - static constexpr const char* always_compute_inline_key = "ansor_always_compute_inline"; - static constexpr const char* no_cache_write_key = "ansor_no_cache_write"; - static constexpr const char* no_cache_read_key = "ansor_no_cache_read"; - - static constexpr const char *_type_key = "ansor.SearchPolicy"; - TVM_DECLARE_BASE_OBJECT_INFO(SearchPolicyNode, Object); - - protected: - // The set of the already measured states. - // We store the string format for redundancy check - std::unordered_set measured_states_set_; - // The array of already measured states. - std::vector measured_states_vector_; - // The throughputs of already measured states - std::vector measured_states_throughputs_; -}; -TVM_DEFINE_MUTABLE_OBJECT_REF(SearchPolicy, SearchPolicyNode); - -} // namespace ansor -} // namespace tvm - -#endif // TVM_ANSOR_SEARCH_POLICY_SEARCH_POLICY_H_ diff --git a/src/ansor/search_policy/sketch_search_policy.cc b/src/ansor/search_policy/sketch_search_policy.cc deleted file mode 100644 index 5b2c10c08c815..0000000000000 --- a/src/ansor/search_policy/sketch_search_policy.cc +++ /dev/null @@ -1,1538 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file ansor/search_policy/sketch_search_policy.h - * \brief The search policy that searches in a hierarchical search space defined by sketches. - * The policy randomly samples programs from the space defined by sketches - * and use evolutionary search to fine-tune them. - */ - -#include "sketch_search_policy.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "utils.h" - -#define IS_GPU(task) ((task)->target->device_type == kDLGPU || \ - (task)->target->device_type == kDLOpenCL) - -namespace tvm { -namespace ansor { - -TVM_REGISTER_NODE_TYPE(SketchSearchPolicyNode); -TVM_REGISTER_OBJECT_TYPE(PreloadCustomSketchRuleNode); - -// All possible candidates for auto_unroll -const std::vector SketchSearchPolicyNode::auto_unroll_configs{0, 16, 64, 512, 1024}; - -SketchSearchPolicy::SketchSearchPolicy(CostModel program_cost_model, - Map params, - int seed) { - auto node = make_object(); - node->program_cost_model = std::move(program_cost_model); - node->rand_gen_ = std::mt19937(seed); - node->params = std::move(params); - data_ = std::move(node); -} - -State SketchSearchPolicyNode::Search(SearchTask task, int n_trials, - int early_stopping, int num_measure_per_iter, int verbose, - ProgramMeasurer measurer, Array pre_search_callbacks) { - std::vector best_states, random_states; - this->cur_task = task; - this->verbose = verbose; - num_measure_per_iter_ = num_measure_per_iter; - - RunCallbacks(pre_search_callbacks); - - if (n_trials <= 1) { // no measurement is allowed - SearchOneRound(&best_states, 0, &random_states); - CHECK_GT(best_states.size(), 0); - return best_states[0]; - } else { - std::vector inputs; - std::vector results; - int num_random = static_cast(GetDoubleParam(params, "eps_greedy") * num_measure_per_iter); - - measurer->Reset(); - - early_stopping = early_stopping < 0 ? std::numeric_limits::max() >> 1 : early_stopping; - - int ct = 0; - while (ct < n_trials) { - if (!inputs.empty()) { - // retrain cost models - PrintTitle("Train cost model", verbose); - program_cost_model->Update(inputs, results); - } - - // Search one round to get promising states - PrintTitle("Search", verbose); - SearchOneRound(&best_states, num_random, &random_states); - - // Fill correct bound.This is necessary for computing the correct ToStr() for reduncency check - cur_task->compute_dag.InferBound(&best_states); - cur_task->compute_dag.InferBound(&random_states); - - // Pick `num_measure_per_iter` states to measure, check hash to remove already measured state - // Also pick some random states to do eps-greedy - PickStatesWithEpsGreedy(&inputs, best_states, random_states, n_trials - ct); - - // Have traversed all of search space - if (inputs.empty()) { - StdCout(verbose) << "All candidates in the search space have been measured." << std::endl; - break; - } - - // Measure candidate states - PrintTitle("Measure", verbose); - measurer->Measure(cur_task, GetRef(this), inputs, &results); - ct += inputs.size(); - - if (ct - measurer->best_ct[cur_task->workload_key] > early_stopping) { - StdCout(verbose) << "Meet the early stopping condition." << std::endl; - break; - } - - // Update measured states. These states will join the LocalMutation in later rounds - for (const auto& res : results) { - measured_states_throughputs_.push_back(1.0 / FloatArrayMean(res->costs)); - } - } - PrintTitle("Done", verbose); - - return measurer->best_state[cur_task->workload_key]; - } -} - -std::pair, Array > - SketchSearchPolicyNode::ContinueSearchOneRound( - SearchTask task, int num_measure, int verbose, ProgramMeasurer measurer) { - if (cur_task.defined()) { - CHECK_EQ(cur_task, task); - } else { - cur_task = task; - } - this->verbose = verbose; - num_measure_per_iter_ = num_measure; - - std::vector best_states, random_states; - std::vector inputs; - std::vector results; - int num_random = static_cast(GetDoubleParam(params, "eps_greedy") * num_measure); - - // Search one round to get promising states - PrintTitle("Search", verbose); - SearchOneRound(&best_states, num_random * 2, &random_states); - - // Fill correct bound. This is necessary for computing the correct ToStr() for reduncency check - cur_task->compute_dag.InferBound(&best_states); - cur_task->compute_dag.InferBound(&random_states); - - // Pick `num_measure` states to measure, check hash to remove already measured state - // Also pick some random states to do eps-greedy - PickStatesWithEpsGreedy(&inputs, best_states, random_states, num_measure); - - // Measure candidate states - PrintTitle("Measure", verbose); - measurer->Measure(cur_task, GetRef(this), inputs, &results); - - // Update throughputs of measured states. These states will join the LocalMutation in later rounds - for (const auto& res : results) { - measured_states_throughputs_.push_back(1.0 / FloatArrayMean(res->costs)); - } - - // Update the cost model - Array inputs_arr(std::make_move_iterator(inputs.begin()), - std::make_move_iterator(inputs.end())); - Array results_arr(std::make_move_iterator(results.begin()), - std::make_move_iterator(results.end())); - - PrintTitle("Train cost model", verbose); - program_cost_model->Update(inputs_arr, results_arr); - return std::make_pair(std::move(inputs_arr), std::move(results_arr)); -} - -void SketchSearchPolicyNode::PickStatesWithEpsGreedy( - std::vector* inputs, - const std::vector& best_states, - const std::vector& random_states, - int remaining_n_trials) { - int num_random = static_cast(GetDoubleParam(params, "eps_greedy") * num_measure_per_iter_); - int num_good = num_measure_per_iter_ - num_random; - - inputs->clear(); - size_t offset_best = 0, offset_random = 0; - - while (static_cast(inputs->size()) < std::min(num_measure_per_iter_, remaining_n_trials)) { - const State* pstate; - - bool has_best = offset_best < best_states.size(); - bool has_random = offset_random < random_states.size(); - - if (static_cast(inputs->size()) < num_good) { - // prefer best states - if (has_best) { - pstate = &best_states[offset_best++]; - } else if (has_random) { - pstate = &random_states[offset_random++]; - } else { - break; - } - } else { - // prefer random states - if (has_random) { - pstate = &random_states[offset_random++]; - } else if (has_best) { - pstate = &best_states[offset_best++]; - } else { - break; - } - } - - // Check if it has already been measured - std::string state_str = pstate->ToStr(); - - if (measured_states_set_.count(state_str)) { continue; } - measured_states_set_.insert(state_str); - - inputs->push_back(MeasureInput(cur_task, *pstate)); - measured_states_vector_.push_back(std::move(*pstate)); - } -} - -void SketchSearchPolicyNode::SearchOneRound(std::vector* best_states, - int num_random_states, std::vector* random_states) { - best_states->clear(); - random_states->clear(); - - // Get parameters - int population = GetIntParam(params, "evolutionary_search_population"); - int num_use_measured = std::min(static_cast(measured_states_vector_.size()), - static_cast( - GetDoubleParam(params, "evolutionary_search_use_measured_ratio") * population)); - bool have_cost_model = !program_cost_model->IsInstance(); - - if (!have_cost_model) { - num_use_measured = 0; - } - - // Generate sketches - std::vector sketches; - GenerateSketch(&sketches); - - // PrintAllStates(sketches); - // exit(0); - - // Sample the init population - std::vector init_population; - SampleInitPopulation(sketches, population - num_use_measured, &init_population); - - // PrintAllStates(init_population); - // exit(0); - - if (have_cost_model) { - // Also insert already measured good states to the initial population - std::vector indices; - Argsort(measured_states_throughputs_, &indices); - for (int i = 0; i < num_use_measured; i++) { - init_population.push_back(measured_states_vector_[indices[i]]); - } - - // Perform evolutionary search - EvolutionarySearch(init_population, num_measure_per_iter_ * 2, best_states); - } else { - // If the cost model is useless (i.e. RandomCostModel), skip evolutionary search - RandomSampleStates(init_population, &rand_gen_, num_measure_per_iter_ * 3, best_states); - } - - // Sample some random states for eps-greedy - RandomSampleStates(init_population, &rand_gen_, num_random_states * 10, random_states); -} - -// The baseclass of derivation rules used in sketch generation -class SketchGenerationRule { - public: - enum ConditionEnum { - kPass, kApply, kApplyAndSkipRest - }; - - virtual ConditionEnum MeetCondition(const SketchSearchPolicyNode* policy, - const State& state, int stage_id) = 0; - virtual std::vector > Apply(const SketchSearchPolicyNode* policy, - const State& state, int stage_id) = 0; -}; - -static inline bool ShouldBeCacheRead( - const SketchSearchPolicyNode* policy, const State& state, int stage_id) { - const SearchTask& task = policy->cur_task; - const Stage& stage = state->stages[stage_id]; - - if (HasAttrsFlag(state, stage_id, - SearchPolicyNode::no_cache_read_key)) { - return false; - } - - std::unordered_set consumers; - GetConsumers(task, state, stage->op, &consumers); - if (consumers.size() != 1) { - return false; - } - - int target_stage_id = OperationToStage(*consumers.begin(), state); - if (!NeedsMultilevelTiling(task, state, - state->stages[target_stage_id]->op)) { - return false; - } - - std::unordered_set producers; - GetProducers(task, state, state->stages[target_stage_id]->op, &producers); - // Only those directly mapped stages can do CacheRead - if (producers.find(stage->op) == producers.end()) { - return false; - } - - return true; -} - -static inline bool ShouldAlwaysBeInlined( - const SketchSearchPolicyNode* policy, const State& state, int stage_id) { - const SearchTask& task = policy->cur_task; - const Stage& stage = state->stages[stage_id]; - - if (stage->op->IsInstance()) { - return false; - } - - // Inline limitation of TVM - if (!IsOutputOp(task, state, stage->op) && !HasReduceIter(stage)) { - // Always inline condition: - // 1. Has attrs that this must be inlined - // 2. Analyse shows this is strict inlineable - // 3. A GPU stage can be inlined(If it should be cache read, do it first) - if (HasAttrsFlag(state, stage_id, - SearchPolicyNode::always_compute_inline_key) || - IsStrictInlineable(task, state, stage->op) || - (IS_GPU(policy->cur_task) && - !ShouldBeCacheRead(policy, state, stage_id))) { - return true; - } - } - - return false; -} - -// The rule that inlines simple elementwise ops -class RuleAlwaysInline : public SketchGenerationRule { - public: - ConditionEnum MeetCondition(const SketchSearchPolicyNode* policy, - const State& state, int stage_id) final { - return ShouldAlwaysBeInlined(policy, state, stage_id) ? - kApplyAndSkipRest : kPass; - } - - std::vector > Apply(const SketchSearchPolicyNode* policy, - const State& state, int stage_id) final { - State tmp_s = state; - tmp_s.compute_inline(stage_id); - return {std::make_pair(std::move(tmp_s), stage_id - 1)}; - } -}; - -// The rule that simply skip the current stage -class RuleSkipStage : public SketchGenerationRule { - public: - ConditionEnum MeetCondition(const SketchSearchPolicyNode* policy, - const State& state, int stage_id) final { - const SearchTask& task = policy->cur_task; - const Stage& stage = state->stages[stage_id]; - - const auto& attrs = stage->op->attrs; - if ((attrs.count(SearchPolicyNode::no_split_at_inner_key) || - attrs.count(SearchPolicyNode::no_split_at_outer_key)) && - NeedsMultilevelTiling(task, state, stage->op)) { - // for the transform stages in Winograd - return kPass; - } - - return kApply; - } - - std::vector > Apply(const SketchSearchPolicyNode* policy, - const State& state, int stage_id) final { - return {std::make_pair(state, stage_id - 1)}; - } -}; - -// The rule that performs multi-level tiling -class RuleMultiLevelTiling : public SketchGenerationRule { - public: - ConditionEnum MeetCondition(const SketchSearchPolicyNode* policy, - const State& state, int stage_id) final { - const SearchTask& task = policy->cur_task; - const Stage& stage = state->stages[stage_id]; - - return NeedsMultilevelTiling(task, state, stage->op) ? - (IS_GPU(policy->cur_task) ? kApplyAndSkipRest : kApply) : kPass; - } - - std::vector > Apply(const SketchSearchPolicyNode* policy, - const State& state, int stage_id) final { - std::string multi_level_tiling_structure = IS_GPU(policy->cur_task) ? - GetStringParam(policy->params, "gpu_multi_level_tiling_structure") : - GetStringParam(policy->params, "cpu_multi_level_tiling_structure"); - - std::vector spatial_split_step_ids; - State tmp_s = state; - tmp_s = DoMultiLevelTiling(tmp_s, stage_id, multi_level_tiling_structure, - &spatial_split_step_ids); - return {std::make_pair(std::move(tmp_s), stage_id-1)}; - } -}; - -// The rule that performs multi-level tiling and fuses later consumers -class RuleMultiLevelTilingWithFusion : public SketchGenerationRule { - public: - ConditionEnum MeetCondition(const SketchSearchPolicyNode* policy, - const State& state, int stage_id) final { - const SearchTask& task = policy->cur_task; - const Stage& stage = state->stages[stage_id]; - - int target_stage_id; - - if (IS_GPU(policy->cur_task)) { - return NeedsMultilevelTiling(task, state, stage->op) && - HasSingleElementwiseMatchedConsumer(task, state, stage, - &target_stage_id) && - (!HasCacheReadStage(state, stage_id) || - HasCacheWriteStage(state, stage_id)) ? - kApplyAndSkipRest : kPass; - } - - return NeedsMultilevelTiling(task, state, stage->op) && - HasSingleElementwiseMatchedConsumer(task, state, stage, - &target_stage_id) ? - kApply : kPass; - } - - std::vector > Apply(const SketchSearchPolicyNode* policy, - const State& state, int stage_id) final { - const SearchTask& task = policy->cur_task; - const Stage& stage = state->stages[stage_id]; - std::string multi_level_tiling_structure = IS_GPU(policy->cur_task) ? - GetStringParam(policy->params, "gpu_multi_level_tiling_structure") : - GetStringParam(policy->params, "cpu_multi_level_tiling_structure"); - - std::vector spatial_split_step_ids; - int target_stage_id; - std::unordered_set consumers; - - GetConsumers(task, state, state->stages[stage_id]->op, &consumers); - CHECK(HasSingleElementwiseMatchedConsumer(task, state, stage, &target_stage_id)); - - State base_state = state; - base_state = DoMultiLevelTiling(base_state, stage_id, - multi_level_tiling_structure, &spatial_split_step_ids); - std::vector follow_tiling_levels; - if (IS_GPU(policy->cur_task)) { - follow_tiling_levels.push_back(3); - } else { - follow_tiling_levels.push_back(1); - follow_tiling_levels.push_back(2); - } - - std::vector > ret; - for (int level : follow_tiling_levels) { - if (tolower(multi_level_tiling_structure[level-1]) != 's') { - continue; - } - State tmp_s = base_state; - tmp_s = FollowTiling(tmp_s, target_stage_id, spatial_split_step_ids, level); - const Iterator &target_iter = tmp_s->stages[target_stage_id]->iters[ - level * spatial_split_step_ids.size() - 1]; - tmp_s.compute_at(stage_id, target_stage_id, target_iter); - - ret.emplace_back(std::move(tmp_s), stage_id - 1); - } - - return ret; - } -}; - -// The rule that adds a cache write stage -class RuleAddCacheWrite : public SketchGenerationRule { - public: - ConditionEnum MeetCondition(const SketchSearchPolicyNode* policy, - const State& state, int stage_id) final { - const SearchTask& task = policy->cur_task; - const Stage& stage = state->stages[stage_id]; - - int target_stage_id; - - // Add cache write if a stage needs multi-level tiling, - // but does not have a element-wise matched consumer - return NeedsMultilevelTiling(task, state, stage->op) && - !HasAttrsFlag(state, stage_id, SearchPolicyNode::no_cache_write_key) && - (!HasSingleElementwiseMatchedConsumer(task, state, stage, - &target_stage_id) || - (HasCacheReadStage(state, stage_id) && - !HasCacheWriteStage(state, stage_id))) ? - kApply : kPass; - } - - std::vector > Apply(const SketchSearchPolicyNode* policy, - const State& state, int stage_id) final { - const SearchTask& task = policy->cur_task; - - State tmp_s = state; - tmp_s.cache_write(stage_id, "local", task->compute_dag); - return {std::make_pair(std::move(tmp_s), stage_id)}; - } -}; - -// The rule that adds a cache read stage -// Mainly used for GPU cooperative fetching -// Currently only support 1 to 1 match cache read -class RuleAddCacheRead : public SketchGenerationRule { - public: - ConditionEnum MeetCondition(const SketchSearchPolicyNode* policy, - const State& state, int stage_id) final { - return ShouldBeCacheRead(policy, state, stage_id) ? - kApplyAndSkipRest : kPass; - } - - std::vector > Apply(const SketchSearchPolicyNode* policy, - const State& state, int stage_id) final { - const SearchTask& task = policy->cur_task; - const Stage& stage = state->stages[stage_id]; - - std::unordered_set consumers; - GetConsumers(task, state, stage->op, &consumers); - CHECK_EQ(consumers.size(), 1); - int target_stage_id = OperationToStage(*consumers.begin(), state); - State tmp_s = state; - int added_stage_id = tmp_s.cache_read(stage_id, "shared", - {target_stage_id}, - task->compute_dag); - target_stage_id++; - const auto& share_read_pos = GetLastReduceIteratorInOutermostReduceTile( - tmp_s->stages[target_stage_id]); - tmp_s.compute_at(added_stage_id, target_stage_id, share_read_pos); - - return {std::make_pair(std::move(tmp_s), stage_id)}; - } -}; - -// The rule that adds rfactor stage -class RuleAddRfactor : public SketchGenerationRule { - public: - ConditionEnum MeetCondition(const SketchSearchPolicyNode* policy, - const State& state, int stage_id) final { - const SearchTask& task = policy->cur_task; - const Stage& stage = state->stages[stage_id]; - - return NeedsRfactor(task, state, stage->op) && - !HasCacheWriteStage(state, stage_id) ? - kApply : kPass; - } - - std::vector > Apply(const SketchSearchPolicyNode* policy, - const State& state, int stage_id) final { - const SearchTask& task = policy->cur_task; - const Stage& stage = state->stages[stage_id]; - - std::vector > ret; - - State tmp_s = state; - - // fuse reduce iters - std::vector space_iters, reduce_iters; - for (const auto &iter : stage->iters) { - if (iter->iter_type == kSpace) { - space_iters.push_back(iter); - } else if (iter->iter_type == kReduce) { - reduce_iters.push_back(iter); - } - } - CHECK(!reduce_iters.empty()); - Iterator fused_reduce_iter; - if (reduce_iters.size() > 1) { - fused_reduce_iter = tmp_s.fuse(stage_id, reduce_iters); - } else { - fused_reduce_iter = reduce_iters[0]; - } - - // split reduce iters - const auto &split_res = tmp_s.split(stage_id, fused_reduce_iter, {1}); - int factor_axis_id = static_cast(space_iters.size()); - State base_state = tmp_s; - for (const auto &split_iter : split_res) { - tmp_s = base_state; - tmp_s.rfactor(stage_id, split_iter, factor_axis_id, task->compute_dag); - - // reorder the space iterator to innermost for vectorization - if (split_iter == split_res[1]) { - std::vector new_order; - for (size_t i = 0; i < tmp_s->stages[stage_id]->iters.size(); ++i) { - if (i != space_iters.size()) { - new_order.push_back(tmp_s->stages[stage_id]->iters[i]); - } - } - new_order.push_back(tmp_s->stages[stage_id]->iters[space_iters.size()]); - tmp_s.reorder(stage_id, new_order); - } - ret.emplace_back(std::move(tmp_s), stage_id - 1); - } - - return ret; - } -}; - -void SketchSearchPolicyNode::GenerateSketch( - std::vector* out_states) { - State init_state = cur_task->compute_dag.GetInitState(); - std::string cpu_multi_level_tiling_structure = - GetStringParam(params, "cpu_multi_level_tiling_structure"); - - // two ping pong buffers to avoid copy - std::vector states_buf1, states_buf2; - std::vector *pnow, *pnext; - pnow = &states_buf1; - pnext = &states_buf2; - pnow->push_back(init_state); - - // A map that maps state to its current working position (stage_id) - std::unordered_map cur_stage_id_map; - cur_stage_id_map[init_state] = static_cast(init_state->stages.size() - 1); - - static RuleSkipStage rule_skip_stage; - static RuleAlwaysInline rule_always_inline; - static RuleMultiLevelTiling rule_multi_level_tiling; - static RuleMultiLevelTilingWithFusion rule_multi_level_tiling_with_fusion; - static RuleAddCacheWrite rule_add_cache_write_stage; - static RuleAddCacheRead rule_add_cache_read_stage; - static RuleAddRfactor rule_add_rfactor; - if (sketch_rules.empty()) { - // We may apply and skip the rest when processing some rules, - // should take care of the rule vector order here - sketch_rules.push_back(&rule_always_inline); - sketch_rules.push_back(&rule_add_cache_write_stage); - sketch_rules.push_back(&rule_multi_level_tiling_with_fusion); - sketch_rules.push_back(&rule_multi_level_tiling); - sketch_rules.push_back(&rule_add_rfactor); - sketch_rules.push_back(&rule_skip_stage); - if (IS_GPU(cur_task)) { - // Try cache read first before cache write - sketch_rules.insert(sketch_rules.begin() + 1, &rule_add_cache_read_stage); - } - // TODO(xian): Add a new rule to try combination of multi-level - // tiling + rfactor - } - - // Derivation rule based synthesizer - while (!pnow->empty()) { - pnext->clear(); - - for (const State& state : *pnow) { - int stage_id = cur_stage_id_map[state]; - - // Reaches to the terminal stage - if (stage_id < 0) { - out_states->push_back(state); - continue; - } - - // Try all derivation rules - for (const auto& rule : sketch_rules) { - auto rule_check = rule->MeetCondition(this, state, stage_id); - if (rule_check > SketchGenerationRule::ConditionEnum::kPass) { - for (const auto& pair : rule->Apply(this, state, stage_id)) { - cur_stage_id_map[pair.first] = pair.second; - pnext->push_back(pair.first); - } - // Skip the reset rules - if (rule_check == SketchGenerationRule::ConditionEnum::kApplyAndSkipRest) { - break; - } - } - } - } - - std::swap(pnow, pnext); - } - - // Hack for rfactor: Replace the split factor for rfactor to the undefined Expr(), - // so later we can sample random value for the split factor. - // Why don't we use Expr() when doing the split for rfactor at the first time? - // Because during ApplySteps, a rfactor with undefined Expr() will crash TVM. - // So rfactor with undefined Expr() will conflict with cache_write, cache_read, rfactor - // in other stages - for (size_t i = 0; i < out_states->size(); ++i) { - auto pstate = (*out_states)[i].CopyOnWrite(); - for (size_t step_id = 0; step_id < pstate->transform_steps.size(); ++step_id) { - if (pstate->transform_steps[step_id]->IsInstance()) { - CHECK_GE(step_id, 1); - int split_step_id = step_id - 1; - auto step = pstate->transform_steps[split_step_id].as(); - CHECK(step != nullptr); - pstate->transform_steps[split_step_id] - = SplitStep(step->stage_id, step->iter_id, step->extent, {PrimExpr()}, - step->inner_to_outer); - } - } - } - - StdCout(verbose) << "Generate Sketches\t\t#s: " << out_states->size() << std::endl; -} - -int InitPopulationFillTileSize(const SketchSearchPolicyNode* policy, - State* state, std::mt19937* rand_gen, - SplitFactorizationMemo* split_memo) { - for (size_t step_id = 0; step_id < (*state)->transform_steps.size(); ++step_id) { - if (auto ps = (*state)->transform_steps[step_id].as()) { - bool defined = true; - for (const PrimExpr& len : ps->lengths) { - if (!len.defined()) { - defined = false; - } - } - - if (defined) { - continue; - } - - int extent = GetIntImm(ps->extent); - const std::vector >& candidate_lens = - split_memo->GetFactorizationSchemes( - extent, ps->lengths.size(), - policy->cur_task->hardware_params->max_innermost_split_factor); - - StateNode* pstate = state->CopyOnWrite(); - pstate->transform_steps[step_id] = SplitStep( - ps->stage_id, ps->iter_id, ps->extent, - candidate_lens[(*rand_gen)() % candidate_lens.size()], - ps->inner_to_outer); - } - } - - return 0; -} - -int InitPopulationThreadBind(const SketchSearchPolicyNode* policy, - State* state) { - for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) { - const Stage& stage = (*state)->stages[stage_id]; - auto pop = stage->op.as(); - - if (stage->compute_at != kRoot || stage->op_type == kPlaceholder) { - continue; - } - - if (HasAnnotationIter(stage, IteratorAnnotation::kThreadX)) { - // Skip if this stage has already done thread bind - continue; - } - - std::vector to_fuse; - - // This stage has not been tiled, but in GPU schedule, we must tile it - // to do thread binding - if (!HasSplitStep(*state, stage_id)) { - for (const auto& it : (*state)->stages[stage_id]->iters) { - if (it->iter_type == kReduce) { - break; - } - to_fuse.push_back(it); - } - const auto& fused_it = state->fuse(stage_id, to_fuse); - // Set default vthread=1 & threadIdx.x=default_warp_size - // EvolutionarySearch will try more possiblity - if (GetExtent(fused_it) <= - policy->cur_task->hardware_params->warp_size) { - state->bind_thread(stage_id, fused_it, kThreadX); - } else { - const auto& split_its = state->split(stage_id, fused_it, - {1, policy->cur_task->hardware_params->warp_size}); - state->bind_thread(stage_id, split_its[0], kBlockX); - state->bind_thread(stage_id, split_its[1], kVThread); - state->bind_thread(stage_id, split_its[2], kThreadX); - } - - continue; - } - - int total_space_extent = 1; - for (const auto& i : pop->root_iter_vars()) { - CHECK(i->dom.defined()); - const auto& pint = i->dom->extent.as(); - CHECK(pint); - total_space_extent *= pint->value; - } - - // TODO(..): Add ThreadBind support for rfactor - if (total_space_extent <= policy->cur_task->hardware_params->warp_size) { - for (const auto& it : (*state)->stages[stage_id]->iters) { - if (it->iter_type == kReduce) { - break; - } - to_fuse.push_back(it); - } - const auto& fused_it = state->fuse(stage_id, to_fuse); - state->bind_thread(stage_id, fused_it, kThreadX); - - continue; - } - - // Fuse the outermost space tile as blockIdx - for (size_t i = 0; i < pop->axis.size(); i++) { - const auto& it = (*state)->stages[stage_id]->iters[i]; - if (!StrEndsWith(it->name, ".0")) { - break; - } - to_fuse.push_back(it); - } - const auto& blockidx_it = state->fuse(stage_id, to_fuse); - state->bind_thread(stage_id, blockidx_it, kBlockX); - - // Fuse the second outermost space tile as vthread - to_fuse.clear(); - for (size_t i = 1; i < pop->axis.size() + 1; i++) { - const auto& it = (*state)->stages[stage_id]->iters[i]; - if (!StrEndsWith(it->name, ".1")) { - break; - } - to_fuse.push_back((*state)->stages[stage_id]->iters[i]); - } - const auto& vthread_it = state->fuse(stage_id, to_fuse); - if (GetExtent(vthread_it) > - policy->cur_task->hardware_params->max_vthread_extent) { - return -1; - } - state->bind_thread(stage_id, vthread_it, kVThread); - - // Fuse the third outermost space tile as threadIdx - to_fuse.clear(); - for (size_t i = 2; i < pop->axis.size() + 2; i++) { - const auto& it = (*state)->stages[stage_id]->iters[i]; - if (!StrEndsWith(it->name, ".2")) { - break; - } - to_fuse.push_back((*state)->stages[stage_id]->iters[i]); - } - const auto& threadidx_it = state->fuse(stage_id, to_fuse); - if (GetExtent(threadidx_it) < - policy->cur_task->hardware_params->warp_size) { - return -1; - } - state->bind_thread(stage_id, threadidx_it, kThreadX); - } - - return 0; -} - -int InitPopulationCooperativeFetching(const SketchSearchPolicyNode* policy, - State* state) { - for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) { - // Do cooperative fetching with cache read stage - // For two stages: A -> B - // 1. A -> A_cache_read -> B - // * - // 2. A -> A_cache_write -> A_cache_read -> B - // * - if ((stage_id > 0 && HasCacheReadStage((*state), stage_id - 1) && - !HasCacheWriteStage((*state), stage_id - 1)) || - (stage_id > 1 && HasCacheReadStage((*state), stage_id - 2) && - HasCacheWriteStage((*state), stage_id - 2))) { - const Stage& target_stage = (*state)->stages[stage_id]; - if (HasAnnotationIter(target_stage, IteratorAnnotation::kThreadX) || - HasAnnotationIter(target_stage, IteratorAnnotation::kTensorized)) { - // Skip if this stage has already done thread bind or has been - // tensorized - continue; - } - // Get spatial_split_step_ids from the root stage - std::unordered_set consumers; - std::vector spatial_split_step_ids; - GetConsumers(policy->cur_task, (*state), target_stage->op, &consumers); - CHECK_EQ(consumers.size(), 1); - int target_stage_id = OperationToStage(*consumers.begin(), (*state)); - GetSpaceSplitStepIds((*state), target_stage_id, &spatial_split_step_ids); - - // Fuse all axis to to do cooperative fetching - Iterator fused = state->fuse(stage_id, - (*state)->stages[stage_id]->iters); - // Left a vectorized cooperative fetching split placeholder - const auto& iters0 = state->split(stage_id, fused, {1}); - state->vectorize(stage_id, iters0[1]); - // Follow split to keep a same thread extent with the root stage - const auto& iters1 = state->follow_fused_split(stage_id, iters0[0], - spatial_split_step_ids, - 1, true); - state->bind_thread(stage_id, iters1[1], kThreadX); - } - } - - return 0; -} - -int InitPopulationChangeComputeLocation(const SketchSearchPolicyNode* policy, - State* state, std::mt19937* rand_gen) { - if(GetIntParam(policy->params, "disable_change_compute_location")) { - return 0; - } - - for (int stage_id = static_cast((*state)->stages.size()) - 1; stage_id >= 0; stage_id--) { - const Stage& stage = (*state)->stages[stage_id]; - - if (stage->op_type == kPlaceholder) { - continue; - } - - if (IsTiled(stage) || stage->compute_at == kInlined) { - continue; - } - - if (NeedsMultilevelTiling(policy->cur_task, (*state), stage->op)) { - continue; - } - - std::unordered_set consumers; - - GetConsumers(policy->cur_task, (*state), stage->op, &consumers); - if (consumers.empty()) { - continue; - } - - int target_stage_id; - if (consumers.size() == 1) { - target_stage_id = OperationToStage(*consumers.begin(), *state); - } else { - // check all consumers share a common root - int common_root_id = -1; - bool mismatch = false; - for (const auto& consumer : consumers) { - int consumer_stage_id = OperationToStage(consumer, *state); - int root_id = -1; - if ((*state)->stages[consumer_stage_id]->compute_at == kRoot) { - root_id = consumer_stage_id; - } else if ((*state)->stages[consumer_stage_id]->compute_at == kIter) { - root_id = (*state)->attach_map->stage_to_attach_iter.at(consumer_stage_id).first; - } else { - LOG(FATAL) << "Invalid case"; - } - - if (common_root_id == -1) { - common_root_id = root_id; - } else { - if (common_root_id != root_id) { - mismatch = true; - break; - } - } - } - - if (mismatch) { - continue; - } - target_stage_id = common_root_id; - } - - const Stage& target_stage = (*state)->stages[target_stage_id]; - std::set to_unroll_name_set; - if (target_stage->op->attrs.count(policy->always_unroll_key)) { - to_unroll_name_set = GetIterNameSetParam(target_stage->op->attrs, - policy->always_unroll_key); - } - - std::vector > candidates; - bool target_compute_at_other = target_stage->compute_at == kIter; - bool target_is_tiled = IsTiled(target_stage); - - bool visited_reduce = false; - // enumerate compute_at location at target_stage - int ct = 0; - for (const auto& target_iter : target_stage->iters) { - if (target_iter->iter_type == kReduce) { - visited_reduce = true; - if (!target_is_tiled) { // do not go into reduce iter - break; - } - } else if (target_iter->iter_type == kSpace) { - if (visited_reduce) { // do not go into inner tile - break; - } - } - - if (to_unroll_name_set.count(target_iter->name)) { - // Do not go into always unroll region - break; - } - - if (GetExtent(target_iter) == 1) { // skip iterators with length of 1 - continue; - } - if (target_compute_at_other && target_iter->iter_type == kSpace && - StrEndsWith(target_iter->name, ".0")) { - // skip the first level iterators if target stage compute_at another stage - // In this case, the lengths of first level iterators are always one - continue; - } - candidates.emplace_back(target_stage_id, target_iter); - - if ((*state)->attach_map->iter_to_attached_stages.count( - std::make_pair(target_stage_id, ct++))) { - break; - } - } - - // if the target_stage is already compute_at another stage X, try also compute_at X - // We call stage X as `target_target_stage` - if (target_compute_at_other) { - int target_target_stage_id; - target_target_stage_id = (*state)->attach_map->stage_to_attach_iter.at( - target_stage_id).first; - const Stage& target_target_stage = (*state)->stages[target_target_stage_id]; - if (target_target_stage->op->attrs.count(policy->always_unroll_key)) { - to_unroll_name_set = GetIterNameSetParam(target_target_stage->op->attrs, - policy->always_unroll_key); - } else { - to_unroll_name_set.clear(); - } - - int ct = 0; - for (const auto& target_target_iter : target_target_stage->iters) { - if (target_target_iter->iter_type == kReduce || - (*state)->attach_map->iter_to_attached_stages.count( - std::make_pair(target_target_stage_id, ct++))) { - break; - } - - if (to_unroll_name_set.count(target_target_iter->name)) { - // Do not go into always unroll region - break; - } - - if (GetExtent(target_target_iter) == 1) { // skip iterators with length of 1 - continue; - } - - candidates.push_back(std::make_pair(target_target_stage_id, target_target_iter)); - } - } - - int choice = (*rand_gen)() % (candidates.size() + 2); - - if (choice == 0) { - if (!HasReduceIter(stage)) { - state->compute_inline(stage_id); - } - } else if (choice == 1) { - state->compute_root(stage_id); - } else { - choice = choice - 2; - state->compute_at(stage_id, candidates[choice].first, candidates[choice].second); - } - } - - return 0; -} - -int InitPopulationParallel(const SketchSearchPolicyNode* policy, - State* state) { - std::function annotate_parallel; - - annotate_parallel = [&annotate_parallel]( - const SketchSearchPolicyNode* policy, State* state, int stage_id, int iter_offset) { - const Stage& stage = (*state)->stages[stage_id]; - - std::vector to_fuse; - int64_t parallel_degree = 1; - - // strategy: try to fuse and parallel the outermost n iterators - // Stop if we meet reduce iterator or we have enough parallel degree - size_t iter_id = iter_offset; - for (; iter_id < stage->iters.size(); ++iter_id) { - const Iterator& it = stage->iters[iter_id]; - if (it->iter_type == kReduce || it->annotation != kNone) { - break; - } - - to_fuse.push_back(it); - parallel_degree *= GetExtent(it); - - if (parallel_degree > policy->cur_task->hardware_params->num_cores * 16) { - break; - } - - if ((*state)->attach_map->iter_to_attached_stages.count( - std::make_pair(stage_id, iter_id))) { - break; - } - } - - if (parallel_degree == 1) { - auto res = (*state)->attach_map->iter_to_attached_stages.find(std::make_pair(stage_id, iter_id)); - if (res != (*state)->attach_map->iter_to_attached_stages.end()) { - for (int attached_stage_id : res->second) { - annotate_parallel(policy, state, attached_stage_id, 0); - } - annotate_parallel(policy, state, stage_id, iter_id + 1); - } - } - - if (!to_fuse.empty()) { - if (to_fuse.size() == 1) { - state->parallel(stage_id, to_fuse[0]); - } else { - Iterator fused_iter = state->fuse(stage_id, to_fuse); - state->parallel(stage_id, fused_iter); - } - } - }; - - for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) { - const Stage& stage = (*state)->stages[stage_id]; - if (stage->compute_at != kRoot || stage->op_type == kPlaceholder) { - continue; - } - - annotate_parallel(policy, state, stage_id, 0); - } - - return 0; -} - -int InitPopulationVectorization(const SketchSearchPolicyNode* policy, - State* state, std::mt19937* rand_gen) { - for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) { - const Stage& stage = (*state)->stages[stage_id]; - - if (stage->op_type == kPlaceholder) { - continue; - } - - // Skip cooperative fetching stage - if (IS_GPU(policy->cur_task) && - HasCacheReadStage((*state), stage_id - 1)) { - continue; - } - - if (HasAnnotationIter(stage, IteratorAnnotation::kTensorized)) { - // Skip if this stage has been tensorized - continue; - } - - // try to fuse and vectorize the space iterators in the inner most tile - int cum_length_prod = 1; - - std::set to_unroll_name_set; - if (stage->op->attrs.count(policy->always_unroll_key)) { - to_unroll_name_set = GetIterNameSetParam(stage->op->attrs, - policy->always_unroll_key); - } - - int num_fusible = 0; - while (num_fusible < static_cast(stage->iters.size())) { - int iter_id = static_cast(stage->iters.size()) - 1 - num_fusible; - if ((*state)->attach_map->iter_to_attached_stages.count( - std::make_pair(stage_id, iter_id))) { - break; - } - - const Iterator& it = stage->iters[iter_id]; - - // Stop if we meet a reduce iterator - if (it->iter_type == kReduce || it->annotation != kNone || - to_unroll_name_set.count(it->name)) { - break; - } - - // Stop if the memory access is not continuous (vectorizable) - // Note: The check is too hard, so we use heuristic here - if (IsTiled(stage) && num_fusible != 0) { - // If the stage is tiled, then the memory access must not be continuous - // for the innermost two iterators - break; - } - - cum_length_prod *= GetExtent(it); - if (cum_length_prod > policy->cur_task->hardware_params->max_unroll_vec) { - break; - } - - num_fusible++; - } - - if (num_fusible > 1) { - num_fusible = 1 + (*rand_gen)() % (num_fusible - 1); // Select a random range to fuse - } - - if (num_fusible == 1) { - state->vectorize(stage_id, stage->iters.back()); - } else if (num_fusible > 1) { - std::vector to_fuse(stage->iters.end() - num_fusible, - stage->iters.end()); - state->vectorize(stage_id, state->fuse(stage_id, to_fuse)); - } - } - - return 0; -} - -int InitPopulationUnroll(const SketchSearchPolicyNode* policy, - State* state, std::mt19937* rand_gen) { - for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) { - const Stage& stage = (*state)->stages[stage_id]; - - if (stage->op_type == kPlaceholder) { - continue; - } - - if (stage->op->attrs.count(policy->always_unroll_inner_key)) { - // Special unroll policy - auto to_unroll_name_set = GetIterNameSetParam(stage->op->attrs, - policy->always_unroll_inner_key); - std::set visited_names; - - // Unroll the space iterators and reduce iterators listed in the attrs - // in the innermost tile - int n = static_cast(stage->iters.size()) - 1; - visited_names.clear(); - while (n >= 0) { - const Iterator& it = stage->iters[n]; - - // If we meet two iterators that come from a same original iterator, - // then we are out of the innermost tile - size_t size_before = visited_names.size(); - ExtractOriginalIterators(it->name, &visited_names); - if (size_before == visited_names.size()) { - break; - } - - std::set name; - ExtractOriginalIterators(it->name, &name); - if (name.size() == 1 && to_unroll_name_set.count(*name.begin())) { - state->unroll(stage_id, it); - } - - n--; - } - } else if (stage->op->attrs.count(policy->always_unroll_key)) { - // Special unroll policy - auto to_unroll_name_set = GetIterNameSetParam(stage->op->attrs, - policy->always_unroll_key); - - // Unroll the space iterators and reduce iterators listed in the attrs - int n = static_cast(stage->iters.size()) - 1; - while (n >= 0) { - const Iterator& it = stage->iters[n]; - if (to_unroll_name_set.count(it->name)) { - state->unroll(stage_id, it); - } - n--; - } - } else if (HasReduceIter(stage)) { - // use auto unroll for multi level tiled stage - int value = policy->auto_unroll_configs[ - (*rand_gen)() % policy->auto_unroll_configs.size()]; - state->pragma(stage_id, (*state)->stages[stage_id]->iters[0], - std::string("auto_unroll_max_step") + "$" + std::to_string(value)); - } - } - - return 0; -} - -void SketchSearchPolicyNode::SampleInitPopulation(const std::vector& sketches, - int out_size, std::vector* out_states) { - std::uniform_real_distribution<> dis(0.0, 1.0); - int continue_count = 0; - - // TODO(...): Maybe try muti thread here - while (static_cast(out_states->size()) < out_size && - continue_count < out_size * 10) { - State tmp_s = sketches[rand_gen_() % sketches.size()]; - - InitPopulationFillTileSize(this, &tmp_s, &rand_gen_, &split_memo_); - - if (IS_GPU(cur_task)) { - tmp_s = cur_task->compute_dag.InferBound(tmp_s); - - if (InitPopulationThreadBind(this, &tmp_s)) { - continue_count++; - if (continue_count == out_size) { - StdCout(verbose) << "Initial Population Sampling..." << std::endl; - } - continue; - } - - InitPopulationCooperativeFetching(this, &tmp_s); - } else { - InitPopulationChangeComputeLocation(this, &tmp_s, &rand_gen_); - - tmp_s = cur_task->compute_dag.InferBound(tmp_s); - - InitPopulationParallel(this, &tmp_s); - } - - InitPopulationVectorization(this, &tmp_s, &rand_gen_); - - InitPopulationUnroll(this, &tmp_s, &rand_gen_); - - out_states->push_back(std::move(tmp_s)); - } - - StdCout(verbose) << "Sample Initial Population\t#s: " - << out_states->size() << std::endl; -} - -void SketchSearchPolicyNode::EvolutionarySearch( - const std::vector& init_population, - int num_best_states, std::vector* best_states) { - auto tic_begin = std::chrono::high_resolution_clock::now(); - - // Set parameters for genetic algorithm - int population = GetIntParam(params, "evolutionary_search_population"); - int num_iters = GetIntParam(params, "evolutionary_search_num_iters"); - double mutation_prob = GetDoubleParam(params, "evolutionary_search_mutation_prob"); - int num_cross_over = static_cast(population * 0.0); // NOT IMPLEMENTED currently - int num_cross_over_trial_upper_bound = num_cross_over * 3; - CostModel cost_model = program_cost_model; - - // Two ping pong buffers to avoid copy - std::vector states_buf1, states_buf2; - std::vector *pnow = &states_buf1, *pnext = &states_buf2; - states_buf1.reserve(population); - states_buf2.reserve(population); - states_buf1.insert(states_buf1.begin(), init_population.begin(), init_population.end()); - - // A heap to keep the best states during evolution - using StateItem = std::pair; - auto cmp = [](const StateItem& left, const StateItem& right) { - return left.second > right.second; - }; - std::vector heap; - std::unordered_set in_heap(measured_states_set_); - heap.reserve(num_best_states); - - // auxiliary global variables - std::vector scores; - std::vector prefix_sum_probs; - double max_score = 0.0; - scores.reserve(population); - prefix_sum_probs.reserve(population); - std::uniform_real_distribution<> dis(0.0, 1.0); - int mutation_fail_ct = 0; - - // Genetic Algorithm - for (int k = 0; k < num_iters + 1; ++k) { - // Maintain the heap - cur_task->compute_dag.InferBound(pnow); - PruneUndefined(pnow); - cost_model->Predict(cur_task, *pnow, &scores); - - for (size_t i = 0; i < pnow->size(); ++i) { - const State& state = (*pnow)[i]; - std::string state_str = state.ToStr(); - - if (in_heap.count(state_str) == 0) { - if (static_cast(heap.size()) < num_best_states) { - heap.emplace_back((*pnow)[i], scores[i]); - std::push_heap(heap.begin(), heap.end(), cmp); - in_heap.insert(state_str); - } else if (scores[i] > heap.front().second) { - std::string old_state_str = heap.front().first.ToStr(); - in_heap.erase(old_state_str); - in_heap.insert(state_str); - - std::pop_heap(heap.begin(), heap.end(), cmp); - heap.back() = StateItem(state, scores[i]); - std::push_heap(heap.begin(), heap.end(), cmp); - } - if (scores[i] > max_score) { - max_score = scores[i]; - } - } - } - - if (k % 5 == 0 || k == num_iters) { - StdCout(verbose) << "GA Iter: " << k << std::fixed << std::setprecision(4) - << "\tMax score: " << max_score - << "\tMin score: " << heap.front().second - << "\tPop size: " << pnow->size() << std::endl; - } - - if (k == num_iters) { - break; - } - - // Compute selection probability - double sum = 0.0; - prefix_sum_probs.resize(scores.size()); - for (size_t i = 0; i < scores.size(); ++i) { - sum += std::max(scores[i], 0.0f); - prefix_sum_probs[i] = sum; - } - for (size_t i = 0; i < scores.size(); ++i) { - prefix_sum_probs[i] = prefix_sum_probs[i] / sum; - } - - // Do cross over - int ct = 0; - while (static_cast(pnext->size()) < num_cross_over - && ct < num_cross_over_trial_upper_bound) { - int p1 = RandomChoose(prefix_sum_probs, &rand_gen_); - int p2 = RandomChoose(prefix_sum_probs, &rand_gen_); - - if (p1 == p2) { - pnext->push_back((*pnow)[p1]); - } else { - State tmp_s = CrossOverState((*pnow)[p1], (*pnow)[p2]); - if (tmp_s.defined()) { - pnext->push_back(std::move(tmp_s)); - } - } - ct++; - } - - // Do mutation - mutation_fail_ct = 0; - while (static_cast(pnext->size()) < population) { - int id = RandomChoose(prefix_sum_probs, &rand_gen_); - - if (dis(rand_gen_) < mutation_prob) { - const std::vector rule_prefix_sum_probs{0.9, 1.0}; - - int rule_id = RandomChoose(rule_prefix_sum_probs, &rand_gen_); - - if (rule_id == 0) { - // Mutate Tile Size - State tmp_s = RandomMutateTileSize((*pnow)[id], &split_memo_, &rand_gen_, - cur_task->hardware_params->max_innermost_split_factor); - if (tmp_s.defined()) { - pnext->push_back(std::move(tmp_s)); - } else { - mutation_fail_ct++; - } - } else if (rule_id == 1) { - // Mutate auto-unroll max step. - State tmp_s = RandomMutateMaxUnrollStep((*pnow)[id], &rand_gen_, auto_unroll_configs); - if (tmp_s.defined()) { - pnext->push_back(std::move(tmp_s)); - } else { - mutation_fail_ct++; - } - } - } else { - pnext->push_back((*pnow)[id]); - } - } - - std::swap(pnext, pnow); pnext->clear(); - } - - // Copy best states in the heap to out_states - std::sort(heap.begin(), heap.end(), cmp); - best_states->clear(); - for (auto& item : heap) { - best_states->push_back(std::move(item.first)); - } - - double duration = std::chrono::duration_cast >( - std::chrono::high_resolution_clock::now()- tic_begin).count(); - StdCout(verbose) << "EvolutionarySearch\t\t#s: " << best_states->size() - << "\tTime elapsed: " - << std::fixed << std::setprecision(2) << duration << std::endl; -} - -class RuleCustomSketch : public SketchGenerationRule { - public: - RuleCustomSketch(PackedFunc meet_condition_func, PackedFunc apply_func) : - meet_condition_func_(meet_condition_func), apply_func_(apply_func) {} - - inline ConditionEnum MeetCondition(const SketchSearchPolicyNode* policy, - const State& state, int stage_id) final { - auto ret = meet_condition_func_( - tvm::runtime::GetRef(policy), state, stage_id); - if (ret.type_code() == 0) { - return ConditionEnum(static_cast(ret)); - } else { - return kApplyAndSkipRest; - } - } - - inline std::vector > Apply( - const SketchSearchPolicyNode* policy, - const State& state, int stage_id) final { - std::vector > ret; - - Array> apply_ret = apply_func_( - tvm::runtime::GetRef(policy), state, stage_id); - - for (const auto& item : apply_ret) { - CHECK_EQ(item.size(), 2); - State state = Downcast(item[0]); - auto next = item[1].as(); - ret.emplace_back(state, next->value); - } - return ret; - } - - private: - PackedFunc meet_condition_func_; - PackedFunc apply_func_; -}; - -PreloadCustomSketchRule::PreloadCustomSketchRule(PackedFunc meet_condition_func, - PackedFunc apply_func) { - auto node = make_object(); - node->meet_condition_func = meet_condition_func; - node->apply_func = apply_func; - data_ = std::move(node); -} - -void PreloadCustomSketchRuleNode::callback(SearchPolicyNode* policy) { - CHECK(policy->IsInstance()); - auto sketch_policy = dynamic_cast(policy); - sketch_policy->sketch_rules.emplace_back( - new RuleCustomSketch(meet_condition_func, apply_func)); - StdCout(policy->verbose) << "Custom sketch rule added." << std::endl; -} - -TVM_REGISTER_GLOBAL("ansor.SketchSearchPolicy") -.set_body_typed([](CostModel program_cost_model, Map params, - int seed){ - return SketchSearchPolicy(program_cost_model, params, seed); -}); - -TVM_REGISTER_GLOBAL("ansor.PreloadCustomSketchRule") -.set_body_typed([](PackedFunc meet_condition_func, PackedFunc apply_func) { - return PreloadCustomSketchRule(meet_condition_func, apply_func); -}); - -} // namespace ansor -} // namespace tvm diff --git a/src/ansor/search_policy/sketch_search_policy.h b/src/ansor/search_policy/sketch_search_policy.h deleted file mode 100644 index 54a5cdd1fa4ee..0000000000000 --- a/src/ansor/search_policy/sketch_search_policy.h +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file ansor/search_policy/sketch_search_policy.h - * \brief The search policy that searches in a hierarchical search space defined by sketches. - * The policy randomly samples programs from the space defined by sketches - * and use evolutionary search to fine-tune them. - */ - -#ifndef TVM_ANSOR_SEARCH_POLICY_SKETCH_SEARCH_POLICY_H_ -#define TVM_ANSOR_SEARCH_POLICY_SKETCH_SEARCH_POLICY_H_ - -#include -#include -#include -#include -#include -#include "search_policy.h" -#include "../cost_model/cost_model.h" -#include "../utils.h" - - -namespace tvm { -namespace ansor { - -class SketchGenerationRule; - -/*! - * \brief The search policy that searches in a hierarchical search space defined by sketches. - * The policy randomly samples programs from the space defined by sketches - * and use evolutionary search to fine-tune them. - */ -class SketchSearchPolicyNode: public SearchPolicyNode { - public: - /*! \brief The cost model for complete programs */ - CostModel program_cost_model; - /*! \brief Random generator */ - std::mt19937 rand_gen_; - /*! \brief The parameters for search. It stores the following parameters: - * int evolutionary_search_population // The population size for evolutionary search - * int evolutionary_search_mutation_prob // The probability of mutation for evolutionary search - * int evolutionary_search_num_iters; // The number of iterations for evolutionary search - * double local_mutation_use_measured_ratio; // The maximum percentage of measured states in the initial - * // population for evolutionary search - * double eps_greedy; // Always allocate this percentage of measurements to random sampled states - * str cpu_multi_level_tiling_structure // The structure of multi-level tiling for CPU - * str gpu_multi_level_tiling_structure // The structure of multi-level tiling for GPU - */ - Map params; - /*! \brief The rules to generate sketches */ - std::vector sketch_rules; - - /*! \brief Search and make n_trails measurements. - * \returns the best state */ - State Search(SearchTask task, int n_trials, - int early_stopping, int num_measure_per_iter, - int verbose, ProgramMeasurer measurer, - Array pre_search_callbacks) final; - - /*! \brief Continue search for one round. This is used by JointTuner - * \returns the measurement pairs */ - std::pair, Array > ContinueSearchOneRound( - SearchTask task, int num_measure, int verbose, ProgramMeasurer measurer) final; - - static constexpr const char *_type_key = "ansor.SketchSearchPolicy"; - static const std::vector auto_unroll_configs; - - TVM_DECLARE_FINAL_OBJECT_INFO(SketchSearchPolicyNode, SearchPolicyNode); - - protected: - /*! \brief Pick states from best states and random states with eps-greedy policy */ - void PickStatesWithEpsGreedy(std::vector* inputs, - const std::vector& best_states, - const std::vector& random_states, - int remaining_n_trials); - - private: - // Run one round of the search pipeline - void SearchOneRound(std::vector* best_states, - int num_random_states, std::vector* random_states); - - // Generate sketches without tile size - void GenerateSketch(std::vector* out_states); - - // Sample init population - void SampleInitPopulation(const std::vector& sketches, - int out_size, std::vector* out_states); - - // Perform evolutionary search - void EvolutionarySearch(const std::vector& init_population, - int num_best_states, std::vector* best_states); - - SplitFactorizationMemo split_memo_; // Memorize split space for Split - int num_measure_per_iter_; // The number of states to measure per iteration -}; - -/*! - * \brief Managed reference to SketchSearchPolicyNode. - * \sa SketchSearchPolicyNode - */ -class SketchSearchPolicy : public SearchPolicy { - public: - SketchSearchPolicy(CostModel program_cost_model, - Map params, - int seed); - - TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(SketchSearchPolicy, SearchPolicy, - SketchSearchPolicyNode); -}; - -/*! \brief Pre-search callback function to load custom rules for sketch generation */ -class PreloadCustomSketchRuleNode : public SearchCallbackNode { - public: - // TODO(jcf94): Use tvm::runtime::TypedPackedFunc? - PackedFunc meet_condition_func; - PackedFunc apply_func; - - void callback(SearchPolicyNode* policy) final; - - static constexpr const char *_type_key = "ansor.PreloadCustomSketchRule"; - TVM_DECLARE_FINAL_OBJECT_INFO(PreloadCustomSketchRuleNode, SearchCallbackNode); -}; - -/*! - * \brief Managed reference to PreloadCustomSketchRuleNode. - * \sa PreloadCustomSketchRuleNode - */ -class PreloadCustomSketchRule : public SearchCallback { - public: - PreloadCustomSketchRule(PackedFunc meet_condition_func, - PackedFunc apply_func); - - TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PreloadCustomSketchRule, SearchCallback, - PreloadCustomSketchRuleNode); -}; - -} // namespace ansor -} // namespace tvm - -#endif // TVM_ANSOR_SEARCH_POLICY_SKETCH_SEARCH_POLICY_H_ diff --git a/src/ansor/search_policy/utils.cc b/src/ansor/search_policy/utils.cc deleted file mode 100644 index 412d0afcca98d..0000000000000 --- a/src/ansor/search_policy/utils.cc +++ /dev/null @@ -1,429 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file ansor/search_policy/utils.cc - * \brief Common utilities for search policies - */ - -#include "utils.h" -#include "search_policy.h" - -namespace tvm { -namespace ansor { - -void GetSpaceSplitStepIds(const State& s, int stage_id, std::vector* spatial_split_step_ids) { - auto pop = s->stages[stage_id]->op.as(); - CHECK(pop != nullptr); - - auto no_split_name_pair = QueryNoSplitAxis(s->stages[stage_id]); - std::set no_split_at_inner_name_set = no_split_name_pair.first; - std::set no_split_at_outer_name_set = no_split_name_pair.second; - size_t reduce_count = 0; - for (const auto axis : pop->reduce_axis) { - if (!no_split_at_inner_name_set.count(axis->var->name_hint) && - !no_split_at_outer_name_set.count(axis->var->name_hint)) { - reduce_count++; - } - } - - for (int i = static_cast(s->transform_steps.size()) - 1; i >= 0; --i) { - if (s->transform_steps[i]->IsInstance() || - s->transform_steps[i]->IsInstance() || - s->transform_steps[i]->IsInstance()) { - if (stage_id > s->transform_steps[i]->stage_id) { - stage_id--; - } - } else if (auto ps = s->transform_steps[i].as()) { - if (stage_id == ps->stage_id) { - if (reduce_count) { - reduce_count--; - } else { - spatial_split_step_ids->push_back(i); - } - } - } - } -} - -State DoMultiLevelTiling(const State& state, int stage_id, const std::string& format, - std::vector* spatial_split_step_ids) { - std::vector > space_levels; - std::vector > reduce_levels; - std::vector space_outer, space_inner, reduce_outer, reduce_inner; - std::vector split_res; - - for (const auto c : format) { - if (tolower(c) == 's') { - space_levels.emplace_back(); - } else if (tolower(c) == 'r') { - reduce_levels.emplace_back(); - } else { - LOG(FATAL) << "Invalid multi level tiling format: " << format; - } - } - size_t n_space = space_levels.size(); - size_t n_reduce = reduce_levels.size(); - - spatial_split_step_ids->clear(); - - State tmp_s = state; - const Stage& stage = state->stages[stage_id]; - auto no_split_name_pair = QueryNoSplitAxis(stage); // handle special split strategy - auto last_split_is_one_name_set = QueryLastSplitIsOneAxis(stage); - std::set no_split_at_inner_name_set = no_split_name_pair.first; - std::set no_split_at_outer_name_set = no_split_name_pair.second; - - for (const auto& iter : state->stages[stage_id]->iters) { - if (iter->iter_type == kSpace) { - if (!no_split_at_inner_name_set.count(iter->name) && - !no_split_at_outer_name_set.count(iter->name)) { - CHECK_GE(n_space, 1); - int tmp_n_space = n_space; - - if (last_split_is_one_name_set.count(iter->name)) { - tmp_n_space--; - } - - if (tmp_n_space == 1) { - space_levels[0].push_back(iter); - } else { - split_res = tmp_s.split(stage_id, iter, std::vector(tmp_n_space - 1)); - for (int i = 0; i < tmp_n_space; i++) { - space_levels[i].push_back(std::move(split_res[i])); - } - spatial_split_step_ids->push_back(tmp_s->transform_steps.size() - 1); - } - } else { - if (no_split_at_inner_name_set.count(iter->name)) { - space_inner.push_back(iter); - } - if (no_split_at_outer_name_set.count(iter->name)) { - space_outer.push_back(iter); - } - } - } else if (iter->iter_type == kReduce) { - // for reduce iterator, split it into two iterators - if (!no_split_at_inner_name_set.count(iter->name) && - !no_split_at_outer_name_set.count(iter->name)) { - CHECK_GE(n_reduce, 1); - if (n_reduce == 1) { - reduce_levels[0].push_back(iter); - } else { - split_res = tmp_s.split(stage_id, iter, std::vector(n_reduce - 1)); - for (size_t i = 0; i < n_reduce; i++) { - reduce_levels[i].push_back(std::move(split_res[i])); - } - } - } else { - if (no_split_at_inner_name_set.count(iter->name)) { - reduce_inner.push_back(iter); - } - if (no_split_at_outer_name_set.count(iter->name)) { - reduce_outer.push_back(iter); - } - } - } else { - LOG(FATAL) << "Invalid iter type: " << iter->iter_type; - } - } - - if (!space_outer.empty()) { - CHECK(!space_levels.empty()); - space_levels.front().insert(space_levels.front().begin(), - space_outer.begin(), space_outer.end()); - } - if (!space_inner.empty()) { - CHECK(!space_levels.empty()); - space_levels.back().insert(space_levels.back().begin(), - space_inner.begin(), space_inner.end()); - } - - if (!reduce_outer.empty()) { - CHECK(!reduce_levels.empty()); - reduce_levels.front().insert(reduce_levels.front().begin(), - reduce_outer.begin(), reduce_outer.end()); - } - if (!reduce_inner.empty()) { - CHECK(!reduce_levels.empty()); - reduce_levels.back().insert(reduce_levels.back().begin(), - reduce_inner.begin(), reduce_inner.end()); - } - - std::vector order; - int space_ct = 0, reduce_ct = 0; - for (const auto c : format) { - if (tolower(c) == 's') { - order.insert(order.end(), std::make_move_iterator(space_levels[space_ct].begin()), - std::make_move_iterator(space_levels[space_ct].end())); - space_ct++; - } else if (tolower(c) == 'r') { - order.insert(order.end(), std::make_move_iterator(reduce_levels[reduce_ct].begin()), - std::make_move_iterator(reduce_levels[reduce_ct].end())); - reduce_ct++; - } else { - LOG(FATAL) << "Invalid multi level tiling format: " << format; - } - } - - tmp_s.reorder(stage_id, order); - return tmp_s; -} - -State FollowTiling(const State& state, int stage_id, - const std::vector& split_step_ids, int n_split) { - if (n_split < 1 || n_split > 3) { - LOG(FATAL) << "Invalid split parts, currently only support 1, 2 and 3"; - } - // Apply up to three-level tiling structure: space_L0, space_L1, space_L2 - std::vector space_0, space_1, space_2, space_3; - std::vector split_res, tmp_order; - - auto pop = state->stages[stage_id]->op.as(); - CHECK(pop != nullptr); - const Stage& stage = state->stages[stage_id]; - auto no_split_name_pair = QueryNoSplitAxis(stage); // handle special split strategy - const std::set& no_split_at_inner_name_set = no_split_name_pair.first; - const std::set& no_split_at_outer_name_set = no_split_name_pair.second; - int no_split_at_inner_name_in_stage_cnt = 0; - int no_split_at_outer_name_in_stage_cnt = 0; - for (const auto& iter : state->stages[stage_id]->iters) { - no_split_at_inner_name_in_stage_cnt += no_split_at_inner_name_set.count(iter->name); - no_split_at_outer_name_in_stage_cnt += no_split_at_outer_name_set.count(iter->name); - } - - CHECK_EQ(state->stages[stage_id]->iters.size() - - no_split_at_inner_name_in_stage_cnt - - no_split_at_outer_name_in_stage_cnt, - split_step_ids.size()); - - State tmp_s = state; - int ct = 0; - for (const auto& iter : state->stages[stage_id]->iters) { - if (iter->iter_type == kSpace) { - // For spatial iterator, split it into multi iterators - if (!no_split_at_inner_name_set.count(iter->name) && - !no_split_at_outer_name_set.count(iter->name)) { - IteratorAnnotation ann_type = iter->annotation; - split_res = tmp_s.follow_split(stage_id, iter, split_step_ids[ct], - n_split); - // Restore annotation. Move unroll and vectorize to inner, move parallel - // to outer - switch (ann_type) { - case kUnroll: - split_res[n_split] = tmp_s.unroll(stage_id, split_res[n_split]); - break; - case kVectorize: - split_res[n_split] = tmp_s.vectorize(stage_id, split_res[n_split]); - break; - case kParallel: - split_res[0] = tmp_s.parallel(stage_id, split_res[0]); break; - default: - break; - } - - space_0.push_back(std::move(split_res[0])); - space_1.push_back(std::move(split_res[1])); - if (n_split >= 2) { - space_2.push_back(std::move(split_res[2])); - if (n_split == 3) { - space_3.push_back(std::move(split_res[3])); - } - } - ct++; - } else { - if (no_split_at_outer_name_set.count(iter->name)) { - space_0.push_back(iter); - } - if (no_split_at_inner_name_set.count(iter->name)) { - if (n_split == 1) { - space_1.push_back(iter); - } else if (n_split == 2) { - space_2.push_back(iter); - } else { - CHECK_EQ(n_split, 3); - space_3.push_back(iter); - } - } - } - } else { - LOG(FATAL) << "Invalid iter type: " << iter->iter_type; - } - } - if (n_split == 3) { - ConcatenateMove(&tmp_order, &space_0, &space_1, &space_2, &space_3); - } else if (n_split == 2) { - ConcatenateMove(&tmp_order, &space_0, &space_1, &space_2); - } else { - ConcatenateMove(&tmp_order, &space_0, &space_1); - } - tmp_s.reorder(stage_id, tmp_order); - return tmp_s; -} - -State RandomMutateTileSize(const State& old_state, SplitFactorizationMemo* split_memo, - std::mt19937* random_gen, int max_innermost_split_factor) { - State tmp_s = old_state; - - // Extract all SplitStep - std::vector split_step_ids; - for (size_t i = 0; i < tmp_s->transform_steps.size(); ++i) { - if (auto ps = tmp_s->transform_steps[i].as()) { - if (ps->extent.defined() && ps->extent->IsInstance() && - GetIntImm(ps->lengths.back()) <= max_innermost_split_factor) { - split_step_ids.push_back(i); - } - } - } - if (split_step_ids.empty()) { - return State(); - } - - // Find a SplitStep with extent != 1 - int retry_ct = 0; - int64_t extent = 1; - int step_id; - const SplitStepNode* ps; - - do { - step_id = split_step_ids[(*random_gen)() % split_step_ids.size()]; - ps = tmp_s->transform_steps[step_id].as(); - CHECK(ps != nullptr); - extent = GetIntImm(ps->extent); - retry_ct += 1; - } while (retry_ct < static_cast(split_step_ids.size()) << 2 && - (extent == 1 || extent == 0)); - - if (extent == 0 || extent == 1) { - return State(); - } - - // Mutate tile size - std::vector lengths(ps->lengths.size() + 1, 1); - for (int i = 0; i < static_cast(ps->lengths.size()); ++i) { - lengths[i + 1] = GetIntImm(ps->lengths[i]); - } - lengths[0] = extent / ElementProduct(lengths); - - std::vector random_perm; - RandomPermutation(lengths.size(), &random_perm, random_gen); - - for (size_t i = 0; i < random_perm.size(); ++i) { - size_t src_idx = random_perm[i]; - int length = lengths[src_idx]; - - if (length == 1) { - continue; - } - - // Divide one factor from lengths[src_idx] and multiply it to lengths[dst_idx] - size_t dst_idx = random_perm[(i + 1) % random_perm.size()]; - - const std::vector& factors = split_memo->GetFactors(length); - CHECK_GE(factors.size(), 1); - - int divide_factor; - if (dst_idx == lengths.size() - 1) { - // Maintain the restriction of hardware_params.max_innermost_split_factor - int max_factor_index = static_cast(factors.size()) - 1; - for (; max_factor_index >= 1; max_factor_index--) { - if (factors[max_factor_index] * lengths[dst_idx] <= max_innermost_split_factor) { - break; - } - } - if (max_factor_index == 0) { - // failed on this dst_idx, try next one - continue; - } - divide_factor = factors[1 + (*random_gen)() % (max_factor_index)]; - } else { - divide_factor = factors[1 + (*random_gen)() % (factors.size() - 1)]; - } - - std::vector new_lengths; - for (size_t j = 1; j < lengths.size(); ++j) { - if (j == src_idx) { - new_lengths.emplace_back(lengths[j] / divide_factor); - } else if (j == dst_idx) { - new_lengths.emplace_back(lengths[j] * divide_factor); - } else { - new_lengths.emplace_back(lengths[j]); - } - } - - CHECK_LE(GetIntImm(new_lengths.back()), max_innermost_split_factor); - - auto pstate = tmp_s.CopyOnWrite(); - pstate->transform_steps[step_id] = - SplitStep(ps->stage_id, ps->iter_id, ps->extent, new_lengths, ps->inner_to_outer); - return tmp_s; - } - - return State(); -} - -State RandomMutateMaxUnrollStep(const State& old_state, std::mt19937* random_gen, - const std::vector& auto_unroll_configs) { - State tmp_s = old_state; - - // Extract all auto_unroll_max_step pragma steps. - std::vector annotate_steps; - for (size_t i = 0; i < old_state->transform_steps.size(); ++i) { - if (auto ps = tmp_s->transform_steps[i].as()) { - if (ps->pragma_type.find("auto_unroll_max_step") != std::string::npos) { - annotate_steps.push_back(i); - } - } - } - if (annotate_steps.empty()) { - return State(); - } - - // Randomly pick one step. - auto step_id = annotate_steps[(*random_gen)() % annotate_steps.size()]; - auto ps = tmp_s->transform_steps[step_id].as(); - auto val = std::to_string(auto_unroll_configs[(*random_gen)() % auto_unroll_configs.size()]); - - auto pstate = tmp_s.CopyOnWrite(); - pstate->transform_steps[step_id] = PragmaStep( - ps->stage_id, ps->iter_id, std::string("auto_unroll_max_step") + "$" + val); - return tmp_s; -} - -void PruneUndefined(std::vector* states) { - size_t pt = 0; - for (size_t i = 0; i < states->size(); ++i) { - if (!(*states)[i].defined()) { - continue; - } - (*states)[pt++] = std::move((*states)[i]); - } - - if (pt == 0) { - LOG(FATAL) << "All states are undefined."; - } else { - states->resize(pt); - } -} - -State CrossOverState(const State& p1, const State& p2) { return State(); } - -} // namespace ansor -} // namespace tvm - diff --git a/src/ansor/search_policy/utils.h b/src/ansor/search_policy/utils.h deleted file mode 100644 index 5f15397e7e905..0000000000000 --- a/src/ansor/search_policy/utils.h +++ /dev/null @@ -1,476 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file ansor/search_policy/utils.cc - * \brief Common utilities for search policies - */ - -#ifndef TVM_ANSOR_SEARCH_POLICY_UTILS_H_ -#define TVM_ANSOR_SEARCH_POLICY_UTILS_H_ - -#include -#include -#include -#include -#include -#include -#include "../cost_model/cost_model.h" -#include "../utils.h" -#include "../loop_state.h" -#include "../transform_step.h" -#include "search_policy.h" - -namespace tvm { -namespace ansor { - -// Get an integer from a tvm str Map -inline int GetIntParam(const Map& attr_dict, - const std::string& key) { - CHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict; - auto pint = attr_dict[key].as(); - CHECK(pint != nullptr); - return pint->value; -} - -// Get a double from a tvm str Map -inline double GetDoubleParam(const Map& attr_dict, - const std::string& key) { - CHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict; - auto pdouble = attr_dict[key].as(); - CHECK(pdouble != nullptr); - return pdouble->value; -} - -// Get a string from a tvm str Map -inline std::string GetStringParam(const Map& attr_dict, - const std::string& key) { - CHECK_GT(attr_dict.count(key), 0) - << "Cannot find key: \"" << key << "\" in " << attr_dict; - const auto& target = attr_dict[key]; - if (auto pstr = target.as()) { - return pstr->value; - } - auto pstr = target.as(); - CHECK(pstr != nullptr); - return pstr->data; -} - -// Get a iterator name set from a tvm str Map -inline std::set GetIterNameSetParam(const Map& attr_dict, - const std::string& key) { - std::set ret; - CHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict; - auto names = attr_dict[key].as(); - CHECK(names != nullptr); - for (auto name = names->begin(); name != names->end(); name++) { - ret.insert(name->as()->value); - } - return ret; -} - -// Convert operation to stage id -inline int OperationToStage(const te::Operation& op, const State& state) { - for (size_t i = 0; i < state->stages.size(); ++i) { - if (op == state->stages[i]->op) { - return i; - } - } - LOG(FATAL) << "Cannot find op: " << op; - return -1; -} - -// Return the extent of an iterator -inline int64_t GetExtent(const Iterator& it) { - if (it->range.defined()) { - if (auto pint = it->range->extent.as()) { - return pint->value; - } - } - return -1; -} - -// Return whether an op is strict inlineable -inline bool IsStrictInlineable(const SearchTask& task, - const State& state, const te::Operation& op) { - if (state->task_dag.defined()) { - return state->task_dag->access_analyzer.IsStrictInlineable(op); - } else { - return task->compute_dag->access_analyzer.IsStrictInlineable(op); - } -} - -// Return whether an op is an output op -inline bool IsOutputOp(const SearchTask& task, const State& state, const te::Operation& op) { - if (state->task_dag.defined()) { - return state->task_dag->access_analyzer.IsOutput(op); - } else { - return task->compute_dag->access_analyzer.IsOutput(op); - } -} - -// Return whether the stage has an attribute flag -inline bool HasAttrsFlag(const State& state, int stage_id, const char* target) { - if (state->stages[stage_id]->op->attrs.count(target)) { - return GetStringParam(state->stages[stage_id]->op->attrs, target) == "True"; - } - return false; -} - -// Return whether the stage has reduce iterators -inline bool HasReduceIter(const Stage& stage) { - for (const auto& iter : stage->iters) { - if (iter->iter_type != kSpace) { - return true; - } - } - return false; -} - -// Return whether the stage has specific annotated iterators -inline bool HasAnnotationIter(const Stage& stage, IteratorAnnotation type) { - for (const auto& iter : stage->iters) { - if (iter->annotation == type) { - return true; - } - } - return false; -} - -// Return whether an op needs multi level tiling -inline bool NeedsMultilevelTiling(const SearchTask& task, - const State& state, const te::Operation& op) { - if (state->task_dag.defined()) { - return state->task_dag->access_analyzer.NeedsMultiLevelTiling(op); - } else { - return task->compute_dag->access_analyzer.NeedsMultiLevelTiling(op); - } -} - -// Get all consumers for an op. This will take inline into consideration -inline void GetConsumers(const SearchTask& task, const State& state, const te::Operation& op, - std::unordered_set* consumers) { - if (state->task_dag.defined()) { - state->task_dag->access_analyzer.GetConsumers(state, op, consumers); - } else { - task->compute_dag->access_analyzer.GetConsumers(state, op, consumers); - } -} - -inline void GetProducers(const SearchTask& task, const State& state, const te::Operation& op, - std::unordered_set* producers) { - if (state->task_dag.defined()) { - state->task_dag->access_analyzer.GetProducers(state, op, producers); - } else { - task->compute_dag->access_analyzer.GetProducers(state, op, producers); - } -} - -// Return whether two ops are elementwise-matched -inline bool ElementwiseMatch(const SearchTask& task, const State& state, const te::Operation& op, - const te::Operation& target_op) { - if (state->task_dag.defined()) { - return state->task_dag->access_analyzer.ElementWiseMatch(op, target_op); - } else { - return task->compute_dag->access_analyzer.ElementWiseMatch(op, target_op); - } -} - -// Return whether the stage has only one consumer and they are elementwise-matched -inline bool HasSingleElementwiseMatchedConsumer(const SearchTask& task, - const State& state, const Stage& stage, int* target_stage_id) { - std::unordered_set consumers; - - GetConsumers(task, state, stage->op, &consumers); - if (consumers.size() == 1) { - *target_stage_id = OperationToStage(*consumers.begin(), state); - const Stage& target_stage = state->stages[*target_stage_id]; - if (ElementwiseMatch(task, state, stage->op, target_stage->op) && - (!(HasReduceIter(stage) && HasReduceIter(target_stage)))) { - return true; - } - } - return false; -} - -// Return whether this stage needs rfactor -inline bool NeedsRfactor(const SearchTask& task, const State& state, const te::Operation& op) { - if (op->IsInstance()) { - // Compute the product of lengths of all space iters and all reduce iters - int64_t cum_space_len = 1, cum_reduce_len = 1; - int stage_id = OperationToStage(op, state); - for (const auto& iter : state->stages[stage_id]->iters) { - if (iter->iter_type == kSpace) { - cum_space_len *= GetExtent(iter); - } else if (iter->iter_type == kReduce) { - cum_reduce_len *= GetExtent(iter); - } - } - - if (NeedsMultilevelTiling(task, state, op)) { - // Do not use rfactor if we have enough parallelism on space iters - if (cum_space_len > cum_reduce_len || - cum_space_len > task->hardware_params->num_cores * 16) { - return false; - } else { - return true; - } - } else if (cum_reduce_len > 1) { - // Always try rfactor for reduction ops - return true; - } - } - - return false; -} - -// Return whether the state did cache_write for stage_id -inline bool HasCacheWriteStage(const State& s, int stage_id) { - for (int i = static_cast(s->transform_steps.size()) - 1; i >= 0; --i) { - if (auto ps = s->transform_steps[i].as()) { - if (stage_id > ps->stage_id) { - stage_id--; - } else if (stage_id == ps->stage_id) { - return true; - } - } else if (auto ps = s->transform_steps[i].as()) { - if (stage_id > ps->stage_id) { - stage_id--; - } - } else if (auto ps = s->transform_steps[i].as()) { - if (stage_id > ps->stage_id) { - stage_id--; - } - } - } - return false; -} - -// Return whether the state did cache_read for stage_id -inline bool HasCacheReadStage(const State& s, int stage_id) { - for (int i = static_cast(s->transform_steps.size()) - 1; i >= 0; --i) { - if (auto ps = s->transform_steps[i].as()) { - if (stage_id > ps->stage_id) { - stage_id--; - } - } else if (auto ps = s->transform_steps[i].as()) { - if (stage_id > ps->stage_id) { - stage_id--; - } else if (stage_id == ps->stage_id) { - return true; - } - } else if (auto ps = s->transform_steps[i].as()) { - if (stage_id > ps->stage_id) { - stage_id--; - } - } - } - return false; -} - -// Get all split step on spatial iterators -void GetSpaceSplitStepIds(const State& s, int stage_id, std::vector* spatial_split_step_ids); - -// Return whether the state did split/follow_split/follow_fused_split in stage_id -inline bool HasSplitStep(const State& s, int stage_id) { - for (int i = static_cast(s->transform_steps.size()) - 1; i >= 0; --i) { - if (s->transform_steps[i]->IsInstance() || - s->transform_steps[i]->IsInstance() || - s->transform_steps[i]->IsInstance()) { - if (stage_id > s->transform_steps[i]->stage_id) { - stage_id--; - } - } else if (s->transform_steps[i]->IsInstance() || - s->transform_steps[i]->IsInstance() || - s->transform_steps[i]->IsInstance()) { - if (stage_id == s->transform_steps[i]->stage_id) { - return true; - } - } - } - return false; -} - -// Return whether the stage has been tiled already -inline bool IsTiled(const Stage& stage) { - auto op = stage->op.as(); - CHECK(op != nullptr); - return stage->iters.size() != op->axis.size() + op->reduce_axis.size(); -} - -// Query axes that should not be splitted according to the attribute from tvm.compute -inline std::pair, std::set > QueryNoSplitAxis( - const Stage& stage) { - std::pair, std::set > ret; - if (stage->op->attrs.count(SearchPolicyNode::no_split_at_inner_key)) { - ret.first = GetIterNameSetParam(stage->op->attrs, SearchPolicyNode::no_split_at_inner_key); - } - if (stage->op->attrs.count(SearchPolicyNode::no_split_at_outer_key)) { - ret.second = GetIterNameSetParam(stage->op->attrs, SearchPolicyNode::no_split_at_outer_key); - } - return ret; -} - -// Query axes that last split is one -inline std::set QueryLastSplitIsOneAxis(const Stage& stage) { - std::set ret; - if (stage->op->attrs.count(SearchPolicyNode::last_split_is_one_key)) { - ret = GetIterNameSetParam(stage->op->attrs, SearchPolicyNode::last_split_is_one_key); - } - return ret; -} - -// Extract primitive iterators from a nested fused or splitted iterator's name -inline void ExtractOriginalIterators(const std::string& name, std::set* rets) { - size_t last_pos = 0; - for (size_t i = 0; i < name.size(); ++i) { - if (name[i] == '@' || name[i] == '.') { // '@' for fuse and '.' for split - if (!isdigit(name[last_pos]) && name[last_pos] != '@' && name[last_pos] != '.') { - rets->insert(name.substr(last_pos, i - last_pos)); - } - last_pos = i + 1; - } - } - - if (last_pos < name.size() && !isdigit(name[last_pos]) && - name[last_pos] != '@' && name[last_pos] != '.') { - rets->insert(name.substr(last_pos, name.size() - last_pos)); - } -} - -// Get the last space iterator in the outer most tile -inline const Iterator& GetLastSpaceIteratorInOutermostTile(const Stage& stage) { - auto pop = stage->op.as(); - CHECK(pop != nullptr); - std::set original_names; - - for (const auto& iter : stage->iters) { - ExtractOriginalIterators(iter->name, &original_names); - if (original_names.size() == pop->axis.size()) { - return iter; - } - } - - LOG(FATAL) << "Cannot find the iterator."; - return stage->iters[0]; -} - -// Get the last reduce iterator in the outermost reduce tile -inline const Iterator& GetLastReduceIteratorInOutermostReduceTile(const Stage& stage) { - auto pop = stage->op.as(); - CHECK(pop != nullptr); - std::set original_names; - - auto no_split_name_pair = QueryNoSplitAxis(stage); - std::set no_split_at_inner_name_set = no_split_name_pair.first; - size_t axis_size = 0; - for (const auto axis : pop->axis) { - if (!no_split_at_inner_name_set.count(axis->var->name_hint)) { - axis_size++; - } - } - size_t reduce_axis_size = 0; - for (const auto axis : pop->reduce_axis) { - if (!no_split_at_inner_name_set.count(axis->var->name_hint)) { - reduce_axis_size++; - } - } - - if (reduce_axis_size) { - for (const auto& iter : stage->iters) { - ExtractOriginalIterators(iter->name, &original_names); - if (original_names.size() == axis_size + reduce_axis_size) { - return iter; - } - } - } else { - for (size_t i = 0; i < stage->iters.size(); i++) { - ExtractOriginalIterators(stage->iters[i]->name, &original_names); - if (original_names.size() == axis_size + 1) { - return stage->iters[i-1]; - } - } - } - - LOG(FATAL) << "Cannot find the iterator."; - return stage->iters[0]; -} - -// Random sample states -inline void RandomSampleStates(const std::vector& in_states, std::mt19937* random_gen, - size_t out_size, std::vector* out_states) { - out_states->clear(); - for (size_t i = 0; i < out_size; i++) { - out_states->push_back(in_states[(*random_gen)() % in_states.size()]); - } -} - -// Random choose an index according to a prefix sum probability -inline int RandomChoose(const std::vector& prefix_sum_probs, std::mt19937* random_gen) { - std::uniform_real_distribution<> dis(0.0, 1.0); - double x = dis(*random_gen); - - CHECK(!prefix_sum_probs.empty()); - - return std::lower_bound(prefix_sum_probs.begin(), prefix_sum_probs.end(), x) - - prefix_sum_probs.begin(); -} - -// Print all states -inline void PrintAllStates(const std::vector& states) { - for (size_t i = 0; i < states.size(); ++i) { - std::cerr << i << std::endl; - std::cerr << states[i]; - std::cerr << "==============================================" << std::endl; - } -} - -// Apply multi-level tiling structure according to a string format, -// where "S" stands a space level, "R" stands for a reudciton level. -// For example, if the format is "SSRSRS", the we will -// use tiling structure: space_L0, space_L1, reduce_L0, space_L2, reduce_L1, space_L3 -// For example, if apply "SSRSRS" to matrix multiplication, -// we have space iterators i and j, reduce iterator k. -// Then the tiling structure is : i0, j0, i1, j1, k0, i2, j2, k1, i3, j3 -State DoMultiLevelTiling(const State& state, int stage_id, const std::string& format, - std::vector* spatial_split_step_ids); - -// Apply tiling structure: space, space -// But use tile sizes from other SplitStep -State FollowTiling(const State& state, int stage_id, - const std::vector& split_step_ids, int n_split); - -// Randomly mutate the tile size of one SplitStep -State RandomMutateTileSize(const State& old_state, SplitFactorizationMemo* split_memo, - std::mt19937* random_gen, int max_innermost_split_factor); - -// Randomly mutate the value of one auto_unroll_max_step PragmaStep -State RandomMutateMaxUnrollStep(const State& old_state, std::mt19937* random_gen, - const std::vector& auto_unroll_configs); - -// GA: Crossover two states -State CrossOverState(const State& p1, const State& p2); - -// Prune undefined states. -void PruneUndefined(std::vector* states); - -} // namespace ansor -} // namespace tvm - -#endif // TVM_ANSOR_SEARCH_POLICY_UTILS_H_ diff --git a/src/relay/analysis/type_solver.cc b/src/relay/analysis/type_solver.cc index 5b063eca43375..a192002825e65 100644 --- a/src/relay/analysis/type_solver.cc +++ b/src/relay/analysis/type_solver.cc @@ -219,7 +219,6 @@ class TypeSolver::Unifier : public TypeFunctor { return Type(nullptr); } - tt1 = tt2; tvm::Array shape; if (tt1->shape.size() != tt2->shape.size()) { this->solver_->ReportError(ErrorBuilder() << "tensor type `" << PrettyPrint(tt1) << "` has " diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc index a8cd1d3c24626..b6cd9e2c6b772 100644 --- a/src/relay/backend/build_module.cc +++ b/src/relay/backend/build_module.cc @@ -335,18 +335,6 @@ class RelayBuildModule : public runtime::ModuleNode { // Fuse the operations if it is needed. relay_module = transform::FuseOps()(relay_module); - - if (targets.size() == 1) { - pass_seqs.push_back(transform::KernelLayoutTransform()); - pass_seqs.push_back(transform::DeFuseOps()); - pass_seqs.push_back(transform::FoldConstant()); - transform::Pass seq = transform::Sequential(pass_seqs); - const auto& it = targets.begin(); - With tctx((*it).second); - relay_module = seq(relay_module); - relay_module = transform::FuseOps()(relay_module); - } - relay_module = transform::InferType()(relay_module); // Inline the functions that have been lifted by the module scope. // diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc index fde880b10f1d0..2aae8546248fa 100644 --- a/src/relay/backend/compile_engine.cc +++ b/src/relay/backend/compile_engine.cc @@ -68,11 +68,6 @@ CCacheKey::CCacheKey(Function source_func, Target target) { auto n = make_object(); n->source_func = std::move(source_func); n->target = std::move(target); - n->disabled = false; - char* envar = getenv("TVM_RELAY_DISABLE_BUILD_CACHE"); - if (envar != nullptr && strcmp(envar, "true") == 0) { - n->disabled = true; - } data_ = std::move(n); } diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h index b290462a4b22e..a5f3f6359f893 100644 --- a/src/relay/backend/compile_engine.h +++ b/src/relay/backend/compile_engine.h @@ -115,8 +115,6 @@ class CCacheKeyNode : public Object { /*! \brief The hardware target.*/ Target target; - bool disabled; - void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("source_func", &source_func); v->Visit("target", &target); @@ -261,7 +259,6 @@ inline size_t CCacheKeyNode::Hash() const { } inline bool CCacheKeyNode::Equal(const CCacheKeyNode* other) const { - if (disabled) return false; if (Hash() != other->Hash()) return false; return this->target->str() == other->target->str() && tvm::StructuralEqual()(this->source_func, other->source_func); diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc index 18ace14a0b75e..ee5e291e3d532 100644 --- a/src/relay/op/tensor/transform.cc +++ b/src/relay/op/tensor/transform.cc @@ -2455,62 +2455,6 @@ the input array by output[n, c, h, w, C] = data[n, C*16+c, h, w] .set_support_level(5) .set_attr("FTVMCompute", LayoutTransformCompute); -// relay.kernel_layout_transform -TVM_REGISTER_NODE_TYPE(KernelLayoutTransformAttrs); - -Array KernelLayoutTransformCompute(const Attrs& attrs, - const Array& inputs, - const Type& out_type) { - //const Target& target) { - const auto* param = attrs.as(); - CHECK(param != nullptr); - return Array{ - topi::kernel_layout_transform(inputs[0], param->src_layout, param->dst_layout) - }; -} - -bool KernelLayoutTransformRel(const Array& types, - int num_inputs, - const Attrs& attrs, - const TypeReporter& reporter) { - - const auto* data = types[0].as(); - CHECK(data != nullptr); - const KernelLayoutTransformAttrs* params = attrs.as(); - - Array dst_shape; - std::vector dst_axes; - - topi::parse_kernel_layout(params->dst_layout, &dst_shape, &dst_axes); - - reporter->Assign(types[1], TensorType(dst_shape, data->dtype)); - return true; -} - -Expr MakeKernelLayoutTransform(Expr data, - String src_layout, - String dst_layout) { - auto attrs = make_object(); - attrs->src_layout = std::move(src_layout); - attrs->dst_layout = std::move(dst_layout); - static const Op& op = Op::Get("kernel_layout_transform"); - return Call(op, {data}, Attrs(attrs), {}); -} - -TVM_REGISTER_GLOBAL("relay.op._make.kernel_layout_transform") -.set_body_typed(MakeKernelLayoutTransform); - -RELAY_REGISTER_OP("kernel_layout_transform") - .describe(R"code(Transform the input kernel layout. -)code" TVM_ADD_FILELINE) - .set_attrs_type() - .set_num_inputs(1) - .add_argument("data", "Tensor", "The input tensor.") - .add_type_rel("kernel_layout_transform", KernelLayoutTransformRel) - .set_support_level(5) - .set_attr("FTVMCompute", KernelLayoutTransformCompute); - - /* relay._contrib_reverse_reshape */ Expr MakeReverseReshape(Expr data, Array newshape) { auto attrs = make_object(); diff --git a/src/relay/transforms/defuse_ops.cc b/src/relay/transforms/defuse_ops.cc deleted file mode 100644 index f7c9037df6875..0000000000000 --- a/src/relay/transforms/defuse_ops.cc +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "pattern_util.h" - -namespace tvm { -namespace relay { - -class DefuseOpsMutator : public ExprMutator { - public: - - class FuncBodyMutator : public ExprMutator { - public: - Array args_; - - FuncBodyMutator(const Array& args) : ExprMutator() { - args_ = args; - } - - Expr VisitExpr_(const VarNode* n) { - const std::string& name = n->name_hint(); - CHECK_EQ(name[0], 'p'); - std::string id_str = name.substr(1); - int id = atoi(id_str.c_str()); - CHECK(id >= 0 && size_t(id) < args_.size()); - return args_[id]; - } - }; - - Expr VisitExpr_(const CallNode* n) { - auto new_n = ExprMutator::VisitExpr_(n); - - const auto* call = new_n.as(); - if (call) { - const auto* func = call->op.as(); - if (func) { - const auto& func_call = func->body.as(); - if (func_call) { - return FuncBodyMutator(call->args).Mutate(func->body); - } - } - } - return new_n; - } -}; - -Expr DeFuseOps(const Expr& expr) { - return DefuseOpsMutator().Mutate(expr); -} - -namespace transform { - -Pass DeFuseOps() { - runtime::TypedPackedFunc pass_func = - [=](Function f, IRModule m, PassContext pc) { - return Downcast(relay::DeFuseOps(f)); - }; - return CreateFunctionPass(pass_func, 3, "DeFuseOps", - {"InferType"}); -} - -TVM_REGISTER_GLOBAL("relay._transform.DeFuseOps") -.set_body_typed(DeFuseOps); - -} // namespace transform - -} // namespace relay -} // namespace tvm diff --git a/src/relay/transforms/kernel_layout_transform.cc b/src/relay/transforms/kernel_layout_transform.cc deleted file mode 100644 index 681785c8123c9..0000000000000 --- a/src/relay/transforms/kernel_layout_transform.cc +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include -#include -#include -#include -#include -#include -#include "kernel_layout_transform.h" - -namespace tvm { -namespace relay { - -// Todo: do not use global variables -std::deque KernelLayoutVisitor::global_ori_layouts_queue; -std::deque KernelLayoutVisitor::global_new_layouts_queue; - -Expr KernelLayoutTransform(const Expr& expr) { - KernelLayoutVisitor visitor; - - // Do a pre-order DFS to gather the optimal kernel layouts for all conv2d nodes. - // These layouts were written to global static variables in python function `prepare_layout_rewrite` - visitor.VisitExpr(expr); - - // Do a post-order DSF to mutate layout for all conv2d nodes - return KernelLayoutTransformer(&visitor).Mutate(expr); -} - -namespace transform { - -Pass KernelLayoutTransform() { - runtime::TypedPackedFunc pass_func = - [=](Function f, IRModule m, PassContext pc) { - return Downcast(relay::KernelLayoutTransform(f)); - }; - return CreateFunctionPass(pass_func, 3, "KernelLayoutTransform", - {"InferType"}); -} - -TVM_REGISTER_GLOBAL("relay._transform.KernelLayoutTransform") -.set_body_typed(KernelLayoutTransform); - -} // namespace transform - -} // namespace relay -} // namespace tvm diff --git a/src/relay/transforms/kernel_layout_transform.h b/src/relay/transforms/kernel_layout_transform.h deleted file mode 100644 index c82a96b306122..0000000000000 --- a/src/relay/transforms/kernel_layout_transform.h +++ /dev/null @@ -1,76 +0,0 @@ -#include -#include -#include -#include - -#include "pattern_util.h" - -#include "../../ansor/compute_dag.h" - -namespace tvm { -namespace relay { - -/*! \brief A visitor to gather the optimal kernel layout for all conv2d nodes. */ -class KernelLayoutVisitor : public ExprVisitor { - public: - void VisitExpr_(const CallNode *n) { - if (n && n->op.as() && - (std::find(op_white_lists.begin(), op_white_lists.end(), n->op.as()->name) != - op_white_lists.end()) && n->args[1]->type_as()->shape[3].as()->value > 1 && - !global_ori_layouts_queue.empty() && !global_new_layouts_queue.empty()) { - ori_layouts_map[n] = global_ori_layouts_queue.front(); - new_layouts_map[n] = global_new_layouts_queue.front(); - // std::cout << "ori_layout " << global_ori_layouts_queue.front() - // << " Filter_shape " << n->args[1]->type_as()->shape << std::endl; - global_ori_layouts_queue.pop_front(); - global_new_layouts_queue.pop_front(); - } - ExprVisitor::VisitExpr_(n); - } - - std::unordered_map ori_layouts_map; - std::unordered_map new_layouts_map; - std::vector op_white_lists {"nn.contrib_conv2d_winograd_without_weight_transform", - "nn.conv2d", "nn.conv3d"}; - - static std::deque global_ori_layouts_queue; - static std::deque global_new_layouts_queue; -}; - - -/*! \brief A mutator to rewrite kernel layout for all conv2d nodes */ -class KernelLayoutTransformer : public ExprMutator { - public: - KernelLayoutTransformer(KernelLayoutVisitor* visitor): ExprMutator(), visitor_(visitor) {} - - Expr VisitExpr_(const CallNode* n) { - auto new_n = ExprMutator::VisitExpr_(n); - - const auto* call = new_n.as(); - std::vector op_white_lists {"nn.contrib_conv2d_winograd_without_weight_transform", - "nn.conv2d", "nn.conv3d"}; - if (call && call->op.as() && - (std::find(op_white_lists.begin(), op_white_lists.end(), n->op.as()->name) != - op_white_lists.end() && n->args[1]->type_as()->shape[3].as()->value > 1)) { - auto ori_layout_iter = visitor_->ori_layouts_map.find(n); - auto new_layout_iter = visitor_->new_layouts_map.find(n); - if (ori_layout_iter != visitor_->ori_layouts_map.end() && - new_layout_iter != visitor_->new_layouts_map.end()) { - const std::string& ori_layout = ori_layout_iter->second; - const std::string& new_layout = new_layout_iter->second; - Expr updated_kernel = MakeKernelLayoutTransform(call->args[1], ori_layout, new_layout); - Array updated_args = {call->args[0], updated_kernel}; - new_n = Call(call->op, updated_args, - call->attrs); - } - } - return new_n; - } - - private: - KernelLayoutVisitor* visitor_; -}; - - -} // namespace relay -} // namespace tvm diff --git a/src/relay/transforms/pattern_util.h b/src/relay/transforms/pattern_util.h index a9d3b5168e474..7518eb9ac81a1 100644 --- a/src/relay/transforms/pattern_util.h +++ b/src/relay/transforms/pattern_util.h @@ -685,8 +685,6 @@ Expr MakeExpandDims(Expr data, int axis, int num_newaxis); Expr MakeLayoutTransform(Expr data, String src_layout, String dst_layout); -Expr MakeKernelLayoutTransform(Expr data, String src_layout, String dst_layout); - Expr StopFusion(Expr data); Expr CastHint(Expr data, DataType dtype); diff --git a/tests/python/unittest/test_ansor_relay_integration.py b/tests/python/unittest/test_ansor_relay_integration.py deleted file mode 100644 index 1ad507e2f3715..0000000000000 --- a/tests/python/unittest/test_ansor_relay_integration.py +++ /dev/null @@ -1,114 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" Test Relay Integration """ - -import tempfile -import numpy as np - -import tvm -from tvm import ansor, relay -import tvm.contrib.graph_runtime as runtime -from tvm.relay.testing import dqn - -def test_tune_dense_graph(): - def dense_graph(N, dtype="float32"): - ori_data = relay.var("data", shape=(N, N), dtype=dtype) - weight = relay.var("weight", shape=(N, N), dtype=dtype) - data = relay.multiply(ori_data, relay.const(2, dtype=dtype)) - dense = relay.nn.dense(data, weight, out_dtype=dtype) - dense = relay.add(dense, weight) - dense = relay.nn.dense(dense, weight, out_dtype=dtype) - return ori_data, weight, dense - - N = 128 - data, weight, dense = dense_graph(N) - mod = relay.Function([data, weight], dense) - mod = tvm.IRModule.from_expr(mod) - - ctx = tvm.context("llvm") - target = tvm.target.create("llvm") - d = tvm.nd.array(np.random.uniform(size=(N, N)).astype(data.type_annotation.dtype), ctx) - w = tvm.nd.array(np.random.uniform(size=(N, N)).astype(weight.type_annotation.dtype), ctx) - wkl_keys, wkl_weights = ansor.extract_from_program(mod, {}, target=target) - - assert len(wkl_keys) == 2 - assert len(wkl_weights) == 2 - - tasks = [] - for wkl_key in wkl_keys: - dag = ansor.workload_key_to_dag(wkl_key) - tasks.append(ansor.SearchTask(dag, wkl_key, target)) - - tuner = ansor.SimpleTaskScheduler(tasks) - measure_ctx = ansor.LocalRPCMeasureContext() - with tempfile.NamedTemporaryFile() as fp: - tuner.tune(ansor.TuneOption(n_trials=2, runner=measure_ctx.runner, - measure_callbacks=[ansor.LogToFile(fp.name)])) - with ansor.apply_history_best(fp.name): - with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): - graph, lib, opt_params = relay.build_module.build( - mod, target=target) - - m = runtime.create(graph, lib, ctx) - m.set_input('data', d) - m.set_input('weight', w) - m.run() - res = m.get_output(0) - - del measure_ctx - - d = d.asnumpy() - d = d * 2 - w = w.asnumpy() - d = np.dot(d, np.transpose(w)) - d = d + w - d = np.dot(d, np.transpose(w)) - - tvm.testing.assert_allclose(res.asnumpy(), d, rtol=1e-5) - - -def test_tune_dqn(): - mod, params = dqn.get_workload(1, image_shape=(84, 84, 4), layout='NHWC') - target = tvm.target.create('llvm') - - wkl_keys, wkl_weights = ansor.extract_from_program(mod, params, target) - - tasks = [] - for wkl_key in wkl_keys: - dag = ansor.workload_key_to_dag(wkl_key) - tasks.append(ansor.SearchTask(dag, wkl_key, target)) - - assert len(tasks) == 5 - - tuner = ansor.SimpleTaskScheduler(tasks) - measure_ctx = ansor.LocalRPCMeasureContext() - with tempfile.NamedTemporaryFile() as fp: - tuner.tune(ansor.TuneOption(n_trials=len(tasks), runner=measure_ctx.runner, - measure_callbacks=[ansor.LogToFile('tmp.json')]), - search_policy='sketch.random') - with ansor.apply_history_best('tmp.json'): - ansor.prepare_layout_rewrite(mod, params, target) - with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}): - graph, lib, opt_params = relay.build_module.build(mod, target=target) - ansor.finish_layout_rewrite() - - del measure_ctx - -if __name__ == "__main__": - test_tune_dense_graph() - test_tune_dqn() - diff --git a/tests/python/unittest/test_ansor_search_policy.py b/tests/python/unittest/test_ansor_search_policy.py deleted file mode 100644 index deff561a4547d..0000000000000 --- a/tests/python/unittest/test_ansor_search_policy.py +++ /dev/null @@ -1,168 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Test search policy""" - -import random -import numpy as np -import tempfile -import threading - -import tvm -from tvm import ansor - -from test_ansor_common import matmul_ansor_test - -def search_common(target="llvm", seed=random.randint(1, 1 << 30), runner='local', - cost_model=ansor.RandomModel(), n_trials=2, params=None, - pre_search_callbacks=None): - print("Test %s schedule search with the default search policy" % (target)) - - random.seed(seed) - N = 128 - workload_key = ansor.make_workload_key_func(matmul_ansor_test, (N, N, N)) - dag = ansor.workload_key_to_dag(workload_key) - target = tvm.target.create(target) - task = ansor.SearchTask(dag, workload_key, target) - - with tempfile.NamedTemporaryFile() as fp: - log_file = fp.name - - search_policy = ansor.SketchSearchPolicy(cost_model, params=params, seed=seed) - tune_option = ansor.TuneOption(n_trials=n_trials, runner=runner, - measure_callbacks=[ansor.LogToFile(log_file)], - pre_search_callbacks=pre_search_callbacks) - sch, args = ansor.auto_schedule(task, search_policy=search_policy, - tune_option=tune_option) - inp, res = ansor.best_measure_pair_in_file(log_file, workload_key, target) - - print("==== Python Code ====") - print(dag.print_python_code_from_state(inp.state)) - - try: - print("==== Lowered Stmt ====") - print(tvm.lower(sch, args, simple_mode=True)) - mod = tvm.build(sch, args, target) - - ctx = tvm.context(str(target), 0) - dtype = dag.tensors[0].dtype - a = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx) - c = tvm.nd.array(np.zeros((N, N), dtype=dtype), ctx) - mod(a, b, c) - tvm.testing.assert_allclose(c.asnumpy(), np.dot( - a.asnumpy(), b.asnumpy()), rtol=1e-5) - print("==== Verification passed ====") - except Exception: - raise Exception("Error encountered with seed: %d" % (seed)) - print() - - -def test_search_basic(): - # wrap the search in a new thread to avoid the conflict - # between python's multiprocessing and tvm's thread pool - t = threading.Thread(target=search_common, kwargs={'seed': 944563397}) - t.start() - t.join() - - -def test_search_xgb_model_rpc_runner(): - measure_ctx = ansor.LocalRPCMeasureContext() - search_common(seed=456787236, cost_model=ansor.XGBModel(), - runner=measure_ctx.runner) - - -def test_search_opencl(): - if tvm.context("opencl", 0).exist: - measure_ctx = ansor.LocalRPCMeasureContext() - search_common("opencl", 380344973, measure_ctx.runner) - else: - print("OpenCL device not found, skip this test.") - - -def test_search_cuda(): - if tvm.context("cuda", 0).exist: - measure_ctx = ansor.LocalRPCMeasureContext() - search_common("cuda", 903667810, measure_ctx.runner) - else: - print("CUDA device not found, skip this test.") - - -def test_search_custom_sketch_rule(): - def meet_condition_func(meta_policy, state, stage_id): - # Apply and Skip the Rest if this function does not return - pass - - # Expecting: - # i.0 - # i.1 - # i.2 - # j.0 - # j.1 - # ax0 - # ax1 - # B.global - # j.2 - # k - # C - def apply_func1(meta_policy, state, stage_id): - # Stage by stage way - ret = [] - if stage_id == 2: - state = ansor.loop_state.State(state, meta_policy.cur_task.compute_dag) - state.split(2, state.stages[2].iters[0], [4, 4]) - state.split(2, state.stages[2].iters[3], [4, 4]) - ret.append([state.state_object, stage_id - 1]) - elif stage_id == 1: - state = ansor.loop_state.State(state, meta_policy.cur_task.compute_dag) - state.cache_read(1, "global", [2]) - state.compute_at(2, 3, state.stages[3].iters[4]) - ret.append([state.state_object, stage_id - 1]) - else: - ret.append([state, stage_id - 1]) - return ret - - def apply_func2(meta_policy, state, stage_id): - # More template like way - ret = [] - state = ansor.loop_state.State(state, meta_policy.cur_task.compute_dag) - - state.split(2, state.stages[2].iters[0], [4, 4]) - state.split(2, state.stages[2].iters[3], [4, 4]) - state.cache_read(1, "global", [2]) - state.compute_at(2, 3, state.stages[3].iters[4]) - - ret.append([state.state_object, -1]) - return ret - - measure_ctx = ansor.LocalRPCMeasureContext() - search_common(seed=887823438, runner=measure_ctx.runner, - pre_search_callbacks=[ansor.PreloadCustomSketchRule( - meet_condition_func, apply_func1)], - params={'disable_change_compute_location': 1}) - search_common(seed=887823438, runner=measure_ctx.runner, - pre_search_callbacks=[ansor.PreloadCustomSketchRule( - meet_condition_func, apply_func2)], - params={'disable_change_compute_location': 1}) - - -if __name__ == "__main__": - test_search_basic() - test_search_xgb_model_rpc_runner() - test_search_opencl() - test_search_cuda() - test_search_custom_sketch_rule() diff --git a/tests/python/unittest/test_ansor_task_scheduler.py b/tests/python/unittest/test_ansor_task_scheduler.py deleted file mode 100644 index 53cf2059c1f3f..0000000000000 --- a/tests/python/unittest/test_ansor_task_scheduler.py +++ /dev/null @@ -1,52 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Test the task scheduler """ - -import threading - -import tvm -from tvm import ansor - -from test_ansor_common import matmul_ansor_test - -def test_task_scheduler_basic(): - N = 128 - A, B, C = matmul_ansor_test(N, N, N) - dag = ansor.ComputeDAG([A, B, C]) - tgt = tvm.target.create("llvm") - task1 = ansor.SearchTask(dag, "test", tgt) - task2 = ansor.SearchTask(dag, "test", tgt) - - def basic_test_func(task1, task2): - def objective(costs): - return sum(costs) - - task_scheduler = ansor.SimpleTaskScheduler([task1, task2], objective) - tune_option = ansor.TuneOption(n_trials=3, runner='local') - task_scheduler.tune(tune_option) - - # Ansor search process with local runner has some modification on thread - # binding, wrap this to a subprocess to eliminate the impacts to other tests - t = threading.Thread(target=basic_test_func, - kwargs={'task1': task1, 'task2': task2}) - t.start() - t.join() - - -if __name__ == "__main__": - test_task_scheduler_basic() diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h index 7dd782f5b6228..e0e4556678894 100644 --- a/topi/include/topi/transform.h +++ b/topi/include/topi/transform.h @@ -1295,75 +1295,6 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout, name, tag); } -/*! - * \brief utility function for kernel_layout_transform - */ -inline void parse_kernel_layout(const String& layout, - Array* shape, - std::vector* axes) { - int32_t factor = 0; - std::string axis = ""; - for (char c : std::string(layout)) { - if (c >= 'A' && c <= 'z') { - axis += c; - if (factor != 0) { - shape->push_back(factor); - factor = 0; - } - } else if (c >= '0' && c <= '9') { - factor = factor * 10 + c - '0'; - if (!axis.empty()) { - axes->push_back(axis); - axis = ""; - } - } else { - LOG(FATAL) << "Invalid layout " << layout; - } - } - if (!axis.empty()) { - axes->push_back(axis); - } -} - -/*! - * \brief Transform the kernel layout according to \p src_layout and \p dst_layout - * \param src the source input. - * \param src_layout the source layout. - * \param dst_layout the destination layout. - * \param name output tensor name. - * \param tag output tensor tag. - * \return A tensor with shape in \p dst_layout - */ -inline Tensor kernel_layout_transform(const Tensor& src, - const String& src_layout, - const String& dst_layout, - const String name = "T_kernel_layout_trans", - const String tag = kInjective) { - Array src_shape; - std::vector src_axes; - Array dst_shape; - std::vector dst_axes; - - parse_kernel_layout(src_layout, &src_shape, &src_axes); - parse_kernel_layout(dst_layout, &dst_shape, &dst_axes); - return compute( - dst_shape, [&](const Array& dst_indices) { - Array dst_indices_expr(dst_indices.begin(), dst_indices.end()); - Array src_indices; - for (const std::string& src_axis : src_axes) { - PrimExpr src_index = 0; - CHECK_EQ(dst_indices_expr.size(), dst_axes.size()); - for (size_t i = 0; i < dst_axes.size(); ++i) { - if (dst_axes[i] == src_axis) { - src_index = src_index * dst_shape[i] + dst_indices_expr[i]; - } - } - src_indices.push_back(src_index); - } - return src(src_indices); - }, name, tag); -} - /*! * \brief Get the shape of input tensor. * \param src the input tensor. diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py index 6800129c12aab..4c7941b49692c 100644 --- a/topi/python/topi/nn/conv2d.py +++ b/topi/python/topi/nn/conv2d.py @@ -20,7 +20,7 @@ from __future__ import absolute_import as _abs from collections import namedtuple import tvm -from tvm import te, ansor +from tvm import te from .pad import pad from .util import get_pad_tuple @@ -342,37 +342,7 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'): dilation_h, dilation_w = dilation batch, in_height, in_width, in_channel = Input.shape - if ansor.GLOBAL_SCOPE.topi_in_compute_rewrite_mode: - # infer shape for the rewritten layout - if len(Filter.shape) >= 10: - # For cpu tile structure SSRSRS - base = len(Filter.shape) - 10 - kernel_h = Filter.shape[2 + base] * Filter.shape[6 + base] - kernel_w = Filter.shape[3 + base] * Filter.shape[7 + base] - channel = Filter.shape[4 + base] * Filter.shape[8 + base] - num_filter = Filter.shape[5 + base] * Filter.shape[9 + base] - for i in range(base + 2): - num_filter *= Filter.shape[i] - elif len(Filter.shape) == 6: - # For cpu tile structure SRS - num_filter = Filter.shape[0] * Filter.shape[1] * Filter.shape[5] - kernel_h = Filter.shape[2] - kernel_w = Filter.shape[3] - channel = Filter.shape[4] - elif len(Filter.shape) == 5: - # For cpu tile structure SRS - num_filter = Filter.shape[0] * Filter.shape[4] - kernel_h = Filter.shape[1] - kernel_w = Filter.shape[2] - channel = Filter.shape[3] - elif len(Filter.shape) == 4: - num_filter, kernel_h, kernel_w, channel = Filter.shape - else: - raise ValueError("Don't know how to infer layout for filter shape: %s. " \ - "You can add a new branch for it to fix this." % str(Filter)) - else: - kernel_h, kernel_w, channel, num_filter = Filter.shape - + kernel_h, kernel_w, channel, num_filter = Filter.shape # compute the output shape dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 @@ -392,9 +362,8 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'): lambda nn, yy, xx, ff: te.sum( PaddedInput[nn, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc].astype(out_dtype) * - Filter[ry, rx, rc, ff].astype(out_dtype) - , axis=[ry, rx, rc]), - name="Conv2dOutput", tag="conv2d_nhwc", attrs={"layout_free_placeholders": [Filter]}) + Filter[ry, rx, rc, ff].astype(out_dtype), axis=[ry, rx, rc]), + name="Conv2dOutput", tag="conv2d_nhwc") return Output diff --git a/tutorials/ansor/README.txt b/tutorials/ansor/README.txt deleted file mode 100644 index 85b6ba401daec..0000000000000 --- a/tutorials/ansor/README.txt +++ /dev/null @@ -1,4 +0,0 @@ -.. _tutorial-ansor-auto-schedule: - -Ansor: Template Free Auto Scheduling ------------------------------------- diff --git a/tutorials/ansor/tune_conv2d_cuda.py b/tutorials/ansor/tune_conv2d_cuda.py deleted file mode 100644 index 03f1b24a768ee..0000000000000 --- a/tutorials/ansor/tune_conv2d_cuda.py +++ /dev/null @@ -1,179 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -Auto-scheduling High Performance Convolution on NVIDIA GPUs -=========================================================== -**Author**: `Lianmin Zheng `_, \ - `Chengfan Jia `_, \ - `Minmin Sun `_, \ - `Zhao Wu `_ - -This is an tutorial for searching high performance schedule for NVIDIA GPU using -Ansor auto-scheduler. By running Ansor on this template, we can outperform the -vendor provided library CuDNN in many cases. -""" - -###################################################################### -# Install dependencies -# -------------------- -# To use autotvm package in tvm, we need to install some extra dependencies. -# (change "3" to "2" if you use python2): -# -# .. code-block:: bash -# -# pip3 install --user psutil xgboost tornado -# -# To make TVM run faster in tuning, it is recommended to use cython -# as FFI of tvm. In the root directory of tvm, execute -# -# .. code-block:: bash -# -# pip3 install --user cython -# sudo make cython3 -# -# Now return to python code. Import packages. - -import random -import sys - -import numpy as np -import tvm -import topi -from topi.testing import conv2d_nchw_python -from tvm import te - -# the module is called `ansor` -from tvm import ansor - -###################################################################### -# Step 1: Define the search task -# ------------------------------- -# There are plenty of useful schedule primitives in tvm. You can also find -# some tutorials that describe them in more details, such as -# (1). :ref:`opt-conv-gpu` -# (2). `Optimizing DepthwiseConv on NVIDIA GPU `_ -# -# It's usually a hard job if one wants to get a high performance schedule for a -# specific workload. Even writing an AutoTVM tunable template needs user to have -# expertises on how each schedule primitive works as well as how they finally -# reflect on the hardward architecture. -# -# However, with Ansor this will be quite simple. Firstly, define the target workload. -# Both :code:`tvm.te` API or topi op API are fine to be used. -# -# We can use the retuned :code:`Tensors` to create a ComputeDAG just like what we do -# in :ref:`ansor-simple-subgraph`, while the way to use workload registry is more -# recommended. - -# Use an extra function decorator to regist this workload -@ansor.register_workload_func -def conv2d_nchw(N, H, W, CO, CI, KH, KW, stride, padding): - data = te.placeholder((N, CI, H, W), name='data') - kernel = te.placeholder((CO, CI, KH, KW), name='kernel') - conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype='float32') - - return [data, kernel, conv] - -###################################################################### -# Step 2: Search through the schedule space -# ------------------------------------------ -# We pick the last layer on resnet as test case. -# Since our space is very large, :code:`XGBModel` is most suitable -# for our case. Here we only do 20 trials for demonstration. -# In practice, making 1000 trials usually can find some good kernels -# for this workload. - -tgt = tvm.target.cuda() - -# The last layer in resnet -N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1) -# Generate workload key with the ansor API -wkl_key = ansor.make_workload_key_func(conv2d_nchw, (N, H, W, CO, CI, KH, KW, strides, padding)) -# Generate ComputeDAG using the workload key -dag = ansor.workload_key_to_dag(wkl_key) -task = ansor.SearchTask(dag, wkl_key, target=tgt) - -log_file = "conv2d_nchw.json" -seed = 0 -random.seed(seed) -cost_model = ansor.XGBModel(seed=seed) -search_policy = ansor.SketchSearchPolicy(cost_model, seed=seed) - -######################################################################### -# The :code:`ansor.LocalRPCMeasureContext` is used to create a RPC runner environment. -# -# Use local gpu, measure 10 times for every schedule to reduce variance. The timeout -# for each running is set to 4 seconds. -# -# During the searching process, we may generate several invalid schedules and they -# will be filtered out. It's fine to see "Encountered errors during feature extraction." -# in the tuning logs. -# :code:`ansor.LogToFile` callback will log the tuning results into a -# log file, which can be used to get the best config later. -# :code:`ansor.PreloadMeasuredStates` callback will load measured states -# from history log before schedule search, we can add this callback to make -# sure a same schedule will never be measured for multiple times. - -measure_ctx = ansor.LocalRPCMeasureContext(repeat=3, min_repeat_ms=100, timeout=4) -tune_option = ansor.TuneOption(n_trials=20, - runner=measure_ctx.runner, - measure_callbacks=[ansor.LogToFile(log_file)], - pre_search_callbacks=[ansor.PreloadMeasuredStates(log_file)]) -s, arg_bufs = ansor.auto_schedule(task, search_policy=search_policy, tune_option=tune_option) - -print("==== Get Lowered Stmt ====") -print(tvm.lower(s, arg_bufs, simple_mode=True)) - -# Release the RPC runner environment -del measure_ctx - -######################################################################### -# From the example lower result showed above, we can see that Ansor has tried -# techniques such as `Shared Memory Cooperative Fetching`, `Kernel Fusion`, -# `Axis unroll`, `Axis Vectorize` and so on. There is no need for users to care -# about the details, and Ansor will catch them well. -# -# Finally we can directly use the returned result to get the generated schedule, -# while in the following tutorial we'll show how to inspect the best config from -# log file, check correctness, and measure running time. - -# Get history best from log file -inp, res = ansor.best_measure_pair_in_file(log_file) -# Get the task ComputeDAG from log result -dag = ansor.workload_key_to_dag(inp.task.workload_key) -# Apply log result to TVM schedule -s, arg_bufs = dag.apply_steps_from_state(inp.state) -func = tvm.build(s, arg_bufs, target=tgt) - -# check correctness -a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32) -w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32) -c_np = conv2d_nchw_python(a_np, w_np, strides, padding) - -ctx = tvm.gpu() -a_tvm = tvm.nd.array(a_np, ctx=ctx) -w_tvm = tvm.nd.array(w_np, ctx=ctx) -c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx) -func(a_tvm, w_tvm, c_tvm) - -tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2) - -# Evaluate running time. Here we choose a large repeat number (400) to reduce the noise -# and the overhead of kernel launch. You can also use nvprof to validate the result. -evaluator = func.time_evaluator(func.entry_name, ctx, number=400) -print('Time cost of this operator: %f s' % evaluator(a_tvm, w_tvm, c_tvm).mean) - diff --git a/tutorials/ansor/tune_simple_subgraph.py b/tutorials/ansor/tune_simple_subgraph.py deleted file mode 100644 index 00bef82cf855c..0000000000000 --- a/tutorials/ansor/tune_simple_subgraph.py +++ /dev/null @@ -1,193 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -.. _ansor-simple-subgraph: - -Writing compute expression and Using Ansor auto-scheduler -========================================================= -**Author**: `Lianmin Zheng `_, \ - `Chengfan Jia `_, \ - `Minmin Sun `_, \ - `Zhao Wu `_ - -This is an introduction tutorial to the auto-scheduler module in TVM. - -There are two steps in auto-scheduling. -The first step is defining the target task. -The second step is running a search algorithm to auto explore the schedule. -In this tutorial, you can learn how to perform these two steps in TVM. -The whole workflow is illustrated by a matrix multiplication with bias add example. -""" - -###################################################################### -# Install dependencies -# -------------------- -# To use Ansor package in TVM, we need to install some extra dependencies. -# This step (installing xgboost) can be skipped as it doesn't need XGBoost -# (change "3" to "2" if you use python2): -# -# .. code-block:: bash -# -# pip3 install --user psutil xgboost -# -# To make TVM run faster in tuning, it is recommended to use cython -# as FFI of TVM. In the root directory of TVM, execute -# (change "3" to "2" if you use python2): -# -# .. code-block:: bash -# -# pip3 install --user cython -# sudo make cython3 -# -# Now return to python code. Import packages. - -import random -import sys - -import numpy as np -import tvm -from tvm import te - -# the module is called `ansor` -from tvm import ansor - -###################################################################### -# Step 1: Define the target compute subgraph -# ------------------------------------------- -# In this section, we will write a deterministic TVM compute expression code -# to a compute subgraph. -# -# .. note:: Comparing to :ref:`tutorials-autotvm-sec` -# -# In Ansor, we do not need users to provide a schedule template, the only input -# is the compute expression writing by :code:`tvm.te` API or topi op API. -# -# Here is how we implement a matrix multiplication subgraph in TVM. - -# Matmul with bias add -def matmul_add(N, L, M, dtype): - A = te.placeholder((N, L), name='A', dtype=dtype) - B = te.placeholder((L, M), name='B', dtype=dtype) - C = te.placeholder((N, M), name='C', dtype=dtype) - - k = te.reduce_axis((0, L), name='k') - mul = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k), - name='Mul') - D = te.compute((N, M), lambda i, j: C[i, j] + mul[i, j], name='D') - - return [A, B, C, D] - -###################################################################### -# Step 2: Search through the schedule space -# ------------------------------------------ -# In step 1, we build the compute subgraph. -# The next step is to pick a cost model as well as a search policy and explore the -# possible schedule. -# -# Auto-scheduler in TVM -# ^^^^^^^^^^^^^^^^^^^^^ -# The job for the Ansor auto-scheduler can be described by following pseudo code -# -# .. code-block:: c -# -# ct = 0 -# while ct < max_number_of_trials: -# auto generate a batch of schedules -# measure this batch of schedules on real hardware and get results -# ct += batch_size -# -# When proposing the next batch of schedules, Ansor can take different cost models to -# guide the schedule generating process. -# -# * :code:`RandomModel`: Generate and take new schedule randomly -# * :code:`XGBModel`: Use XGBoost model to estimate the performance of potential schedules, try to pick schedules with better performance in each step -# -# XGBModel can explore more efficiently and find better schedules. - -################################################################ -# Begin tuning -# ^^^^^^^^^^^^ -# Here we continue our matrix multiplication example. -# -# The :code:`ansor.ComputeDAG` takes the Tensor list as input, and generates -# a dag structure. During which process, :code:`ansor.ComputeDAG` will -# do some analyzes with the target subgraph and the results will be used in -# search policy later. -# -# Then we create the :code:`tvm.target` and a tuning task. - -N, L, M = 128, 128, 128 -A, B, C, D = matmul_add(N, L, M, 'float32') -dag = ansor.ComputeDAG([A, B, C, D]) - -print(dag) -print(dag.access_analyzer) - -tgt = tvm.target.create("llvm") -task = ansor.SearchTask(dag, "test", tgt) - -################################################################ -# Next, we choose random model and create a default search policy: -# :code:`ansor.SketchSearchPolicy`. -# -# We only make 5 trials in this tutorial for demonstration. In practice, -# you can do more trials according to your time budget. -# :code:`ansor.LogToFile` callback will log the tuning results into a -# log file, which can be used to get the best config later. -# :code:`ansor.PreloadMeasuredStates` callback will load measured states -# from history log before schedule search, we can add this callback to make -# sure a same schedule will never be measured for multiple times. - -log_file = "matmul_add.json" - -seed = 0 -random.seed(seed) -cost_model = ansor.RandomModel() -search_policy = ansor.SketchSearchPolicy(cost_model, seed=seed) - -tune_option = ansor.TuneOption(n_trials=5, - measure_callbacks=[ansor.LogToFile(log_file)], - pre_search_callbacks=[ansor.PreloadMeasuredStates(log_file)]) - -################################################################ -# Then just call :code:`ansor.auto_schedule` and Ansor will try to find a high -# performance schedule for the target subgraph automatically. -# -# The returned result will be a :code:`te.schedule` and a list of :code:`te.Tensor`, -# which can be used as the input of :code:`tvm.lower` or :code:`tvm.build`. - -s, arg_bufs = ansor.auto_schedule(task, search_policy=search_policy, - tune_option=tune_option) - -print("==== Get Lowered Stmt ====") -print(tvm.lower(s, arg_bufs, simple_mode=True)) - -######################################################################### -# Check the correctness to make sure we generate a right schedule. - -func = tvm.build(s, arg_bufs) - -# check correctness -a_np = np.random.uniform(size=(N, L)).astype(np.float32) -b_np = np.random.uniform(size=(L, M)).astype(np.float32) -c_np = np.random.uniform(size=(N, M)).astype(np.float32) -d_np = a_np.dot(b_np) + c_np - -d_tvm = tvm.nd.empty(d_np.shape) -func(tvm.nd.array(a_np), tvm.nd.array(b_np), tvm.nd.array(c_np), d_tvm) - -tvm.testing.assert_allclose(d_np, d_tvm.asnumpy(), rtol=1e-2) diff --git a/tutorials/autotvm/README.txt b/tutorials/autotvm/README.txt index 4ad36c000e3c2..38e3b3343f4ea 100644 --- a/tutorials/autotvm/README.txt +++ b/tutorials/autotvm/README.txt @@ -1,4 +1,4 @@ .. _tutorials-autotvm-sec: -AutoTVM: Template Based Auto Tuning ------------------------------------ +Auto tuning +-----------