diff --git a/docs/conf.py b/docs/conf.py
index 5826526d55b02..7ece63bd7aa86 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -198,7 +198,6 @@
      '../tutorials/language',
      '../tutorials/optimize',
      '../tutorials/autotvm',
-     '../tutorials/ansor',
      '../tutorials/dev',
      '../tutorials/topi',
      '../tutorials/deployment',
diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h
index 95476ed61bdd6..750a8a43163c3 100644
--- a/include/tvm/relay/attrs/transform.h
+++ b/include/tvm/relay/attrs/transform.h
@@ -296,19 +296,6 @@ struct LayoutTransformAttrs : public tvm::AttrsNode<LayoutTransformAttrs> {
   }
 };
 
-/*! \brief Attributes for KernelLayoutTransform operator */
-struct KernelLayoutTransformAttrs : public tvm::AttrsNode<KernelLayoutTransformAttrs> {
-  std::string src_layout;
-  std::string dst_layout;
-
-  TVM_DECLARE_ATTRS(KernelLayoutTransformAttrs, "relay.attrs.KernelLayoutTransformAttrs") {
-    TVM_ATTR_FIELD(src_layout)
-        .describe("The source layout of the tensor. (e.g. 1N32C112H112W)");
-    TVM_ATTR_FIELD(dst_layout)
-        .describe("The destination layout of the tensor. (e.g. 1N2C112H112W16c)");
-  }
-};
-
 /*! \brief Attributes for ShapeOf operator */
 struct ShapeOfAttrs : public tvm::AttrsNode<ShapeOfAttrs> {
   DataType dtype;
diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index 5f5d9b643633e..1b8b31aee5d10 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -277,20 +277,6 @@ TVM_DLL Pass CanonicalizeOps();
  */
 TVM_DLL Pass AlterOpLayout();
 
-/*!
- * \brief Alternate the layouts of kernels.
- *
- * \return The pass.
- */
-TVM_DLL Pass KernelLayoutTransform();
-
-/*!
- * \brief The reverse of FuseOps.
- *
- * \return The pass.
- */
-TVM_DLL Pass DeFuseOps();
-
 /*!
  * \brief Given a dest layout, this pass transforms the expr such that most of the ops input data
  * layout is changed to the dest layout. In ideal situation, there are only 2 layout transforms, one
diff --git a/python/tvm/ansor/__init__.py b/python/tvm/ansor/__init__.py
index c629c1049a87b..8b8c03a142b34 100644
--- a/python/tvm/ansor/__init__.py
+++ b/python/tvm/ansor/__init__.py
@@ -21,26 +21,14 @@
 from . import measure
 from . import serialization
 from . import loop_state
-from . import auto_schedule
 from . import utils
 from . import feature
-from . import workload_registry
-from . import task_scheduler
 
 # Shortcut
 from .compute_dag import ComputeDAG, LayoutRewriteLevel
-from .auto_schedule import SearchTask, SketchSearchPolicy, TuneOption, HardwareParams, \
-    PreloadMeasuredStates, PreloadCustomSketchRule, auto_schedule
+from .auto_schedule import SearchTask, HardwareParams
 from .measure import MeasureInput, LocalBuilder, LocalRunner, RPCRunner, LocalRPCMeasureContext
-from .cost_model import RandomModel
-from .cost_model.xgb_model import XGBModel
 from .serialization import LogToFile, LogReader, best_measure_pair_in_file, \
     load_from_file, write_measure_records_to_file
 from .workload_registry import register_workload_func, \
     workload_key_to_dag, make_workload_key_func
-from .task_scheduler import TaskScheduler, SimpleTaskScheduler
-from .dispatcher import DispatchContext, ApplyConfig, ApplyHistoryBest as apply_history_best, \
-    FallbackContext
-from .relay_integration import extract_from_program, extract_from_multiple_program, \
-    finish_layout_rewrite, prepare_layout_rewrite, auto_schedule_topi
-from .env import GLOBAL_SCOPE
diff --git a/python/tvm/ansor/auto_schedule.py b/python/tvm/ansor/auto_schedule.py
index a03d9fdacbc2b..41891872b76ec 100644
--- a/python/tvm/ansor/auto_schedule.py
+++ b/python/tvm/ansor/auto_schedule.py
@@ -22,7 +22,6 @@
 import tvm._ffi
 from tvm.runtime import Object
 from .measure import LocalBuilder, LocalRunner
-from .cost_model import RandomModel, XGBModel
 from . import _ffi_api
 
 
@@ -64,206 +63,3 @@ def __init__(self, dag, workload_key, target, target_host=None,
         self.__init_handle_by_constructor__(_ffi_api.SearchTask, dag,
                                             workload_key, target, target_host,
                                             hardware_params)
-
-
-@tvm._ffi.register_object("ansor.SearchPolicy")
-class SearchPolicy(Object):
-    """ The base class for search policy  """
-    def continue_search(self, task, num_measure, verbose, measurer):
-        return _ffi_api.SearchPolicyContinueSearchOneRound(self, task,
-                                                           num_measure, verbose, measurer)
-
-    def set_task(self, task):
-        _ffi_api.SearchPolicySetTask(self, task)
-
-    def set_verbose(self, verbose):
-        _ffi_api.SearchPolicySetVerbose(self, verbose)
-
-    def run_callbacks(self, callbacks):
-        _ffi_api.SearchPolicyRunCallbacks(self, callbacks)
-
-
-@tvm._ffi.register_object("ansor.SketchSearchPolicy")
-class SketchSearchPolicy(SearchPolicy):
-    """  The search policy that searches in a hierarchical search space defined by sketches.
-    The policy randomly samples programs from the space defined by sketches
-    and use evolutionary search to fine-tune them.
-
-    Parameters
-    ----------
-    program_cost_model: CostModel
-        Cost model for programs
-    params: int
-        Parameters of the search policy. See `src/ansor/search_policy/sketch_search_policy.h`
-        to find the definitions. See code below to find the default values
-    seed: int
-        Random seed
-    """
-    def __init__(self,
-                 program_cost_model,
-                 params=None,
-                 seed=None):
-        # set default parameters
-        default_params = {
-            "eps_greedy": 0.05,
-
-            'evolutionary_search_population': 2048,
-            'evolutionary_search_num_iters': 15,
-            "evolutionary_search_mutation_prob": 0.85,
-            "evolutionary_search_use_measured_ratio": 0.2,
-
-            'cpu_multi_level_tiling_structure': 'SSRSRS',
-            'gpu_multi_level_tiling_structure': 'SSSRRSRS',
-
-            'disable_change_compute_location': 0,
-        }
-
-        if params is None:
-            params = default_params
-        else:
-            for key, value in default_params.items():
-                if key not in params:
-                    params[key] = value
-
-        self.__init_handle_by_constructor__(
-            _ffi_api.SketchSearchPolicy, program_cost_model, params,
-            seed or random.randint(1, 1 << 30))
-
-
-@tvm._ffi.register_object("ansor.SearchCallback")
-class SearchCallback(Object):
-    """Callback function before or after search process"""
-    pass
-
-
-@tvm._ffi.register_object("ansor.PreloadMeasuredStates")
-class PreloadMeasuredStates(SearchCallback):
-    """ A SearchCallback to load measured states from the log file for a search policy.
-    This can resume the state of the search policy.
-
-    Parameters
-    ----------
-    filename: str
-    """
-    def __init__(self, filename: str):
-        self.__init_handle_by_constructor__(
-            _ffi_api.PreloadMeasuredStates, filename)
-
-
-@tvm._ffi.register_object("ansor.PreloadCustomSketchRule")
-class PreloadCustomSketchRule(SearchCallback):
-    """
-    A SearchCallback for SketchSearchPolicy that allowing users to add
-    custom sketch rule.
-
-    Notes
-    -----
-    This is an advanced feature. Make sure you're clear how it
-    works and this should only be used in SketchSearchPolicy.
-
-    Parameters
-    ----------
-    meet_condition_func: Function
-        A function with `(policy, state, stage_id) -> int`
-    apply_func: Function
-        A function with `(policy, state, stage_id) -> [[State, int], ...]`
-    """
-    def __init__(self, meet_condition_func, apply_func):
-        self.__init_handle_by_constructor__(
-            _ffi_api.PreloadCustomSketchRule, meet_condition_func, apply_func)
-
-
-@tvm._ffi.register_object("ansor.TuneOption")
-class TuneOption(Object):
-    """ The options for tuning
-
-    Parameters
-    ----------
-    n_trials: int
-      Number of total measurement trials
-    early_stopping: int
-      Stops early the tuning if no improvement after n measurements
-    num_measure_per_iter: int
-      The number of programs to be measured at each iteration
-    verbose: int
-      Verbosity level. 0 means silent.
-    builder: Builder
-      Builder which builds the program
-    runner: Runner
-      Runner which runs the program and measure time costs
-    measure_callbacks: List[MeasureCallback]
-      Callback functions called after each measure
-      Candidates:
-        - ansor.LogToFile
-    pre_search_callbacks: List[SearchCallback]
-      Callback functions called before the search process
-      Candidates:
-        - ansor.PreloadMeasuredStates
-        - ansor.PreloadCustomSketchRule
-    """
-    def __init__(self, n_trials=0, early_stopping=-1, num_measure_per_iter=64,
-                 verbose=1, builder='local', runner='local', measure_callbacks=None,
-                 pre_search_callbacks=None):
-        if isinstance(builder, str):
-            if builder == 'local':
-                builder = LocalBuilder()
-            else:
-                raise ValueError("Invalid builder: " + builder)
-
-        if isinstance(runner, str):
-            if runner == 'local':
-                runner = LocalRunner()
-            else:
-                raise ValueError("Invalid builder: " + runner)
-
-        if measure_callbacks is None:
-            measure_callbacks = []
-
-        if pre_search_callbacks is None:
-            pre_search_callbacks = []
-
-        self.__init_handle_by_constructor__(
-            _ffi_api.TuneOption, n_trials, early_stopping, num_measure_per_iter,
-            verbose, builder, runner, measure_callbacks, pre_search_callbacks)
-
-
-def auto_schedule(workload, target=None,
-                  target_host=None, search_policy='default',
-                  hardware_params=None, tune_option=None):
-    """ Do auto scheduling for a computation declaration.
-
-    The workload parameter can be a `string` as workload_key, or directly
-    passing a `SearchTask` as input.
-
-    Parameters
-    ----------
-    workload : Union[SearchTask, str]
-    target : Target
-    target_host : Target = None
-    search_policy : Union[SearchPolicy, str]
-    hardware_params : HardwareParams
-    tune_option : TuneOption
-
-    Returns
-    -------
-    sch : tvm.Schedule
-    tensors : List[Tensor]
-    """
-    if isinstance(search_policy, str):
-        if search_policy == 'default':
-            search_policy = SketchSearchPolicy(RandomModel())
-        else:
-            raise ValueError("Invalid search policy: " + search_policy)
-
-    if tune_option is None:
-        tune_option = TuneOption(n_trials=0)
-
-    if isinstance(workload, str):
-        sch, tensors = _ffi_api.AutoScheduleByWorkloadKey(
-            workload, target, target_host, search_policy, hardware_params, tune_option)
-        return sch, tensors
-    elif isinstance(workload, SearchTask):
-        sch, tensors = _ffi_api.AutoScheduleBySearchTask(workload, search_policy, tune_option)
-        return sch, tensors
-    else:
-        raise ValueError("Invalid workload: " + workload + ". Expect a string or SearchTask")
diff --git a/python/tvm/ansor/cost_model/__init__.py b/python/tvm/ansor/cost_model/__init__.py
deleted file mode 100644
index 56e4a5f9128b3..0000000000000
--- a/python/tvm/ansor/cost_model/__init__.py
+++ /dev/null
@@ -1,21 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-import, redefined-builtin
-""" Cost model that estimates the performance of programs """
-
-from .cost_model import RandomModel
-from .xgb_model import XGBModel
diff --git a/python/tvm/ansor/cost_model/cost_model.py b/python/tvm/ansor/cost_model/cost_model.py
deleted file mode 100644
index 57cc53853b2e1..0000000000000
--- a/python/tvm/ansor/cost_model/cost_model.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" Cost model that estimates the performance of programs """
-import ctypes
-import numpy as np
-
-import tvm._ffi
-from tvm.runtime import Object
-from .. import _ffi_api
-
-
-@tvm._ffi.register_object("ansor.CostModel")
-class CostModel(Object):
-    """The base class for cost model"""
-    pass
-
-
-@tvm._ffi.register_object("ansor.RandomModel")
-class RandomModel(Object):
-    """A model returns random estimation for all inputs"""
-    def __init__(self):
-        self.__init_handle_by_constructor__(_ffi_api.RandomModel)
-
-
-@tvm._ffi.register_func("ansor.cost_model.random_number")
-def random_number(n, return_ptr):
-    """ A random number generator func for c++'s RandomModel """
-    if n == 0:
-        return
-    return_ptr = ctypes.cast(return_ptr, ctypes.POINTER(ctypes.c_float))
-    array_wrapper = np.ctypeslib.as_array(return_ptr, shape=(n,))
-    array_wrapper[:] = np.random.uniform(0, 1, (n,))
-
-
-@tvm._ffi.register_object("ansor.PythonBasedModel")
-class PythonBasedModel(CostModel):
-    """Base class for cost models implemented in python"""
-    def __init__(self):
-        def update_func(inputs, results):
-            self.update(inputs, results)
-
-        def predict_func(task, states, return_ptr):
-            return_ptr = ctypes.cast(return_ptr, ctypes.POINTER(ctypes.c_float))
-            array_wrapper = np.ctypeslib.as_array(return_ptr, shape=(len(states),))
-            array_wrapper[:] = self.predict(task, states)
-
-        def predict_stage_func(task, states, return_ptr):
-            ret = self.predict_stages(task, states)
-            return_ptr = ctypes.cast(return_ptr, ctypes.POINTER(ctypes.c_float))
-            array_wrapper = np.ctypeslib.as_array(return_ptr, shape=ret.shape)
-            array_wrapper[:] = ret
-
-        self.__init_handle_by_constructor__(_ffi_api.PythonBasedModel, update_func,
-                                            predict_func, predict_stage_func)
-
-    def update(self, inputs, results):
-        raise NotImplementedError
-
-    def predict(self, task, states):
-        raise NotImplementedError
-
-    def predict_stages(self, task, states):
-        raise NotImplementedError
diff --git a/python/tvm/ansor/cost_model/xgb_model.py b/python/tvm/ansor/cost_model/xgb_model.py
deleted file mode 100644
index 42af17daae2c6..0000000000000
--- a/python/tvm/ansor/cost_model/xgb_model.py
+++ /dev/null
@@ -1,474 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Cost model based on xgboost"""
-import multiprocessing
-import logging
-from collections import defaultdict
-
-import numpy as np
-import xgboost as xgb
-
-from tvm.autotvm.tuner.xgboost_cost_model import get_rank, recall_curve, max_curve
-from .cost_model import PythonBasedModel
-from ..feature import get_per_stmt_features_from_measure_pairs, get_per_stmt_features_from_states
-from ..serialization import LogReader
-
-logger = logging.getLogger('ansor')
-
-class XGBDMatrixContext:
-    """Context to hold additional attributes of xgb.DMatrix"""
-    def __init__(self):
-        self.context_dict = defaultdict(dict)
-
-    def get(self, key, matrix, default=None):
-        return self.context_dict[key].get(matrix.handle.value, default)
-
-    def put(self, key, matrix, value):
-        self.context_dict[key][matrix.handle.value] = value
-
-dmatrix_context = XGBDMatrixContext()
-
-class XGBModel(PythonBasedModel):
-    """Train a XGBoost model to predict the runtime cost of a program.
-    The cost of a program = the sum of the costs of all stages in this program.
-    i.e. Cost(p) = cost_s0 + cost_s1 + ... + cost_sn, where cost_si is the cost of Stage i
-
-    The xgboost model makes prediction per stage, then we sum them up.
-    The final predction made by this class is normalized throughtput (from 0 to 1, larger is better)
-
-    To support this stage decomposition, we have to implement a custom loss function for
-    XGBoost, which is the `pack_sum` in the code below.
-    """
-    def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None):
-        self.xgb_params = {
-            'max_depth': 10,
-            'gamma': 0.001,
-            'min_child_weight': 0,
-            'eta': 0.2,
-            # todo(lmzheng): automatically decrease learning rate when the loss is too large
-
-            'n_gpus': 0,
-            'nthread': multiprocessing.cpu_count() // 2,
-            'verbosity': 0,
-            'seed': seed or 43,
-            'disable_default_eval_metric': 1
-        }
-        self.bst = None
-        self.plan_size = 32
-        self.num_warmup_sample = num_warmup_sample
-        self.verbose_eval = verbose_eval
-
-        super().__init__()
-
-        # measurement input/result pairs
-        self.inputs = []
-        self.results = []
-        self.inputs_feature_cache = []
-
-    def update(self, inputs, results):
-        if len(inputs) <= 0:
-            return
-
-        self.inputs.extend(inputs)
-        self.results.extend(results)
-
-        # extract feature
-        n_cached = len(self.inputs_feature_cache)
-        features, normalized_throughputs, task_ids = \
-            get_per_stmt_features_from_measure_pairs(self.inputs, self.results,
-                                                     skip_first_n_feature_extraction=n_cached)
-        if n_cached > 0:
-            features = list(features)
-            features[:n_cached] = self.inputs_feature_cache
-            features = np.array(features)
-        self.inputs_feature_cache = features
-        dtrain = pack_sum_xgbmatrix(features, normalized_throughputs,
-                                    task_ids, normalized_throughputs)
-
-        # train xgb model
-        self.bst = xgb.train(self.xgb_params, dtrain,
-                             num_boost_round=10000,
-                             obj=pack_sum_square_error,
-                             callbacks=[custom_callback(
-                                 stopping_rounds=50,
-                                 metric='tr-p-rmse',
-                                 fevals=[
-                                     pack_sum_rmse, pack_sum_average_peak_score(self.plan_size),
-                                 ],
-                                 evals=[(dtrain, 'tr')],
-                                 maximize=False,
-                                 verbose_eval=self.verbose_eval)])
-
-    def predict(self, task, states):
-        features = get_per_stmt_features_from_states(states, task)
-        if self.bst is not None and len(self.inputs) > self.num_warmup_sample:
-            dtest, pack_ids = pack_sum_xgbmatrix_for_prediction(features)
-            raw_preds = self.bst.predict(dtest)
-            ret = pack_sum_predict_throughput(raw_preds, pack_ids)
-        else:
-            ret = np.random.uniform(0, 1, (len(states),))
-
-        # Predict 0 for invalid states that failed to be lowered.
-        for idx, feature in enumerate(features):
-            if feature.min() == feature.max() == 0:
-                ret[idx] = float('-inf')
-
-        return ret
-
-    def predict_stages(self, task, states):
-        # Format: (s0 score, ..., sN score, s0 n_stage, s0 stage 0, ..., s1 n_stage, s1 stage 0,)
-        features = get_per_stmt_features_from_states(states, task)
-        if self.bst is not None and len(self.inputs) > self.num_warmup_sample:
-            dtest, pack_ids = pack_sum_xgbmatrix_for_prediction(features)
-            raw_preds = self.bst.predict(dtest)
-            breakdown = pack_sum_predict_throughput(raw_preds, pack_ids)
-            stage_scores = [[] for _ in range(len(states))]
-            for pred, pack_id in zip(raw_preds, pack_ids):
-                stage_scores[pack_id].append(pred)
-            for idx, stage_score in enumerate(stage_scores):
-                breakdown = np.append(breakdown, len(stage_score))
-                breakdown = np.concatenate((breakdown, -np.array(stage_score)))
-        else:
-            breakdown = np.concatenate(
-                (np.random.uniform(0, 1, (len(states), )), np.zeros(len(states), )))
-
-        # Predict 0 for invalid states that failed to be lowered.
-        for idx, feature in enumerate(features):
-            if feature.min() == feature.max() == 0:
-                breakdown[idx] = float('-inf')
-
-        return breakdown
-
-    def load_log_file(self, file_name, n_lines=-1):
-        inputs, results = LogReader(file_name).read_lines(n_lines)
-        logger.info("XGBModel: Loaded %s lines of history log from %s", len(inputs), file_name)
-        self.update(inputs, results)
-
-    def save(self, file_name: str):
-        self.bst.save_model(file_name)
-
-    def load(self, file_name: str):
-        if self.bst is None:
-            self.bst = xgb.Booster(self.xgb_params)
-        self.bst.load_model(file_name)
-        self.num_warmup_sample = -1
-
-
-def pack_sum_xgbmatrix_for_prediction(xs):
-    x_flatten = []
-    pack_ids = []
-
-    for ct, x in enumerate(xs):
-        for row in x:
-            x_flatten.append(row)
-            pack_ids.append(ct)
-
-    return xgb.DMatrix(np.array(x_flatten)), pack_ids
-
-
-def pack_sum_xgbmatrix(xs, ys, gids=None, weights=None):
-    if gids is not None:
-        # sort by group
-        indices = gids.argsort()
-        xs, ys = xs[indices], ys[indices]
-        group_sizes = np.bincount(gids)
-        if weights is not None:
-            weights = weights[indices]
-    else:
-        # assume it has only one group
-        group_sizes = [len(xs)]
-
-    x_flatten = []
-    y_flatten = []
-    weights_flatten = []
-    pack_ids = []
-
-    if weights is not None:
-        for ct, (x, y, w) in enumerate(zip(xs, ys, weights)):
-            for row in x:
-                x_flatten.append(row)
-                y_flatten.append(y)
-                weights_flatten.append(w)
-                pack_ids.append(ct)
-    else:
-        for ct, (x, y) in enumerate(zip(xs, ys)):
-            for row in x:
-                x_flatten.append(row)
-                y_flatten.append(y)
-                pack_ids.append(ct)
-
-    ret = xgb.DMatrix(np.array(x_flatten), y_flatten)
-    if weights is not None:
-        ret.set_weight(weights_flatten)
-    dmatrix_context.put('pack_ids', ret, np.array(pack_ids))
-    dmatrix_context.put('group_sizes', ret, group_sizes)
-    return ret
-
-LOSS_TYPE = 3
-
-# Type 0
-# The model predicts cost. Use square error of throughput as loss
-# loss = 1/2 * (1 / sum(x_i) - y) ^ 2
-#
-# Type 1
-# The model predicts cost. Use square error of cost as loss
-# loss = 1/2 * (sum(x_i) - 1 / y) ^ 2
-#
-# Type 2
-# The model predicts throughput. Use square error of throughput as loss.
-# loss = 1/2 * (1 / sum(1 / x_i) - y) ^ 2
-#
-# Type 3
-# The model predicts throughput. Use square error of throughput as loss.
-# But approximate 1 / (1 / a_1 + 1 / a_2 + ... + 1 / a_n) with -(b_1 + b_2 + b_3)
-# loss = 1/2 * (-sum(x_i) - y) ^ 2
-#
-# Type 4
-# The model predicts throughput. Use square error of throughput as loss.
-# But approximate 1 / (1 / a_1 + 1 / a_2 + ... + 1 / a_n) with -(b_1 + b_2 + b_3)
-# Also add a sigmoid to force the prediction to be within the range of (0, 1)
-# loss = 1/2 * (sigmoid(-sum(x_i)) - y) ^ 2
-#
-
-def pack_sum_predict_throughput(raw_preds, pack_ids):
-    if LOSS_TYPE == 0:
-        sum_pred = np.bincount(pack_ids, weights=raw_preds)
-        return 1 / sum_pred
-    elif LOSS_TYPE == 1:
-        sum_pred = np.bincount(pack_ids, weights=raw_preds)
-        return 1 / sum_pred
-    elif LOSS_TYPE == 2:
-        sum_inverse_preds = np.bincount(pack_ids, weights=1 / raw_preds)
-        return 1 / sum_inverse_preds
-    elif LOSS_TYPE == 3:
-        sum_pred = np.bincount(pack_ids, weights=raw_preds)
-        return - sum_pred # pylint: disable=invalid-unary-operand-type
-    elif LOSS_TYPE == 4:
-        sum_pred = np.bincount(pack_ids, weights=raw_preds)
-        return 1 / (1 + np.exp(sum_pred))
-    else:
-        raise ValueError("Invalid loss type: " + LOSS_TYPE)
-
-def pack_sum_square_error(preds, dtrain):
-    pack_ids = dmatrix_context.get("pack_ids", dtrain)
-    weight = dtrain.get_weight()
-
-    if LOSS_TYPE == 0:
-        sum_pred = np.bincount(pack_ids, weights=preds)
-        x = sum_pred[pack_ids]
-        y = dtrain.get_label()
-        gradient = (x * y - 1) / np.power(x, 3)
-        hessian = (3 - 2 * x * y) / np.power(x, 4)
-    elif LOSS_TYPE == 1:
-        sum_pred = np.bincount(pack_ids, weights=preds)
-        x = sum_pred[pack_ids]
-        y = dtrain.get_label()
-        gradient = x - 1 / np.minimum(y, 1e6)
-        hessian = np.ones_like(gradient)
-    elif LOSS_TYPE == 2:
-        sum_inverse_preds = np.bincount(pack_ids, weights=1 / preds)[pack_ids]
-        y = dtrain.get_label()
-        gradient = (1 / sum_inverse_preds - y) / (np.power(preds * sum_inverse_preds, 2))
-        hessian = (2 * preds * y * np.power(sum_inverse_preds, 2) - 2 * y * sum_inverse_preds - 2 * preds * sum_inverse_preds + 3) / (np.power(preds * sum_inverse_preds, 4))
-    elif LOSS_TYPE == 3:
-        sum_pred = np.bincount(pack_ids, weights=preds)
-        x = sum_pred[pack_ids]
-        y = dtrain.get_label()
-        gradient = x + y
-        hessian = np.ones_like(gradient)
-    elif LOSS_TYPE == 4:
-        sum_pred = np.bincount(pack_ids, weights=preds)
-        exp_x = np.exp(sum_pred[pack_ids])
-        exp_2x = np.power(exp_x, 2)
-        y = dtrain.get_label()
-        gradient = exp_x * (exp_x * y + y - 1) / np.power(exp_x + 1, 3)
-        hessian = exp_x * (-exp_2x * y + 2 * exp_x + y - 1) / np.power(exp_x + 1, 4)
-    else:
-        raise ValueError("Invalid loss type: " + LOSS_TYPE)
-
-    if len(weight) == 0:
-        return gradient, hessian
-    else:
-        return gradient * weight, hessian * weight
-
-def pack_sum_rmse(raw_preds, dtrain):
-    pack_ids = dmatrix_context.get("pack_ids", dtrain)
-    preds = pack_sum_predict_throughput(raw_preds, pack_ids)[pack_ids]
-    return 'p-rmse', np.sqrt(np.mean(np.square((preds - dtrain.get_label()))))
-
-def pack_sum_average_peak_score(N):
-    """Evaluate pack sum average peak score for xgb"""
-
-    def feval(preds, labels):
-        group_sizes = dmatrix_context.get('group_sizes', labels, [len(preds)])
-        pack_ids = dmatrix_context.get("pack_ids", labels)
-
-        preds = pack_sum_predict_throughput(preds, pack_ids)
-        labels = (np.bincount(pack_ids, weights=labels.get_label())
-                  / np.unique(pack_ids, return_counts=True)[1])
-
-        scores = []
-        offset = 0
-        for size in group_sizes:
-            preds_group = preds[offset:offset + size]
-            labels_group = labels[offset:offset + size]
-            offset += size
-
-            trials = np.argsort(preds_group)[::-1][:N]
-            trial_scores = labels_group[trials]
-            curve = max_curve(trial_scores) / np.max(labels_group)
-            scores.append(np.mean(curve))
-        return "a-peak@%d" % N, np.mean(scores)
-    return feval
-
-def pack_sum_average_recall_score(N):
-    """Evaluate average recall score for xgb"""
-
-    def feval(preds, labels):
-        group_sizes = dmatrix_context.get('group_sizes', labels, [len(preds)])
-        pack_ids = dmatrix_context.get("pack_ids", labels)
-
-        preds = pack_sum_predict_throughput(preds, pack_ids)
-        labels = (np.bincount(pack_ids, weights=labels.get_label())
-                  / np.unique(pack_ids, return_counts=True)[1])
-
-        scores = []
-        offset = 0
-        for size in group_sizes:
-            preds_group = preds[offset:offset + size]
-            labels_group = labels[offset:offset + size]
-            offset += size
-
-            trials = np.argsort(preds_group)[::-1]
-            ranks = get_rank(labels_group[trials])[:N]
-            curve = recall_curve(ranks)
-            scores.append(np.mean(curve))
-        return "a-recall@%d" % N, np.mean(scores)
-    return feval
-
-
-def custom_callback(stopping_rounds, metric, fevals, evals=(), log_file=None,
-                    maximize=False, verbose_eval=True, skip_every=2):
-    """Callback function for xgboost to support multiple custom evaluation functions"""
-    from xgboost.core import EarlyStopException
-    from xgboost.callback import _fmt_metric
-    from xgboost.training import aggcv
-
-    state = {}
-    metric_shortname = metric.split("-")[1]
-
-    def init(env):
-        """internal function"""
-        bst = env.model
-
-        state['maximize_score'] = maximize
-        state['best_iteration'] = 0
-        if maximize:
-            state['best_score'] = float('-inf')
-        else:
-            state['best_score'] = float('inf')
-
-        if bst is not None:
-            if bst.attr('best_score') is not None:
-                state['best_score'] = float(bst.attr('best_score'))
-                state['best_iteration'] = int(bst.attr('best_iteration'))
-                state['best_msg'] = bst.attr('best_msg')
-            else:
-                bst.set_attr(best_iteration=str(state['best_iteration']))
-                bst.set_attr(best_score=str(state['best_score']))
-        else:
-            assert env.cvfolds is not None
-
-    def callback(env):
-        """internal function"""
-        if not state:
-            init(env)
-
-        bst = env.model
-        i = env.iteration
-        cvfolds = env.cvfolds
-
-        res_dict = {}
-
-        if i % skip_every == 1:
-            return
-
-        ##### evaluation #####
-        if cvfolds is not None:
-            for feval in fevals:
-                tmp = aggcv([f.eval(i, feval) for f in cvfolds])
-                for k, mean, std in tmp:
-                    res_dict[k] = [mean, std]
-        else:
-            for feval in fevals:
-                bst_eval = bst.eval_set(evals, i, feval)
-                res = [x.split(':') for x in bst_eval.split()]
-                for kv in res[1:]:
-                    res_dict[kv[0]] = [float(kv[1])]
-
-        eval_res = []
-        keys = list(res_dict.keys())
-        keys.sort(key=lambda x: x if metric_shortname not in x else "a" + x)
-        for key in keys:
-            v = res_dict[key]
-            eval_res.append([key] + v)
-
-        ##### print eval result #####
-        if not isinstance(verbose_eval, bool) and verbose_eval and i % verbose_eval == 0:
-            infos = ["XGB iter: %3d" % i]
-            for item in eval_res:
-                if 'null' in item[0]:
-                    continue
-                infos.append("%s: %.6f" % (item[0], item[1]))
-
-            logger.debug("\t".join(infos))
-            if log_file:
-                with open(log_file, "a") as fout:
-                    fout.write("\t".join(infos) + '\n')
-
-        ##### choose score and do early stopping #####
-        score = None
-        for item in eval_res:
-            if item[0] == metric:
-                score = item[1]
-                break
-        assert score is not None
-
-        best_score = state['best_score']
-        best_iteration = state['best_iteration']
-        maximize_score = state['maximize_score']
-        if (maximize_score and score > best_score) or \
-                (not maximize_score and score < best_score):
-            msg = '[%d] %s' % (
-                env.iteration,
-                '\t'.join([_fmt_metric(x) for x in eval_res]))
-            state['best_msg'] = msg
-            state['best_score'] = score
-            state['best_iteration'] = env.iteration
-            # save the property to attributes, so they will occur in checkpoint.
-            if env.model is not None:
-                env.model.set_attr(best_score=str(state['best_score']),
-                                   best_iteration=str(state['best_iteration']),
-                                   best_msg=state['best_msg'])
-        elif env.iteration - best_iteration >= stopping_rounds:
-            best_msg = state['best_msg']
-            if verbose_eval and env.rank == 0:
-                logger.debug("XGB stopped. Best iteration: %s ", best_msg)
-            raise EarlyStopException(best_iteration)
-
-    return callback
diff --git a/python/tvm/ansor/dispatcher.py b/python/tvm/ansor/dispatcher.py
deleted file mode 100644
index 0c07fd141bd2c..0000000000000
--- a/python/tvm/ansor/dispatcher.py
+++ /dev/null
@@ -1,299 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-The global context that dispatches best configurations to workloads
-"""
-# pylint: disable=invalid-name
-
-from __future__ import absolute_import as _abs
-
-import logging
-
-import numpy as np
-
-from tvm.tir.expr import FloatImm
-
-logger = logging.getLogger('auto_scheduler')
-
-
-class DispatchContext(object):
-    """
-    Base class of dispatch context.
-    """
-    current = None 
-
-    def __init__(self):
-        self._old_ctx = DispatchContext.current
-
-    def query(self, target, workload):
-        """
-        Query the context to get the specific config for a workload.
-        If cannot find the result inside this context, this function will query it
-        from the upper contexts.
-
-        Parameters
-        ----------
-        target: Target
-            The current target
-        workload : str
-            The current workload
-
-        Returns
-        -------
-        cfg : State
-            The schedule configuration for the workload
-        """
-        ret = self._query_inside(target, workload)
-        return ret
-
-    def update(self, target, workload, cfg):
-        """
-        Update the config for a workload
-
-        Parameters
-        ----------
-        target: Target
-            The current target
-        workload : Workload
-            The current workload.
-        cfg : State
-            The schedule configuration for the workload
-        """
-        raise NotImplementedError()
-
-    def _query_inside(self, target, workload):
-        """
-        Query the context to get the specific config for a workload.
-        This function only query config inside this context.
-
-        Parameters
-        ----------
-        target: Target
-            The current target
-        workload : Workload
-            The current workload.
-
-        Returns
-        -------
-        cfg : State or str
-            The schedule configuration for the workload
-        """
-        raise NotImplementedError()
-
-    def __enter__(self):
-        self._old_ctx = DispatchContext.current
-        DispatchContext.current = self
-        return self
-
-    def __exit__(self, ptype, value, trace):
-        DispatchContext.current = self._old_ctx
-
-
-class ApplyConfig(DispatchContext):
-    """Apply a deterministic config for all queries.
-
-    Parameters
-    ----------
-    config : State
-        The schedule configuration
-    """
-    def __init__(self, config):
-        super(ApplyConfig, self).__init__()
-        self._config = config
-        self.workload = None
-
-    def _query_inside(self, target, workload):
-        """Override query"""
-        self.workload = workload
-        return self._config
-
-    def update(self, target, workload, cfg):
-        """Override update"""
-        self.workload = workload
-        self._config = cfg
-
-
-class ApplyHistoryBest(DispatchContext):
-    """
-    Apply the history best config
-
-    Parameters
-    ----------
-    records : str or iterator of (MeasureInput, MeasureResult)
-        Collection of tuning records.
-        If is str, then it should be the filename of a records log file.
-                   Each row of this file is an encoded record pair.
-        Otherwise, it is an iterator.
-    n_lines: int (optional)
-        if it is not None, only load the first `n_lines` lines of log
-    """
-    def __init__(self, records, n_lines=None):
-        super(ApplyHistoryBest, self).__init__()
-
-        self.best_by_targetkey = {}
-        self.best_by_model = {}
-        self._best_user_defined = {}
-
-        if records:
-            self.load(records, n_lines)
-
-    def load(self, records, n_lines=None):
-        """Load records to this dispatch context
-
-        Parameters
-        ----------
-        records : str or iterator of (MeasureInput, MeasureResult)
-            Collection of tuning records.
-            If is str, then it should be the filename of a records log file.
-                       Each row of this file is an encoded record pair.
-            Otherwise, it is an iterator.
-        n_lines: int (optional)
-            if it is not None, only load the first `n_lines` lines of log
-        """
-        from pathlib import Path
-        from . import load_from_file
-
-        if isinstance(records, Path):
-            records = str(records)
-
-        if isinstance(records, str):
-            records = load_from_file(records)
-        if not records:
-            return
-
-        best_by_targetkey = self.best_by_targetkey
-        best_by_model = self.best_by_model
-
-        counter = 0
-        for inp, res in records:
-            if n_lines is not None and counter >= n_lines:
-                break
-            counter += 1
-            if res.error_no != 0:
-                continue
-
-            # use target keys in tvm target system as key to build best map
-            for k in inp.task.target.keys:
-                key = (k, inp.task.workload_key)
-                if key not in best_by_targetkey:
-                    best_by_targetkey[key] = (inp, res)
-                else:
-                    _, other_res = best_by_targetkey[key]
-                    other_costs = [x.value for x in other_res.costs if isinstance(x, FloatImm)]
-                    costs = [x.value for x in res.costs if isinstance(x, FloatImm)]
-                    if np.mean(other_costs) > np.mean(costs):
-                        best_by_targetkey[key] = (inp, res)
-
-            # use model as key to build best map
-            key = (inp.task.target.model, inp.task.workload_key)
-            if key not in best_by_model:
-                if inp.task.target.model != 'unknown':
-                    best_by_model[key] = (inp, res)
-            else:
-                _, other_res = best_by_model[key]
-                other_costs = [x.value for x in other_res.costs if isinstance(x, FloatImm)]
-                costs = [x.value for x in res.costs if isinstance(x, FloatImm)]
-                if np.mean(other_costs) > np.mean(costs):
-                    best_by_model[key] = (inp, res)
-
-        logger.debug("Finish loading %d records", counter)
-
-    def _query_inside(self, target, workload):
-        if target is None:
-            raise RuntimeError("Need a target context to find the history best. "
-                               "Hint: If your target is llvm, use `with tvm.target.create('llvm'):`"
-                               " above the dispatcher call. So does other target. ")
-
-        # first try matching by model
-        key = (target.model, workload)
-        if key in self._best_user_defined:
-            return self._best_user_defined[key]
-        if key in self.best_by_model:
-            return self.best_by_model[key][0].state
-
-        # then try matching by target key
-        for k in target.keys:
-            key = (k, workload)
-            if key in self._best_user_defined:
-                return self._best_user_defined[key]
-            if key in self.best_by_targetkey:
-                return self.best_by_targetkey[key][0].state
-
-        return None
-
-    def update(self, target, workload, state):
-        model = target.model
-        key = (model, workload)
-        self._best_user_defined[key] = state
-
-        for k in target.keys:
-            key = (k, workload)
-            self._best_user_defined[key] = state
-
-
-class FallbackContext(DispatchContext):
-    """
-    A fallback dispatch context.
-    This is used as the root context.
-    """
-
-    def __init__(self):
-        super(FallbackContext, self).__init__()
-        self.memory = {}
-        self.silent = False
-
-        # a set to prevent print duplicated message
-        self.messages = set()
-
-    def _query_inside(self, target, workload):
-        key = (str(target), workload)
-        if key in self.memory:
-            return self.memory[key]
-
-        if not self.silent:
-            msg = "Cannot find config for target=%s, workload=%s. A fallback configuration "\
-                  "is used, which may bring great performance regression." % (target, workload)
-            if msg not in self.messages:
-                self.messages.add(msg)
-                logger.warning(msg)
-        cfg = None
-
-        # cache this config to avoid duplicated warning message
-        self.memory[key] = cfg
-        return cfg
-
-    def clear_cache(self, target, workload):
-        """Clear fallback cache. Pass the same argument as _query_inside to this function
-        to clean the cache.
-
-        Parameters
-        ----------
-        target: Target
-            The current target
-        workload : Workload
-            The current workload.
-        """
-        key = (str(target), workload)
-        if key in self.memory:
-            del self.memory[key]
-
-    def update(self, target, workload, cfg):
-        key = (str(target), workload)
-        self.memory[key] = cfg
-
-
-DispatchContext.current = FallbackContext()
diff --git a/python/tvm/ansor/env.py b/python/tvm/ansor/env.py
deleted file mode 100644
index 0f35f92acbbc6..0000000000000
--- a/python/tvm/ansor/env.py
+++ /dev/null
@@ -1,26 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-""" The scope to store global variables in ansor """
-
-
-class AutoschedulerGlobalScope(object):
-    def __init__(self):
-        self.topi_in_compute_rewrite_mode = False
-
-GLOBAL_SCOPE = AutoschedulerGlobalScope()
-
diff --git a/python/tvm/ansor/relay_integration.py b/python/tvm/ansor/relay_integration.py
deleted file mode 100644
index 3c2eabd3dfacb..0000000000000
--- a/python/tvm/ansor/relay_integration.py
+++ /dev/null
@@ -1,240 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=unused-variable,invalid-name
-
-"""
-Integrate ansor into relay. It implements the following items:
-1. Extract search tasks from a relay program
-2. Provide auto-scheduling for all TOPI compute functions
-"""
-import os
-import json
-import threading
-
-from tvm import target, te, transform
-from tvm.te.tensor import PlaceholderOp, ComputeOp
-from .dispatcher import DispatchContext
-from .workload_registry import register_workload_bufs, compute_dag_hash
-from .compute_dag import ComputeDAG, LayoutRewriteLevel
-from .env import GLOBAL_SCOPE
-
-def call_all_topi_funcs(mod, target, params):
-    """Call all TOPI compute + schedule to extract tasks in a relay program"""
-    # pylint: disable=import-outside-toplevel
-    from tvm import relay
-
-    with transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
-        bld_mod = relay.build_module.BuildModule()
-        bld_mod.call_all_topi_funcs(mod, target=target, params=params)
-
-def extract_from_program(mod, params, target, target_host=None):
-    """ Extract tuning tasks from a relay program.
-
-    This function is the single program version of extract_from_multiple_program.
-
-    Parameters
-    ----------
-    mod : relay.Module
-        The module to extract.
-    params: dict of str to numpy array
-        The associated parameters of the program
-    ops: List of relay op
-        List of relay ops to be tuned
-    target: tvm.target.Target
-        The compilation target
-    target_host: tvm.target.Target
-        The host compilation target
-
-    Returns
-    -------
-    workloads: Array of Tuple(wkl_key, target)
-    """
-    return extract_from_multiple_program([mod], [params], target, target_host)
-
-def extract_from_multiple_program(mods, params, target, target_host=None):
-    """ Extract tuning tasks from multiple relay programs.
-
-    Parameters
-    ----------
-    mods : List of relay.Module
-        The modules to extract.
-    params: List of dict of str to numpy array
-        The associated parameters of the programs
-    ops: List of relay op
-        List of relay ops to be tuned
-    target: tvm.target.Target
-        The compilation target
-    target_host: tvm.target.Target
-        The host compilation target
-
-    Returns
-    -------
-    workloads: Array of Tuple(wkl_key, target)
-    """
-    # pylint: disable=import-outside-toplevel
-    from tvm import relay
-
-    env = TracingEnvironment(TracingMode.EXTRACT_TASK)
-    with env:
-        # run compiler to collect all TOPI calls during compilation
-        for mod, param in zip(mods, params):
-            # wrap build call in a new thread to avoid the conflict
-            # between python's multiprocessing and tvm's thread pool
-            build_thread = threading.Thread(target=call_all_topi_funcs,
-                                            args=(mod, target, param))
-        build_thread.start()
-        build_thread.join()
-        relay.backend.compile_engine.get().clear()
-
-    # create tasks for target
-    wkl_keys = []
-    wkl_weights = []
-    for wkl_key, wkl_weight in env.wkl_key_collection.items():
-        wkl_keys.append(wkl_key)
-        wkl_weights.append(wkl_weight)
-
-    return wkl_keys, wkl_weights
-
-
-def prepare_layout_rewrite(mod, params, target):
-    """
-    Prepare for kernel layout rewrite. This function will write layout infos to a global static variable.
-    Then these layout info will be used by a relay pass `kernel_layout_transform`.
-    """
-    # pylint: disable=import-outside-toplevel
-    from tvm import relay
-
-    env = TracingEnvironment(TracingMode.PREPARE_LAYOUT_REWRITE)
-    with env:
-        # wrap build call in a new thread to avoid the conflict
-        # between python's multiprocessing and tvm's thread pool
-        build_thread = threading.Thread(target=call_all_topi_funcs,
-                                        args=(mod, target, params))
-        build_thread.start()
-        build_thread.join()
-        relay.backend.compile_engine.get().clear()
-
-    if env.layout_rewrite_success_ct > 0:
-        GLOBAL_SCOPE.topi_in_compute_rewrite_mode = True
-
-def finish_layout_rewrite():
-    """Clear the global flag for layout rewrite"""
-    GLOBAL_SCOPE.topi_in_compute_rewrite_mode = False
-
-
-class TracingMode:
-    """Two modes for tracing"""
-    EXTRACT_TASK = 0            # trace all topi calls to extract tasks
-    PREPARE_LAYOUT_REWRITE = 1  # trace all topi calls to prepare layout rewrite
-
-class TracingEnvironment:
-    """Global environment for tracing all topi function calls"""
-    current = None
-
-    def __init__(self, tracing_mode):
-        self.tracing_mode = tracing_mode
-        self.relay_disable_build_cache = "false"
-        self.layout_rewrite_success_ct = 0
-        self.wkl_key_collection = {}
-
-    def __enter__(self):
-        self.relay_disable_build_cache = os.environ.get("TVM_RELAY_DISABLE_BUILD_CACHE", "false")
-        os.environ["TVM_RELAY_DISABLE_BUILD_CACHE"] = "true"
-        TracingEnvironment.current = self
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        os.environ["TVM_RELAY_DISABLE_BUILD_CACHE"] = self.relay_disable_build_cache
-        TracingEnvironment.current = None
-
-    def add_workload_key(self, key):
-        """Add the workload key of an Ansor search task
-
-        Parameters
-        ----------
-        key: str
-        """
-        if key in self.wkl_key_collection:
-            self.wkl_key_collection[key] += 1
-        else:
-            self.wkl_key_collection[key] = 1
-
-
-def traverse_to_get_io_tensors(outs):
-    """Traverse from a list of output tensors to get a whole computational DAG"""
-    layout_free_ops = []
-    inputs = []
-
-    visited = set()
-
-    def traverse(t):
-        if t in visited:
-            return
-        if isinstance(t.op, PlaceholderOp):
-            inputs.append(t)
-        elif isinstance(t.op, ComputeOp):
-            if "layout_free_placeholders" in t.op.attrs:
-                layout_free_ops.append(t.op)
-            for x in t.op.input_tensors:
-                traverse(x)
-        visited.add(t)
-
-    for t in outs:
-        traverse(t)
-
-    has_layout_free = (len(layout_free_ops) > 0)
-    return inputs + [t for t in outs], has_layout_free
-
-
-def auto_schedule_topi(outs):
-    """ Use ansor to auto-schedule a topi compute declaration """
-    io_tensors, has_layout_free = traverse_to_get_io_tensors(outs)
-    key = register_workload_bufs(io_tensors)
-
-    env = TracingEnvironment.current
-    if env is None:  # in the final build mode
-        state = DispatchContext.current.query(target.Target.current(), key)
-        if state is None:
-            return te.create_schedule([x.op for x in outs])
-
-        dag = ComputeDAG(io_tensors)
-        # Only update compute body, layout_rewrite_level = LayoutRewriteLevel.COMPUTE_REWRITE,
-        # Since kernel layout has already been rewritten in relay pass
-        schedule, _ = dag.apply_steps_from_state(state,
-             layout_rewrite_level=LayoutRewriteLevel.COMPUTE_REWRITE)
-        return schedule
-    elif env.tracing_mode == TracingMode.EXTRACT_TASK:  # in the task extraction mode
-        env.add_workload_key(key)
-        return te.create_schedule([x.op for x in outs])
-    elif env.tracing_mode == TracingMode.PREPARE_LAYOUT_REWRITE:
-        # in prepare_layout_rewrite mode
-        if has_layout_free:
-            # Rewrite the DAG and update the transform history for
-            # the new dag in DispatchContext
-            dispatch_ctx = DispatchContext.current
-            tgt = target.Target.current()
-            state = dispatch_ctx.query(tgt, key)
-            assert state is not None
-            dag = ComputeDAG(outs)
-            new_dag = dag.rewrite_layout_from_state(state)
-            new_key = json.dumps((compute_dag_hash(new_dag),))
-            dispatch_ctx.update(tgt, new_key, state)
-            if new_key != key:
-                env.layout_rewrite_success_ct += 1
-        return te.create_schedule([x.op for x in outs])
-    else:
-        raise ValueError("Invalid tracing mode: " + env.tracing_mode)
diff --git a/python/tvm/ansor/task_scheduler.py b/python/tvm/ansor/task_scheduler.py
deleted file mode 100644
index 587fe3121e883..0000000000000
--- a/python/tvm/ansor/task_scheduler.py
+++ /dev/null
@@ -1,293 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""TaskScheduler that allocates the time resources when tuning multiple tasks together"""
-from typing import List, Union, Callable
-import time
-
-import numpy as np
-
-from .auto_schedule import SearchTask, SearchPolicy, SketchSearchPolicy, TuneOption
-from .cost_model import RandomModel, XGBModel
-from .measure import ProgramMeasurer
-from .utils import array_mean, to_str_round
-
-
-class TaskScheduler:
-    """Allocate the time resources when tuning multiple tasks together"""
-    def __init__(self,
-                 tasks: List[SearchTask],
-                 objective_func: Callable = None):
-        self.tasks = tasks
-        self.objective_func = objective_func or sum
-
-    def compute_score(self, costs: List[float]) -> float:
-        return self.objective_func(costs)
-
-
-def get_search_policies(search_policy: Union[str, List[SearchPolicy]], tasks: List[SearchTask],
-                        num_measure_per_iter, load_model_file=None, load_log_file=None):
-    if search_policy == 'default':
-        search_policy = 'sketch.xgb'
-
-    if isinstance(search_policy, str):
-        policy_type, model_type = search_policy.split('.')
-        if model_type == 'xgb':
-            cost_model = XGBModel(num_warmup_sample=len(tasks) * num_measure_per_iter)
-            if load_model_file:
-                print("Load pretrained model...")
-                cost_model.load(load_model_file)
-            elif load_log_file:
-                cost_model.load_log_file(load_log_file)
-        elif model_type == 'random':
-            cost_model = RandomModel()
-        else:
-            raise ValueError("Invalid search policy: " + search_policy)
-
-        if policy_type == 'sketch':
-            search_policies = [SketchSearchPolicy(cost_model) for _ in range(len(tasks))]
-        elif policy_type == 'limit-space':
-            search_policies = [SketchSearchPolicy(cost_model,
-                                                  params={'cpu_multi_level_tiling_structure': 'SRS',
-                                                          'disable_change_compute_location': 1})
-                               for _ in range(len(tasks))]
-        elif policy_type == 'beam-search':
-            search_policies = [SketchSearchPolicy(cost_model,
-                                                  params={'use_beam_search': 1})
-                               for _ in range(len(tasks))]
-        else:
-            raise ValueError("Invalid search policy: " + search_policy)
-    else:
-        # check type
-        assert isinstance(search_policy, (tuple, list))
-        for item in search_policy:
-            assert isinstance(item, SearchPolicy)
-        search_policies = search_policy
-
-    return search_policies
-
-
-class SimpleTaskScheduler(TaskScheduler):
-    """The default task scheduler with several strategies
-
-    Parameters
-    ----------
-    tasks: List[SearchTask]
-        All workloads to tune
-    weights: List[float]
-        Weights of tasks   (i.e. the number of occurrence of a task in the whole network)
-    strategy: str
-        The joint tuning strategy.
-        "sequential" : Tune tasks sequentially. Divide n_trials equally to every task.
-        "round-robin": Tune tasks in round robin order.
-        "gradient" : Tune tasks with gradient descent.
-    load_log_file: str
-        Load history log file to pre-train cost model
-    eps-random: float
-        Always allocate this percent of n_trials to select tasks randomly. This is for encouraging exploration.
-    verbose: int
-        The level of verbosity. 0 means silent.
-    alpha: float
-        The parameter used for 'gradient' strategy
-    beta: float
-        The parameter used for 'gradient' strategy
-    backward_window_size: int
-        The parameter used for 'gradient' strategy
-    """
-    def __init__(self,
-                 tasks: List[SearchTask],
-                 objective_func: Callable = None,
-                 strategy: str = 'gradient',
-                 load_log_file: str = None,
-                 load_model_file: str = None,
-                 eps_random: float = 0.05,
-                 verbose: int = 1,
-                 alpha: float = 0.2,
-                 beta: float = 2,
-                 gamma: float = 0.5,
-                 backward_window_size: int = 3,
-                 use_debug_measurement_simulator=None):
-        super().__init__(tasks, objective_func)
-        self.strategy = strategy
-        self.eps_random = eps_random
-        self.verbose = verbose
-        self.load_log_file = load_log_file
-        self.load_model_file = load_model_file
-        self.alpha = alpha
-        self.beta = beta
-        self.gamma = gamma
-        self.backward_window_size = backward_window_size
-        self.use_debug_measurement_simulator = use_debug_measurement_simulator
-
-        assert self.strategy in ['round-robin', 'gradient']
-
-        self.task_cts = []
-        self.task_costs_history = []
-        self.best_costs = self.cur_score = None
-        self.tune_option = self.measurer = self.search_policies = self.ct = self.tic = None
-        self.num_measure_per_iter = None
-        self.dead_tasks = set()
-        self.sequential_now_task_idx = 0
-        self.sequential_now_task_begin_ct = 0
-
-    def tune(self, tune_option: TuneOption, search_policy: Union[str, List[SearchPolicy]] = 'default'):
-        """ Tune tasks.
-
-        Notice: This method does not have return value, make sure to set `LogToFile`
-        measure callback in `tune_option`.
-
-        Parameters
-        ----------
-        tune_option: TuneOption
-        search_policy: Str or List[SearchPolicy]
-        """
-        # init members
-        self.task_cts = [0 for _ in range(len(self.tasks))]
-        self.task_costs_history = [[] for _ in range(len(self.tasks))]
-        self.best_costs = 1e10 * np.ones(len(self.tasks))
-        self.cur_score = self.compute_score(self.best_costs)
-        self.tune_option = tune_option
-        if self.use_debug_measurement_simulator is None:
-            self.measurer = ProgramMeasurer(tune_option.builder, tune_option.runner,
-                                            tune_option.measure_callbacks, tune_option.verbose)
-        self.ct = 0
-        self.tic = time.time()
-        # reset num_measure_per_iter to make sure every task is tuned at least once
-        self.num_measure_per_iter = min(tune_option.num_measure_per_iter,
-                                        tune_option.n_trials // len(self.tasks))
-        self.search_policies = get_search_policies(search_policy, self.tasks,
-                                                   self.num_measure_per_iter,
-                                                   self.load_model_file,
-                                                   self.load_log_file)
-        self.dead_tasks = set()
-        self.sequential_now_task_idx = 0
-        self.sequential_now_task_begin_ct = 0
-
-        for i in range(len(self.tasks)):
-            search_policy = self.search_policies[i]
-            task = self.tasks[i]
-            search_policy.set_task(task)
-            search_policy.set_verbose(tune_option.verbose)
-            search_policy.run_callbacks(tune_option.pre_search_callbacks)
-
-        # do a round robin first
-        if self.strategy != 'sequential':
-            for i in range(len(self.tasks)):
-                self.tune_task(i)
-
-        # use the specific strategy to choose workload to tune
-        task_idx = -1
-        while self.ct < tune_option.n_trials and len(self.dead_tasks) < len(self.tasks):
-            if self.strategy == 'sequential':
-                allocated_total_ct = ((tune_option.n_trials - self.sequential_now_task_begin_ct)
-                                      / (len(self.tasks) - self.sequential_now_task_idx))
-                used_ct = self.ct - self.sequential_now_task_begin_ct
-
-                if self.sequential_now_task_idx in self.dead_tasks or used_ct >= allocated_total_ct:
-                    self.sequential_now_task_idx += 1
-                    self.sequential_now_task_begin_ct = self.ct
-                task_idx = self.sequential_now_task_idx
-                if task_idx >= len(self.tasks):
-                    break
-            elif self.strategy == 'round-robin':
-                task_idx = (task_idx + 1) % len(self.tasks)
-                while task_idx in self.dead_tasks:
-                    task_idx = (task_idx + 1) % len(self.tasks)
-            elif self.strategy == 'gradient':
-                gradients = []
-                for i in range(len(self.tasks)):
-                    if i in self.dead_tasks:
-                        gradients.append(0)
-                        continue
-
-                    # compute gradient from chain rule : (delta f / delta g_i)
-                    delta = 1e-7
-                    new_costs = list(self.best_costs)
-                    new_costs[i] -= delta
-                    chain_grad = (self.compute_score(self.best_costs) - self.compute_score(new_costs)) / delta
-
-                    # compute (g_i(t_i) - g(t_i - \Delta t)) / (\Delta t)
-                    if self.task_cts[i] - 1 - self.backward_window_size >= 0:
-                        backward_grad = (self.task_costs_history[i][self.task_cts[i] - 1]
-                                         - self.task_costs_history[i][self.task_cts[i] - 1 - self.backward_window_size]) \
-                                        / self.backward_window_size
-                    else:
-                        backward_grad = 0
-
-                    # compute (g_i(t_i + \Delta t) - g(t_i)) / (\Delta t)
-                    g_next_1 = self.best_costs[i] - (self.best_costs[i] / self.task_cts[i])
-                    # todo(lmzheng): this needs adding attribute to topi.compute for similarity check
-                    g_next_2 = self.beta * 1e20
-                    g_next = min(g_next_1, g_next_2)
-                    forward_grad = g_next - self.best_costs[i]
-
-                    # combine all grads
-                    grad = chain_grad * (self.alpha * backward_grad + (1 - self.alpha) * forward_grad)
-                    assert grad <= 0
-                    gradients.append(grad)
-
-                if max(gradients) == min(gradients):
-                    task_idx = np.random.choice(len(gradients))
-                else:
-                    task_idx = np.argmin(gradients)
-            else:
-                raise ValueError("Invalid strategy: " + self.strategy)
-
-            if self.verbose >= 1:
-                print("Next tuning task: %d" % task_idx)
-            self.tune_task(task_idx)
-
-    def tune_task(self, task_idx):
-        if self.use_debug_measurement_simulator is not None:
-            measure_inputs, measure_results = \
-                self.use_debug_measurement_simulator.get_next_batch(
-                    self.tasks[task_idx],
-                    self.num_measure_per_iter,
-                )
-        else:
-            measure_inputs, measure_results = \
-                self.search_policies[task_idx].continue_search(
-                    self.tasks[task_idx],
-                    self.num_measure_per_iter,
-                    self.tune_option.verbose,
-                    self.measurer)
-
-        for inp, res in zip(measure_inputs, measure_results):
-            cost = array_mean(res.costs)
-            if cost < self.best_costs[task_idx]:
-                self.best_costs[task_idx] = cost
-
-        if len(measure_inputs) == 0:
-            self.dead_tasks.add(task_idx)
-
-        self.task_cts[task_idx] += 1
-        self.task_costs_history[task_idx].append(self.best_costs[task_idx])
-
-        self.ct += len(measure_inputs)
-        self.cur_score = self.compute_score(self.best_costs)
-
-        if self.verbose >= 1:
-            print(("TaskScheduler\tct: %d\testimated cost (ms): %.3f\ttime elapsed: %.2f\t" +
-                  "best_costs (ms): %s\ttask_ct: %s") %
-                  (self.ct, self.cur_score * 1e3, time.time() - self.tic,
-                   to_str_round(self.best_costs * 1e3, decimal=3),
-                   self.task_cts))
-
-    def remove_dead_task(self, prob):
-        for idx in self.dead_tasks:
-            prob[idx] = 0
-        return prob / prob.sum()
diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py
index 41bd10cabe3ef..d104c1b1c2f8b 100644
--- a/python/tvm/relay/op/_transform.py
+++ b/python/tvm/relay/op/_transform.py
@@ -74,8 +74,6 @@ def compute_strided_set(attrs, inputs, output_type):
 # layout_transform
 _reg.register_injective_schedule("layout_transform")
 _reg.register_pattern("layout_transform", OpPattern.INJECTIVE)
-_reg.register_injective_schedule("kernel_layout_transform")
-_reg.register_pattern("kernel_layout_transform", OpPattern.INJECTIVE)
 
 # argwhere
 @_reg.register_compute("argwhere")
diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py
index 58b9269a4c48c..486d63c36ff0f 100644
--- a/python/tvm/relay/op/op_attrs.py
+++ b/python/tvm/relay/op/op_attrs.py
@@ -261,9 +261,6 @@ class ClipAttrs(Attrs):
 class LayoutTransformAttrs(Attrs):
     """Attributes for transform.layout_transform"""
 
-@tvm._ffi.register_object("relay.attrs.KernelLayoutTransformAttrs")
-class KernelLayoutTransformAttrs(Attrs):
-    """Attributes for transform.kernel_layout_transform"""
 
 @tvm._ffi.register_object("relay.attrs.ShapeOfAttrs")
 class ShapeOfAttrs(Attrs):
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index 2a0ddd1329b57..b02db416bdc85 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -16,16 +16,14 @@
 # under the License.
 """Definition of x86 operator strategy."""
 # pylint: disable=invalid-name,unused-argument,wildcard-import,unused-wildcard-import
+import logging
 
-import os
+import re
+import topi
 from tvm.te import SpecializedCondition
-from tvm import ansor
 from .generic import *
 from .. import op as _op
 
-# Set the priority level to use the Ansor auto-scheduler
-ansor_plevel = 11
-
 logger = logging.getLogger('strategy')
 
 _NCHWc_matcher = re.compile("^NCHW[0-9]+c$")
@@ -41,7 +39,7 @@ def schedule_injective_cpu(attrs, outs, target):
 def schedule_reduce_cpu(attrs, outs, target):
     """schedule reduction ops for x86"""
     with target:
-        return ansor.auto_schedule_topi(outs)
+        return topi.x86.schedule_reduce(outs)
 
 @schedule_concatenate.register("cpu")
 def schedule_concatenate_cpu(attrs, outs, target):
@@ -53,13 +51,13 @@ def schedule_concatenate_cpu(attrs, outs, target):
 def schedule_pool_cpu(attrs, outs, target):
     """schedule pooling ops for x86"""
     with target:
-        return ansor.auto_schedule_topi(outs)
+        return topi.x86.schedule_pool(outs, attrs.layout)
 
 @schedule_adaptive_pool.register("cpu")
 def schedule_adaptive_pool_cpu(attrs, outs, target):
     """schedule adaptive pooling ops for x86"""
     with target:
-        return ansor.auto_schedule_topi(outs)
+        return topi.x86.schedule_adaptive_pool(outs)
 
 @softmax_strategy.register("cpu")
 def softmax_strategy_cpu(attrs, inputs, out_type, target):
@@ -67,15 +65,15 @@ def softmax_strategy_cpu(attrs, inputs, out_type, target):
     strategy = _op.OpStrategy()
     strategy.add_implementation(
         wrap_compute_softmax(topi.nn.softmax),
-        wrap_topi_schedule(ansor.auto_schedule_topi),
-        name="ansor")
+        wrap_topi_schedule(topi.x86.schedule_softmax),
+        name="softmax.x86")
     return strategy
 
 @schedule_log_softmax.register("cpu")
 def schedule_log_softmax_cpu(attrs, outs, target):
     """schedule log_softmax op for x86"""
     with target:
-        return ansor.auto_schedule_topi(outs)
+        return topi.x86.schedule_softmax(outs)
 
 @conv2d_strategy.register("cpu")
 def conv2d_strategy_cpu(attrs, inputs, out_type, target):
@@ -107,18 +105,18 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             return conv2d_NCHWc_strategy_cpu(attrs, inputs, out_type, target)
         elif layout == "NHWC":
             assert kernel_layout == "HWIO"
-            #logger.warning("For x86 target, NCHW layout is recommended for conv2d.")
+            logger.warning("For x86 target, NCHW layout is recommended for conv2d.")
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.conv2d_nhwc),
-                wrap_topi_schedule(ansor.auto_schedule_topi),
-                name="ansor")
+                wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc),
+                name="conv2d_nhwc.x86")
         elif layout == "HWCN":
             assert kernel_layout == "HWIO"
-            #logger.warning("conv2d HWCN layout is not optimized for x86.")
+            logger.warning("conv2d HWCN layout is not optimized for x86.")
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.conv2d_hwcn),
-                wrap_topi_schedule(ansor.auto_schedule_topi),
-                name="ansor")
+                wrap_topi_schedule(topi.generic.schedule_conv2d_hwcn),
+                name="conv2d_hwcn.generic")
         else:
             raise RuntimeError("Unsupported conv2d layout {} for x86".format(layout))
     elif is_depthwise_conv2d(data.shape, layout, kernel.shape, kernel_layout, groups):
@@ -145,8 +143,8 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             logger.warning("depthwise_conv2d NHWC layout is not optimized for x86.")
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.depthwise_conv2d_nhwc),
-                wrap_topi_schedule(ansor.auto_schedule_topi),
-                name="ansor")
+                wrap_topi_schedule(topi.generic.schedule_depthwise_conv2d_nhwc),
+                name="depthwise_conv2d_nhwc.generic")
         else:
             raise RuntimeError("Unsupported depthwise_conv2d layout {}".format(layout))
     else: # group_conv2d
@@ -155,8 +153,8 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target):
             logger.warning("group_conv2d is not optimized for x86.")
             strategy.add_implementation(
                 wrap_compute_conv2d(topi.nn.group_conv2d_nchw, has_groups=True),
-                wrap_topi_schedule(ansor.auto_schedule_topi),
-                name="ansor")
+                wrap_topi_schedule(topi.generic.schedule_group_conv2d_nchw),
+                name="group_conv2d_nchw.generic")
         else:
             raise RuntimeError("Unsupported group_conv2d layout {}".format(layout))
     return strategy
@@ -233,8 +231,8 @@ def conv3d_strategy_cpu(attrs, inputs, out_type, target):
                                     name="conv3d_ncdhw.x86")
     elif layout == "NDHWC":
         strategy.add_implementation(wrap_compute_conv3d(topi.x86.conv3d_ndhwc),
-                                    wrap_topi_schedule(ansor.auto_schedule_topi),
-                                    name="ansor")
+                                    wrap_topi_schedule(topi.x86.schedule_conv3d_ndhwc),
+                                    name="conv3d_ndhwc.x86")
     else:
         raise ValueError("Not support this layout {} yet".format(layout))
     return strategy
@@ -253,8 +251,8 @@ def conv1d_strategy_cpu(attrs, inputs, out_type, target):
                                     name="conv1d_ncw.x86")
     elif layout == "NWC":
         strategy.add_implementation(wrap_compute_conv1d(topi.nn.conv1d_nwc),
-                                    wrap_topi_schedule(ansor.auto_schedule_topi),
-                                    name="ansor")
+                                    wrap_topi_schedule(topi.x86.schedule_conv1d_nwc),
+                                    name="conv1d_nwc.x86")
     else:
         raise ValueError("Unsupported conv1d layout {}".format(layout))
     return strategy
@@ -263,23 +261,16 @@ def conv1d_strategy_cpu(attrs, inputs, out_type, target):
 def dense_strategy_cpu(attrs, inputs, out_type, target):
     """dense x86 strategy"""
     strategy = _op.OpStrategy()
-
-    strategy.add_implementation(wrap_compute_dense(topi.nn.dense),
-                                wrap_topi_schedule(ansor.auto_schedule_topi),
-                                name='ansor',
-                                plevel=ansor_plevel)
-
+    m, _ = inputs[0].shape
     strategy.add_implementation(wrap_compute_dense(topi.x86.dense_nopack),
                                 wrap_topi_schedule(topi.x86.schedule_dense_nopack),
                                 name="dense_nopack.x86",
                                 plevel=10)
-
     if "cblas" in target.libs:
         strategy.add_implementation(wrap_compute_dense(topi.x86.dense_cblas),
                                     wrap_topi_schedule(topi.x86.schedule_dense_cblas),
                                     name="dense_cblas.x86",
                                     plevel=15)
-    m, _ = inputs[0].shape
     with SpecializedCondition(m >= 16):
         # this implementation may not be well-optimized, so use plevel=8 for now.
         strategy.add_implementation(wrap_compute_dense(topi.x86.dense_pack),
@@ -292,12 +283,6 @@ def dense_strategy_cpu(attrs, inputs, out_type, target):
 def batch_matmul_strategy_cpu(attrs, inputs, out_type, target):
     """batch_matmul x86 strategy"""
     strategy = _op.OpStrategy()
-
-    strategy.add_implementation(wrap_compute_dense(topi.nn.batch_matmul),
-                                wrap_topi_schedule(ansor.auto_schedule_topi),
-                                name='ansor',
-                                plevel=ansor_plevel)
-
     strategy.add_implementation(wrap_compute_batch_matmul(topi.x86.batch_matmul),
                                 wrap_topi_schedule(topi.x86.schedule_batch_matmul),
                                 name="batch_matmul.x86",
diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py
index f2fa2b5f5b901..a37226ea4f586 100644
--- a/python/tvm/relay/op/transform.py
+++ b/python/tvm/relay/op/transform.py
@@ -815,27 +815,6 @@ def layout_transform(data, src_layout, dst_layout):
     """
     return _make.layout_transform(data, src_layout, dst_layout)
 
-def kernel_layout_transform(data, src_layout, dst_layout):
-    """Transform the layout of a kernel
-
-    Parameters
-    ----------
-    data : relay.Expr
-        The source tensor to be transformed
-
-    src_layout: str
-        The source layout.  (e.g 1N32C112H112W)
-
-    dst_layout: str
-        The destination layout.  (e.g. 1N2C112H112W16c)
-
-    Returns
-    -------
-    ret : relay.Expr
-        The transformed tensor.
-    """
-    return _make.kernel_layout_transform(data, src_layout, dst_layout)
-
 
 def reverse_reshape(data, newshape):
     """Reshapes the input array where the special values are inferred from
diff --git a/python/tvm/te/tensor.py b/python/tvm/te/tensor.py
index 6539aabaa48f8..7d73bf42ab7d1 100644
--- a/python/tvm/te/tensor.py
+++ b/python/tvm/te/tensor.py
@@ -57,10 +57,8 @@ class Tensor(DataProducer, _expr.ExprOp):
 
     def __call__(self, *indices):
         ndim = self.ndim
-        # After ansor kernel layout rewrite, len(indices) <= ndim,
-        # and the indices will get modified by Ansor during schedule generation. 
-        # if len(indices) != ndim:
-        #     raise ValueError("Need to provide %d index in tensor slice" % ndim)
+        if len(indices) != ndim:
+            raise ValueError("Need to provide %d index in tensor slice" % ndim)
         indices = convert_to_object(indices)
         args = []
         for x in indices:
diff --git a/scripts/common.py b/scripts/common.py
deleted file mode 100644
index 8f4fbec09dd0f..0000000000000
--- a/scripts/common.py
+++ /dev/null
@@ -1,1017 +0,0 @@
-"""Common utility for scripts"""
-import argparse
-import math
-import os
-import re
-import time
-from collections import defaultdict, namedtuple
-from typing import Dict, List, Tuple
-
-import numpy as np
-import matplotlib.pyplot as plt
-
-import topi
-import tvm
-from tvm import te
-from tvm.ansor import (LogReader, make_workload_key_func,
-                       register_workload_func,
-                       write_measure_records_to_file)
-from tvm.contrib import ndk, util
-
-############################################################
-######################  Test Workloads  ####################
-############################################################
-
-@register_workload_func
-def min_mn(M, N):
-    A = te.placeholder((M, N), name='A')
-    B = topi.min(A, axis=1)
-
-    return [A, B]
-
-@register_workload_func
-def argmin_mn(M, N):
-    A = te.placeholder((M, N), name='A')
-    B = topi.argmin(A, axis=1)
-
-    return [A, B]
-
-@register_workload_func
-def softmax_mn(M, N):
-    A = te.placeholder((M, N), name='A')
-    B = topi.nn.softmax(A, axis=1)
-
-    return [A, B]
-
-@register_workload_func
-def norm_bmn(B, M, N):
-    A = te.placeholder((B, M, N), name='A')
-    i = te.reduce_axis((0, M))
-    j = te.reduce_axis((0, N))
-    C = te.compute((B,), lambda b: te.sum(A[b][i][j] * A[b][i][j], axis=[i, j]), name='C')
-    D = te.compute((B,), lambda b: te.sqrt(C[b]), name='D')
-
-    return [A, D]
-
-@register_workload_func
-def add_mn(M, N):
-    A = te.placeholder((M, N), name='A')
-    B = te.placeholder((M, N), name='B')
-    C = te.compute((M, N), lambda i, j: A[i][j] + B[i][j], name='C')
-
-    return [A, B, C]
-
-@register_workload_func
-def matmul_nkkm(N, M, K, in_type='float32', out_type='float32',
-                tensor_core_support=False):
-    A = te.placeholder((N, K), name='A', dtype=in_type)
-    B = te.placeholder((K, M), name='B', dtype=in_type)
-    k = te.reduce_axis((0, K), name='k')
-    if in_type == out_type:
-        if not (in_type == 'float16' and out_type == 'float16'):
-            tensor_core_support = False
-        C = te.compute((N, M),
-                        lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]),
-                        name='C',
-                        attrs={"ansor_tensor_core_support": "True" if tensor_core_support else "False"})
-    else:
-        if not ((in_type == 'float16' and out_type == 'float32') or \
-                (in_type == 'int8' and out_type == 'int32')):
-            tensor_core_support = False
-        C = te.compute((N, M),
-                        lambda i, j: te.sum(A[i][k].astype(out_type) * B[k][j].astype(out_type),
-                                             axis=[k]),
-                        name='C',
-                        attrs={"ansor_tensor_core_support": "True" if tensor_core_support else "False"})
-
-    return [A, B, C]
-
-@register_workload_func
-def dense_layer(batch, in_dim, out_dim):
-    A = te.placeholder((batch, in_dim), name='A')
-    B = te.placeholder((out_dim, in_dim), name='B')
-    k = te.reduce_axis((0, in_dim), name='k')
-    C = te.compute((batch, out_dim), lambda i, j: te.sum(A[i][k] * B[j][k], axis=[k]), name='C')
-
-    return [A, B, C]
-
-@register_workload_func
-def max_pool_2d_nchw(N, C, H, W):
-    data = te.placeholder((N, C, H, W), name='data')
-    out = topi.nn.pool(data, (2, 2), (1, 1), (0, 0, 0, 0), pool_type='max', ceil_mode=True,
-                       layout="NCHW", count_include_pad=True)
-
-    return [data, out]
-
-@register_workload_func
-def add_min_relu(M, N):
-    A = te.placeholder((M, N), name='A')
-    B = te.placeholder((M, N), name='B')
-    C = topi.add(A, B)
-    D = topi.min(C, axis=1)
-    out = topi.nn.relu(D)
-    return [A, B, out]
-
-@register_workload_func
-def conv2d_relu_softmax_min(N, H, W, CI, CO, KH, KW, strides, padding, dilation):
-    data = te.placeholder((N, CI, H, W), name='data')
-    kernel = te.placeholder((CO, CI, KH, KW), name='kernel')
-    conv = topi.nn.conv2d_nchw(data, kernel, strides, padding, dilation)
-    relu = topi.nn.relu(conv)
-    softmax = topi.nn.softmax(relu, axis=1)
-    out = topi.min(softmax, axis=1)
-
-    return [data, kernel, out]
-
-@register_workload_func
-def conv2d_nchw_bias(N, H, W, CI, CO, KH, KW, strides, padding, dilation):
-    data = te.placeholder((N, CI, H, W), name='data')
-    kernel = te.placeholder((CO, CI, KH, KW), name='kernel')
-    bias = te.placeholder((CO, 1, 1), name='bias')
-    conv = topi.nn.conv2d_nchw(data, kernel, strides, padding, dilation)
-    #out = topi.nn.relu(conv)
-    out = topi.add(conv, bias)
-    return [data, kernel, bias, out]
-
-def conv2d_nhwc_without_layout_rewrite(Input, Filter, stride, padding, dilation, out_dtype='float32'):
-    """A copy of `topi.nn.conv2d_nhwc` but without the 'layout_free` attribute.
-    We use this in single op and subgraph evaluation because we don't want to introduce graph level optimization.
-    """
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    batch, in_height, in_width, in_channel = Input.shape
-    if len(Filter.shape) == 10:
-        kernel_h = Filter.shape[2] * Filter.shape[6]
-        kernel_w = Filter.shape[3] * Filter.shape[7]
-        channel = Filter.shape[4] * Filter.shape[8]
-        num_filter = Filter.shape[0] * Filter.shape[1] * Filter.shape[5] * Filter.shape[9]
-        #Filter = te.placeholder([kernel_h, kernel_w, channel, num_filter], Filter.dtype, Filter.name)
-    elif len(Filter.shape) == 11:
-        kernel_h = Filter.shape[3] * Filter.shape[7]
-        kernel_w = Filter.shape[4] * Filter.shape[8]
-        channel = Filter.shape[5] * Filter.shape[9]
-        num_filter = Filter.shape[0] * Filter.shape[1] * Filter.shape[2] * Filter.shape[6] * Filter.shape[10]
-    else:
-        kernel_h, kernel_w, channel, num_filter = Filter.shape
-
-    # compute the output shape
-    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-    pad_top, pad_left, pad_down, pad_right = topi.nn.get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w))
-    out_channel = num_filter
-    out_height = topi.util.simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = topi.util.simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
-    pad_before = [0, pad_top, pad_left, 0]
-    pad_after = [0, pad_down, pad_right, 0]
-    PaddedInput = topi.nn.pad(Input, pad_before, pad_after, name="PaddedInput")
-    rc = te.reduce_axis((0, in_channel), name='rc')
-    ry = te.reduce_axis((0, kernel_h), name='ry')
-    rx = te.reduce_axis((0, kernel_w), name='rx')
-    Output = te.compute(
-        (batch, out_height, out_width, out_channel),
-        lambda nn, yy, xx, ff: te.sum(
-            PaddedInput[nn, yy * stride_h + ry * dilation_h,
-                        xx * stride_w + rx * dilation_w, rc].astype(out_dtype) *
-            Filter[ry, rx, rc, ff].astype(out_dtype)
-            , axis=[ry, rx, rc]),
-        name="Conv2dOutput", tag="conv2d_nhwc")
-    return Output
-
-
-@register_workload_func
-def conv2d_nhwc_bias_with_rewrite(N, H, W, CI, CO, KH, KW, strides, padding, dilation):
-    data = te.placeholder((N, H, W, CI), name='data')
-    kernel = te.placeholder((KH, KW, CI, CO), name='kernel')
-    bias = te.placeholder((CO, ), name='bias')
-    conv = topi.nn.conv2d_nhwc(data, kernel, strides, padding, dilation)
-    out = topi.add(conv, bias)
-    return [data, kernel, bias, out]
-
-@register_workload_func
-def depthwise_conv2d_nhwc_bias_with_rewrite(N, H, W, CI, CO, KH, KW, strides, padding, dilation):
-    data = te.placeholder((N, H, W, CI), name='data')
-    kernel = te.placeholder((KH, KW, CI, 1), name='kernel')
-    bias = te.placeholder((CO, ), name='bias')
-    conv = topi.nn.depthwise_conv2d_nhwc(data, kernel, strides, padding, dilation)
-    out = topi.add(conv, bias)
-    return [data, kernel, bias, out]
-
-@register_workload_func
-def conv2d_nhwc_bias(N, H, W, CI, CO, KH, KW, strides, padding, dilation):
-    data = te.placeholder((N, H, W, CI), name='data')
-    kernel = te.placeholder((KH, KW, CI, CO), name='kernel')
-    bias = te.placeholder((CO, ), name='bias')
-    conv = conv2d_nhwc_without_layout_rewrite(data, kernel, strides, padding, dilation)
-    out = topi.add(conv, bias)
-    return [data, kernel, bias, out]
-
-
-@register_workload_func
-def conv2d_nchw_bn_relu(N, H, W, CI, CO, kernel_size, strides, padding, dilation=1):
-    data = te.placeholder((N, CI, H, W), name='data')
-    kernel = te.placeholder((CO, CI, kernel_size, kernel_size), name='kernel')
-    bias = te.placeholder((CO, 1, 1), name='bias')
-    bn_scale = te.placeholder((CO, 1, 1), name='bn_scale')
-    bn_offset = te.placeholder((CO, 1, 1), name='bn_offset')
-
-    OH = (H + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1
-    OW = (W + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1
-
-    conv = topi.nn.conv2d_nchw(data, kernel, strides, padding, dilation)
-    conv = te.compute((N, CO, OH, OW),
-                       lambda i, j, k, l: conv[i, j, k, l] + bias[j, 0, 0],
-                       name='bias_add')
-    conv = te.compute((N, CO, OH, OW),
-                       lambda i, j, k, l: conv[i, j, k, l] * bn_scale[j, 0, 0],
-                       name='bn_mul')
-    conv = te.compute((N, CO, OH, OW),
-                       lambda i, j, k, l: conv[i, j, k, l] + bn_offset[j, 0, 0],
-                       name='bn_add')
-    out = topi.nn.relu(conv)
-
-    return [data, kernel, bias, bn_offset, bn_scale, out]
-
-@register_workload_func
-def conv2d_nhwc_bn_relu(N, H, W, CI, CO, kernel_size, strides, padding, dilation=1):
-    data = te.placeholder((N, H, W, CI), name='data')
-    kernel = te.placeholder((kernel_size, kernel_size, CI, CO), name='kernel')
-    bias = te.placeholder((CO,), name='bias')
-    bn_scale = te.placeholder((CO,), name='bn_scale')
-    bn_offset = te.placeholder((CO,), name='bn_offset')
-
-    OH = (H + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1
-    OW = (W + 2 * padding - (kernel_size - 1) * dilation - 1) // strides + 1
-
-    conv = conv2d_nhwc_without_layout_rewrite(data, kernel, strides, padding, dilation)
-    conv = te.compute((N, OH, OW, CO),
-                       lambda i, j, k, l: conv[i, j, k, l] + bias[l],
-                       name='bias_add')
-    conv = te.compute((N, OH, OW, CO),
-                       lambda i, j, k, l: conv[i, j, k, l] * bn_scale[l],
-                       name='bn_mul')
-    conv = te.compute((N, OH, OW, CO),
-                       lambda i, j, k, l: conv[i, j, k, l] + bn_offset[l],
-                       name='bn_add')
-    out = topi.nn.relu(conv)
-
-    return [data, kernel, bias, bn_offset, bn_scale, out]
-
-resnet_conv2d_configs = {
-    # format : N, H, W, CI, CO, KH, KW, strides, padding, dilation
-    '18': [
-        (1, 224, 224, 3, 64, 7, 7, (2, 2), (3, 3), (1, 1)),
-        (1, 56, 56, 64, 128, 3, 3, (2, 2), (1, 1), (1, 1)),
-        (1, 56, 56, 64, 128, 1, 1, (2, 2), (0, 0), (1, 1)),
-        (1, 56, 56, 64, 64, 3, 3, (1, 1), (1, 1), (1, 1)),
-        (1, 56, 56, 64, 64, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 28, 28, 128, 256, 3, 3, (2, 2), (1, 1), (1, 1)),
-        (1, 28, 28, 128, 256, 1, 1, (2, 2), (0, 0), (1, 1)),
-        (1, 28, 28, 128, 128, 3, 3, (1, 1), (1, 1), (1, 1)),
-        (1, 14, 14, 256, 512, 3, 3, (2, 2), (1, 1), (1, 1)),
-        (1, 14, 14, 256, 512, 1, 1, (2, 2), (0, 0), (1, 1)),
-        (1, 14, 14, 256, 256, 3, 3, (1, 1), (1, 1), (1, 1)),
-        (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)),
-    ],
-    '50': [
-        (1, 224, 224, 3, 64, 7, 7, (2, 2), (3, 3), (1, 1)),
-        (1, 56, 56, 256, 512, 1, 1, (2, 2), (0, 0), (1, 1)),
-        (1, 56, 56, 256, 128, 1, 1, (2, 2), (0, 0), (1, 1)),
-        (1, 56, 56, 256, 64, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 56, 56, 64, 256, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 56, 56, 64, 64, 3, 3, (1, 1), (1, 1), (1, 1)),
-        (1, 56, 56, 64, 64, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 28, 28, 512, 1024, 1, 1, (2, 2), (0, 0), (1, 1)),
-        (1, 28, 28, 512, 256, 1, 1, (2, 2), (0, 0), (1, 1)),
-        (1, 28, 28, 512, 128, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 28, 28, 128, 512, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 28, 28, 128, 128, 3, 3, (1, 1), (1, 1), (1, 1)),
-        (1, 14, 14, 1024, 2048, 1, 1, (2, 2), (0, 0), (1, 1)),
-        (1, 14, 14, 1024, 512, 1, 1, (2, 2), (0, 0), (1, 1)),
-        (1, 14, 14, 1024, 256, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 14, 14, 256, 1024, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 14, 14, 256, 256, 3, 3, (1, 1), (1, 1), (1, 1)),
-        (1, 7, 7, 2048, 512, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 7, 7, 512, 2048, 1, 1, (1, 1), (0, 0), (1, 1)),
-        (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)),
-    ],
-}
-
-# number of appearance for all conv2ds in resnet
-resnet_conv2d_weights = {
-    '18': [1, 1, 1, 4, 1, 1, 1, 3, 1, 1, 3, 3],
-    '50': [1, 1, 1, 2, 4, 3, 1, 1, 1, 3, 4, 4, 1, 1, 5, 6, 6, 2, 3, 3],
-}
-
-
-def parse_workload_name(name: str) -> List[str]:
-    """Parse workload name with wildcard character and abbreviation to standard names"""
-    if name.startswith('matmul-'):  # e.g. matmul-512, matmul-1024, matmul-+
-        N = name.split('-', maxsplit=1)[1]
-        if N == '+':
-            cfg_list = [256, 512, 1024]
-        else:
-            cfg_list = [N]
-        return ["matmul-%s" % x for x in cfg_list]
-    elif name.startswith('dense-'):  # e.g. dense-1-512-1024, dense-16-512-512
-        N = name.split('-', maxsplit=1)[1]
-        if N == '+':
-            cfg_list = ["1-512-512", "16-512-512"]
-        else:
-            cfg_list = [N]
-        return ["dense-%s" % x for x in cfg_list]
-    elif name.startswith('min-'):  # e.g. min-4096
-        N = name.split('-', maxsplit=1)[1]
-        if N == '+':
-            cfg_list = [4096, 8192, 16384]
-        else:
-            cfg_list = [N]
-        return ["min-%s" % x for x in cfg_list]
-    elif name.startswith('argmin-'):  # e.g. argmin-4096
-        N = name.split('-', maxsplit=1)[1]
-        if N == '+':
-            cfg_list = [4096, 8192, 16384]
-        else:
-            cfg_list = [N]
-        return ["argmin-%s" % x for x in cfg_list]
-    elif name.startswith('softmax-'):  # e.g. softmax-4096
-        N = name.split('-', maxsplit=1)[1]
-        if N == '+':
-            cfg_list = [4096, 8192, 16384]
-        else:
-            cfg_list = [N]
-        return ["softmax-%s" % x for x in cfg_list]
-    elif name.startswith('add-'):  # e.g. add-4096
-        N = name.split('-', maxsplit=1)[1]
-        if N == '+':
-            cfg_list = [4096, 8192, 16384]
-        else:
-            cfg_list = [N]
-        return ["add-%s" % x for x in cfg_list]
-    elif name.startswith('norm-'):  # e.g. norm-1024
-        N = name.split('-', maxsplit=1)[1]
-        if N == '+':
-            cfg_list = [4096, 8192, 16384]
-        else:
-            cfg_list = [N]
-        return ["norm-%s" % x for x in cfg_list]
-    elif name.startswith('add-min-relu'):  # e.g. add-min-relu-4096
-        N = name.split('-', maxsplit=3)[3]
-        if N == '+':
-            cfg_list = [4096, 8192, 16384]
-        else:
-            cfg_list = [N]
-        return ["add-min-relu-%s" % x for x in cfg_list]
-    elif name.startswith('nhwc-resnet-'):  # e.g.  nhwc-resnet-50.C1
-        res = re.match(r'nhwc-resnet-(\d+).C([\d\+]+)(.B(\d+))?', name)
-        n_layers = res.group(1)
-        if res.group(2) == '+':
-            idx_list = range(len(resnet_conv2d_configs[n_layers]))
-        else:
-            idx_list = [int(res.group(2))]
-
-        batch_size = 1 if res.group(4) is None else int(res.group(4))
-        return ['nhwc-resnet-%s.C%d.B%d' % (n_layers, i, batch_size) for i in idx_list]
-    elif name.startswith('resnet-'):  # e.g.  resnet-50.C1, resnet-50.C1.B2, resnet-50.C+.B2
-        res = re.match(r'resnet-(\d+).C([\d\+]+)(.B(\d+))?', name)
-        n_layers = res.group(1)
-        if res.group(2) == '+':
-            idx_list = range(len(resnet_conv2d_configs[n_layers]))
-        else:
-            idx_list = [int(res.group(2))]
-
-        batch_size = 1 if res.group(4) is None else int(res.group(4))
-        return ['resnet-%s.C%d.B%d' % (n_layers, i, batch_size) for i in idx_list]
-    elif name in ['conv2d-bn-relu', 'conv2d-relu-softmax-min', 'max-pool-2d', 'conv2d-rewrite', 'depthwise-conv2d-rewrite']:
-        return [name]
-    else:
-        raise ValueError("Invalid workload " + name)
-
-
-def get_workload_keys(name: str) -> List[str]:
-    """Parse workload name and return the workload keys"""
-    normalized_names = parse_workload_name(name)
-
-    ret = []
-    for name in normalized_names:
-        if name.startswith('matmul-'):
-            name_split = name.split('-')
-            in_type = out_type = 'float32'
-            tensor_core_support = False
-            if len(name_split) == 2:    # e.g. matmul-512
-                N = K = M = int(name_split[1])
-            elif len(name_split) == 4:  # e.g. matmul-32-256-512
-                N = int(name_split[1])
-                K = int(name_split[2])
-                M = int(name_split[3])
-            elif len(name_split) == 6:  # e.g. matmul-32-512-512-float16-float32
-                N = int(name_split[1])
-                K = int(name_split[2])
-                M = int(name_split[3])
-                in_type = name_split[4]
-                out_type = name_split[5]
-            elif len(name_split) == 7:  # e.g. matmul-32-512-512-float16-float32-tc
-                N = int(name_split[1])
-                K = int(name_split[2])
-                M = int(name_split[3])
-                in_type = name_split[4]
-                out_type = name_split[5]
-                tensor_core_support = name_split[6] == "tc"
-            else:
-                raise ValueError("Invalid matmul workload")
-            ret.append(make_workload_key_func(matmul_nkkm,
-                                              (N, M, K, in_type, out_type, tensor_core_support)))
-        elif name.startswith('dense-'):  # e.g. dense-1-512-1024, dense-16-512-512
-            name_split = name.split('-')
-            assert len(name_split) == 4
-            batch = int(name_split[1])
-            in_dim = int(name_split[2])
-            out_dim = int(name_split[3])
-            ret.append(make_workload_key_func(dense_layer, (batch, in_dim, out_dim)))
-        elif name.startswith('min-'):  # e.g. min-4096
-            name_split = name.split('-')
-            if len(name_split) == 2:
-                M = 64
-                N = int(name_split[1])
-            elif len(name_split) == 3:
-                M = int(name_split[1])
-                N = int(name_split[2])
-            else:
-                raise ValueError("Invalid min workload")
-            ret.append(make_workload_key_func(min_mn, (M, N)))
-        elif name.startswith('argmin-'):  # e.g. argmin-4096
-            name_split = name.split('-')
-            if len(name_split) == 2:
-                M = 64
-                N = int(name_split[1])
-            elif len(name_split) == 3:
-                M = int(name_split[1])
-                N = int(name_split[2])
-            else:
-                raise ValueError("Invalid argmin workload")
-            ret.append(make_workload_key_func(argmin_mn, (M, N)))
-        elif name.startswith('softmax-'):  # e.g. softmax-4096
-            name_split = name.split('-')
-            if len(name_split) == 2:
-                M = 64
-                N = int(name_split[1])
-            elif len(name_split) == 3:
-                M = int(name_split[1])
-                N = int(name_split[2])
-            else:
-                raise ValueError("Invalid softmax workload")
-            ret.append(make_workload_key_func(softmax_mn, (M, N)))
-        elif name.startswith('add-min-relu'):  # e.g. add-min-relu-4096
-            name_split = name.split('-')
-            if len(name_split) == 4:
-                M = 64
-                N = int(name_split[3])
-            elif len(name_split) == 5:
-                M = int(name_split[3])
-                N = int(name_split[4])
-            else:
-                raise ValueError("Invalid workload")
-            ret.append(make_workload_key_func(add_min_relu, (M, N)))
-        elif name.startswith('add-'):  # e.g. add-4096
-            name_split = name.split('-')
-            if len(name_split) == 2:
-                N = M = int(name_split[1])
-            elif len(name_split) == 3:
-                M = int(name_split[1])
-                N = int(name_split[2])
-            else:
-                raise ValueError("Invalid add workload")
-            ret.append(make_workload_key_func(add_mn, (M, N)))
-        elif name.startswith('norm-'):  # e.g. norm-4096
-            name_split = name.split('-')
-            B = 2
-            if len(name_split) == 2:
-                N = M = int(name_split[1])
-            elif len(name_split) == 3:
-                M = int(name_split[1])
-                N = int(name_split[2])
-            else:
-                raise ValueError("Invalid norm workload")
-            ret.append(make_workload_key_func(norm_bmn, (B, M, N)))
-        elif name.startswith('nhwc-resnet-'):  # e.g.  nhwc-resnet-50.C1.B2
-            res = re.match(r'nhwc-resnet-(\d+).C(\d+).B(\d+)', name)
-            n_layers = res.group(1)
-            idx = int(res.group(2))
-            batch_size = 1 if res.group(3) is None else int(res.group(3))
-            args = list(resnet_conv2d_configs[n_layers][idx])
-            args[0] = batch_size
-            ret.append(make_workload_key_func(conv2d_nhwc_bias, args))
-        elif name.startswith('resnet-'):  # e.g.  resnet-50.C1.B2
-            res = re.match(r'resnet-(\d+).C(\d+).B(\d+)', name)
-            n_layers = res.group(1)
-            idx = int(res.group(2))
-            batch_size = 1 if res.group(3) is None else int(res.group(3))
-            args = list(resnet_conv2d_configs[n_layers][idx])
-            args[0] = batch_size
-            ret.append(make_workload_key_func(conv2d_nchw_bias, args))
-        elif name == 'max-pool-2d':
-            return [make_workload_key_func(max_pool_2d_nchw, (2, 512, 7, 7))]
-        elif name == 'conv2d-bn-relu':
-            return [make_workload_key_func(conv2d_nhwc_bn_relu,
-                                           (1, 7, 7, 512, 512, 3, 1, 1, 1)) ]
-        elif name == 'conv2d-rewrite':
-            return [ make_workload_key_func(conv2d_nhwc_bias_with_rewrite,
-                                            (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)))]
-        elif name == 'depthwise-conv2d-rewrite':
-            return [ make_workload_key_func(depthwise_conv2d_nhwc_bias_with_rewrite,
-                                            (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)))]
-        elif name == 'conv2d-relu-softmax-min':
-            return [make_workload_key_func(conv2d_relu_softmax_min,
-                                           (1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1), (1, 1)))]
-        else:
-            raise ValueError("Invalid workload " + name)
-
-    return ret
-
-
-def get_workload_weights(name: str) -> List[float]:
-    """Return weights for workload name"""
-    if name.startswith('resnet-'):
-        res = re.match(r'resnet-(\d+).C+', name)
-        n_layers = res.group(1)
-        return np.array(resnet_conv2d_weights[n_layers])
-    else:
-        return np.ones(len(get_workload_keys(name)))
-
-
-############################################################
-######################  Measure Tools   ####################
-############################################################
-
-
-def measure_schedule(s,
-                     bufs,
-                     target,
-                     target_host=None,
-                     remote=None,
-                     ndk_cc=None,
-                     number=10,
-                     repeat=3,
-                     min_repeat_ms=500):
-    """Measure the time cost of a schedule"""
-    func = tvm.build(s, bufs, target=target, target_host=target_host)
-    if remote:
-        ctx = remote.context(str(target), 0)
-        temp = util.tempdir()
-        remote_path = temp.relpath("tmp_deploy_lib.so")
-        os.environ['TVM_NDK_CC'] = ndk_cc
-        func.export_library(remote_path, ndk.create_shared)
-        remote.upload(remote_path)
-        func = remote.load_module("tmp_deploy_lib.so")
-    else:
-        ctx = tvm.context(str(target), 0)
-
-    if os.environ.get('TVM_AUTO_CACHE_FLUSH', '0') == '1':
-        min_repeat_ms = 0
-        number = 1
-
-    time_f = func.time_evaluator(func.entry_name,
-                                 ctx,
-                                 number=number,
-                                 repeat=repeat,
-                                 min_repeat_ms=min_repeat_ms)
-
-    np_args = [np.ones(topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs]
-    args = [tvm.nd.array(x, ctx=ctx) for x in np_args]
-    ctx.sync()
-
-    costs = time_f(*args).results
-
-    return costs
-
-def check_correctness(s, bufs, s_ref, buf_ref, target, target_host=None, remote=None, ndk_cc=None):
-    """Check the correctness of a schedule against a reference schedule"""
-    func = tvm.build(s, bufs, target=target, target_host=target_host)
-    func_ref = tvm.build(s_ref, buf_ref, target='llvm')
-
-    if remote:
-        raise NotImplemented
-    else:
-        ctx = tvm.context(str(target), 0)
-        ctx_ref = tvm.cpu()
-
-    np_args = [np.ones(topi.get_const_tuple(x.shape)).astype(x.dtype) for x in bufs]
-    args = [tvm.nd.array(x, ctx=ctx) for x in np_args]
-    args_ref = [tvm.nd.array(x, ctx=ctx_ref) for x in np_args]
-    ctx.sync()
-
-    func(*args)
-    func_ref(*args_ref)
-
-    for arr, arr_ref in zip(args, args_ref):
-        np.testing.assert_allclose(arr.asnumpy(), arr_ref.asnumpy())
-
-
-############################################################
-#####################  Other Utilities  ####################
-############################################################
-
-
-def geomean(xs):
-    """Compute geometric mean"""
-    return math.exp(math.fsum(math.log(x) for x in xs) / len(xs))
-
-
-def str2bool(v):
-    if isinstance(v, bool):
-        return v
-    if v.lower() in ('yes', 'true', 't', 'y', '1'):
-        return True
-    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
-        return False
-    else:
-        raise argparse.ArgumentTypeError('Boolean value expected.')
-
-
-global last_tic
-last_tic = None
-
-
-def PRINT_TIME(msg):
-    """Print time interval between differnt calls. This is for debug so we make the name letters capital"""
-    global last_tic
-    now = time.time()
-
-    if last_tic is None:
-        last_tic = now
-
-    print(msg, now - last_tic)
-    last_tic = now
-
-
-############################################################
-######################  I/O Utilities  #####################
-############################################################
-
-# The format for a line in resulst file
-BenchmarkRecord = namedtuple("BenchmarkRecord", [
-    'device', 'backend', 'workload_type', 'workload_name', 'library', 'algorithm', 'value',
-    'time_stamp'
-])
-
-
-class BaselineDatabase:
-    """A class for query records in baseline database"""
-    def __init__(self, filename):
-        self.filename = filename
-
-        self.lines = []
-        for line in open(filename):
-            if line.startswith('#') or line.isspace():
-                continue
-            self.lines.append(line.split('\t'))
-
-    def filter_records(self, devices=None, backends=None, wkl_names=None, libraries=None):
-        ret = []
-        for line in self.lines:
-            line = BenchmarkRecord(*line)
-
-            if devices is not None and line.device not in devices:
-                continue
-            if backends is not None and line.backend not in backends:
-                continue
-            if wkl_names is not None and line.workload_name not in wkl_names:
-                continue
-            if libraries is not None and line.library not in libraries:
-                continue
-
-            ret.append(line)
-        return ret
-
-    def get_data_dict(self, device, target, wkl_names) -> Tuple[Dict, List]:
-        """Return a data dict s.t.  data[wkl][library] = cost"""
-        data = defaultdict(lambda: defaultdict(lambda: 1e10))
-
-        all_libraries = set()
-
-        if "cpu" in target.keys:
-            backends = ['cpu']
-        elif "gpu" in target.keys:
-            backends = ['gpu']
-        else:
-            raise ValueError("Invalid target: " + target)
-
-        # Read costs for baselines
-        records = self.filter_records(devices=[device], backends=backends, wkl_names=wkl_names)
-        for record in records:
-            # use min over (possible) multiple algorithms
-            all_libraries.add(record.library)
-            data[record.workload_name][record.library] = \
-                min(data[record.workload_name][record.library],
-                    np.mean(eval(record.value)['costs']))
-
-        return data, list(all_libraries)
-
-
-class LogFileDatabase:
-    """A class for indexing best records in a log file"""
-    def __init__(self, filename: str, n_lines: int = -1):
-        inputs, results = LogReader(filename).read_lines(n_lines)
-
-        # best records, search by (target_key, workload_key).  e.g. ('gpu', 'conv2d...')
-        self.best_by_targetkey = {}
-
-        # best according to (model, workload_key).  e.g. ('1080ti', 'conv2d...'))
-        self.best_by_model = {}
-
-        # find best records and build the index
-        for inp, res in zip(inputs, results):
-            if res.error_no != 0:
-                continue
-
-            # use target keys in tvm target system as key to build best map
-            for target_key in inp.task.target.keys:
-                key = (target_key, inp.task.workload_key)
-                if key not in self.best_by_targetkey:
-                    self.best_by_targetkey[key] = (inp, res)
-                else:
-                    _, other_res = self.best_by_targetkey[key]
-                    if np.mean([x.value for x in other_res.costs]) > \
-                            np.mean([x.value for x in res.costs]):
-                        self.best_by_targetkey[key] = (inp, res)
-
-            # use model as key to build best map
-            key = (inp.task.target.model, inp.task.workload_key)
-            if key not in self.best_by_model:
-                if inp.task.target.model != 'unknown':
-                    self.best_by_model[key] = (inp, res)
-            else:
-                _, other_res = self.best_by_model[key]
-                if np.mean([x.value for x in other_res.costs]) > \
-                        np.mean([x.value for x in res.costs]):
-                    self.best_by_model[key] = (inp, res)
-
-    def write_best(self, filename: str):
-        best_records = list(self.best_by_targetkey.values())
-        inputs = [x[0] for x in best_records]
-        results = [x[1] for x in best_records]
-        write_measure_records_to_file(filename, inputs, results)
-
-
-############################################################
-######################  Plot Utilities  ####################
-############################################################
-
-def max_curve(raw_curve):
-    """Return b[i] = max(a[:i]) """
-    ret = []
-    cur_max = -np.inf
-    for x in raw_curve:
-        cur_max = max(cur_max, x)
-        ret.append(cur_max)
-    return ret
-
-def min_curve(raw_curve):
-    """Return b[i] = min(a[:i]) """
-    ret = []
-    cur_min = np.inf
-    for x in raw_curve:
-        cur_min = min(cur_min, x)
-        ret.append(cur_min)
-    return ret
-
-def mean_curve(raw_curve, window_size=None):
-    """Return b[i] = mean(a[:i]) """
-    ret = []
-    mean = 0
-    if window_size is None:
-        for i, x in enumerate(raw_curve):
-            mean = (mean * i + x) / (i + 1)
-            ret.append(mean)
-    else:
-        for i, x in enumerate(raw_curve):
-            if i >= window_size:
-                mean = (mean * window_size + x - raw_curve[i - window_size]) / window_size
-            else:
-                mean = (mean * i + x) / (i + 1)
-            ret.append(mean)
-    return ret
-
-
-def enhance_color(color, h=1, l=1, s=1):
-    """Make color looks better for pyplot"""
-    import matplotlib.colors as mc
-    import colorsys
-    try:
-        c = mc.cnames[color]
-    except:
-        c = color
-    c = np.array(colorsys.rgb_to_hls(*mc.to_rgb(c)))
-
-    h, l, s = h * c[0], l * c[1], s * c[2]
-    h, l, s = [max(min(x, 1), 0) for x in [h, l, s]]
-
-    return colorsys.hls_to_rgb(h, l, s)
-
-
-method_color_dict = {
-    'ours': 'C0',
-    'AutoTVM': 'C1',
-
-    'tensorflow': 'C2',
-    'tensorflow-tensorrt': 'C9',
-    'tflite': 'C2',
-
-    'pytorch': enhance_color('C3', l=1.1, s=0.9),
-
-    'FlexTensor': enhance_color('C5'),
-    'halide': enhance_color('teal', l=1.25),
-
-    'Limit space': 'C7',
-    'No fine-tuning': 'C8',
-    'No task scheduler': 'C1',
-}
-
-def method2color(method):
-    if '-batch-' in method:
-        method, batch_size = method.split('-batch-')
-        #return enhance_color(method_color_dict[method], s=1.1, l=1.5)
-        return method_color_dict[method]
-    else:
-        return method_color_dict[method]
-
-method_order_list = [
-    'pytorch', 'tensorflow', 'tensorflow-xla', 'tensorflow-tensorrt',
-    'tflite', 'halide', 'FlexTensor',  'AutoTVM',
-
-    'Limit space', 'No fine-tuning',
-    'ours',
-]
-
-def method2order(method):
-    if '-batch-' in method:
-        method, batch_size = method.split('-batch-')
-        batch_size = int(batch_size)
-        return method_order_list.index(method) + batch_size / 100
-    else:
-        return method_order_list.index(method)
-
-show_name_replace_dict = {
-    'pytorch': "PyTorch",
-    'tensorflow-tensorrt': 'TensorRT-TF',
-    'tensorflow': 'TensorFlow',
-    'tflite': 'TensorFlow Lite',
-    'halide': 'Halide',
-
-    'ours': 'Ansor (ours)',
-    'batch-16': 'batch',
-
-    'resnet_50': 'ResNet-50',
-    'mobilenet_v2': 'Mobilenet V2',
-    'resnet_18_3d': '3D-ResNet',
-    'dcgan': 'DCGAN',
-    'dqn': 'DQN',
-    'bert': 'BERT',
-}
-
-def show_name(name):
-    #    if name.startswith('resnet-'):
-    #        return name.split('.')[1]
-    for key, value in show_name_replace_dict.items():
-        name = name.replace(key, value)
-
-    return name
-
-def draw_grouped_bar_chart(data, baseline='pytorch', output='out.png',
-                           yscale_log=False, yticks=None, y_max=None,
-                           legend_bbox_to_anchor=None, legend_nrow=None,
-                           figure_size=None, figax=None, draw_ylabel=True, draw_legend=True):
-    width = 1
-    gap = 1.5
-    fontsize = 19
-    xticks_font_size = fontsize - 2
-
-    figure_size = figure_size or (11, 4)
-    legend_bbox_to_anchor = legend_bbox_to_anchor or (0.45, 1.35)
-
-    all_methods = set()
-    legend_set = {}
-
-    if figax is None:
-        fig, ax = plt.subplots()
-        axes = []
-        axes.append(ax)
-    else:
-        ax = figax
-
-    x0 = 0
-    xticks = []
-    xlabels = []
-
-    workloads = list(data.keys())
-    for wkl in workloads:
-        ys = []
-        colors = []
-
-        methods = list(data[wkl].keys())
-
-        if baseline in data[wkl]:
-            baseline_cost = data[wkl][baseline]
-        else:
-            # normalize to best library
-            baseline_cost = 1e10
-            for method in methods:
-                if data[wkl][method] < baseline_cost:
-                    baseline_cost = data[wkl][method]
-
-        methods.sort(key=lambda x: method2order(x))
-        for method in methods:
-            relative_speedup = baseline_cost / data[wkl][method]
-            if yticks is None:
-                ys.append(relative_speedup)
-            else:
-                ys.append(max(relative_speedup, yticks[0] * 1.1))
-            colors.append(method2color(method))
-
-        # draw the bars
-        xs = np.arange(x0, x0 + len(ys))
-        bars = ax.bar(xs, ys, width=width, color=colors)
-
-        for method, bar_obj in zip(methods, bars):
-            all_methods.add(method)
-            if method not in legend_set:
-                legend_set[method] = bar_obj
-
-        # tick and label
-        x0 += len(ys) + gap
-
-        xticks.append(x0 - gap - len(ys)*width/2.0 - width/2.0)
-        xlabels.append(show_name(wkl))
-
-        ax.set_xticks(xticks)
-        ax.set_xticklabels(xlabels, fontsize=xticks_font_size)
-        plt.tick_params(axis='x', which='both', bottom='off', top='off')
-
-        if draw_ylabel is True:
-            ax.set_ylabel('Relative Speedup', fontsize=fontsize)
-        elif isinstance(draw_ylabel, str):
-            ax.set_ylabel(draw_ylabel, fontsize=fontsize)
-
-        if yscale_log:
-            ax.set_yscale('log', basey=2)
-        if yticks is not None:
-            ax.set_yticks(yticks)
-        if y_max:
-            ax.set_ylim(top=y_max)
-
-        from matplotlib.ticker import FormatStrFormatter
-        ax.set_yticklabels(ax.get_yticks(), fontsize=fontsize)
-        ax.yaxis.set_major_formatter(FormatStrFormatter('%.1f'))
-        ax.yaxis.grid(linewidth=0.4, linestyle='dotted') # draw grid line
-        ax.set_axisbelow(True)  # grid lines are behind the rest
-        ax.tick_params(bottom=False, top=False, right=False)
-
-    # put legend outside the plot
-    all_methods = list(all_methods)
-    all_methods.sort(key=lambda x : method2order(x))
-
-    if draw_legend:
-        legend_nrow = legend_nrow or 2
-        ncol = (len(all_methods) + legend_nrow - 1)// legend_nrow
-        ax.legend([legend_set[x] for x in all_methods],
-                  [show_name(x) for x in all_methods],
-                  fontsize=fontsize-1,
-                  loc='upper center',
-                  bbox_to_anchor=legend_bbox_to_anchor,
-                  ncol=ncol,
-                  handlelength=1.0,
-                  handletextpad=0.5,
-                  columnspacing=1.1)
-
-    if figax is None:
-        fig.set_size_inches(figure_size)
-        fig.savefig(output, bbox_inches='tight')
-        print("Output the plot to %s" % output)
-
-
-def to_str_round(x, decimal=6):
-    if isinstance(x, str):
-        return x
-    if isinstance(x, (list, tuple)) or isinstance(x, np.ndarray):
-        return "[" + ", ".join([to_str_round(y, decimal=decimal)
-                                for y in x]) + "]"
-    if isinstance(x, dict):
-        return str({k: eval(to_str_round(v)) for k, v in x.items()})
-    if isinstance(x, int):
-        return str(x)
-    if isinstance(x, float):
-        format_str = "%%.%df" % decimal
-        return format_str % x
-    raise ValueError("Invalid value: " + str(x))
-
diff --git a/scripts/shape_configs.py b/scripts/shape_configs.py
deleted file mode 100644
index 244638f5b29c3..0000000000000
--- a/scripts/shape_configs.py
+++ /dev/null
@@ -1,230 +0,0 @@
-""" Shape configurations for single operator / subgraph evaluation
-This file is shared by tune_op_subgraph.py and scripts in scripts/baseline/
-"""
-
-matmul_shapes = [
-    (1, 128, 128, 128),
-    (1, 512, 32, 512),
-    (1, 512, 512, 512),
-    (1, 1024, 1024, 1024),
-]
-
-conv1d_shapes = [
-    # derived from conv2d_shapes
-    (1, 256, 64, 128, 3, 2, 1),
-#    (1, 256, 64, 128, 1, 2, 0),
-#    (1, 256, 64, 64, 1, 1, 0),
-#    (1, 128, 128, 256, 3, 2, 1),
-    (1, 128, 128, 256, 1, 2, 0),
-#    (1, 128, 128, 128, 3, 1, 1),
-#    (1, 64, 256, 512, 3, 2, 1),
-#    (1, 64, 256, 512, 1, 2, 0),
-    (1, 64, 256, 256, 5, 1, 2),
-    (1, 32, 512, 512, 3, 1, 1),
-]
-
-conv2d_shapes = [
-    # all conv2d layers in resnet-18
-    (1, 224, 224, 3, 64, 7, 2, 3),
-#    (1, 56, 56, 64, 128, 3, 2, 1),
-#    (1, 56, 56, 64, 128, 1, 2, 0),
-#    (1, 56, 56, 64, 64, 3, 1, 1),
-    (1, 56, 56, 64, 64, 1, 1, 0),
-#    (1, 28, 28, 128, 256, 3, 2, 1),
-#    (1, 28, 28, 128, 256, 1, 2, 0),
-#    (1, 28, 28, 128, 128, 3, 1, 1),
-#    (1, 14, 14, 256, 512, 3, 2, 1),
-#    (1, 14, 14, 256, 512, 1, 2, 0),
-    (1, 14, 14, 256, 256, 3, 1, 1),
-    (1, 7, 7, 512, 512, 3, 1, 1),
-]
-
-conv3d_shapes = [
-    # Derived from cnov2d_shapes. Use depth=16 for all configurations
-    (1, 16, 224, 224, 3, 64, 7, 2, 3),
-#    (1, 16, 56, 56, 64, 128, 3, 2, 1),
-#    (1, 16, 56, 56, 64, 128, 1, 2, 0),
-#    (1, 16, 56, 56, 64, 64, 3, 1, 1),
-    (1, 16, 56, 56, 64, 64, 1, 1, 0),
-#    (1, 16, 28, 28, 128, 256, 3, 2, 1),
-#    (1, 16, 28, 28, 128, 256, 1, 2, 0),
-#    (1, 16, 28, 28, 128, 128, 3, 1, 1),
-#    (1, 16, 14, 14, 256, 512, 3, 2, 1),
-#    (1, 16, 14, 14, 256, 512, 1, 2, 0),
-    (1, 16, 14, 14, 256, 256, 3, 1, 1),
-    (1, 16, 7, 7, 512, 512, 3, 1, 1),
-]
-
-group_conv2d_shapes = [
-    # Derived from cnov2d_shapes. Use group=4 for all configurations
-    (1, 56, 56, 64, 128, 3, 2, 1 , 1, 4),
-#    (1, 56, 56, 64, 128, 1, 2, 0 , 1, 4),
-#    (1, 56, 56, 64, 64, 3, 1, 1  , 1, 4),
-    (1, 56, 56, 64, 64, 1, 1, 0  , 1, 4),
-#    (1, 28, 28, 128, 256, 3, 2, 1, 1, 4),
-#    (1, 28, 28, 128, 256, 1, 2, 0, 1, 4),
-#    (1, 28, 28, 128, 128, 3, 1, 1, 1, 4),
-#    (1, 14, 14, 256, 512, 3, 2, 1, 1, 4),
-#    (1, 14, 14, 256, 512, 1, 2, 0, 1, 4),
-    (1, 14, 14, 256, 256, 3, 1, 1, 1, 4),
-    (1, 7, 7, 512, 512, 3, 1, 1  , 1, 4),
-]
-
-dilation_conv2d_shapes = [
-    # Derived from cnov2d_shapes. Use dilation=2 for all configurations
-    (1, 224, 224, 3, 64, 7, 2, 3 , 2),
-#    (1, 56, 56, 64, 128, 3, 2, 1 , 2),
-#    (1, 56, 56, 64, 128, 1, 2, 0 , 2),
-#    (1, 56, 56, 64, 64, 3, 1, 1  , 2),
-    (1, 56, 56, 64, 64, 1, 1, 0  , 2),
-#    (1, 28, 28, 128, 256, 3, 2, 1, 2),
-#    (1, 28, 28, 128, 256, 1, 2, 0, 2),
-#    (1, 28, 28, 128, 128, 3, 1, 1, 2),
-#    (1, 14, 14, 256, 512, 3, 2, 1, 2),
-#    (1, 14, 14, 256, 512, 1, 2, 0, 2),
-    (1, 14, 14, 256, 256, 3, 1, 1, 2),
-    (1, 7, 7, 512, 512, 3, 1, 1  , 2),
-]
-
-depthwise_conv2d_shapes = [
-    # all depthwise conv2d layers in mobilenet
-    (1, 112, 112, 32,  3, 1, 1),
-    (1, 112, 112, 64,  3, 2, 1),
-#    (1,  56,  56, 128, 3, 1, 1),
-#    (1,  56,  56, 128, 3, 2, 1),
-#    (1,  28,  28, 256, 3, 1, 1),
-#    (1,  28,  28, 256, 3, 2, 1),
-#    (1,  14,  14, 512, 3, 1, 1),
-    (1,  14,  14, 512, 3, 2, 1),
-    (1,   7,   7, 1024, 3, 1, 1),
-]
-
-conv2d_transpose_shapes = [
-    # all conv2d tranpose layers in DCGAN
-    (1, 4, 4, 512, 256, 4, 2, 1),
-    (1, 8, 8, 256, 128, 4, 2, 1),
-    (1, 16, 16, 128, 64, 4, 2, 1),
-    (1, 32, 32, 64, 3, 4, 2, 1),
-]
-
-conv2d_capsule_shapes = [
-    # all conv2d capsule layers in matrix capsules withemrouting (ICLR 2018)
-    (1, 16, 16, 32, 32, 3, 2, 1),
-    (1,  8,  8, 32, 32, 3, 1, 1),
-    (1, 16, 16,  8, 16, 3, 2, 1),
-    (1,  8,  8, 16, 16, 3, 1, 1),
-]
-
-conv2d_winograd_nhwc_shapes = [
-    (1, 56, 56, 64, 64, 3, 1, 1),
-    (1, 28, 28, 128, 128, 3, 1, 1),
-    (1, 14, 14, 256, 256, 3, 1, 1),
-    (1, 7, 7, 512, 512, 3, 1, 1),
-]
-
-conv2d_winograd_nchw_shapes = [
-    (1, 64, 56, 56, 64, 3, 1, 1),
-    (1, 128, 28, 28, 128, 3, 1, 1),
-    (1, 256, 14, 14, 256, 3, 1, 1),
-    (1, 512, 7, 7, 512, 3, 1, 1),
-]
-
-matmul_tensor_core_shapes = [
-    (16, 512, 512, 'float16', 'float32', True),
-    (32, 512, 512, 'float16', 'float32', True),
-    (512, 512, 512, 'float16', 'float32', True),
-]
-
-norm_shapes = [
-    (1, 256, 256),
-    (1, 512, 512),
-    (1, 1024, 1024),
-    (1, 4096, 1024),
-]
-
-single_op_shape_dict = {
-    'C1D': conv1d_shapes,
-    'C2D': conv2d_shapes,
-    'C3D': conv3d_shapes,
-    'GMM': matmul_shapes,
-    'GRP': group_conv2d_shapes,
-    'DIL': dilation_conv2d_shapes,
-    'DEP': depthwise_conv2d_shapes,
-    'T2D': conv2d_transpose_shapes,
-    'CAP': conv2d_capsule_shapes,
-    'NRM': norm_shapes,
-
-#    The following workloads are not in our sinle op evaluation plan.
-#    They should be moved to `common.py` and be used by `tune_wkl.py`.
-#    'C2D_NCHW': conv2d_nchw_shapes,
-#    'C2DWG_NHWC': conv2d_winograd_nhwc_shapes,
-#    'C2DWG_NCHW': conv2d_winograd_nchw_shapes,
-#    'GMM_TC': matmul_tensor_core_shapes,
-}
-
-conv2d_bn_relu_shapes = [
-    (1, 224, 224, 3, 64, 7, 2, 3),
-    (1, 56, 56, 64, 128, 3, 2, 1),
-    (1, 28, 28, 128, 256, 1, 2, 0),
-    (1, 7, 7, 512, 512, 3, 1, 1, 1),
-    (16, 224, 224, 3, 64, 7, 2, 3),
-    (16, 56, 56, 64, 128, 3, 2, 1),
-    (16, 28, 28, 128, 256, 1, 2, 0),
-    (16, 7, 7, 512, 512, 3, 1, 1, 1),
-]
-
-transpose_batch_matmul_shapes = [
-    (1,   128, 12, 64),
-    (1,   128, 16, 64),
-    (1,   64,  12, 128),
-    (1,   128, 12, 128),
-    (16,  128, 12, 64),
-    (16,  128, 16, 64),
-    (16,  64,  12, 128),
-    (16,  128, 12, 128),
-]
-
-subgraph_shape_dict = {
-    "conv2d_bn_relu": conv2d_bn_relu_shapes,
-    "transpose_batch_matmul": transpose_batch_matmul_shapes,
-}
-
-resnet_shapes = [
-    (1, ),
-    (16, ),
-]
-
-mobilenet_v2_shapes = [
-    (1, ),
-    (16, ),
-]
-
-dcgan_shapes = [
-    (1, ),
-    (16, ),
-]
-
-dqn_shapes = [
-    (1, ),
-    (16, ),
-]
-
-bert_shapes = [
-    (1, ),
-    (16, ),
-]
-
-resnet18_3d_shapes = [
-    (1, ),
-    (16, ),
-]
-
-network_shape_dict = {
-    'resnet_50': resnet_shapes,
-    'mobilenet_v2': mobilenet_v2_shapes,
-    'dcgan': dcgan_shapes,
-    'dqn': dqn_shapes,
-    'bert': bert_shapes,
-    'resnet_18_3d': resnet18_3d_shapes,
-}
-
diff --git a/scripts/tune_network.py b/scripts/tune_network.py
deleted file mode 100644
index 1905d81320039..0000000000000
--- a/scripts/tune_network.py
+++ /dev/null
@@ -1,388 +0,0 @@
-"""Tune a whole neural network"""
-import argparse
-import logging
-import random
-import os
-import numpy as np
-
-import tvm
-from tvm import ansor, relay
-import tvm.contrib.graph_runtime as runtime
-from tvm.contrib.debugger import debug_runtime
-from tvm.contrib import util, ndk
-from tvm.relay import testing
-from tvm.ansor.utils import request_remote
-#from baseline.utils import log_line, BenchmarkRecord
-
-from common import str2bool
-from tune_test import create_tune_option
-
-dtype = "float32"
-
-def get_network(name, network_path, batch_size, layout):
-    """Get the relay module and random weights for a network"""
-    input_shape = (batch_size, 3, 224, 224)
-    output_shape = (batch_size, 1000)
-    input_name = 'data'
-
-    if name.startswith("resnet3d"):
-        n_layer = int(name.split('-')[1])
-        layout = "NDHWC"
-        image_shape = (16, 112, 112, 3)
-        input_shape = (batch_size, *image_shape)
-        mod, params = relay.testing.resnet3d.get_workload(num_layers=n_layer, batch_size=batch_size, image_shape=image_shape, dtype=dtype, layout=layout)
-    elif name.startswith("resnet"):
-        n_layer = int(name.split('-')[1])
-        image_shape = (224, 224, 3) if layout == 'NHWC' else (3, 224, 224)
-        input_shape = (batch_size, *image_shape)
-        mod, params = relay.testing.resnet.get_workload(num_layers=n_layer, batch_size=batch_size, layout=layout, image_shape=image_shape, dtype=dtype)
-    elif "lstm" in name:
-        mod, params = relay.testing.lstm.get_workload(iterations=10, num_hidden=512, batch_size=batch_size, dtype=dtype)
-    elif "mlp" in name:
-        input_shape = (batch_size, 1, 28, 28)
-        mod, params = relay.testing.mlp.get_workload(batch_size=batch_size, dtype=dtype)
-    elif "vgg" in name:
-        n_layer = int(name.split('-')[1])
-        mod, params = relay.testing.vgg.get_workload(num_layers=n_layer, batch_size=batch_size, dtype=dtype)
-    elif name == 'dcgan':
-        input_shape = (batch_size, 100)
-        mod, params = relay.testing.dcgan.get_workload(batch_size=batch_size)
-    elif name == 'dqn':
-        layout = "NHWC"
-        image_shape = (84, 84, 4)
-        input_shape = (batch_size, *image_shape)
-        mod, params = relay.testing.dqn.get_workload(batch_size=batch_size, image_shape=image_shape, dtype=dtype, layout=layout)
-    elif name == 'mobilenet':
-        image_shape = (224, 224, 3) if layout == 'NHWC' else (3, 224, 224)
-        input_shape = (batch_size, *image_shape)
-        mod, params = relay.testing.mobilenet.get_workload(batch_size=batch_size, layout=layout, image_shape=image_shape, dtype=dtype)
-    elif name == 'r3d_18':
-        import torch
-        import torchvision
-
-        model = getattr(torchvision.models.video, name)(pretrained=False)
-        model = model.eval()
-
-        # We grab the TorchScripted model via tracing
-        input_shape = [batch_size, 3, 16, 112, 112]
-        input_data = torch.randn(input_shape)
-        scripted_model = torch.jit.trace(model, input_data).eval()
-
-        input_name = 'input0'  # only one input, set it to this name
-        shape_list = {input_name: input_shape}
-        mod, params = relay.frontend.from_pytorch(scripted_model,
-                                                  shape_list)
-    elif name == 'squeezenet_v1.1':
-        mod, params = relay.testing.squeezenet.get_workload(batch_size=batch_size, version='1.1', dtype=dtype)
-    elif name == 'inception_v3':
-        input_shape = (batch_size, 3, 299, 299)
-        mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype)
-    elif name == 'mxnet':
-        # an example for mxnet model
-        from mxnet.gluon.model_zoo.vision import get_model
-        block = get_model('resnet18_v1', pretrained=True)
-        mod, params = relay.frontend.from_mxnet(block, shape={"input_name": input_shape}, dtype=dtype)
-        net = mod["main"]
-        net = relay.Function(net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs)
-        mod = relay.Module.from_expr(net)
-    elif name == 'tflite-mobilenet-v2' or name == 'tflite-resnet-v2-50':
-        try:
-            import tflite.Model
-        except ImportError:
-            raise ImportError("The tflite package must be installed")
-        input_name = "input"
-        input_shape = (1, 224, 224, 3)
-        output_shape = (1, 1001)
-        input_dtype = "float32"
-        tflite_model_buf = open(network_path, "rb").read()
-        tflite_model = tflite.Model.Model.GetRootAsModel(tflite_model_buf, 0)
-        mod, params = relay.frontend.from_tflite(tflite_model,
-                                                 shape_dict={input_name: input_shape},
-                                                 dtype_dict={input_name: input_dtype})
-    elif name == 'pytorch-mobilenet-v2':
-        import torch
-
-        model = torch.hub.load('pytorch/vision:v0.5.0', 'mobilenet_v2', pretrained=False)
-        model.eval()
-
-        input_shape = [batch_size, 3, 224, 224]
-        input_data = torch.randn(input_shape)
-        scripted_model = torch.jit.trace(model, input_data).eval()
-
-        input_name = 'input0'
-        shape_list = {input_name: input_shape}
-        mod, params = relay.frontend.from_pytorch(scripted_model,
-                                                  shape_list)
-    elif name == 'bert':
-        import tensorflow as tf
-
-        bert_pb = './baseline/tensorflow/tf_models/bert/bert-B%d.pb' % batch_size
-        try:
-            with tf.compat.v1.gfile.GFile(bert_pb, 'rb') as f:
-                graph_def = tf.compat.v1.GraphDef()
-                graph_def.ParseFromString(f.read())
-        except:
-            raise ValueError("Need to run ./baseline/tensorflow/bert/generate_bert_pb.py to get model first")
-
-        input_shape = (batch_size, 128)
-        input_name = ['input']
-        shape_dict = {
-            'input': input_shape
-        }
-        out_names = [
-            'bert/pooler/dense/Tanh'
-        ]
-
-        mod, params = relay.frontend.from_tensorflow(graph_def,
-                                                    shape=shape_dict,
-                                                    outputs=out_names)
-    else:
-        raise ValueError("Unsupported network: " + name)
-
-    return mod, params, input_name, input_shape, output_shape
-
-
-def create_module(data_shape, graph, lib, target, input_name, params, debug_profile,
-        local_measure, ndk_cc, rpc_device_key, rpc_host, rpc_port, rpc_num_threads, seed=43):
-    if local_measure:
-        if target.target_name == "cuda":
-            ctx = tvm.gpu()
-        else:
-            ctx = tvm.cpu()
-    else:
-        print("=============== Request Remote ===============")
-        if 'TVM_NDK_CC' not in os.environ:
-            os.environ['TVM_NDK_CC'] = ndk_cc
-        remote = request_remote(rpc_device_key, rpc_host, rpc_port)
-
-        print("=============== Export ===============")
-        ctx = remote.cpu()
-        temp = util.tempdir()
-        path_lib = temp.relpath("deploy_lib.so")
-        lib.export_library(path_lib, ndk.create_shared)
-
-        print("=============== Upload ===============")
-        remote.upload(path_lib)
-
-        print("=============== Load ===============")
-        lib = remote.load_module("deploy_lib.so")
-
-        if rpc_num_threads:
-            config_threadpool = remote.get_function('runtime.config_threadpool')
-            config_threadpool(0, rpc_num_threads)
-
-    np.random.seed(seed)
-    data_tvm = tvm.nd.array(100 * (np.random.uniform(size=data_shape)).astype(dtype), ctx=ctx)
-    if debug_profile:
-        module = debug_runtime.create(graph, lib, ctx)
-    else:
-        module = runtime.create(graph, lib, ctx)
-
-    if type(input_name) == list:
-        for name in input_name:
-            module.set_input(name, data_tvm)
-    else:
-        module.set_input(input_name, data_tvm)
-    for k, v in params.items():
-        module.set_input(k, v)
-
-    return module, ctx
-
-
-def tune_and_evaluate(network_arguments, target, target_host,
-                      search_policy, task_scheduler_arguments, tune_option_arguments,
-                      tune, debug_profile, check_correctness, log_n_lines):
-    # Extract tasks from relay program
-    mod, params, input_name, data_shape, out_shape = get_network(**network_arguments)
-
-    # Tune all
-    if tune:
-        print("=============== Extract Workloads ===============")
-        workloads, wkl_weights = ansor.extract_from_program(mod, target=target, params=params)
-        print("Extract %d workloads in total" % (len(workloads)))
-
-        # Tune workloads with auto scheduler
-        print("=============== Tune ===============")
-        tasks = []
-        for i, wkl_key in enumerate(workloads):
-            dag = ansor.workload_key_to_dag(wkl_key)
-            print("[========= Task %d =========]\n" % i, dag)
-            tasks.append(ansor.SearchTask(dag, wkl_key, target, target_host))
-
-        tuner = ansor.SimpleTaskScheduler(tasks,
-            lambda costs: sum(c * w for c, w in zip(costs, wkl_weights)),
-            **task_scheduler_arguments)
-        tune_option, measure_ctx = create_tune_option(target, **tune_option_arguments)
-
-        if tune_option_arguments['local_measure'] and target.target_name != 'cuda':
-            os.environ['TVM_BIND_MASTER_CORE_0'] = "1"
-        tuner.tune(tune_option, search_policy)
-
-        if measure_ctx:
-            del measure_ctx
-
-    kernel_layout_rewrite = True
-
-    # Compile graph with best states found by auto-scheduler
-    print("=============== Compile ===============")
-    with ansor.apply_history_best(tune_option_arguments['log_file'], log_n_lines):
-        os.environ['TVM_AUTO_CACHE_FLUSH'] = "0"
-
-        if kernel_layout_rewrite:
-            ansor.prepare_layout_rewrite(mod, target=target, params=params)
-        else:
-            # disable layout rewrite
-            ansor.LayoutRewriteLevel.BOTH_REWRITE = ansor.LayoutRewriteLevel.NO_REWRITE
-            ansor.LayoutRewriteLevel.COMPUTE_REWRITE = ansor.LayoutRewriteLevel.NO_REWRITE
-
-        with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
-            graph, lib, opt_params = relay.build_module.build(
-                mod, target=target, params=params)
-
-        ansor.finish_layout_rewrite()
-        print("=============== Compile Finish ===============")
-
-        module, ctx = create_module(data_shape, graph, lib, target, input_name,
-                                    opt_params, debug_profile, **common_measure_parameters)
-
-        # Evaluate
-        print("========== Evaluate ==========")
-        ftimer = module.module.time_evaluator("run", ctx, number=10, repeat=3)
-        prof_res = np.array(ftimer().results)
-
-        # display profile information
-        if debug_profile or check_correctness:
-            module.run()
-            if check_correctness:
-                actual_output = module.get_output(0).asnumpy()
-                print(actual_output)
-
-        print("Mean inference time (std dev): %.2f ms (%.2f ms)" %
-              (np.mean(prof_res) * 1000, np.std(prof_res) * 1000))
-        #log_line(BenchmarkRecord(target.target_name, 'gpu' if target.target_name == 'cuda' else 'cpu', 'network',
-        #                         "%s.B%d" % (network_name, batch_size), 'AutoSchedule', layout,
-        #                         {"costs": prof_res}, time.time()), record_file)
-
-    if check_correctness:
-        print("========== Check Correctness ==========")
-        # clean relay cache
-        relay.backend.compile_engine.get().clear()
-
-        # disable layout rewrite
-        ansor.LayoutRewriteLevel.BOTH_REWRITE = ansor.LayoutRewriteLevel.NO_REWRITE
-        ansor.LayoutRewriteLevel.COMPUTE_REWRITE = ansor.LayoutRewriteLevel.NO_REWRITE
-        target = tvm.target.create('llvm')
-        with tvm.transform.PassContext(opt_level=3, disabled_pass={"AlterOpLayout"}):
-            graph, lib, opt_params = relay.build_module.build(
-                mod, target=target, params=params)
-
-        module, _ = create_module(data_shape, graph, lib, target, input_name,
-                                  opt_params, debug_profile, **common_measure_parameters)
-        module.run()
-
-        expected_output = module.get_output(0).asnumpy()
-        np.testing.assert_allclose(actual_output, expected_output, rtol=1e-3, atol=1e-3)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    # Search task related arguments
-    parser.add_argument("--network", type=str, required=True)
-    parser.add_argument("--network-path", type=str, default=None, help="The path of tflite model")
-    parser.add_argument("--batch-size", type=int, default=1)
-    parser.add_argument("--layout", type=str, default='NHWC')
-    parser.add_argument("--target", type=str, default='llvm -mcpu=core-avx2')
-    parser.add_argument("--target-host", type=str, default=None)
-    parser.add_argument("--check-correctness", type=str2bool, nargs='?', const=True, default=False)
-    parser.add_argument("--debug-profile", type=str2bool, nargs='?', const=True, default=False)
-    parser.add_argument("--tune", type=str2bool, nargs='?', const=True, default=True)
-
-    # Search strategy related arguments
-    parser.add_argument("--n-trials", type=int, default=1000)
-    parser.add_argument("--policy", type=str, choices=['sketch'], default='sketch')
-    parser.add_argument("--model-type", type=str, choices=['xgb', 'random', 'no-share'], default='xgb')
-    parser.add_argument("--task-scheduler", type=str, default='gradient',
-                        choices=['no', 'gradient', 'round-robin'],
-                        help='The strategy of task scheduler')
-    parser.add_argument("--seed", type=int, default=0, help='random seed')
-
-    # Log file related arguments
-    parser.add_argument("--log-file", type=str, help="Write measurement records to this log file")
-    parser.add_argument("--load-log", type=str, help="Load history log to resume the status of search")
-    parser.add_argument("--log-n-lines", type=int, help="Only load the first n lines for history log")
-    parser.add_argument("--load-model", type=str, help="Load pre trained cost model file")
-
-    # Measurement related and other arguments
-    parser.add_argument("--num-measure-per-iter", type=int, default=48,
-                        help="The number of programs to be measured at each iteration")
-    parser.add_argument("--build-timeout", type=int, default=10)
-    parser.add_argument("--run-timeout", type=int, default=10)
-    parser.add_argument("--early-stopping", type=int, default=-1)
-    parser.add_argument("--verbose", type=int, default=1)
-    parser.add_argument("--local-measure", type=str2bool, nargs='?', const=True, default=True)
-    parser.add_argument("--rpc-device-key", type=str, default=None)
-    parser.add_argument("--rpc-host", type=str, default='0.0.0.0')
-    parser.add_argument("--rpc-port", type=int, default=9190)
-    parser.add_argument("--rpc-num-threads", type=int, default=None)
-    parser.add_argument("--n-parallel", type=int, default=1)
-    parser.add_argument("--ndk-cc", type=str, default=None)
-    args = parser.parse_args()
-
-    np.random.seed(args.seed)
-    random.seed(args.seed)
-    logging.basicConfig()
-    logging.getLogger('ansor').setLevel(logging.DEBUG)
-    os.environ["TOPHUB_LOCATION"] = "NONE"  # disable autotvm
-
-    target = tvm.target.create(args.target)
-    log_file = args.log_file or "%s-B%d-%s.json" % (args.network, args.batch_size,
-                                                    target.target_name)
-    load_log_file = args.load_log or log_file
-    search_policy = "%s.%s" % (args.policy, args.model_type)
-    if args.layout:
-        layout = args.layout
-    elif target.target_name == "cuda":
-        layout = "NCHW"
-    else:
-        layout = "NHWC"
-
-    network_arguments = {
-        'name': args.network,
-        'network_path': args.network_path,
-        'batch_size': args.batch_size,
-        'layout': layout
-    }
-
-    task_scheduler_parameters = {
-        'strategy': args.task_scheduler,
-        'load_log_file': load_log_file,
-        'load_model_file': args.load_model,
-        'verbose': args.verbose,
-    }
-
-    common_measure_parameters = {
-        'local_measure': args.local_measure,
-        'rpc_device_key': args.rpc_device_key,
-        'rpc_host': args.rpc_host,
-        'rpc_port': args.rpc_port,
-        'rpc_num_threads': args.rpc_num_threads,
-        'ndk_cc': args.ndk_cc,
-    }
-
-    tune_option_arguments = {
-        'log_file': log_file,
-        'n_trials': args.n_trials,
-        'num_measure_per_iter': args.num_measure_per_iter,
-        'verbose': args.verbose,
-        'n_parallel': args.n_parallel,
-        'build_timeout': args.build_timeout,
-        'run_timeout': args.run_timeout,
-        'early_stopping': args.early_stopping,
-        **common_measure_parameters
-    }
-
-    tune_and_evaluate(network_arguments, target, args.target_host,
-                      search_policy, task_scheduler_parameters, tune_option_arguments,
-                      args.tune, args.debug_profile, args.check_correctness,
-                      args.log_n_lines)
diff --git a/scripts/tune_op_subgraph.py b/scripts/tune_op_subgraph.py
deleted file mode 100644
index 6574bb77e510b..0000000000000
--- a/scripts/tune_op_subgraph.py
+++ /dev/null
@@ -1,585 +0,0 @@
-"""Tune all workloads for single op & subgraph evaluation"""
-import argparse
-import logging
-import random
-
-import numpy as np
-
-import tvm
-from tvm import te, ansor
-import topi
-from topi.nn.winograd_util import winograd_transform_matrices
-from topi.util import get_const_tuple
-
-from common import measure_schedule, str2bool, norm_bmn, conv2d_nhwc_bn_relu, conv2d_nchw_bn_relu
-from shape_configs import single_op_shape_dict, subgraph_shape_dict
-from tune_test import tune_workloads_jointly, replay_workload, create_tune_option
-
-# ========================== Single Ops ==========================
-
-@ansor.register_workload_func
-def batch_matmul_nkkm(B, N, M, K):
-    X = te.placeholder((B, N, K), name='A')
-    Y = te.placeholder((B, K, M), name='B')
-    k = te.reduce_axis((0, K), name='k')
-    Z = te.compute((B, N, M), lambda b, i, j: te.sum(X[b][i][k] * Y[b][k][j], axis=[k]), name='C')
-    return [X, Y, Z]
-
-@ansor.register_workload_func
-def conv1d_nlc(N, L, CI, CO, kernel_size, stride=1, padding=0, dilation=1, groups=1):
-    inputs = te.placeholder((N, L, CI), name='inputs')
-    weight = te.placeholder((kernel_size, CI//groups, CO), name='weight')
-
-    batch_size, in_len, in_channel = inputs.shape
-    k_len, channel_per_group, out_channel = weight.shape
-    out_channel_per_group = out_channel // groups
-    out_len = (in_len + 2 * padding - dilation * (k_len - 1) - 1) // stride + 1
-    rc = te.reduce_axis((0, channel_per_group), name='rc')
-    rl = te.reduce_axis((0, k_len), name='rl')
-
-    padded = topi.nn.pad(inputs, [0, padding, 0])
-    output = te.compute(
-        (batch_size, out_len, out_channel),
-        lambda n, l, co: te.sum(
-            (padded[n, l * stride + rl * dilation, co // out_channel_per_group * channel_per_group + rc] *
-             weight[rl, rc, co]), axis=[rl, rc]),
-        name='conv1d_nlc'
-    )
-    return [inputs, weight, output]
-
-@ansor.register_workload_func
-def conv2d_nhwc(N, H, W, CI, CO, kernel_size, stride=1, padding=0, dilation=1, groups=1):
-    inputs = te.placeholder((N, H, W, CI), name='inputs')
-    weight = te.placeholder((kernel_size, kernel_size, CI//groups, CO), name='weight')
-    batch_size, in_h, in_w, in_channel = inputs.shape
-    k_h, k_w, channel_per_group, out_channel = weight.shape
-    out_channel_per_group = out_channel // groups
-
-    out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1
-    out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1
-    rh = te.reduce_axis((0, k_h), name="rh")
-    rw = te.reduce_axis((0, k_w), name="rw")
-    rc = te.reduce_axis((0, channel_per_group), name="rc")
-
-    padded = topi.nn.pad(inputs, [0, padding, padding, 0])
-    output = te.compute(
-        (batch_size, out_h, out_w, out_channel),
-        lambda n, h, w, co: te.sum(
-            (padded[n, h * stride + rh * dilation, w * stride + rw * dilation,
-                    co // out_channel_per_group * channel_per_group + rc]
-             * weight[rh, rw, rc, co]), axis=[rh, rw, rc]
-        ),
-        name='conv2d_nhwc'
-    )
-    return [inputs, weight, output]
-
-@ansor.register_workload_func
-def conv2d_nchw(N, CI, H, W, CO, kernel_size, stride=1, padding=0, dilation=1, groups=1):
-    inputs = te.placeholder((N, CI, H, W), name='inputs')
-    weight = te.placeholder((CO, CI//groups, kernel_size, kernel_size), name='weight')
-    batch_size, in_channel, in_h, in_w = inputs.shape
-    out_channel, channel_per_group, k_h, k_w, = weight.shape
-    out_channel_per_group = out_channel // groups
-
-    out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1
-    out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1
-    rc = te.reduce_axis((0, channel_per_group), name="rc")
-    rh = te.reduce_axis((0, k_h), name="rh")
-    rw = te.reduce_axis((0, k_w), name="rw")
-
-    padded = topi.nn.pad(inputs, [0, 0, padding, padding])
-    output = te.compute(
-        (batch_size, out_channel, out_h, out_w),
-        lambda n, co, h, w: te.sum(
-            (padded[n, co // out_channel_per_group * channel_per_group + rc,
-                    h * stride + rh * dilation, w * stride + rw * dilation]
-             * weight[co, rc, rh, rw]), axis=[rc, rh, rw]
-        ),
-        name='conv2d_nchw'
-    )
-    return [inputs, weight, output]
-
-@ansor.register_workload_func
-def conv3d_ndhwc(N, D, H, W, CI, CO, kernel_size, stride=1, padding=0, dilation=1, groups=1):
-    inputs = te.placeholder((N, D, H, W, CI))
-    weight = te.placeholder((kernel_size, kernel_size, kernel_size, CI//groups, CO))
-    batch_size, in_d, in_h, in_w, in_channel = inputs.shape
-    k_d, k_h, k_w, channel_per_group, out_channel = weight.shape
-    out_channel_per_group = out_channel // groups
-
-    out_d = (in_d + 2 * padding - dilation * (k_d - 1) - 1) // stride + 1
-    out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1
-    out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1
-    rd = te.reduce_axis((0, k_d), name='rd')
-    rh = te.reduce_axis((0, k_h), name='rh')
-    rw = te.reduce_axis((0, k_w), name='rw')
-    rc = te.reduce_axis((0, channel_per_group), name='rc')
-
-    padded = topi.nn.pad(inputs, [0, padding, padding, padding, 0])
-    output = te.compute(
-        (batch_size, out_d, out_h, out_w, out_channel),
-        lambda n, d, h, w, co: te.sum(
-            (padded[n, d * stride + rd * dilation,
-                    h * stride + rh * dilation, w * stride + rw * dilation,
-                    co // out_channel_per_group * channel_per_group + rc]
-             * weight[rd, rh, rw, rc, co]),
-            axis=[rd, rh, rw, rc]
-        ),
-        name='conv3d_ndhwc'
-    )
-    return [inputs, weight, output]
-
-@ansor.register_workload_func
-def depthwise_conv2d_nhwc(N, H, W, C, kernel_size, stride=1, padding=0, dilation=1, factor=1):
-    inputs = te.placeholder((N, H, W, C))
-    weight = te.placeholder((factor, kernel_size, kernel_size, C))
-
-    batch_size, in_h, in_w, in_channel = inputs.shape
-    factor, k_h, k_w, in_channel = weight.shape
-    out_channel = in_channel * factor
-
-    assert factor.value == 1, "Not optimized for factor != 1"
-
-    out_h = (in_h + 2 * padding - dilation * (k_h - 1) - 1) // stride + 1
-    out_w = (in_w + 2 * padding - dilation * (k_w - 1) - 1) // stride + 1
-    rh = te.reduce_axis((0, k_h), name='rh')
-    rw = te.reduce_axis((0, k_w), name='rw')
-
-    padded = topi.nn.pad(inputs, [0, padding, padding, 0])
-    output = te.compute(
-        (batch_size, out_h, out_w, out_channel),
-        lambda n, h, w, c: te.sum(
-            (padded[n,  h * stride + rh * dilation, w * stride + rw * dilation, c // factor]
-             * weight[c % factor, rh, rw, c // factor]),
-            axis=[rh, rw]
-        ),
-        name="depth_conv2d_nhwc"
-    )
-    return [inputs, weight, output]
-
-@ansor.register_workload_func
-def conv2d_transpose_nhwc(N, H, W, CI, CO, kernel_size, stride=1, padding=0):
-    inputs = te.placeholder((N, H, W, CI), name='inputs')
-    weight = te.placeholder((kernel_size, kernel_size, CI, CO), name='weight')
-
-    batch, in_h, in_w, in_c = inputs.shape
-    filter_h, filter_w, in_c, out_c = weight.shape
-    stride_h, stride_w = (stride, stride)
-
-    # compute padding
-    fpad_top, fpad_left, fpad_bottom, fpad_right = topi.nn.get_pad_tuple(padding, (filter_h, filter_w))
-    bpad_top = filter_h - 1 - fpad_top
-    bpad_bottom = filter_h - 1 - fpad_bottom
-    bpad_left = filter_w - 1 - fpad_left
-    bpad_right = filter_w - 1 - fpad_right
-
-    # padding stage
-    padded = topi.nn.pad(inputs,
-                         [0, (bpad_top + stride_h - 1) // stride_h,
-                          (bpad_left + stride_w - 1) // stride_w, 0],
-                         [0, (bpad_bottom + stride_h - 1) // stride_h,
-                          (bpad_right + stride_w - 1) // stride_w, 0])
-
-    # remove extra padding introduced by dilatation
-    idxdiv = te.indexdiv
-    idxmod = te.indexmod
-    border_h = idxmod(stride_h - idxmod(bpad_top, stride_h), stride_h)
-    border_w = idxmod(stride_w - idxmod(bpad_left, stride_w), stride_w)
-
-    # dilation stage
-    strides = [1, stride_h, stride_w, 1]
-    n = len(padded.shape)
-
-    # We should embed this dilation directly into te.compute rather than creating a new te.compute.
-    # Only in this way can we use unroll to eliminate the multiplication of zeros.
-    def _dilate(*indices):
-        not_zero = []
-        index_tuple = []
-        for i in range(n):
-            if not strides[i] == 1:
-                index_tuple.append(idxdiv(indices[i], strides[i]))
-                not_zero.append(idxmod(indices[i], strides[i]).equal(0))
-            else:
-                index_tuple.append(indices[i])
-        if not_zero:
-            not_zero = te.all(*not_zero)
-            return te.if_then_else(not_zero, padded(*index_tuple), tvm.tir.const(0.0, padded.dtype))
-        return padded(*index_tuple)
-
-    # convolution stage
-    out_h = (in_h - 1) * stride_h - fpad_top - fpad_bottom + filter_h
-    out_w = (in_w - 1) * stride_w - fpad_left - fpad_right + filter_w
-    rc = te.reduce_axis((0, in_c), name='rc')
-    rh = te.reduce_axis((0, filter_h), name='rh')
-    rw = te.reduce_axis((0, filter_w), name='rw')
-
-    output = te.compute(
-        (batch, out_h, out_w, out_c),
-        lambda n, h, w, co: te.sum(
-            _dilate(n, h + rh + border_h, w + rw + border_w, rc) *
-            weight[filter_h - 1 - rh, filter_w - 1 - rw, rc, co],
-            axis=[rh, rw, rc]),
-        name="conv2d_transpose_nhwc",
-        attrs={"ansor_always_unroll_inner": ["h", "w", "rh", "rw", "h_c", "w_c"]})
-    # todo(lmzheng): add constraints on the tile size of h and w
-
-    return [inputs, weight, output]
-
-@ansor.register_workload_func
-def conv2d_capsule_nhwijc(N, H, W, CI, CO, kernel_size, stride=1, padding=0, capsule_size=4):
-    inputs = te.placeholder((N, H, W, capsule_size, capsule_size, CI), name='inputs')
-    weight = te.placeholder((kernel_size, kernel_size, capsule_size, capsule_size, CI, CO), name='weight')
-    batch_size, in_h, in_w, _, _, in_channel = inputs.shape
-    k_h, k_w, _, _, _, out_channel = weight.shape
-
-    out_h = (in_h + 2 * padding - kernel_size) // stride + 1
-    out_w = (in_w + 2 * padding - kernel_size) // stride + 1
-
-    rh = te.reduce_axis((0, k_h), name="rh")
-    rw = te.reduce_axis((0, k_w), name="rw")
-    cap_k = te.reduce_axis((0, capsule_size), name='cap_k')
-    rc = te.reduce_axis((0, in_channel), name="rc")
-
-    padded = topi.nn.pad(inputs, [0, padding, padding, 0, 0, 0])
-    output = te.compute(
-        (batch_size, out_h, out_w, capsule_size, capsule_size, out_channel),
-        lambda n, h, w, cap_i, cap_j, co: te.sum(
-            (padded[n, h * stride + rh, w * stride + rw, cap_i, cap_k, rc]
-             * weight[rh, rw, cap_k, cap_j, rc, co]), axis=[rh, rw, cap_k, rc]
-        ),
-        name='conv2d_capsule_nhwijc'
-    )
-    return [inputs, weight, output]
-
-
-@ansor.register_workload_func
-def conv2d_winograd_nhwc(N, H, W, CI, CO, kernel_size=3, stride=1, padding=0, dilation=1):
-    # TODO: implement tile_size
-    tile_size = 4 #_infer_tile_size(data, kernel)
-    inputs = te.placeholder((N, H, W, CI), name='inputs')
-    #weight = te.placeholder((kernel_size, kernel_size, CI, CO), name='weight')
-    N, H, W, CI = get_const_tuple(inputs.shape)
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-    # if dilation_h != 1 or dilation_w != 1:
-    #     weight = topi.nn.dilate(weight, (1, 1, dilation_h, dilation_w))
-    KH = KW = kernel_size
-    HPAD, WPAD, _, _ = topi.nn.get_pad_tuple(padding, (KH, KW))
-    HSTR, WSTR = (stride, stride) if isinstance(stride, int) else stride
-    assert HSTR == 1 and WSTR == 1 and KH == KW
-
-    data_pad = topi.nn.pad(inputs, (0, HPAD, WPAD, 0), (0, HPAD, WPAD, 0), name="data_pad")
-
-    r = KW
-    m = tile_size
-    alpha = m + r - 1
-    A, B, G = winograd_transform_matrices(m, r, 'float32')
-
-    H = (H + 2 * HPAD - KH) // HSTR + 1
-    W = (W + 2 * WPAD - KW) // WSTR + 1
-    nH, nW = (H + m - 1) // m, (W + m - 1) // m
-    P = N * nH * nW
-    r_kh = te.reduce_axis((0, KH), name='r_kh')
-    r_kw = te.reduce_axis((0, KW), name='r_kw')
-    # kernel_pack = te.compute((alpha, alpha, CO, CI), lambda eps, nu, co, ci:
-    #                           weight[0][0][0][0],
-    #                           name='kernel_pack')
-    kshape = (alpha, alpha, CO, CI)
-    kernel_pack = te.placeholder(kshape, inputs.dtype, name="weight")
-
-    idxdiv = te.indexdiv
-    idxmod = te.indexmod
-    # pack input tile
-    input_tile = te.compute((alpha, alpha, P, CI), lambda eps, nu, p, ci:
-                             data_pad[idxdiv(p, (nH * nW))][idxmod(idxdiv(p, nW), nH) * m + eps]
-                                     [idxmod(p, nW) * m + nu][ci], name='input_tile',)
-
-    # transform data
-    r_a = te.reduce_axis((0, alpha), 'r_a')
-    r_b = te.reduce_axis((0, alpha), 'r_b')
-    data_pack = te.compute((alpha, alpha, P, CI), lambda eps, nu, p, ci:
-                            te.sum(input_tile[r_a][r_b][p][ci] * B[r_a][eps] * B[r_b][nu],
-                                    axis=[r_a, r_b]), name='data_pack',
-                                    attrs={"ansor_no_split_at_inner": ["eps", "nu", "r_a", "r_b"],
-                                           "ansor_last_split_is_one": ["ci", "p"],
-                                           "ansor_always_unroll": ["eps", "nu", "r_a", "r_b"],
-                                           "ansor_no_cache_write": "True",
-                                           })
-
-    # do batch gemm
-    ci = te.reduce_axis((0, CI), name='ci')
-    bgemm = te.compute((alpha, alpha, P, CO), lambda eps, nu, p, co:
-                        te.sum(data_pack[eps][nu][p][ci] *
-                                kernel_pack[eps][nu][co][ci],
-                                axis=[ci]), name='bgemm')
-
-    # inverse transform
-    r_a = te.reduce_axis((0, alpha), 'r_a')
-    r_b = te.reduce_axis((0, alpha), 'r_b')
-    inverse = te.compute((m, m, P, CO), lambda vh, vw, p, co:
-                          te.sum(bgemm[r_a][r_b][p][co] * A[r_a][vh] * A[r_b][vw],
-                                  axis=[r_a, r_b]), name='inverse',
-                          attrs={"ansor_no_split_at_inner": ["vh", "vw", "r_a", "r_b"],
-                                 "ansor_always_unroll": ["vh", "vw", "r_a", "r_b"],
-                                 "ansor_last_split_is_one": ["co", "p"],
-                                 "ansor_no_cache_write": "True",
-                                 })
-
-    # output
-    output = te.compute((N, H, W, CO), lambda n, h, w, co:
-                         inverse[idxmod(h, m),
-                                 idxmod(w, m),
-                                 n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m),
-                                 co],
-                         name='conv2d_winograd',
-                         tag='conv2d_winograd_nhwc',
-                         attrs={"ansor_no_split_at_outer": ["n", "h", "w", "co"],})
-    return [inputs, kernel_pack, output]
-
-@ansor.register_workload_func
-def conv2d_winograd_nchw(N, CI, H, W, CO, kernel_size=3, stride=1, padding=0, dilation=1, precompute=False):
-    # TODO: implement tile_size
-    tile_size = 4 #_infer_tile_size(data, kernel)
-    inputs = te.placeholder((N, CI, H, W), name='inputs')
-    #weight = te.placeholder((CO, CI, kernel_size, kernel_size), name='weight')
-    N, CI, H, W = get_const_tuple(inputs.shape)
-    # if isinstance(dilation, int):
-    #     dilation_h = dilation_w = dilation
-    # else:
-    #     dilation_h, dilation_w = dilation
-    # if dilation_h != 1 or dilation_w != 1:
-    #     weight = topi.nn.dilate(weight, (1, 1, dilation_h, dilation_w))
-    KH = KW = kernel_size
-    HPAD, WPAD, _, _ = topi.nn.get_pad_tuple(padding, (KH, KW))
-    HSTR, WSTR = (stride, stride) if isinstance(stride, int) else stride
-    assert HSTR == 1 and WSTR == 1 and KH == KW
-
-    data_pad = topi.nn.pad(inputs, (0, 0, HPAD, WPAD), (0, 0, HPAD, WPAD), name="data_pad")
-
-    r = KW
-    m = tile_size
-    alpha = m + r - 1
-    A, B, G = winograd_transform_matrices(m, r, 'float32')
-
-    H = (H + 2 * HPAD - KH) // HSTR + 1
-    W = (W + 2 * WPAD - KW) // WSTR + 1
-    nH, nW = (H + m - 1) // m, (W + m - 1) // m
-    P = N * nH * nW
-    r_kh = te.reduce_axis((0, KH), name='r_kh')
-    r_kw = te.reduce_axis((0, KW), name='r_kw')
-    # kernel_pack = te.compute((alpha, alpha, CI, CO), lambda eps, nu, ci, co:
-    #                           weight[0][0][0][0],
-    #                           name='kernel_pack')
-    kshape = (alpha, alpha, CI, CO)
-    kernel_pack = te.placeholder(kshape, inputs.dtype, name="weight")
-
-    idxdiv = te.indexdiv
-    idxmod = te.indexmod
-    # pack input tile
-    input_tile = te.compute((CI, P, alpha, alpha), lambda ci, p, eps, nu:
-                             data_pad[idxdiv(p, (nH * nW))][ci][idxmod(idxdiv(p, nW), nH) * m + eps]
-                                     [idxmod(p, nW) * m + nu], name='input_tile')
-
-    # transform data
-    r_a = te.reduce_axis((0, alpha), 'r_a')
-    r_b = te.reduce_axis((0, alpha), 'r_b')
-    data_pack = te.compute((alpha, alpha, CI, P), lambda eps, nu, ci, p:
-                            te.sum(input_tile[ci][p][r_a][r_b] * B[r_a][eps] * B[r_b][nu],
-                                    axis=[r_a, r_b]), name='data_pack',
-                                    attrs={"ansor_no_split_at_inner": ["eps", "nu", "r_a", "r_b"],
-                                           "ansor_no_split_at_outer": ["ci", "p"],
-                                           "ansor_always_unroll": ["eps", "nu", "r_a", "r_b"],
-                                           "ansor_no_cache_write": "True",
-                                           })
-
-    # do batch gemm
-    ci = te.reduce_axis((0, CI), name='ci')
-    bgemm = te.compute((alpha, alpha, CO, P), lambda eps, nu, co, p:
-                        te.sum(data_pack[eps][nu][ci][p] *
-                                kernel_pack[eps][nu][ci][co],
-                                axis=[ci]), name='bgemm')
-
-    # inverse transform
-    r_a = te.reduce_axis((0, alpha), 'r_a')
-    r_b = te.reduce_axis((0, alpha), 'r_b')
-    inverse = te.compute((CO, P, m, m), lambda co, p, vh, vw:
-                          te.sum(bgemm[r_a][r_b][co][p] * A[r_a][vh] * A[r_b][vw],
-                                  axis=[r_a, r_b]), name='inverse',
-                          attrs={"ansor_no_split_at_outer": ["co", "p", "vh", "vw", "r_a", "r_b"],
-                                 "ansor_always_unroll": ["vh", "vw", "r_a", "r_b"],
-                                 "ansor_no_cache_write": "True"})
-
-    # output
-    output = te.compute((N, CO, H, W), lambda n, co, h, w:
-                         inverse[co, n * nH * nW + idxdiv(h, m) * nW + idxdiv(w, m),
-                                 idxmod(h, m),
-                                 idxmod(w, m)],
-                         name='conv2d_winograd',
-                         attrs={"ansor_no_split_at_outer": ["n", "co", "h", "w"],})
-    return [inputs, kernel_pack, output]
-
-# ========================== Subgraphs ==========================
-
-@ansor.register_workload_func
-def transpose_batch_matmul(batch, seq_len, n_head, n_dim):
-    query = te.placeholder((batch, seq_len, n_head, n_dim), name='query')
-    value = te.placeholder((batch, seq_len, n_head, n_dim), name='value')
-    query_T = te.compute((batch, n_head, seq_len, n_dim),
-                      lambda b, h, l, d: query[b, l, h, d], name="query_T")
-    value_T = te.compute((batch, n_head, n_dim, seq_len),
-                      lambda b, h, d, l: value[b, l, h, d], name="value_T")
-    k = te.reduce_axis((0, n_dim), name='k')
-    out = te.compute((batch, n_head, seq_len, seq_len),
-                 lambda b, h, i, j: te.sum(query_T[b][h][i][k] * value_T[b][h][k][j], axis=[k]),
-                 name='C')
-    return [query, value, out]
-
-# ========================== Tune function & Task dicts ==========================
-
-def tune_wkl(task_func_dict, shape_dict, wkl_type, args):
-    target = tvm.target.create(args.target)
-
-    for wkl_meta_name, func in task_func_dict.items():
-        if not args.wkl in ["all", wkl_type, wkl_meta_name]:
-            continue
-
-        log_file = args.log_file or wkl_meta_name + ".json"
-        wkl_keys = []
-        for shape in shape_dict[wkl_meta_name]:
-            if shape[0] == 1:
-                shape = list(shape)
-                shape[0] = args.batch_size
-
-            wkl_key = ansor.make_workload_key_func(func, shape)
-            wkl_keys.append(wkl_key)
-            if args.fast_check:
-                break
-
-            if not args.tune:
-                cost, gflops = replay_workload(
-                        wkl_key, target, args.target_host, log_file,
-                        args.local_measure, args.rpc_device_key, args.rpc_host,
-                        args.rpc_port, args.rpc_num_threads, args.ndk_cc, False)
-                # log_line(BenchmarkRecord(target.name, 'gpu' if target.name == 'cuda' else 'cpu', 'subgraph',
-                #                          workload_name, "AutoSchedule", "default",
-                #                          {"costs": [cost]}, time.time()), args.out_file)
-
-        if args.tune:
-            print("========== Tune for %s (%d shapes) ========== " % (wkl_meta_name, len(wkl_keys)))
-
-            load_log_file = args.load_log or log_file
-            n_trials = args.n_trials_per_shape * len(wkl_keys)
-
-            tune_option, measure_ctx = create_tune_option(target, log_file,
-                    n_trials, args.num_measure_per_iter, args.verbose,
-                    args.n_parallel, args.build_timeout, args.local_measure,
-                    args.rpc_device_key, args.rpc_host, args.rpc_port,
-                    args.rpc_num_threads, args.ndk_cc)
-
-            # tune workloads jointly using JointTuner
-            tune_workloads_jointly(wkl_keys, np.ones(len(wkl_keys)), args.task_scheduler,
-                                   target, args.target_host, args.policy, args.model_type,
-                                   args.load_model, load_log_file, tune_option)
-
-            if measure_ctx:
-                del measure_ctx
-
-
-single_op_task_func_dict = {
-    'GMM': batch_matmul_nkkm,
-    'C1D': conv1d_nlc,
-    'C2D': conv2d_nhwc,
-    'C3D': conv3d_ndhwc,
-    'GRP': conv2d_nhwc,
-    'DIL': conv2d_nhwc,
-    'DEP': depthwise_conv2d_nhwc,
-    'T2D': conv2d_transpose_nhwc,
-    'CAP': conv2d_capsule_nhwijc,
-    'NRM': norm_bmn,
-    #'SMX': softmax_mn,
-
-#    The following workloads are not in our sinle op evaluation plan.
-#    They should be moved to `common.py` and be used by `tune_wkl.py`.
-#    'C2D_NCHW': conv2d_nchw,
-#    'C2DWG_NHWC': conv2d_winograd_nhwc,
-#    'C2DWG_NCHW': conv2d_winograd_nchw,
-#    'GMM_TC': matmul_nkkm,
-}
-
-subgraph_task_func_dict = {
-    'conv2d_bn_relu': conv2d_nhwc_bn_relu,
-    #'conv2d_bn_relu': conv2d_nchw_bn_relu,    # some old log uses conv2d_nchw_bn_relu
-    'transpose_batch_matmul': transpose_batch_matmul,
-}
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Search task related arguments
-    parser.add_argument("--wkl", type=str, required=True,
-                        help="all      - Tune all workloads; \
-                              op       - Tune all single ops; \
-                              subgraph - Tune all subgraphs; \
-                              specific wkl name - Tune a specific workload")
-    parser.add_argument("--batch-size", type=int, default=1)
-    parser.add_argument("--target", type=str, default='llvm -mcpu=core-avx2')
-    parser.add_argument("--target-host", type=str, default=None)
-    parser.add_argument("--tune", type=str2bool, nargs='?', const=True, default=True)
-    parser.add_argument("--fast-check", action='store_true',
-                        help='Only run one shape for each workload. This is used for fast checking')
-
-    # Search strategy related arguments
-    parser.add_argument("--n-trials-per-shape", type=int, default=1000)
-    parser.add_argument("--policy", type=str, choices=['sketch', 'beam-search'], default='sketch')
-    parser.add_argument("--model-type", type=str, choices=['xgb', 'random', 'no-share'], default='xgb')
-    parser.add_argument("--task-scheduler", type=str, default='round-robin',
-                        choices=['no', 'gradient', 'round-robin'], help='The strategy of task scheduler')
-    parser.add_argument("--seed", type=int, default=0, help='random seed')
-
-    # Log file related arguments
-    parser.add_argument("--log-file", type=str, help="Write measurement records to this log file")
-    parser.add_argument("--load-log", type=str, help="Load history log to resume the status of search")
-    parser.add_argument("--load-model", type=str, help="Load pre-trained cost model from this file")
-
-    # Measurement related and other arguments
-    parser.add_argument("--num-measure-per-iter", type=int, default=48,
-                        help="The number of programs to be measured at each iteration")
-    parser.add_argument("--build-timeout", type=int, default=10)
-    parser.add_argument("--run-timeout", type=int, default=60)
-    parser.add_argument("--verbose", type=int, default=1)
-    parser.add_argument("--local-measure", type=str2bool, nargs='?', const=True, default=True)
-    parser.add_argument("--rpc-device-key", type=str, default=None)
-    parser.add_argument("--rpc-host", type=str, default='0.0.0.0')
-    parser.add_argument("--rpc-port", type=int, default=9190)
-    parser.add_argument("--rpc-num-threads", type=int, default=None)
-    parser.add_argument("--n-parallel", type=int, default=1)
-    parser.add_argument("--ndk-cc", type=str, default=None)
-    args = parser.parse_args()
-
-    np.random.seed(args.seed)
-    random.seed(args.seed)
-    logging.basicConfig()
-    logging.getLogger('ansor').setLevel(logging.DEBUG)
-
-    # compute the number of tasks
-    num_tasks = 0
-    for wkl_meta_name in single_op_task_func_dict:
-        if not args.wkl in ["all", "op", wkl_meta_name]:
-            continue
-        if args.fast_check:
-            num_tasks += 1
-        else:
-            num_tasks += len(single_op_shape_dict[wkl_meta_name])
-    for wkl_meta_name in subgraph_task_func_dict:
-        if not args.wkl in ["all", "subgraph", wkl_meta_name]:
-            continue
-        if args.fast_check:
-            num_tasks += 1
-        else:
-            num_tasks += len(subgraph_shape_dict[wkl_meta_name])
-    print("Number of tasks: %d\tTotal trials: %d" % (num_tasks, num_tasks * args.n_trials_per_shape))
-
-    # tune for tasks
-    tune_wkl(single_op_task_func_dict, single_op_shape_dict, "op", args)
-    tune_wkl(subgraph_task_func_dict, subgraph_shape_dict, "subgraph", args)
diff --git a/scripts/tune_test.py b/scripts/tune_test.py
deleted file mode 100644
index 67c0526dd6247..0000000000000
--- a/scripts/tune_test.py
+++ /dev/null
@@ -1,212 +0,0 @@
-"""Use auto scheduler to tune workloads"""
-import argparse
-import logging
-import os
-import random
-
-import numpy as np
-
-import tvm
-from tvm import ansor
-from tvm.ansor.utils import request_remote
-
-from common import get_workload_keys, get_workload_weights, measure_schedule, str2bool
-
-def create_tune_option(target, log_file, n_trials, num_measure_per_iter, verbose,
-                       n_parallel, build_timeout, local_measure, rpc_device_key, rpc_host,
-                       rpc_port, rpc_num_threads, ndk_cc, early_stopping=-1, run_timeout=10):
-    builder = runner = measure_ctx = None
-    if local_measure:
-        builder = ansor.LocalBuilder(timeout=build_timeout)
-        if target.target_name == "cuda":
-            measure_ctx = ansor.LocalRPCMeasureContext(repeat=1, min_repeat_ms=400)
-            runner = measure_ctx.runner
-        else:
-            os.environ['TVM_AUTO_CACHE_FLUSH'] = "1"
-            runner = ansor.LocalRunner(repeat=10, number=1, min_repeat_ms=0, timeout=run_timeout)
-    else:
-        os.environ['TVM_NDK_CC'] = ndk_cc
-        builder = ansor.LocalBuilder(timeout=build_timeout, build_func='ndk')
-        runner = ansor.RPCRunner(key=rpc_device_key, host=rpc_host, port=rpc_port,
-                                 timeout=run_timeout, n_parallel=n_parallel,
-                                 repeat=1, min_repeat_ms=200)
-        remote = request_remote(rpc_device_key, rpc_host, rpc_port)
-        if rpc_num_threads:
-            config_threadpool = remote.get_function('runtime.config_threadpool')
-            config_threadpool(0, rpc_num_threads)
-
-    tune_option = ansor.TuneOption(n_trials=n_trials, early_stopping=early_stopping,
-                                   num_measure_per_iter=num_measure_per_iter,
-                                   verbose=verbose,
-                                   builder=builder,
-                                   runner=runner,
-                                   measure_callbacks=[ansor.LogToFile(log_file)],
-                                   pre_search_callbacks=[ansor.PreloadMeasuredStates(log_file)])
-
-    return tune_option, measure_ctx
-
-
-def replay_workload(wkl_key, target, target_host, log_file,
-                    local_measure=True, rpc_device_key=None, rpc_host="0.0.0.0",
-                    rpc_port=9190, rpc_num_threads=None, ndk_cc=None,
-                    show_lower_result=True):
-    cost = gflops = None
-
-    inp, res = ansor.best_measure_pair_in_file(log_file, wkl_key, target)
-    if inp is None:
-        print("Cannot find log for: %s" % wkl_key)
-    else:
-        dag = ansor.workload_key_to_dag(inp.task.workload_key)
-        print("Found schedule for: %s" % wkl_key)
-
-        s, bufs = dag.apply_steps_from_state(inp.state)
-        if show_lower_result:
-            print(tvm.lower(s, bufs, simple_mode=True))
-
-        if local_measure:
-            remote = None
-        else:
-            remote = request_remote(rpc_device_key, rpc_host, rpc_port)
-            if rpc_num_threads:
-                config_threadpool = remote.get_function('runtime.config_threadpool')
-                config_threadpool(0, rpc_num_threads)
-
-        cost = np.mean((measure_schedule(s, bufs, target, target_host,
-                                         remote=remote, ndk_cc=ndk_cc)))
-        gflops = ansor.ComputeDAG(bufs).flop_ct / cost / 1e9
-        print("Best schedule: %.2f GFLOPS\tcost: %.3f ms" % (gflops, cost * 1e3))
-
-    return cost, gflops
-
-
-def tune_workload(wkl_key, target, target_host, policy, model_type,
-                  load_model_file, load_log_file, tune_option):
-    """Tune a workload"""
-
-    if False:
-        # Debug info. Print static analysis results from the access analyzer
-        dag = ansor.workload_key_to_dag(wkl_key)
-        print(dag.access_analyzer)
-        exit()
-
-    if model_type == 'xgb':
-        model = ansor.XGBModel()
-        if load_model_file:
-            print("Load pretrained model...")
-            model.load(load_model_file)
-        elif load_log_file:
-            model.load_log_file(load_log_file)
-        elif model_type == "random":
-            model = ansor.RandomModel()
-        else:
-            raise ValueError("Invalid model: " + model_type)
-
-    if policy == 'sketch':
-        policy = ansor.SketchSearchPolicy(program_cost_model=model)
-    elif policy == 'beam-search':
-        policy = ansor.SketchSearchPolicy(program_cost_model=model,
-                                          params={'use_beam_search': 1})
-    else:
-        raise ValueError("Invalid search policy: " + policy)
-
-    s, bufs = ansor.auto_schedule(wkl_key,
-                                  target=target, target_host=target_host,
-                                  search_policy=policy,
-                                  tune_option=tune_option)
-
-def tune_workloads_jointly(wkl_keys, weights, task_scheduler, target, target_host,
-                           search_policy, model_type, load_model_file, load_log_file,
-                           tune_option):
-    """Tune for multiple workloads together with TaksScheduler"""
-    tasks = []
-    for wkl_key in wkl_keys:
-        dag = ansor.workload_key_to_dag(wkl_key)
-        tasks.append(ansor.SearchTask(dag, wkl_key, target, target_host))
-
-    def objective_func(costs):
-        return sum(c * w for c, w in zip(costs, weights))
-
-    tuner = ansor.SimpleTaskScheduler(tasks, objective_func, strategy=task_scheduler,
-                                      load_log_file=load_log_file, load_model_file=load_model_file)
-    search_policy = "%s.%s" % (search_policy, model_type)
-    tuner.tune(tune_option, search_policy)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    # Search task related arguments
-    parser.add_argument("--wkl", type=str, required=True)
-    parser.add_argument("--target", type=str, default='llvm -mcpu=core-avx2')
-    parser.add_argument("--target-host", type=str, default=None)
-    parser.add_argument("--tune", type=str2bool, nargs='?', const=True, default=True)
-
-    # Search strategy related arguments
-    parser.add_argument("--n-trials", type=int, default=1000)
-    parser.add_argument("--policy", type=str, choices=['sketch', 'beam-search'], default='sketch')
-    parser.add_argument("--model-type", type=str, choices=['xgb', 'random', 'no-share'], default='xgb')
-    parser.add_argument("--task-scheduler", type=str, default='no',
-                        choices=['no', 'gradient', 'round-robin'],
-                        help='The strategy of task scheduler')
-    parser.add_argument("--seed", type=int, default=0, help='random seed')
-
-    # Log file related arguments
-    parser.add_argument("--log-file", type=str, help="Write measurement records to this log file")
-    parser.add_argument("--load-log", type=str, help="Load history log to resume the status of search")
-    parser.add_argument("--load-model", type=str, help="Load pre-trained cost model from this file")
-
-    # Measurement related and other arguments
-    parser.add_argument("--num-measure-per-iter", type=int, default=48,
-                        help="The number of programs to be measured at each iteration")
-    parser.add_argument("--build-timeout", type=int, default=10)
-    parser.add_argument("--run-timeout", type=int, default=60)
-    parser.add_argument("--verbose", type=int, default=1)
-    parser.add_argument("--local-measure", type=str2bool, nargs='?', const=True, default=True)
-    parser.add_argument("--rpc-device-key", type=str, default=None)
-    parser.add_argument("--rpc-host", type=str, default='0.0.0.0')
-    parser.add_argument("--rpc-port", type=int, default=9190)
-    parser.add_argument("--rpc-num-threads", type=int, default=None)
-    parser.add_argument("--n-parallel", type=int, default=1)
-    parser.add_argument("--ndk-cc", type=str, default=None)
-    args = parser.parse_args()
-
-    np.random.seed(args.seed)
-    random.seed(args.seed)
-    logging.basicConfig()
-    logging.getLogger('ansor').setLevel(logging.DEBUG)
-
-    wkl_keys = get_workload_keys(args.wkl)
-    target = tvm.target.create(args.target)
-    log_file = args.log_file or args.wkl + ".json"
-
-    # Tune workloads
-    if args.tune:
-        load_log_file = args.load_log or log_file
-        weights = get_workload_weights(args.wkl)
-
-        tune_option, measure_ctx = create_tune_option(target, log_file,
-            args.n_trials, args.num_measure_per_iter, args.verbose,
-            args.n_parallel, args.build_timeout, args.local_measure,
-            args.rpc_device_key, args.rpc_host, args.rpc_port, args.rpc_num_threads,
-            args.ndk_cc)
-
-        if args.task_scheduler == 'no':
-            # tune workloads one by one
-            for wkl_key in wkl_keys:
-                tune_workload(wkl_key, target, args.target_host, args.policy,
-                              args.model_type, args.load_model, load_log_file,
-                              tune_option)
-        else:
-            # tune workloads jointly with TaskScheduler
-            tune_workloads_jointly(wkl_keys, weights, args.task_scheduler,
-                                   target, args.target_host, args.policy,
-                                   args.model_type, args.load_model, load_log_file,
-                                   tune_option)
-        if measure_ctx:
-            del measure_ctx
-
-    # Replay the best found schedule
-    if len(wkl_keys) == 1 or not args.tune:
-        for wkl_key in wkl_keys:
-            replay_workload(wkl_key, target, args.target_host, log_file,
-                            args.local_measure, args.rpc_device_key, args.rpc_host,
-                            args.rpc_port, args.rpc_num_threads, args.ndk_cc)
diff --git a/src/ansor/auto_schedule.cc b/src/ansor/auto_schedule.cc
deleted file mode 100644
index 05cb95c2c4514..0000000000000
--- a/src/ansor/auto_schedule.cc
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file ansor/auto_schedule.cc
- * \brief The user interface of the auto-scheduler
- */
-
-#include "auto_schedule.h"
-#include <tvm/runtime/registry.h>
-#include <string>
-#include <utility>
-#include "search_policy/sketch_search_policy.h"
-
-namespace tvm {
-namespace ansor {
-
-TVM_REGISTER_NODE_TYPE(TuneOptionNode);
-
-TuneOption::TuneOption(int n_trials, int early_stopping,
-                       int num_measure_per_iter, int verbose, Builder builder,
-                       Runner runner, Array<MeasureCallback> measure_callbacks,
-                       Array<SearchCallback> pre_search_callbacks) {
-  auto node = make_object<TuneOptionNode>();
-  node->n_trials = n_trials;
-  node->early_stopping = early_stopping;
-  node->num_measure_per_iter = num_measure_per_iter;
-  node->verbose = verbose;
-  node->builder = std::move(builder);
-  node->runner = std::move(runner);
-  node->measure_callbacks = std::move(measure_callbacks);
-  node->pre_search_callbacks = std::move(pre_search_callbacks);
-  data_ = std::move(node);
-}
-
-std::pair<te::Schedule, Array<te::Tensor> > AutoSchedule(SearchTask task,
-    SearchPolicy search_policy, TuneOption tune_option) {
-  // Search for the best schedule
-  ProgramMeasurer measurer =
-      ProgramMeasurer(tune_option->builder, tune_option->runner,
-                      tune_option->measure_callbacks,
-                      tune_option->verbose);
-
-  State state = search_policy->Search(
-      task, tune_option->n_trials, tune_option->early_stopping,
-      tune_option->num_measure_per_iter, tune_option->verbose, measurer,
-      tune_option->pre_search_callbacks);
-
-  return task->compute_dag.ApplySteps(state->transform_steps);
-}
-
-std::pair<te::Schedule, Array<te::Tensor> > AutoSchedule(
-    std::string workload_key, Target target, Target target_host,
-    SearchPolicy search_policy, HardwareParams hardware_params,
-    TuneOption tune_option) {
-  ComputeDAG dag = ComputeDAG(workload_key);
-  SearchTask task = SearchTask(
-      std::move(dag), std::move(workload_key), std::move(target),
-      std::move(target_host), std::move(hardware_params));
-  return AutoSchedule(std::move(task), std::move(search_policy),
-                      std::move(tune_option));
-}
-
-TVM_REGISTER_GLOBAL("ansor.TuneOption")
-.set_body_typed([](int n_trials, int early_stopping,
-                   int num_measure_per_iter, int verbose, Builder builder,
-                   Runner runner, Array<MeasureCallback> measure_callbacks,
-                   Array<SearchCallback> pre_search_callbacks) {
-  return TuneOption(n_trials, early_stopping, num_measure_per_iter, verbose,
-                    builder, runner, measure_callbacks, pre_search_callbacks);
-});
-
-TVM_REGISTER_GLOBAL("ansor.AutoScheduleBySearchTask")
-.set_body_typed([](SearchTask task, SearchPolicy search_policy,
-                   TuneOption tune_option) {
-  te::Schedule sch;
-  Array<te::Tensor> return_tensors;
-  std::tie(sch, return_tensors) = AutoSchedule(task, search_policy, tune_option);
-
-  return Array<ObjectRef>{sch, return_tensors};
-});
-
-TVM_REGISTER_GLOBAL("ansor.AutoScheduleByWorkloadKey")
-.set_body_typed([](std::string workload_key, Target target,
-                   Target target_host, SearchPolicy search_policy,
-                   HardwareParams hardware_params, TuneOption tune_option) {
-  te::Schedule sch;
-  Array<te::Tensor> return_tensors;
-  std::tie(sch, return_tensors) =
-      AutoSchedule(workload_key, target, target_host, search_policy,
-                   hardware_params, tune_option);
-
-  return Array<ObjectRef>{sch, return_tensors};
-});
-
-}  // namespace ansor
-}  // namespace tvm
diff --git a/src/ansor/auto_schedule.h b/src/ansor/auto_schedule.h
deleted file mode 100644
index f17c043cfadd3..0000000000000
--- a/src/ansor/auto_schedule.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file ansor/auto_schedule.h
- * \brief The user interface of the auto-scheduler
- */
-
-#ifndef TVM_ANSOR_AUTO_SCHEDULE_H_
-#define TVM_ANSOR_AUTO_SCHEDULE_H_
-
-#include <utility>
-#include <string>
-#include "measure.h"
-#include "search_policy/search_policy.h"
-
-namespace tvm {
-namespace ansor {
-
-/*! \brief Tuning and measurement options */
-class TuneOptionNode : public Object {
- public:
-  int n_trials;              // Number of total measurement trials
-  int early_stopping;        // Stops early the tuning if no improvement after n
-                             // measurements
-  int num_measure_per_iter;  // The number of programs to be measured at each
-                             // iteration
-  int verbose;               // Verbosity level. 0 means silent.
-  Builder builder;           // Builder which builds the program
-  Runner runner;             // Runner which runs the program and measure time
-                             // costs
-  Array<MeasureCallback> measure_callbacks;    // MeasureCallback functions
-  Array<SearchCallback> pre_search_callbacks;  // SearchCallback functions
-                                               // run before search
-
-  void VisitAttrs(tvm::AttrVisitor* v) {
-    v->Visit("n_trials", &n_trials);
-    v->Visit("early_stopping", &early_stopping);
-    v->Visit("num_measure_per_iter", &num_measure_per_iter);
-    v->Visit("verbose", &verbose);
-    v->Visit("builder", &builder);
-    v->Visit("runner", &runner);
-    v->Visit("measure_callbacks", &measure_callbacks);
-    v->Visit("pre_search_callbacks", &pre_search_callbacks);
-  }
-
-  static constexpr const char* _type_key = "ansor.TuneOption";
-  TVM_DECLARE_FINAL_OBJECT_INFO(TuneOptionNode, Object);
-};
-
-/*!
- * \brief Managed reference to TuneOptionNode.
- * \sa TuneOptionNode
- */
-class TuneOption : public ObjectRef {
- public:
-  TuneOption(int n_trials, int early_stopping, int num_measure_per_iter,
-             int verbose, Builder builder, Runner runner,
-             Array<MeasureCallback> measure_callbacks,
-             Array<SearchCallback> pre_search_callbacks);
-
-  TVM_DEFINE_OBJECT_REF_METHODS(TuneOption, ObjectRef, TuneOptionNode);
-  TVM_DEFINE_OBJECT_REF_COW_METHOD(TuneOptionNode);
-};
-
-/*! \brief Auto schedule for a compute declaration */
-std::pair<te::Schedule, Array<te::Tensor> > AutoSchedule(
-    SearchTask task, SearchPolicy search_policy, TuneOption tune_option);
-
-std::pair<te::Schedule, Array<te::Tensor> > AutoSchedule(
-    std::string workload_key, Target target, Target target_host,
-    SearchPolicy search_policy, HardwareParams hardware_params,
-    TuneOption tune_option);
-
-}  // namespace ansor
-}  // namespace tvm
-
-#endif  // TVM_ANSOR_AUTO_SCHEDULE_H_
diff --git a/src/ansor/compute_dag.cc b/src/ansor/compute_dag.cc
index 13f64b2bdc89d..6c89c55a5ceec 100644
--- a/src/ansor/compute_dag.cc
+++ b/src/ansor/compute_dag.cc
@@ -37,8 +37,6 @@
 #include <set>
 #include <vector>
 #include "transform_step.h"
-#include "search_policy/utils.h"
-#include "../relay/transforms/kernel_layout_transform.h"
 
 namespace tvm {
 namespace ansor {
@@ -473,6 +471,24 @@ bool AccessAnalyzer::ElementWiseMatch(const te::Operation& op,
   return true;
 }
 
+// Extract primitive iterators from a nested fused or splitted iterator's name
+inline void ExtractOriginalIterators(const std::string& name, std::set<std::string>* rets) {
+  size_t last_pos = 0;
+  for (size_t i = 0; i < name.size(); ++i) {
+    if (name[i] == '@' || name[i] == '.') {  // '@' for fuse and '.' for split
+      if (!isdigit(name[last_pos]) && name[last_pos] != '@' && name[last_pos] != '.') {
+        rets->insert(name.substr(last_pos, i - last_pos));
+      }
+      last_pos = i + 1;
+    }
+  }
+
+  if (last_pos < name.size() && !isdigit(name[last_pos]) &&
+      name[last_pos] != '@' && name[last_pos] != '.') {
+    rets->insert(name.substr(last_pos, name.size() - last_pos));
+  }
+}
+
 // Estimate number of float operations in an expression
 class FlopEstimator: public ExprFunctor<double(const PrimExpr& n)> {
  public:
@@ -788,7 +804,7 @@ void ComputeDAG::RewriteLayout(
               CHECK_EQ(placeholder_axis_names.size(), placeholder->shape.size());
               std::string ori_layout = os.str();
               os.str("");
-              ::tvm::relay::KernelLayoutVisitor::global_ori_layouts_queue.push_back(ori_layout);
+              // ::tvm::relay::KernelLayoutVisitor::global_ori_layouts_queue.push_back(ori_layout);
             }
           }
 
@@ -851,7 +867,7 @@ void ComputeDAG::RewriteLayout(
           }
           std::string new_layout = os.str();
           os.str("");
-          ::tvm::relay::KernelLayoutVisitor::global_new_layouts_queue.push_back(new_layout);
+          // ::tvm::relay::KernelLayoutVisitor::global_new_layouts_queue.push_back(new_layout);
           placeholder_new_names[placeholder_op] = new_names;
           placeholder_new_shapes[placeholder_op] = new_shape;
 
diff --git a/src/ansor/cost_model/cost_model.cc b/src/ansor/cost_model/cost_model.cc
deleted file mode 100644
index ee7bf8b260532..0000000000000
--- a/src/ansor/cost_model/cost_model.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file ansor/cost_model.h
- * \brief Cost model that estimates the performance of programs
- */
-
-#include "cost_model.h"
-
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/registry.h>
-
-#include <utility>
-
-namespace tvm {
-namespace ansor {
-
-using ::tvm::runtime::NDArray;
-
-TVM_REGISTER_OBJECT_TYPE(CostModelNode);
-TVM_REGISTER_OBJECT_TYPE(RandomModelNode);
-TVM_REGISTER_OBJECT_TYPE(MeasureModelNode);
-TVM_REGISTER_OBJECT_TYPE(PythonBasedModelNode);
-
-void RandomNumber(TVMArgs args, TVMRetValue* rv) {
-  int n = args[0];
-  void* data = args[1];
-  float* fdata = reinterpret_cast<float*>(data);
-  for (int i = 0; i < n; i++) {
-    fdata[i] = static_cast<float>(rand_r(nullptr)) / (static_cast<float>(RAND_MAX));
-  }
-}
-
-RandomModel::RandomModel() {
-  ObjectPtr<RandomModelNode> node = make_object<RandomModelNode>();
-  node->random_number_func =
-      runtime::Registry::Get("ansor.cost_model.random_number");
-  if (node->random_number_func == nullptr) {
-    LOG(WARNING) << "ansor.cost_model.random_number is not registered, "
-                 << "use C++ default random_number func instead.";
-    static PackedFunc cost_model_random_number(RandomNumber);
-    node->random_number_func = &cost_model_random_number;
-  }
-  data_ = std::move(node);
-}
-
-void RandomModelNode::Update(const Array<MeasureInput>& inputs,
-                             const Array<MeasureResult>& results) {}
-
-void RandomModelNode::Predict(const SearchTask& task,
-                              const std::vector<State>& states,
-                              std::vector<float>* scores) {
-  scores->resize(states.size());
-  (*random_number_func)(states.size(), static_cast<void*>(scores->data()));
-}
-
-MeasureModel::MeasureModel(Builder builder, Runner runner) {
-  ObjectPtr<MeasureModelNode> node = make_object<MeasureModelNode>();
-  node->measurer = ProgramMeasurer(std::move(builder), std::move(runner),
-                                   Array<MeasureCallback>(), 0);
-  data_ = std::move(node);
-}
-
-void MeasureModelNode::Update(const Array<MeasureInput>& inputs,
-                              const Array<MeasureResult>& results) {}
-
-void MeasureModelNode::Predict(const SearchTask& task,
-                               const std::vector<State>& states,
-                               std::vector<float>* scores) {
-  std::vector<MeasureInput> inputs;
-  std::vector<MeasureResult> results;
-
-  inputs.clear();
-  inputs.reserve(states.size());
-  for (const auto& state : states) {
-    inputs.push_back(MeasureInput(task, state));
-  }
-  measurer->SilentMeasure(task, inputs, &results);
-
-  scores->clear();
-  scores->reserve(results.size());
-  for (const auto& res : results) {
-    scores->push_back(1.0 / FloatArrayMean(res->costs));
-  }
-}
-
-PythonBasedModel::PythonBasedModel(PackedFunc update_func,
-                                   PackedFunc predict_func,
-                                   PackedFunc predict_stage_func) {
-  auto node = make_object<PythonBasedModelNode>();
-  node->update_func = std::move(update_func);
-  node->predict_func = std::move(predict_func);
-  node->predict_stage_func = std::move(predict_stage_func);
-  data_ = std::move(node);
-}
-
-void PythonBasedModelNode::Update(const Array<MeasureInput>& inputs,
-                                  const Array<MeasureResult>& results) {
-  update_func(inputs, results);
-}
-
-void PythonBasedModelNode::Predict(const SearchTask& task,
-                                   const std::vector<State>& states,
-                                   std::vector<float>* scores) {
-  scores->resize(states.size());
-  predict_func(task, Array<State>(states.begin(), states.end()),
-               static_cast<void*>(scores->data()));
-}
-
-void PythonBasedModelNode::PredictStages(const SearchTask& task,
-    const std::vector<State>& states, std::vector<float>* state_scores,
-    std::vector<std::vector<float>>* stage_scores) {
-  int n_states = states.size();
-  int n_stages = task->compute_dag.GetInitState()->stages.size();
-  std::vector<float> flatten_scores;
-  // Allocate sufficient spaces.
-  flatten_scores.resize(n_states * n_stages * 2);
-  predict_stage_func(task, Array<State>(states.begin(), states.end()),
-                     static_cast<void*>(flatten_scores.data()));
-
-  // Unpack flatten scores.
-  state_scores->clear();
-  stage_scores->clear();
-
-  // Score of each states.
-  for (int i = 0; i < n_states; ++i) {
-    state_scores->push_back(flatten_scores[i]);
-  }
-
-  // Score of each stage in each states.
-  size_t idx = n_states;
-  for (int i = 0; i < n_states; ++i) {
-    CHECK_LE(idx, flatten_scores.size());
-
-    // Number of scored stages of this state.
-    int s_length = static_cast<int>(flatten_scores[idx++]);
-
-    if (s_length > 0) {
-      std::vector<float> scores;
-      int offset = 0;
-
-      if ((*state_scores)[i] > -INFINITY) {
-        // If the score is valid. Copy scored stages and assign 0 to placeholder
-        // and inlined stages. If the score is 0, meaning this state failed to
-        // be lowered. Just bypass to update offset.
-        for (const Stage& stage : states[i]->stages) {
-          if (stage->op_type == kPlaceholder) {
-            scores.push_back(0);
-            continue;
-          }
-          if (stage->compute_at == kInlined) {
-            scores.push_back(0);
-            continue;
-          }
-          scores.push_back(flatten_scores[idx + offset]);
-          offset++;
-        }
-        CHECK_EQ(offset, s_length);
-        stage_scores->push_back(std::move(scores));
-      }
-      idx += s_length;
-    } else {
-      // Cost model does not provide any stage score details.
-      stage_scores->push_back({});
-    }
-  }
-}
-
-TVM_REGISTER_GLOBAL("ansor.RandomModel").set_body_typed([]() {
-  return RandomModel();
-});
-
-TVM_REGISTER_GLOBAL("ansor.PythonBasedModel")
-.set_body_typed([](PackedFunc update_func, PackedFunc predict_func,
-                   PackedFunc predict_stage_func) {
-  return PythonBasedModel(update_func, predict_func,
-                          predict_stage_func);
-});
-
-}  // namespace ansor
-}  // namespace tvm
diff --git a/src/ansor/cost_model/cost_model.h b/src/ansor/cost_model/cost_model.h
deleted file mode 100644
index f38624a3572c1..0000000000000
--- a/src/ansor/cost_model/cost_model.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file ansor/cost_model.h
- * \brief Cost model that estimates the performance of programs
-*/
-
-#ifndef TVM_ANSOR_COST_MODEL_COST_MODEL_H_
-#define TVM_ANSOR_COST_MODEL_COST_MODEL_H_
-
-#include <tvm/node/node.h>
-#include <tvm/node/container.h>
-#include <tvm/runtime/packed_func.h>
-#include <vector>
-#include "../measure.h"
-
-namespace tvm {
-namespace ansor {
-
-using runtime::PackedFunc;
-
-/*! \brief The base class for cost model */
-class CostModelNode: public Object {
- public:
-  // Update the cost model according to new measurement pairs
-  virtual void Update(const Array<MeasureInput>& inputs,
-                      const Array<MeasureResult>& results) = 0;
-
-  // Predict the scores of states
-  virtual void Predict(const SearchTask& task, const std::vector<State>& states,
-      std::vector<float>* scores) = 0;
-
-  // Predict the scores of all stages in states
-  virtual void PredictStages(const SearchTask& task,
-                             const std::vector<State>& states,
-                             std::vector<float>* state_scores,
-                             std::vector<std::vector<float>>* stage_scores) {
-    LOG(FATAL) << "Not Implemented";
-  }
-
-  static constexpr const char *_type_key = "ansor.CostModel";
-  TVM_DECLARE_BASE_OBJECT_INFO(CostModelNode, Object);
-};
-TVM_DEFINE_MUTABLE_OBJECT_REF(CostModel, CostModelNode);
-
-/*! \brief The cost model returns random value for all predictions */
-class RandomModelNode: public CostModelNode {
- public:
-  const PackedFunc* random_number_func;
-
-  void Update(const Array<MeasureInput>& inputs,
-              const Array<MeasureResult>& results) final;
-  void Predict(const SearchTask& task, const std::vector<State>& states,
-      std::vector<float>* scores) final;
-
-  static constexpr const char *_type_key = "ansor.RandomModel";
-  TVM_DECLARE_FINAL_OBJECT_INFO(RandomModelNode, CostModelNode);
-};
-
-/*!
- * \brief Managed reference to RandomModelNode.
- * \sa RandomModelNode
- */
-class RandomModel : public CostModel {
- public:
-  RandomModel();
-  explicit RandomModel(::tvm::runtime::ObjectPtr<::tvm::runtime::Object> n)
-      : CostModel(n) {}
-
-  RandomModelNode* operator->() const {
-    return static_cast<RandomModelNode*>(data_.get());
-  }
-
-  TVM_DEFINE_DEFAULT_COPY_MOVE_AND_ASSIGN(RandomModel);
-  using ContainerType = RandomModelNode;
-};
-
-/*! \brief The cost model returns actual cost by measurement */
-class MeasureModelNode : public CostModelNode {
- public:
-  ProgramMeasurer measurer;
-
-  void Update(const Array<MeasureInput>& inputs,
-              const Array<MeasureResult>& results) final;
-  void Predict(const SearchTask& task, const std::vector<State>& states,
-               std::vector<float>* scores) final;
-
-  static constexpr const char* _type_key = "ansor.MeasureModel";
-  TVM_DECLARE_FINAL_OBJECT_INFO(MeasureModelNode, CostModelNode);
-};
-
-/*!
- * \brief Managed reference to MeasureModelNode.
- * \sa MeasureModelNode
- */
-class MeasureModel : public CostModel {
- public:
-  MeasureModel(Builder builder, Runner runner);
-
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(MeasureModel, CostModel,
-                                        MeasureModelNode);
-};
-
-/*! \brief  A wrapper for cost model defined by python code
- *  This class will call python's function */
-class PythonBasedModelNode: public CostModelNode {
- public:
-  PackedFunc update_func;
-  PackedFunc predict_func;
-  PackedFunc predict_stage_func;
-
-  void Update(const Array<MeasureInput>& inputs,
-              const Array<MeasureResult>& results) final;
-  void Predict(const SearchTask& task, const std::vector<State>& states,
-      std::vector<float>* scores) final;
-  void PredictStages(const SearchTask& task, const std::vector<State>& states,
-                     std::vector<float>* state_scores,
-                     std::vector<std::vector<float>>* stage_scores) final;
-
-  static constexpr const char *_type_key = "ansor.PythonBasedModel";
-  TVM_DECLARE_FINAL_OBJECT_INFO(PythonBasedModelNode, CostModelNode);
-};
-
-/*!
- * \brief Managed reference to PythonBasedModelNode.
- * \sa PythonBasedModelNode
- */
-class PythonBasedModel : public CostModel {
- public:
-  PythonBasedModel(PackedFunc update_func, PackedFunc predict_func,
-                   PackedFunc predict_stage_func);
-
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PythonBasedModel, CostModel,
-                                        PythonBasedModelNode);
-};
-
-}  // namespace ansor
-}  // namespace tvm
-
-#endif  // TVM_ANSOR_COST_MODEL_COST_MODEL_H_
diff --git a/src/ansor/search_policy/search_policy.cc b/src/ansor/search_policy/search_policy.cc
deleted file mode 100644
index 51a48780813a2..0000000000000
--- a/src/ansor/search_policy/search_policy.cc
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file ansor/search_policy/search_policy.cc
- * \brief The base class for search policy
- */
-
-#include "search_policy.h"
-#include <tvm/runtime/registry.h>
-#include "../serialization.h"
-
-namespace tvm {
-namespace ansor {
-
-TVM_REGISTER_OBJECT_TYPE(SearchPolicyNode);
-TVM_REGISTER_OBJECT_TYPE(PreloadMeasuredStatesNode);
-
-void SearchPolicyNode::PreloadMeasuredStates(const std::string& log_file) {
-  LogReader reader = LogReader(log_file);
-  const auto& res = reader->ReadLines(-1);
-  size_t log_size = res.first.size();
-  CHECK_EQ(log_size, res.second.size());
-  if (log_size) {
-    std::vector<State> measured_states;
-    std::vector<float> measured_throughputs;
-    for (size_t i = 0; i < log_size; i++) {
-      const auto& inp = res.first[i];
-      if (inp->task->workload_key == cur_task->workload_key &&
-          inp->task->target->target_name.compare(
-              cur_task->target->target_name) == 0) {
-        State state = cur_task->compute_dag.GetInitState();
-        state.CopyOnWrite()->transform_steps = inp->state->transform_steps;
-        state.DoSteps(inp->state->transform_steps, cur_task->compute_dag);
-        measured_states.emplace_back(std::move(state));
-        measured_throughputs.push_back(res.second[i]->error_no == 0 ?
-            (1.0 / FloatArrayMean(res.second[i]->costs)) : 0.0);
-      }
-    }
-    cur_task->compute_dag.InferBound(&measured_states);
-    for (size_t i = 0; i < measured_states.size(); i ++) {
-      auto& state = measured_states[i];
-      const auto& state_str = state.ToStr();
-      if (!measured_states_set_.count(state_str)) {
-        measured_states_set_.insert(state_str);
-        if (measured_throughputs[i] != 0.0) {
-          measured_states_vector_.emplace_back(std::move(state));
-          measured_states_throughputs_.emplace_back(measured_throughputs[i]);
-        }
-      }
-    }
-
-    StdCout(verbose) << "Successfully load " << measured_states_set_.size()
-                     << " measurement records from " << log_file
-                     << " for " << cur_task->workload_key << std::endl;
-  } else {
-    StdCout(verbose) << "No measurement records found in "
-                     << log_file << " for " << cur_task->workload_key << std::endl;
-  }
-}
-
-void SearchPolicyNode::RunCallbacks(const Array<SearchCallback>& callbacks) {
-  if (callbacks.defined() && callbacks.size()) {
-    PrintTitle("Call search callbacks", verbose);
-    for (const auto& callback : callbacks) {
-      callback->callback(this);
-    }
-  }
-}
-
-PreloadMeasuredStates::PreloadMeasuredStates(std::string filename) {
-  auto node = make_object<PreloadMeasuredStatesNode>();
-  node->filename = std::move(filename);
-  data_ = std::move(node);
-}
-
-void PreloadMeasuredStatesNode::callback(SearchPolicyNode* policy) {
-  policy->PreloadMeasuredStates(filename);
-}
-
-// Search Policy
-TVM_REGISTER_GLOBAL("ansor.SearchPolicyContinueSearchOneRound")
-.set_body_typed([](SearchPolicy policy, SearchTask task, int num_measure,
-                   int verbose, ProgramMeasurer measurer) {
-  Array<MeasureInput> inputs;
-  Array<MeasureResult> results;
-  std::tie(inputs, results) = policy->ContinueSearchOneRound(task, num_measure, verbose, measurer);
-  return Array<ObjectRef>{inputs, results};
-});
-
-TVM_REGISTER_GLOBAL("ansor.SearchPolicyRunCallbacks")
-.set_body_typed([](SearchPolicy policy, Array<SearchCallback> callbacks) {
-  policy->RunCallbacks(callbacks);
-});
-
-TVM_REGISTER_GLOBAL("ansor.SearchPolicySetTask")
-.set_body_typed([](SearchPolicy policy, SearchTask task) {
-  policy->cur_task = task;
-});
-
-TVM_REGISTER_GLOBAL("ansor.SearchPolicySetVerbose")
-.set_body_typed([](SearchPolicy policy, int verbose) {
-  policy->verbose = verbose;
-});
-
-TVM_REGISTER_GLOBAL("ansor.PreloadMeasuredStates")
-.set_body_typed([](std::string filename) {
-  return PreloadMeasuredStates(filename);
-});
-
-}  // namespace ansor
-}  // namespace tvm
diff --git a/src/ansor/search_policy/search_policy.h b/src/ansor/search_policy/search_policy.h
deleted file mode 100644
index 03e7c3f025dfc..0000000000000
--- a/src/ansor/search_policy/search_policy.h
+++ /dev/null
@@ -1,131 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file ansor/search_policy/search_policy.h
- * \brief The base class for search policy
- */
-
-#ifndef TVM_ANSOR_SEARCH_POLICY_SEARCH_POLICY_H_
-#define TVM_ANSOR_SEARCH_POLICY_SEARCH_POLICY_H_
-
-#include "../search_task.h"
-#include <tvm/node/node.h>
-#include <unordered_set>
-#include <vector>
-#include <utility>
-#include <string>
-#include "../measure.h"
-
-namespace tvm {
-namespace ansor {
-
-class SearchPolicyNode;
-
-/*! \brief Callback function to be called before or after the search process */
-class SearchCallbackNode : public Object {
- public:
-  virtual void callback(SearchPolicyNode* policy) = 0;
-
-  static constexpr const char *_type_key = "ansor.SearchCallback";
-  TVM_DECLARE_BASE_OBJECT_INFO(SearchCallbackNode, Object);
-};
-TVM_DEFINE_MUTABLE_OBJECT_REF(SearchCallback, SearchCallbackNode);
-
-/*! \brief Preload measured states from a log file.
- * This can resume the state of the search policy */
-class PreloadMeasuredStatesNode : public SearchCallbackNode {
- public:
-  std::string filename;
-
-  void callback(SearchPolicyNode* policy) final;
-
-  static constexpr const char *_type_key = "ansor.PreloadMeasuredStates";
-  TVM_DECLARE_FINAL_OBJECT_INFO(PreloadMeasuredStatesNode, SearchCallbackNode);
-};
-
-/*!
- * \brief Managed reference to PreloadMeasuredStatesNode.
- * \sa PreloadMeasuredStatesNode
- */
-class PreloadMeasuredStates : public SearchCallback {
- public:
-  explicit PreloadMeasuredStates(std::string filename);
-
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PreloadMeasuredStates, SearchCallback,
-                                        PreloadMeasuredStatesNode);
-};
-
-/*! \brief The base class for search policy */
-class SearchPolicyNode : public Object {
- public:
-  SearchTask cur_task;   // The current task
-  int verbose;           // Verbose level (0 means silent)
-
-  void VisitAttrs(AttrVisitor* v) {
-    v->Visit("cur_task", &cur_task);
-    v->Visit("verbose", &verbose);
-  }
-
-  // Search for a task
-  virtual State Search(SearchTask task, int n_trials,
-                       int early_stopping, int num_measure_per_iter,
-                       int verbose, ProgramMeasurer measurer,
-                       Array<SearchCallback> pre_search_callbacks) = 0;
-
-  // Continue search one round for a task.
-  // This is used in the task scheduler for searching for multiple tasks together.
-  virtual std::pair<Array<MeasureInput>, Array<MeasureResult> > ContinueSearchOneRound(
-      SearchTask task, int num_measure, int verbose, ProgramMeasurer measurer) = 0;
-
-  // Preload measured states from a log file to resume the state of the search policy
-  void PreloadMeasuredStates(const std::string& log_file);
-
-  // Run a list of callback functions
-  void RunCallbacks(const Array<SearchCallback>& callbacks);
-
-  // Dict keys to give hints to the policy
-  static constexpr const char* always_unroll_inner_key = "ansor_always_unroll_inner";
-  static constexpr const char* always_unroll_key = "ansor_always_unroll";
-  static constexpr const char* no_split_at_inner_key = "ansor_no_split_at_inner";
-  static constexpr const char* no_split_at_outer_key = "ansor_no_split_at_outer";
-  static constexpr const char* last_split_is_one_key = "ansor_last_split_is_one";
-  // Flag keys to give hints to the policy
-  static constexpr const char* always_compute_inline_key = "ansor_always_compute_inline";
-  static constexpr const char* no_cache_write_key = "ansor_no_cache_write";
-  static constexpr const char* no_cache_read_key = "ansor_no_cache_read";
-
-  static constexpr const char *_type_key = "ansor.SearchPolicy";
-  TVM_DECLARE_BASE_OBJECT_INFO(SearchPolicyNode, Object);
-
- protected:
-  // The set of the already measured states.
-  // We store the string format for redundancy check
-  std::unordered_set<std::string> measured_states_set_;
-  // The array of already measured states.
-  std::vector<State> measured_states_vector_;
-  // The throughputs of already measured states
-  std::vector<float> measured_states_throughputs_;
-};
-TVM_DEFINE_MUTABLE_OBJECT_REF(SearchPolicy, SearchPolicyNode);
-
-}  // namespace ansor
-}  // namespace tvm
-
-#endif  // TVM_ANSOR_SEARCH_POLICY_SEARCH_POLICY_H_
diff --git a/src/ansor/search_policy/sketch_search_policy.cc b/src/ansor/search_policy/sketch_search_policy.cc
deleted file mode 100644
index 5b2c10c08c815..0000000000000
--- a/src/ansor/search_policy/sketch_search_policy.cc
+++ /dev/null
@@ -1,1538 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file ansor/search_policy/sketch_search_policy.h
- * \brief The search policy that searches in a hierarchical search space defined by sketches.
- * The policy randomly samples programs from the space defined by sketches
- * and use evolutionary search to fine-tune them.
- */
-
-#include "sketch_search_policy.h"
-#include <tvm/runtime/registry.h>
-#include <iomanip>
-#include <vector>
-#include <string>
-#include <algorithm>
-#include <limits>
-#include <set>
-#include <unordered_set>
-#include <unordered_map>
-#include <utility>
-#include "utils.h"
-
-#define IS_GPU(task) ((task)->target->device_type == kDLGPU || \
-  (task)->target->device_type == kDLOpenCL)
-
-namespace tvm {
-namespace ansor {
-
-TVM_REGISTER_NODE_TYPE(SketchSearchPolicyNode);
-TVM_REGISTER_OBJECT_TYPE(PreloadCustomSketchRuleNode);
-
-// All possible candidates for auto_unroll
-const std::vector<int> SketchSearchPolicyNode::auto_unroll_configs{0, 16, 64, 512, 1024};
-
-SketchSearchPolicy::SketchSearchPolicy(CostModel program_cost_model,
-                                       Map<String, ObjectRef> params,
-                                       int seed) {
-  auto node = make_object<SketchSearchPolicyNode>();
-  node->program_cost_model = std::move(program_cost_model);
-  node->rand_gen_ = std::mt19937(seed);
-  node->params = std::move(params);
-  data_ = std::move(node);
-}
-
-State SketchSearchPolicyNode::Search(SearchTask task, int n_trials,
-    int early_stopping, int num_measure_per_iter, int verbose,
-    ProgramMeasurer measurer, Array<SearchCallback> pre_search_callbacks) {
-  std::vector<State> best_states, random_states;
-  this->cur_task = task;
-  this->verbose = verbose;
-  num_measure_per_iter_ = num_measure_per_iter;
-
-  RunCallbacks(pre_search_callbacks);
-
-  if (n_trials <= 1) {  // no measurement is allowed
-    SearchOneRound(&best_states, 0, &random_states);
-    CHECK_GT(best_states.size(), 0);
-    return best_states[0];
-  } else {
-    std::vector<MeasureInput> inputs;
-    std::vector<MeasureResult> results;
-    int num_random = static_cast<int>(GetDoubleParam(params, "eps_greedy") * num_measure_per_iter);
-
-    measurer->Reset();
-
-    early_stopping = early_stopping < 0 ? std::numeric_limits<int>::max() >> 1 : early_stopping;
-
-    int ct = 0;
-    while (ct < n_trials) {
-      if (!inputs.empty()) {
-        // retrain cost models
-        PrintTitle("Train cost model", verbose);
-        program_cost_model->Update(inputs, results);
-      }
-
-      // Search one round to get promising states
-      PrintTitle("Search", verbose);
-      SearchOneRound(&best_states, num_random, &random_states);
-
-      // Fill correct bound.This is necessary for computing the correct ToStr() for reduncency check
-      cur_task->compute_dag.InferBound(&best_states);
-      cur_task->compute_dag.InferBound(&random_states);
-
-      // Pick `num_measure_per_iter` states to measure, check hash to remove already measured state
-      // Also pick some random states to do eps-greedy
-      PickStatesWithEpsGreedy(&inputs, best_states, random_states, n_trials - ct);
-
-      // Have traversed all of search space
-      if (inputs.empty()) {
-        StdCout(verbose) << "All candidates in the search space have been measured." << std::endl;
-        break;
-      }
-
-      // Measure candidate states
-      PrintTitle("Measure", verbose);
-      measurer->Measure(cur_task, GetRef<SearchPolicy>(this), inputs, &results);
-      ct += inputs.size();
-
-      if (ct - measurer->best_ct[cur_task->workload_key] > early_stopping) {
-        StdCout(verbose) << "Meet the early stopping condition." << std::endl;
-        break;
-      }
-
-      // Update measured states. These states will join the LocalMutation in later rounds
-      for (const auto& res : results) {
-        measured_states_throughputs_.push_back(1.0 / FloatArrayMean(res->costs));
-      }
-    }
-    PrintTitle("Done", verbose);
-
-    return measurer->best_state[cur_task->workload_key];
-  }
-}
-
-std::pair<Array<MeasureInput>, Array<MeasureResult> >
-    SketchSearchPolicyNode::ContinueSearchOneRound(
-    SearchTask task, int num_measure, int verbose, ProgramMeasurer measurer) {
-  if (cur_task.defined()) {
-    CHECK_EQ(cur_task, task);
-  } else {
-    cur_task = task;
-  }
-  this->verbose = verbose;
-  num_measure_per_iter_ = num_measure;
-
-  std::vector<State> best_states, random_states;
-  std::vector<MeasureInput> inputs;
-  std::vector<MeasureResult> results;
-  int num_random = static_cast<int>(GetDoubleParam(params, "eps_greedy") * num_measure);
-
-  // Search one round to get promising states
-  PrintTitle("Search", verbose);
-  SearchOneRound(&best_states, num_random * 2, &random_states);
-
-  // Fill correct bound. This is necessary for computing the correct ToStr() for reduncency check
-  cur_task->compute_dag.InferBound(&best_states);
-  cur_task->compute_dag.InferBound(&random_states);
-
-  // Pick `num_measure` states to measure, check hash to remove already measured state
-  // Also pick some random states to do eps-greedy
-  PickStatesWithEpsGreedy(&inputs, best_states, random_states, num_measure);
-
-  // Measure candidate states
-  PrintTitle("Measure", verbose);
-  measurer->Measure(cur_task, GetRef<SearchPolicy>(this), inputs, &results);
-
-  // Update throughputs of measured states. These states will join the LocalMutation in later rounds
-  for (const auto& res : results) {
-    measured_states_throughputs_.push_back(1.0 / FloatArrayMean(res->costs));
-  }
-
-  // Update the cost model
-  Array<MeasureInput> inputs_arr(std::make_move_iterator(inputs.begin()),
-                                 std::make_move_iterator(inputs.end()));
-  Array<MeasureResult> results_arr(std::make_move_iterator(results.begin()),
-                                   std::make_move_iterator(results.end()));
-
-  PrintTitle("Train cost model", verbose);
-  program_cost_model->Update(inputs_arr, results_arr);
-  return std::make_pair(std::move(inputs_arr), std::move(results_arr));
-}
-
-void SketchSearchPolicyNode::PickStatesWithEpsGreedy(
-    std::vector<MeasureInput>* inputs,
-    const std::vector<State>& best_states,
-    const std::vector<State>& random_states,
-    int remaining_n_trials) {
-  int num_random = static_cast<int>(GetDoubleParam(params, "eps_greedy") * num_measure_per_iter_);
-  int num_good = num_measure_per_iter_ - num_random;
-
-  inputs->clear();
-  size_t offset_best = 0, offset_random = 0;
-
-  while (static_cast<int>(inputs->size()) < std::min(num_measure_per_iter_, remaining_n_trials)) {
-    const State* pstate;
-
-    bool has_best = offset_best < best_states.size();
-    bool has_random = offset_random < random_states.size();
-
-    if (static_cast<int>(inputs->size()) < num_good) {
-      // prefer best states
-      if (has_best) {
-        pstate = &best_states[offset_best++];
-      } else if (has_random) {
-        pstate = &random_states[offset_random++];
-      } else {
-        break;
-      }
-    } else {
-      // prefer random states
-      if (has_random) {
-        pstate = &random_states[offset_random++];
-      } else if (has_best) {
-        pstate = &best_states[offset_best++];
-      } else {
-        break;
-      }
-    }
-
-    // Check if it has already been measured
-    std::string state_str = pstate->ToStr();
-
-    if (measured_states_set_.count(state_str)) { continue; }
-    measured_states_set_.insert(state_str);
-
-    inputs->push_back(MeasureInput(cur_task, *pstate));
-    measured_states_vector_.push_back(std::move(*pstate));
-  }
-}
-
-void SketchSearchPolicyNode::SearchOneRound(std::vector<State>* best_states,
-    int num_random_states, std::vector<State>* random_states) {
-  best_states->clear();
-  random_states->clear();
-
-  // Get parameters
-  int population = GetIntParam(params, "evolutionary_search_population");
-  int num_use_measured = std::min(static_cast<int>(measured_states_vector_.size()),
-      static_cast<int>(
-          GetDoubleParam(params, "evolutionary_search_use_measured_ratio") * population));
-  bool have_cost_model = !program_cost_model->IsInstance<RandomModelNode>();
-
-  if (!have_cost_model) {
-    num_use_measured = 0;
-  }
-
-  // Generate sketches
-  std::vector<State> sketches;
-  GenerateSketch(&sketches);
-
-  // PrintAllStates(sketches);
-  // exit(0);
-
-  // Sample the init population
-  std::vector<State> init_population;
-  SampleInitPopulation(sketches, population - num_use_measured, &init_population);
-
-  // PrintAllStates(init_population);
-  // exit(0);
-
-  if (have_cost_model) {
-    // Also insert already measured good states to the initial population
-    std::vector<int> indices;
-    Argsort(measured_states_throughputs_, &indices);
-    for (int i = 0; i < num_use_measured; i++) {
-      init_population.push_back(measured_states_vector_[indices[i]]);
-    }
-
-    // Perform evolutionary search
-    EvolutionarySearch(init_population, num_measure_per_iter_ * 2, best_states);
-  } else {
-    // If the cost model is useless (i.e. RandomCostModel), skip evolutionary search
-    RandomSampleStates(init_population, &rand_gen_, num_measure_per_iter_ * 3, best_states);
-  }
-
-  // Sample some random states for eps-greedy
-  RandomSampleStates(init_population, &rand_gen_, num_random_states * 10, random_states);
-}
-
-// The baseclass of derivation rules used in sketch generation
-class SketchGenerationRule {
- public:
-  enum ConditionEnum {
-    kPass, kApply, kApplyAndSkipRest
-  };
-
-  virtual ConditionEnum MeetCondition(const SketchSearchPolicyNode* policy,
-      const State& state, int stage_id) = 0;
-  virtual std::vector<std::pair<State, int> > Apply(const SketchSearchPolicyNode* policy,
-      const State& state, int stage_id) = 0;
-};
-
-static inline bool ShouldBeCacheRead(
-    const SketchSearchPolicyNode* policy, const State& state, int stage_id) {
-  const SearchTask& task = policy->cur_task;
-  const Stage& stage = state->stages[stage_id];
-
-  if (HasAttrsFlag(state, stage_id,
-                   SearchPolicyNode::no_cache_read_key)) {
-    return false;
-  }
-
-  std::unordered_set<te::Operation, ObjectHash, ObjectEqual> consumers;
-  GetConsumers(task, state, stage->op, &consumers);
-  if (consumers.size() != 1) {
-    return false;
-  }
-
-  int target_stage_id = OperationToStage(*consumers.begin(), state);
-  if (!NeedsMultilevelTiling(task, state,
-                             state->stages[target_stage_id]->op)) {
-    return false;
-  }
-
-  std::unordered_set<te::Operation, ObjectHash, ObjectEqual> producers;
-  GetProducers(task, state, state->stages[target_stage_id]->op, &producers);
-  // Only those directly mapped stages can do CacheRead
-  if (producers.find(stage->op) == producers.end()) {
-    return false;
-  }
-
-  return true;
-}
-
-static inline bool ShouldAlwaysBeInlined(
-    const SketchSearchPolicyNode* policy, const State& state, int stage_id) {
-  const SearchTask& task = policy->cur_task;
-  const Stage& stage = state->stages[stage_id];
-
-  if (stage->op->IsInstance<te::PlaceholderOpNode>()) {
-    return false;
-  }
-
-  // Inline limitation of TVM
-  if (!IsOutputOp(task, state, stage->op) && !HasReduceIter(stage)) {
-    // Always inline condition:
-    // 1. Has attrs that this must be inlined
-    // 2. Analyse shows this is strict inlineable
-    // 3. A GPU stage can be inlined(If it should be cache read, do it first)
-    if (HasAttrsFlag(state, stage_id,
-                     SearchPolicyNode::always_compute_inline_key) ||
-        IsStrictInlineable(task, state, stage->op) ||
-        (IS_GPU(policy->cur_task) &&
-         !ShouldBeCacheRead(policy, state, stage_id))) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-// The rule that inlines simple elementwise ops
-class RuleAlwaysInline : public SketchGenerationRule {
- public:
-  ConditionEnum MeetCondition(const SketchSearchPolicyNode* policy,
-      const State& state, int stage_id) final {
-    return ShouldAlwaysBeInlined(policy, state, stage_id) ?
-        kApplyAndSkipRest : kPass;
-  }
-
-  std::vector<std::pair<State, int> > Apply(const SketchSearchPolicyNode* policy,
-      const State& state, int stage_id) final {
-    State tmp_s = state;
-    tmp_s.compute_inline(stage_id);
-    return {std::make_pair(std::move(tmp_s), stage_id - 1)};
-  }
-};
-
-// The rule that simply skip the current stage
-class RuleSkipStage : public SketchGenerationRule {
- public:
-  ConditionEnum MeetCondition(const SketchSearchPolicyNode* policy,
-                              const State& state, int stage_id) final {
-    const SearchTask& task = policy->cur_task;
-    const Stage& stage = state->stages[stage_id];
-
-    const auto& attrs = stage->op->attrs;
-    if ((attrs.count(SearchPolicyNode::no_split_at_inner_key) ||
-        attrs.count(SearchPolicyNode::no_split_at_outer_key)) &&
-        NeedsMultilevelTiling(task, state, stage->op)) {
-      // for the transform stages in Winograd
-      return kPass;
-    }
-
-    return kApply;
-  }
-
-  std::vector<std::pair<State, int> > Apply(const SketchSearchPolicyNode* policy,
-                                            const State& state, int stage_id) final {
-    return {std::make_pair(state, stage_id - 1)};
-  }
-};
-
-// The rule that performs multi-level tiling
-class RuleMultiLevelTiling : public SketchGenerationRule {
- public:
-  ConditionEnum MeetCondition(const SketchSearchPolicyNode* policy,
-                              const State& state, int stage_id) final {
-    const SearchTask& task = policy->cur_task;
-    const Stage& stage = state->stages[stage_id];
-
-    return NeedsMultilevelTiling(task, state, stage->op) ?
-           (IS_GPU(policy->cur_task) ? kApplyAndSkipRest : kApply) : kPass;
-  }
-
-  std::vector<std::pair<State, int> > Apply(const SketchSearchPolicyNode* policy,
-                                            const State& state, int stage_id) final {
-    std::string multi_level_tiling_structure = IS_GPU(policy->cur_task) ?
-        GetStringParam(policy->params, "gpu_multi_level_tiling_structure") :
-        GetStringParam(policy->params, "cpu_multi_level_tiling_structure");
-
-    std::vector<int> spatial_split_step_ids;
-    State tmp_s = state;
-    tmp_s = DoMultiLevelTiling(tmp_s, stage_id, multi_level_tiling_structure,
-        &spatial_split_step_ids);
-    return {std::make_pair(std::move(tmp_s), stage_id-1)};
-  }
-};
-
-// The rule that performs multi-level tiling and fuses later consumers
-class RuleMultiLevelTilingWithFusion : public SketchGenerationRule {
- public:
-  ConditionEnum MeetCondition(const SketchSearchPolicyNode* policy,
-                              const State& state, int stage_id) final {
-    const SearchTask& task = policy->cur_task;
-    const Stage& stage = state->stages[stage_id];
-
-    int target_stage_id;
-
-    if (IS_GPU(policy->cur_task)) {
-      return NeedsMultilevelTiling(task, state, stage->op) &&
-             HasSingleElementwiseMatchedConsumer(task, state, stage,
-                                                 &target_stage_id) &&
-             (!HasCacheReadStage(state, stage_id) ||
-              HasCacheWriteStage(state, stage_id)) ?
-          kApplyAndSkipRest : kPass;
-    }
-
-    return NeedsMultilevelTiling(task, state, stage->op) &&
-            HasSingleElementwiseMatchedConsumer(task, state, stage,
-                                                &target_stage_id) ?
-        kApply : kPass;
-  }
-
-  std::vector<std::pair<State, int> > Apply(const SketchSearchPolicyNode* policy,
-                                            const State& state, int stage_id) final {
-    const SearchTask& task = policy->cur_task;
-    const Stage& stage = state->stages[stage_id];
-    std::string multi_level_tiling_structure = IS_GPU(policy->cur_task) ?
-        GetStringParam(policy->params, "gpu_multi_level_tiling_structure") :
-        GetStringParam(policy->params, "cpu_multi_level_tiling_structure");
-
-    std::vector<int> spatial_split_step_ids;
-    int target_stage_id;
-    std::unordered_set<te::Operation, ObjectHash, ObjectEqual> consumers;
-
-    GetConsumers(task, state, state->stages[stage_id]->op, &consumers);
-    CHECK(HasSingleElementwiseMatchedConsumer(task, state, stage, &target_stage_id));
-
-    State base_state = state;
-    base_state = DoMultiLevelTiling(base_state, stage_id,
-        multi_level_tiling_structure, &spatial_split_step_ids);
-    std::vector<int> follow_tiling_levels;
-    if (IS_GPU(policy->cur_task)) {
-      follow_tiling_levels.push_back(3);
-    } else {
-      follow_tiling_levels.push_back(1);
-      follow_tiling_levels.push_back(2);
-    }
-
-    std::vector<std::pair<State, int> > ret;
-    for (int level : follow_tiling_levels) {
-      if (tolower(multi_level_tiling_structure[level-1]) != 's') {
-        continue;
-      }
-      State tmp_s = base_state;
-      tmp_s = FollowTiling(tmp_s, target_stage_id, spatial_split_step_ids, level);
-      const Iterator &target_iter = tmp_s->stages[target_stage_id]->iters[
-          level * spatial_split_step_ids.size() - 1];
-      tmp_s.compute_at(stage_id, target_stage_id, target_iter);
-
-      ret.emplace_back(std::move(tmp_s), stage_id - 1);
-    }
-
-    return ret;
-  }
-};
-
-// The rule that adds a cache write stage
-class RuleAddCacheWrite : public SketchGenerationRule {
- public:
-  ConditionEnum MeetCondition(const SketchSearchPolicyNode* policy,
-                              const State& state, int stage_id) final {
-    const SearchTask& task = policy->cur_task;
-    const Stage& stage = state->stages[stage_id];
-
-    int target_stage_id;
-
-    // Add cache write if a stage needs multi-level tiling,
-    // but does not have a element-wise matched consumer
-    return NeedsMultilevelTiling(task, state, stage->op) &&
-           !HasAttrsFlag(state, stage_id, SearchPolicyNode::no_cache_write_key) &&
-           (!HasSingleElementwiseMatchedConsumer(task, state, stage,
-                                                  &target_stage_id) ||
-             (HasCacheReadStage(state, stage_id) &&
-              !HasCacheWriteStage(state, stage_id))) ?
-        kApply : kPass;
-  }
-
-  std::vector<std::pair<State, int> > Apply(const SketchSearchPolicyNode* policy,
-                                            const State& state, int stage_id) final {
-    const SearchTask& task = policy->cur_task;
-
-    State tmp_s = state;
-    tmp_s.cache_write(stage_id, "local", task->compute_dag);
-    return {std::make_pair(std::move(tmp_s), stage_id)};
-  }
-};
-
-// The rule that adds a cache read stage
-// Mainly used for GPU cooperative fetching
-// Currently only support 1 to 1 match cache read
-class RuleAddCacheRead : public SketchGenerationRule {
- public:
-  ConditionEnum MeetCondition(const SketchSearchPolicyNode* policy,
-                              const State& state, int stage_id) final {
-    return ShouldBeCacheRead(policy, state, stage_id) ?
-        kApplyAndSkipRest : kPass;
-  }
-
-  std::vector<std::pair<State, int> > Apply(const SketchSearchPolicyNode* policy,
-                                            const State& state, int stage_id) final {
-    const SearchTask& task = policy->cur_task;
-    const Stage& stage = state->stages[stage_id];
-
-    std::unordered_set<te::Operation, ObjectHash, ObjectEqual> consumers;
-    GetConsumers(task, state, stage->op, &consumers);
-    CHECK_EQ(consumers.size(), 1);
-    int target_stage_id = OperationToStage(*consumers.begin(), state);
-    State tmp_s = state;
-    int added_stage_id = tmp_s.cache_read(stage_id, "shared",
-                                          {target_stage_id},
-                                          task->compute_dag);
-    target_stage_id++;
-    const auto& share_read_pos = GetLastReduceIteratorInOutermostReduceTile(
-        tmp_s->stages[target_stage_id]);
-    tmp_s.compute_at(added_stage_id, target_stage_id, share_read_pos);
-
-    return {std::make_pair(std::move(tmp_s), stage_id)};
-  }
-};
-
-// The rule that adds rfactor stage
-class RuleAddRfactor : public SketchGenerationRule {
- public:
-  ConditionEnum MeetCondition(const SketchSearchPolicyNode* policy,
-                              const State& state, int stage_id) final {
-    const SearchTask& task = policy->cur_task;
-    const Stage& stage = state->stages[stage_id];
-
-    return NeedsRfactor(task, state, stage->op) &&
-           !HasCacheWriteStage(state, stage_id) ?
-        kApply : kPass;
-  }
-
-  std::vector<std::pair<State, int> > Apply(const SketchSearchPolicyNode* policy,
-                                            const State& state, int stage_id) final {
-    const SearchTask& task = policy->cur_task;
-    const Stage& stage = state->stages[stage_id];
-
-    std::vector<std::pair<State, int> > ret;
-
-    State tmp_s = state;
-
-    // fuse reduce iters
-    std::vector<Iterator> space_iters, reduce_iters;
-    for (const auto &iter : stage->iters) {
-      if (iter->iter_type == kSpace) {
-        space_iters.push_back(iter);
-      } else if (iter->iter_type == kReduce) {
-        reduce_iters.push_back(iter);
-      }
-    }
-    CHECK(!reduce_iters.empty());
-    Iterator fused_reduce_iter;
-    if (reduce_iters.size() > 1) {
-      fused_reduce_iter = tmp_s.fuse(stage_id, reduce_iters);
-    } else {
-      fused_reduce_iter = reduce_iters[0];
-    }
-
-    // split reduce iters
-    const auto &split_res = tmp_s.split(stage_id, fused_reduce_iter, {1});
-    int factor_axis_id = static_cast<int>(space_iters.size());
-    State base_state = tmp_s;
-    for (const auto &split_iter : split_res) {
-      tmp_s = base_state;
-      tmp_s.rfactor(stage_id, split_iter, factor_axis_id, task->compute_dag);
-
-      // reorder the space iterator to innermost for vectorization
-      if (split_iter == split_res[1]) {
-        std::vector<Iterator> new_order;
-        for (size_t i = 0; i < tmp_s->stages[stage_id]->iters.size(); ++i) {
-          if (i != space_iters.size()) {
-            new_order.push_back(tmp_s->stages[stage_id]->iters[i]);
-          }
-        }
-        new_order.push_back(tmp_s->stages[stage_id]->iters[space_iters.size()]);
-        tmp_s.reorder(stage_id, new_order);
-      }
-      ret.emplace_back(std::move(tmp_s), stage_id - 1);
-    }
-
-    return ret;
-  }
-};
-
-void SketchSearchPolicyNode::GenerateSketch(
-    std::vector<State>* out_states) {
-  State init_state = cur_task->compute_dag.GetInitState();
-  std::string cpu_multi_level_tiling_structure =
-      GetStringParam(params, "cpu_multi_level_tiling_structure");
-
-  // two ping pong buffers to avoid copy
-  std::vector<State> states_buf1, states_buf2;
-  std::vector<State> *pnow, *pnext;
-  pnow = &states_buf1;
-  pnext = &states_buf2;
-  pnow->push_back(init_state);
-
-  // A map that maps state to its current working position (stage_id)
-  std::unordered_map<State, int, ObjectHash, ObjectEqual> cur_stage_id_map;
-  cur_stage_id_map[init_state] = static_cast<int>(init_state->stages.size() - 1);
-
-  static RuleSkipStage rule_skip_stage;
-  static RuleAlwaysInline rule_always_inline;
-  static RuleMultiLevelTiling rule_multi_level_tiling;
-  static RuleMultiLevelTilingWithFusion rule_multi_level_tiling_with_fusion;
-  static RuleAddCacheWrite rule_add_cache_write_stage;
-  static RuleAddCacheRead rule_add_cache_read_stage;
-  static RuleAddRfactor rule_add_rfactor;
-  if (sketch_rules.empty()) {
-    // We may apply and skip the rest when processing some rules,
-    // should take care of the rule vector order here
-    sketch_rules.push_back(&rule_always_inline);
-    sketch_rules.push_back(&rule_add_cache_write_stage);
-    sketch_rules.push_back(&rule_multi_level_tiling_with_fusion);
-    sketch_rules.push_back(&rule_multi_level_tiling);
-    sketch_rules.push_back(&rule_add_rfactor);
-    sketch_rules.push_back(&rule_skip_stage);
-    if (IS_GPU(cur_task)) {
-      // Try cache read first before cache write
-      sketch_rules.insert(sketch_rules.begin() + 1, &rule_add_cache_read_stage);
-    }
-    // TODO(xian): Add a new rule to try combination of multi-level
-    // tiling + rfactor
-  }
-
-  // Derivation rule based synthesizer
-  while (!pnow->empty()) {
-    pnext->clear();
-
-    for (const State& state : *pnow) {
-      int stage_id = cur_stage_id_map[state];
-
-      // Reaches to the terminal stage
-      if (stage_id < 0) {
-        out_states->push_back(state);
-        continue;
-      }
-
-      // Try all derivation rules
-      for (const auto& rule : sketch_rules) {
-        auto rule_check = rule->MeetCondition(this, state, stage_id);
-        if (rule_check > SketchGenerationRule::ConditionEnum::kPass) {
-          for (const auto& pair : rule->Apply(this, state, stage_id)) {
-            cur_stage_id_map[pair.first] = pair.second;
-            pnext->push_back(pair.first);
-          }
-          // Skip the reset rules
-          if (rule_check == SketchGenerationRule::ConditionEnum::kApplyAndSkipRest) {
-            break;
-          }
-        }
-      }
-    }
-
-    std::swap(pnow, pnext);
-  }
-
-  // Hack for rfactor: Replace the split factor for rfactor to the undefined Expr(),
-  // so later we can sample random value for the split factor.
-  // Why don't we use Expr() when doing the split for rfactor at the first time?
-  // Because during ApplySteps, a rfactor with undefined Expr() will crash TVM.
-  // So rfactor with undefined Expr() will conflict with cache_write, cache_read, rfactor
-  // in other stages
-  for (size_t i = 0; i < out_states->size(); ++i) {
-    auto pstate = (*out_states)[i].CopyOnWrite();
-    for (size_t step_id = 0; step_id < pstate->transform_steps.size(); ++step_id) {
-      if (pstate->transform_steps[step_id]->IsInstance<RfactorStepNode>()) {
-        CHECK_GE(step_id, 1);
-        int split_step_id = step_id - 1;
-        auto step = pstate->transform_steps[split_step_id].as<SplitStepNode>();
-        CHECK(step != nullptr);
-        pstate->transform_steps[split_step_id]
-            = SplitStep(step->stage_id, step->iter_id, step->extent, {PrimExpr()},
-                        step->inner_to_outer);
-      }
-    }
-  }
-
-  StdCout(verbose) << "Generate Sketches\t\t#s: " << out_states->size() << std::endl;
-}
-
-int InitPopulationFillTileSize(const SketchSearchPolicyNode* policy,
-                               State* state, std::mt19937* rand_gen,
-                               SplitFactorizationMemo* split_memo) {
-  for (size_t step_id = 0; step_id < (*state)->transform_steps.size(); ++step_id) {
-    if (auto ps = (*state)->transform_steps[step_id].as<SplitStepNode>()) {
-      bool defined = true;
-      for (const PrimExpr& len : ps->lengths) {
-        if (!len.defined()) {
-          defined = false;
-        }
-      }
-
-      if (defined) {
-        continue;
-      }
-
-      int extent = GetIntImm(ps->extent);
-      const std::vector<std::vector<PrimExpr> >& candidate_lens =
-          split_memo->GetFactorizationSchemes(
-              extent, ps->lengths.size(),
-              policy->cur_task->hardware_params->max_innermost_split_factor);
-
-      StateNode* pstate = state->CopyOnWrite();
-      pstate->transform_steps[step_id] = SplitStep(
-          ps->stage_id, ps->iter_id, ps->extent,
-          candidate_lens[(*rand_gen)() % candidate_lens.size()],
-          ps->inner_to_outer);
-    }
-  }
-
-  return 0;
-}
-
-int InitPopulationThreadBind(const SketchSearchPolicyNode* policy,
-                             State* state) {
-  for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) {
-    const Stage& stage = (*state)->stages[stage_id];
-    auto pop = stage->op.as<te::ComputeOpNode>();
-
-    if (stage->compute_at != kRoot || stage->op_type == kPlaceholder) {
-      continue;
-    }
-
-    if (HasAnnotationIter(stage, IteratorAnnotation::kThreadX)) {
-      // Skip if this stage has already done thread bind
-      continue;
-    }
-
-    std::vector<Iterator> to_fuse;
-
-    // This stage has not been tiled, but in GPU schedule, we must tile it
-    // to do thread binding
-    if (!HasSplitStep(*state, stage_id)) {
-      for (const auto& it : (*state)->stages[stage_id]->iters) {
-        if (it->iter_type == kReduce) {
-          break;
-        }
-        to_fuse.push_back(it);
-      }
-      const auto& fused_it = state->fuse(stage_id, to_fuse);
-      // Set default vthread=1 & threadIdx.x=default_warp_size
-      // EvolutionarySearch will try more possiblity
-      if (GetExtent(fused_it) <=
-          policy->cur_task->hardware_params->warp_size) {
-        state->bind_thread(stage_id, fused_it, kThreadX);
-      } else {
-        const auto& split_its = state->split(stage_id, fused_it,
-            {1, policy->cur_task->hardware_params->warp_size});
-        state->bind_thread(stage_id, split_its[0], kBlockX);
-        state->bind_thread(stage_id, split_its[1], kVThread);
-        state->bind_thread(stage_id, split_its[2], kThreadX);
-      }
-
-      continue;
-    }
-
-    int total_space_extent = 1;
-    for (const auto& i : pop->root_iter_vars()) {
-      CHECK(i->dom.defined());
-      const auto& pint = i->dom->extent.as<IntImmNode>();
-      CHECK(pint);
-      total_space_extent *= pint->value;
-    }
-
-    // TODO(..): Add ThreadBind support for rfactor
-    if (total_space_extent <= policy->cur_task->hardware_params->warp_size) {
-      for (const auto& it : (*state)->stages[stage_id]->iters) {
-        if (it->iter_type == kReduce) {
-          break;
-        }
-        to_fuse.push_back(it);
-      }
-      const auto& fused_it = state->fuse(stage_id, to_fuse);
-      state->bind_thread(stage_id, fused_it, kThreadX);
-
-      continue;
-    }
-
-    // Fuse the outermost space tile as blockIdx
-    for (size_t i = 0; i < pop->axis.size(); i++) {
-      const auto& it = (*state)->stages[stage_id]->iters[i];
-      if (!StrEndsWith(it->name, ".0")) {
-        break;
-      }
-      to_fuse.push_back(it);
-    }
-    const auto& blockidx_it = state->fuse(stage_id, to_fuse);
-    state->bind_thread(stage_id, blockidx_it, kBlockX);
-
-    // Fuse the second outermost space tile as vthread
-    to_fuse.clear();
-    for (size_t i = 1; i < pop->axis.size() + 1; i++) {
-      const auto& it = (*state)->stages[stage_id]->iters[i];
-      if (!StrEndsWith(it->name, ".1")) {
-        break;
-      }
-      to_fuse.push_back((*state)->stages[stage_id]->iters[i]);
-    }
-    const auto& vthread_it = state->fuse(stage_id, to_fuse);
-    if (GetExtent(vthread_it) >
-        policy->cur_task->hardware_params->max_vthread_extent) {
-      return -1;
-    }
-    state->bind_thread(stage_id, vthread_it, kVThread);
-
-    // Fuse the third outermost space tile as threadIdx
-    to_fuse.clear();
-    for (size_t i = 2; i < pop->axis.size() + 2; i++) {
-      const auto& it = (*state)->stages[stage_id]->iters[i];
-      if (!StrEndsWith(it->name, ".2")) {
-        break;
-      }
-      to_fuse.push_back((*state)->stages[stage_id]->iters[i]);
-    }
-    const auto& threadidx_it = state->fuse(stage_id, to_fuse);
-    if (GetExtent(threadidx_it) <
-        policy->cur_task->hardware_params->warp_size) {
-      return -1;
-    }
-    state->bind_thread(stage_id, threadidx_it, kThreadX);
-  }
-
-  return 0;
-}
-
-int InitPopulationCooperativeFetching(const SketchSearchPolicyNode* policy,
-                                      State* state) {
-  for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) {
-    // Do cooperative fetching with cache read stage
-    // For two stages: A -> B
-    // 1. A -> A_cache_read -> B
-    //               *
-    // 2. A -> A_cache_write -> A_cache_read -> B
-    //                                *
-    if ((stage_id > 0 && HasCacheReadStage((*state), stage_id - 1) &&
-         !HasCacheWriteStage((*state), stage_id - 1)) ||
-        (stage_id > 1 && HasCacheReadStage((*state), stage_id - 2) &&
-         HasCacheWriteStage((*state), stage_id - 2))) {
-      const Stage& target_stage = (*state)->stages[stage_id];
-      if (HasAnnotationIter(target_stage, IteratorAnnotation::kThreadX) ||
-          HasAnnotationIter(target_stage, IteratorAnnotation::kTensorized)) {
-        // Skip if this stage has already done thread bind or has been
-        // tensorized
-        continue;
-      }
-      // Get spatial_split_step_ids from the root stage
-      std::unordered_set<te::Operation, ObjectHash, ObjectEqual> consumers;
-      std::vector<int> spatial_split_step_ids;
-      GetConsumers(policy->cur_task, (*state), target_stage->op, &consumers);
-      CHECK_EQ(consumers.size(), 1);
-      int target_stage_id = OperationToStage(*consumers.begin(), (*state));
-      GetSpaceSplitStepIds((*state), target_stage_id, &spatial_split_step_ids);
-
-      // Fuse all axis to to do cooperative fetching
-      Iterator fused = state->fuse(stage_id,
-                                   (*state)->stages[stage_id]->iters);
-      // Left a vectorized cooperative fetching split placeholder
-      const auto& iters0 = state->split(stage_id, fused, {1});
-      state->vectorize(stage_id, iters0[1]);
-      // Follow split to keep a same thread extent with the root stage
-      const auto& iters1 = state->follow_fused_split(stage_id, iters0[0],
-                                                     spatial_split_step_ids,
-                                                     1, true);
-      state->bind_thread(stage_id, iters1[1], kThreadX);
-    }
-  }
-
-  return 0;
-}
-
-int InitPopulationChangeComputeLocation(const SketchSearchPolicyNode* policy,
-                                        State* state, std::mt19937* rand_gen) {
-  if(GetIntParam(policy->params, "disable_change_compute_location")) {
-    return 0;
-  }
-
-  for (int stage_id = static_cast<int>((*state)->stages.size()) - 1; stage_id >= 0; stage_id--) {
-    const Stage& stage = (*state)->stages[stage_id];
-
-    if (stage->op_type == kPlaceholder) {
-      continue;
-    }
-
-    if (IsTiled(stage) || stage->compute_at == kInlined) {
-      continue;
-    }
-
-    if (NeedsMultilevelTiling(policy->cur_task, (*state), stage->op)) {
-      continue;
-    }
-
-    std::unordered_set<te::Operation, ObjectHash, ObjectEqual> consumers;
-
-    GetConsumers(policy->cur_task, (*state), stage->op, &consumers);
-    if (consumers.empty()) {
-      continue;
-    }
-
-    int target_stage_id;
-    if (consumers.size() == 1) {
-      target_stage_id = OperationToStage(*consumers.begin(), *state);
-    } else {
-      // check all consumers share a common root
-      int common_root_id = -1;
-      bool mismatch = false;
-      for (const auto& consumer : consumers) {
-        int consumer_stage_id = OperationToStage(consumer, *state);
-        int root_id = -1;
-        if ((*state)->stages[consumer_stage_id]->compute_at == kRoot) {
-          root_id = consumer_stage_id;
-        } else if ((*state)->stages[consumer_stage_id]->compute_at == kIter) {
-          root_id = (*state)->attach_map->stage_to_attach_iter.at(consumer_stage_id).first;
-        } else {
-          LOG(FATAL) << "Invalid case";
-        }
-
-        if (common_root_id == -1) {
-          common_root_id = root_id;
-        } else {
-          if (common_root_id != root_id) {
-            mismatch = true;
-            break;
-          }
-        }
-      }
-
-      if (mismatch) {
-        continue;
-      }
-      target_stage_id = common_root_id;
-    }
-
-    const Stage& target_stage = (*state)->stages[target_stage_id];
-    std::set<std::string> to_unroll_name_set;
-    if (target_stage->op->attrs.count(policy->always_unroll_key)) {
-      to_unroll_name_set = GetIterNameSetParam(target_stage->op->attrs,
-                                               policy->always_unroll_key);
-    }
-
-    std::vector<std::pair<int, Iterator> > candidates;
-    bool target_compute_at_other = target_stage->compute_at == kIter;
-    bool target_is_tiled = IsTiled(target_stage);
-
-    bool visited_reduce = false;
-    // enumerate compute_at location at target_stage
-    int ct = 0;
-    for (const auto& target_iter : target_stage->iters) {
-      if (target_iter->iter_type == kReduce) {
-        visited_reduce = true;
-        if (!target_is_tiled) {  // do not go into reduce iter
-          break;
-        }
-      } else if (target_iter->iter_type == kSpace) {
-        if (visited_reduce) {  // do not go into inner tile
-          break;
-        }
-      }
-
-      if (to_unroll_name_set.count(target_iter->name)) {
-          // Do not go into always unroll region
-          break;
-      }
-
-      if (GetExtent(target_iter) == 1) {  // skip iterators with length of 1
-        continue;
-      }
-      if (target_compute_at_other && target_iter->iter_type == kSpace &&
-          StrEndsWith(target_iter->name, ".0")) {
-        // skip the first level iterators if target stage compute_at another stage
-        // In this case, the lengths of first level iterators are always one
-        continue;
-      }
-      candidates.emplace_back(target_stage_id, target_iter);
-
-      if ((*state)->attach_map->iter_to_attached_stages.count(
-          std::make_pair(target_stage_id, ct++))) {
-        break;
-      }
-    }
-
-    // if the target_stage is already compute_at another stage X, try also compute_at X
-    // We call stage X as `target_target_stage`
-    if (target_compute_at_other) {
-      int target_target_stage_id;
-      target_target_stage_id = (*state)->attach_map->stage_to_attach_iter.at(
-          target_stage_id).first;
-      const Stage& target_target_stage = (*state)->stages[target_target_stage_id];
-      if (target_target_stage->op->attrs.count(policy->always_unroll_key)) {
-        to_unroll_name_set = GetIterNameSetParam(target_target_stage->op->attrs,
-                                                 policy->always_unroll_key);
-      } else {
-        to_unroll_name_set.clear();
-      }
-
-      int ct = 0;
-      for (const auto& target_target_iter : target_target_stage->iters) {
-        if (target_target_iter->iter_type == kReduce ||
-            (*state)->attach_map->iter_to_attached_stages.count(
-                std::make_pair(target_target_stage_id, ct++))) {
-          break;
-        }
-
-        if (to_unroll_name_set.count(target_target_iter->name)) {
-            // Do not go into always unroll region
-            break;
-        }
-
-        if (GetExtent(target_target_iter) == 1) {  // skip iterators with length of 1
-          continue;
-        }
-
-        candidates.push_back(std::make_pair(target_target_stage_id, target_target_iter));
-      }
-    }
-
-    int choice = (*rand_gen)() % (candidates.size() + 2);
-
-    if (choice == 0) {
-      if (!HasReduceIter(stage)) {
-        state->compute_inline(stage_id);
-      }
-    } else if (choice == 1) {
-      state->compute_root(stage_id);
-    } else {
-      choice = choice - 2;
-      state->compute_at(stage_id, candidates[choice].first, candidates[choice].second);
-    }
-  }
-
-  return 0;
-}
-
-int InitPopulationParallel(const SketchSearchPolicyNode* policy,
-                           State* state) {
-  std::function<void(const SketchSearchPolicyNode*, State*, int stage_id, int iter_offset)> annotate_parallel;
-
-  annotate_parallel = [&annotate_parallel](
-          const SketchSearchPolicyNode* policy, State* state, int stage_id, int iter_offset) {
-    const Stage& stage = (*state)->stages[stage_id];
-
-    std::vector<Iterator> to_fuse;
-    int64_t parallel_degree = 1;
-
-    // strategy: try to fuse and parallel the outermost n iterators
-    // Stop if we meet reduce iterator or we have enough parallel degree
-    size_t iter_id = iter_offset;
-    for (; iter_id < stage->iters.size(); ++iter_id) {
-      const Iterator& it = stage->iters[iter_id];
-      if (it->iter_type == kReduce || it->annotation != kNone) {
-        break;
-      }
-
-      to_fuse.push_back(it);
-      parallel_degree *= GetExtent(it);
-
-      if (parallel_degree > policy->cur_task->hardware_params->num_cores * 16) {
-        break;
-      }
-
-      if ((*state)->attach_map->iter_to_attached_stages.count(
-          std::make_pair(stage_id, iter_id))) {
-        break;
-      }
-    }
-
-    if (parallel_degree == 1) {
-      auto res = (*state)->attach_map->iter_to_attached_stages.find(std::make_pair(stage_id, iter_id));
-      if (res != (*state)->attach_map->iter_to_attached_stages.end()) {
-        for (int attached_stage_id : res->second) {
-          annotate_parallel(policy, state, attached_stage_id, 0);
-        }
-        annotate_parallel(policy, state, stage_id, iter_id + 1);
-      }
-    }
-
-    if (!to_fuse.empty()) {
-      if (to_fuse.size() == 1) {
-        state->parallel(stage_id, to_fuse[0]);
-      } else {
-        Iterator fused_iter = state->fuse(stage_id, to_fuse);
-        state->parallel(stage_id, fused_iter);
-      }
-    }
-  };
-
-  for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) {
-    const Stage& stage = (*state)->stages[stage_id];
-    if (stage->compute_at != kRoot || stage->op_type == kPlaceholder) {
-      continue;
-    }
-
-    annotate_parallel(policy, state, stage_id, 0);
-  }
-
-  return 0;
-}
-
-int InitPopulationVectorization(const SketchSearchPolicyNode* policy,
-                                State* state, std::mt19937* rand_gen) {
-  for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) {
-    const Stage& stage = (*state)->stages[stage_id];
-
-    if (stage->op_type == kPlaceholder) {
-      continue;
-    }
-
-    // Skip cooperative fetching stage
-    if (IS_GPU(policy->cur_task) &&
-        HasCacheReadStage((*state), stage_id - 1)) {
-      continue;
-    }
-
-    if (HasAnnotationIter(stage, IteratorAnnotation::kTensorized)) {
-      // Skip if this stage has been tensorized
-      continue;
-    }
-
-    // try to fuse and vectorize the space iterators in the inner most tile
-    int cum_length_prod = 1;
-
-    std::set<std::string> to_unroll_name_set;
-    if (stage->op->attrs.count(policy->always_unroll_key)) {
-      to_unroll_name_set = GetIterNameSetParam(stage->op->attrs,
-                                               policy->always_unroll_key);
-    }
-
-    int num_fusible = 0;
-    while (num_fusible < static_cast<int>(stage->iters.size())) {
-      int iter_id = static_cast<int>(stage->iters.size()) - 1 - num_fusible;
-      if ((*state)->attach_map->iter_to_attached_stages.count(
-          std::make_pair(stage_id, iter_id))) {
-        break;
-      }
-
-      const Iterator& it = stage->iters[iter_id];
-
-      // Stop if we meet a reduce iterator
-      if (it->iter_type == kReduce || it->annotation != kNone ||
-          to_unroll_name_set.count(it->name)) {
-        break;
-      }
-
-      // Stop if the memory access is not continuous (vectorizable)
-      // Note: The check is too hard, so we use heuristic here
-      if (IsTiled(stage) && num_fusible != 0) {
-        // If the stage is tiled, then the memory access must not be continuous
-        // for the innermost two iterators
-        break;
-      }
-
-      cum_length_prod *= GetExtent(it);
-      if (cum_length_prod > policy->cur_task->hardware_params->max_unroll_vec) {
-        break;
-      }
-
-      num_fusible++;
-    }
-
-    if (num_fusible > 1) {
-      num_fusible = 1 + (*rand_gen)() % (num_fusible - 1); // Select a random range to fuse
-    }
-
-    if (num_fusible == 1) {
-      state->vectorize(stage_id, stage->iters.back());
-    } else if (num_fusible > 1) {
-      std::vector<Iterator> to_fuse(stage->iters.end() - num_fusible,
-                                    stage->iters.end());
-      state->vectorize(stage_id, state->fuse(stage_id, to_fuse));
-    }
-  }
-
-  return 0;
-}
-
-int InitPopulationUnroll(const SketchSearchPolicyNode* policy,
-                         State* state, std::mt19937* rand_gen) {
-  for (size_t stage_id = 0; stage_id < (*state)->stages.size(); ++stage_id) {
-    const Stage& stage = (*state)->stages[stage_id];
-
-    if (stage->op_type == kPlaceholder) {
-      continue;
-    }
-
-    if (stage->op->attrs.count(policy->always_unroll_inner_key)) {
-      // Special unroll policy
-      auto to_unroll_name_set = GetIterNameSetParam(stage->op->attrs,
-                                                    policy->always_unroll_inner_key);
-      std::set<std::string> visited_names;
-
-      // Unroll the space iterators and reduce iterators listed in the attrs
-      // in the innermost tile
-      int n = static_cast<int>(stage->iters.size()) - 1;
-      visited_names.clear();
-      while (n >= 0) {
-        const Iterator& it = stage->iters[n];
-
-        // If we meet two iterators that come from a same original iterator,
-        // then we are out of the innermost tile
-        size_t size_before = visited_names.size();
-        ExtractOriginalIterators(it->name, &visited_names);
-        if (size_before == visited_names.size()) {
-          break;
-        }
-
-        std::set<std::string> name;
-        ExtractOriginalIterators(it->name, &name);
-        if (name.size() == 1 && to_unroll_name_set.count(*name.begin())) {
-          state->unroll(stage_id, it);
-        }
-
-        n--;
-      }
-    } else if (stage->op->attrs.count(policy->always_unroll_key)) {
-      // Special unroll policy
-      auto to_unroll_name_set = GetIterNameSetParam(stage->op->attrs,
-                                                    policy->always_unroll_key);
-
-      // Unroll the space iterators and reduce iterators listed in the attrs
-      int n = static_cast<int>(stage->iters.size()) - 1;
-      while (n >= 0) {
-        const Iterator& it = stage->iters[n];
-        if (to_unroll_name_set.count(it->name)) {
-          state->unroll(stage_id, it);
-        }
-        n--;
-      }
-    } else if (HasReduceIter(stage)) {
-      // use auto unroll for multi level tiled stage
-      int value = policy->auto_unroll_configs[
-          (*rand_gen)() % policy->auto_unroll_configs.size()];
-      state->pragma(stage_id, (*state)->stages[stage_id]->iters[0],
-                    std::string("auto_unroll_max_step") + "$" + std::to_string(value));
-    }
-  }
-
-  return 0;
-}
-
-void SketchSearchPolicyNode::SampleInitPopulation(const std::vector<State>& sketches,
-    int out_size, std::vector<State>* out_states) {
-  std::uniform_real_distribution<> dis(0.0, 1.0);
-  int continue_count = 0;
-
-  // TODO(...): Maybe try muti thread here
-  while (static_cast<int>(out_states->size()) < out_size &&
-         continue_count < out_size * 10) {
-    State tmp_s = sketches[rand_gen_() % sketches.size()];
-
-    InitPopulationFillTileSize(this, &tmp_s, &rand_gen_, &split_memo_);
-
-    if (IS_GPU(cur_task)) {
-      tmp_s = cur_task->compute_dag.InferBound(tmp_s);
-
-      if (InitPopulationThreadBind(this, &tmp_s)) {
-        continue_count++;
-        if (continue_count == out_size) {
-          StdCout(verbose) << "Initial Population Sampling..." << std::endl;
-        }
-        continue;
-      }
-
-      InitPopulationCooperativeFetching(this, &tmp_s);
-    } else {
-      InitPopulationChangeComputeLocation(this, &tmp_s, &rand_gen_);
-
-      tmp_s = cur_task->compute_dag.InferBound(tmp_s);
-
-      InitPopulationParallel(this, &tmp_s);
-    }
-
-    InitPopulationVectorization(this, &tmp_s, &rand_gen_);
-
-    InitPopulationUnroll(this, &tmp_s, &rand_gen_);
-
-    out_states->push_back(std::move(tmp_s));
-  }
-
-  StdCout(verbose) << "Sample Initial Population\t#s: "
-                   << out_states->size() << std::endl;
-}
-
-void SketchSearchPolicyNode::EvolutionarySearch(
-    const std::vector<State>& init_population,
-    int num_best_states, std::vector<State>* best_states) {
-  auto tic_begin = std::chrono::high_resolution_clock::now();
-
-  // Set parameters for genetic algorithm
-  int population = GetIntParam(params, "evolutionary_search_population");
-  int num_iters =  GetIntParam(params, "evolutionary_search_num_iters");
-  double mutation_prob = GetDoubleParam(params, "evolutionary_search_mutation_prob");
-  int num_cross_over = static_cast<int>(population * 0.0);  // NOT IMPLEMENTED currently
-  int num_cross_over_trial_upper_bound = num_cross_over * 3;
-  CostModel cost_model = program_cost_model;
-
-  // Two ping pong buffers to avoid copy
-  std::vector<State> states_buf1, states_buf2;
-  std::vector<State> *pnow = &states_buf1, *pnext = &states_buf2;
-  states_buf1.reserve(population);
-  states_buf2.reserve(population);
-  states_buf1.insert(states_buf1.begin(), init_population.begin(), init_population.end());
-
-  // A heap to keep the best states during evolution
-  using StateItem = std::pair<State, float>;
-  auto cmp = [](const StateItem& left, const StateItem& right) {
-    return left.second > right.second;
-  };
-  std::vector<StateItem> heap;
-  std::unordered_set<std::string> in_heap(measured_states_set_);
-  heap.reserve(num_best_states);
-
-  // auxiliary global variables
-  std::vector<float> scores;
-  std::vector<double> prefix_sum_probs;
-  double max_score = 0.0;
-  scores.reserve(population);
-  prefix_sum_probs.reserve(population);
-  std::uniform_real_distribution<> dis(0.0, 1.0);
-  int mutation_fail_ct = 0;
-
-  // Genetic Algorithm
-  for (int k = 0; k < num_iters + 1; ++k) {
-    // Maintain the heap
-    cur_task->compute_dag.InferBound(pnow);
-    PruneUndefined(pnow);
-    cost_model->Predict(cur_task, *pnow, &scores);
-
-    for (size_t i = 0; i < pnow->size(); ++i) {
-      const State& state = (*pnow)[i];
-      std::string state_str = state.ToStr();
-
-      if (in_heap.count(state_str) == 0) {
-        if (static_cast<int>(heap.size()) < num_best_states) {
-          heap.emplace_back((*pnow)[i], scores[i]);
-          std::push_heap(heap.begin(), heap.end(), cmp);
-          in_heap.insert(state_str);
-        } else if (scores[i] > heap.front().second) {
-          std::string old_state_str = heap.front().first.ToStr();
-          in_heap.erase(old_state_str);
-          in_heap.insert(state_str);
-
-          std::pop_heap(heap.begin(), heap.end(), cmp);
-          heap.back() = StateItem(state, scores[i]);
-          std::push_heap(heap.begin(), heap.end(), cmp);
-        }
-        if (scores[i] > max_score) {
-          max_score = scores[i];
-        }
-      }
-    }
-
-    if (k % 5 == 0 || k == num_iters) {
-      StdCout(verbose) << "GA Iter: " << k << std::fixed << std::setprecision(4)
-                       << "\tMax score: " << max_score
-                       << "\tMin score: " << heap.front().second
-                       << "\tPop size: " << pnow->size() << std::endl;
-    }
-
-    if (k == num_iters) {
-      break;
-    }
-
-    // Compute selection probability
-    double sum = 0.0;
-    prefix_sum_probs.resize(scores.size());
-    for (size_t i = 0; i < scores.size(); ++i) {
-      sum += std::max(scores[i], 0.0f);
-      prefix_sum_probs[i] = sum;
-    }
-    for (size_t i = 0; i < scores.size(); ++i) {
-      prefix_sum_probs[i] = prefix_sum_probs[i] / sum;
-    }
-
-    // Do cross over
-    int ct = 0;
-    while (static_cast<int>(pnext->size()) < num_cross_over
-        && ct < num_cross_over_trial_upper_bound) {
-      int p1 = RandomChoose(prefix_sum_probs, &rand_gen_);
-      int p2 = RandomChoose(prefix_sum_probs, &rand_gen_);
-
-      if (p1 == p2) {
-        pnext->push_back((*pnow)[p1]);
-      } else {
-        State tmp_s = CrossOverState((*pnow)[p1], (*pnow)[p2]);
-        if (tmp_s.defined()) {
-          pnext->push_back(std::move(tmp_s));
-        }
-      }
-      ct++;
-    }
-
-    // Do mutation
-    mutation_fail_ct = 0;
-    while (static_cast<int>(pnext->size()) < population) {
-      int id = RandomChoose(prefix_sum_probs, &rand_gen_);
-
-      if (dis(rand_gen_) < mutation_prob) {
-        const std::vector<double> rule_prefix_sum_probs{0.9, 1.0};
-
-        int rule_id = RandomChoose(rule_prefix_sum_probs, &rand_gen_);
-
-        if (rule_id == 0) {
-          // Mutate Tile Size
-          State tmp_s = RandomMutateTileSize((*pnow)[id], &split_memo_, &rand_gen_,
-                                             cur_task->hardware_params->max_innermost_split_factor);
-          if (tmp_s.defined()) {
-            pnext->push_back(std::move(tmp_s));
-          } else {
-            mutation_fail_ct++;
-          }
-        } else if (rule_id == 1) {
-          // Mutate auto-unroll max step.
-          State tmp_s = RandomMutateMaxUnrollStep((*pnow)[id], &rand_gen_, auto_unroll_configs);
-          if (tmp_s.defined()) {
-            pnext->push_back(std::move(tmp_s));
-          } else {
-            mutation_fail_ct++;
-          }
-        }
-      } else {
-        pnext->push_back((*pnow)[id]);
-      }
-    }
-
-    std::swap(pnext, pnow); pnext->clear();
-  }
-
-  // Copy best states in the heap to out_states
-  std::sort(heap.begin(), heap.end(), cmp);
-  best_states->clear();
-  for (auto& item : heap) {
-    best_states->push_back(std::move(item.first));
-  }
-
-  double duration = std::chrono::duration_cast<std::chrono::duration<double> >(
-      std::chrono::high_resolution_clock::now()-  tic_begin).count();
-  StdCout(verbose) << "EvolutionarySearch\t\t#s: " << best_states->size()
-                   << "\tTime elapsed: "
-                   << std::fixed << std::setprecision(2) << duration << std::endl;
-}
-
-class RuleCustomSketch : public SketchGenerationRule {
- public:
-  RuleCustomSketch(PackedFunc meet_condition_func, PackedFunc apply_func) :
-      meet_condition_func_(meet_condition_func), apply_func_(apply_func) {}
-
-  inline ConditionEnum MeetCondition(const SketchSearchPolicyNode* policy,
-                                     const State& state, int stage_id) final {
-    auto ret = meet_condition_func_(
-        tvm::runtime::GetRef<SketchSearchPolicy>(policy), state, stage_id);
-    if (ret.type_code() == 0) {
-      return ConditionEnum(static_cast<int>(ret));
-    } else {
-      return kApplyAndSkipRest;
-    }
-  }
-
-  inline std::vector<std::pair<State, int> > Apply(
-      const SketchSearchPolicyNode* policy,
-      const State& state, int stage_id) final {
-    std::vector<std::pair<State, int> > ret;
-
-    Array<Array<ObjectRef>> apply_ret = apply_func_(
-        tvm::runtime::GetRef<SketchSearchPolicy>(policy), state, stage_id);
-
-    for (const auto& item : apply_ret) {
-      CHECK_EQ(item.size(), 2);
-      State state = Downcast<State>(item[0]);
-      auto next = item[1].as<IntImmNode>();
-      ret.emplace_back(state, next->value);
-    }
-    return ret;
-  }
-
- private:
-  PackedFunc meet_condition_func_;
-  PackedFunc apply_func_;
-};
-
-PreloadCustomSketchRule::PreloadCustomSketchRule(PackedFunc meet_condition_func,
-                                                 PackedFunc apply_func) {
-  auto node = make_object<PreloadCustomSketchRuleNode>();
-  node->meet_condition_func = meet_condition_func;
-  node->apply_func = apply_func;
-  data_ = std::move(node);
-}
-
-void PreloadCustomSketchRuleNode::callback(SearchPolicyNode* policy) {
-  CHECK(policy->IsInstance<SketchSearchPolicyNode>());
-  auto sketch_policy = dynamic_cast<SketchSearchPolicyNode*>(policy);
-  sketch_policy->sketch_rules.emplace_back(
-      new RuleCustomSketch(meet_condition_func, apply_func));
-  StdCout(policy->verbose) << "Custom sketch rule added." << std::endl;
-}
-
-TVM_REGISTER_GLOBAL("ansor.SketchSearchPolicy")
-.set_body_typed([](CostModel program_cost_model, Map<String, ObjectRef> params,
-                   int seed){
-  return SketchSearchPolicy(program_cost_model, params, seed);
-});
-
-TVM_REGISTER_GLOBAL("ansor.PreloadCustomSketchRule")
-.set_body_typed([](PackedFunc meet_condition_func, PackedFunc apply_func) {
-  return PreloadCustomSketchRule(meet_condition_func, apply_func);
-});
-
-}  // namespace ansor
-}  // namespace tvm
diff --git a/src/ansor/search_policy/sketch_search_policy.h b/src/ansor/search_policy/sketch_search_policy.h
deleted file mode 100644
index 54a5cdd1fa4ee..0000000000000
--- a/src/ansor/search_policy/sketch_search_policy.h
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file ansor/search_policy/sketch_search_policy.h
- * \brief The search policy that searches in a hierarchical search space defined by sketches.
- * The policy randomly samples programs from the space defined by sketches
- * and use evolutionary search to  fine-tune them.
- */
-
-#ifndef TVM_ANSOR_SEARCH_POLICY_SKETCH_SEARCH_POLICY_H_
-#define TVM_ANSOR_SEARCH_POLICY_SKETCH_SEARCH_POLICY_H_
-
-#include <vector>
-#include <string>
-#include <utility>
-#include <unordered_set>
-#include <set>
-#include "search_policy.h"
-#include "../cost_model/cost_model.h"
-#include "../utils.h"
-
-
-namespace tvm {
-namespace ansor {
-
-class SketchGenerationRule;
-
-/*!
- * \brief The search policy that searches in a hierarchical search space defined by sketches.
- * The policy randomly samples programs from the space defined by sketches
- * and use evolutionary search to  fine-tune them.
- */
-class SketchSearchPolicyNode: public SearchPolicyNode {
- public:
-  /*! \brief The cost model for complete programs */
-  CostModel program_cost_model;
-  /*! \brief Random generator */
-  std::mt19937 rand_gen_;
-  /*! \brief The parameters for search. It stores the following parameters:
-   * int evolutionary_search_population    // The population size for evolutionary search
-   * int evolutionary_search_mutation_prob // The probability of mutation for evolutionary search
-   * int evolutionary_search_num_iters;    // The number of iterations for evolutionary search
-   * double local_mutation_use_measured_ratio;   // The maximum percentage of measured states in the initial
-   *                                             // population for evolutionary search
-   * double eps_greedy;          // Always allocate this percentage of measurements to random sampled states
-   * str cpu_multi_level_tiling_structure // The structure of multi-level tiling for CPU
-   * str gpu_multi_level_tiling_structure // The structure of multi-level tiling for GPU
-   */
-  Map<String, ObjectRef> params;
-  /*! \brief The rules to generate sketches */
-  std::vector<SketchGenerationRule*> sketch_rules;
-
-  /*! \brief Search and make n_trails measurements.
-   *  \returns the best state */
-  State Search(SearchTask task, int n_trials,
-               int early_stopping, int num_measure_per_iter,
-               int verbose, ProgramMeasurer measurer,
-               Array<SearchCallback> pre_search_callbacks) final;
-
-  /*! \brief Continue search for one round. This is used by JointTuner
-   * \returns the measurement pairs */
-  std::pair<Array<MeasureInput>, Array<MeasureResult> > ContinueSearchOneRound(
-      SearchTask task, int num_measure, int verbose, ProgramMeasurer measurer) final;
-
-  static constexpr const char *_type_key = "ansor.SketchSearchPolicy";
-  static const std::vector<int> auto_unroll_configs;
-
-  TVM_DECLARE_FINAL_OBJECT_INFO(SketchSearchPolicyNode, SearchPolicyNode);
-
- protected:
-  /*! \brief Pick states from best states and random states with eps-greedy policy */
-  void PickStatesWithEpsGreedy(std::vector<MeasureInput>* inputs,
-                               const std::vector<State>& best_states,
-                               const std::vector<State>& random_states,
-                               int remaining_n_trials);
-
- private:
-  // Run one round of the search pipeline
-  void SearchOneRound(std::vector<State>* best_states,
-                      int num_random_states, std::vector<State>* random_states);
-
-  // Generate sketches without tile size
-  void GenerateSketch(std::vector<State>* out_states);
-
-  // Sample init population
-  void SampleInitPopulation(const std::vector<State>& sketches,
-      int out_size, std::vector<State>* out_states);
-
-  // Perform evolutionary search
-  void EvolutionarySearch(const std::vector<State>& init_population,
-      int num_best_states, std::vector<State>* best_states);
-
-  SplitFactorizationMemo split_memo_;  // Memorize split space for Split
-  int num_measure_per_iter_;   // The number of states to measure per iteration
-};
-
-/*!
- * \brief Managed reference to SketchSearchPolicyNode.
- * \sa SketchSearchPolicyNode
- */
-class SketchSearchPolicy : public SearchPolicy {
- public:
-  SketchSearchPolicy(CostModel program_cost_model,
-                     Map<String, ObjectRef> params,
-                     int seed);
-
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(SketchSearchPolicy, SearchPolicy,
-                                        SketchSearchPolicyNode);
-};
-
-/*! \brief Pre-search callback function to load custom rules for sketch generation */
-class PreloadCustomSketchRuleNode : public SearchCallbackNode {
- public:
-  // TODO(jcf94): Use tvm::runtime::TypedPackedFunc?
-  PackedFunc meet_condition_func;
-  PackedFunc apply_func;
-
-  void callback(SearchPolicyNode* policy) final;
-
-  static constexpr const char *_type_key = "ansor.PreloadCustomSketchRule";
-  TVM_DECLARE_FINAL_OBJECT_INFO(PreloadCustomSketchRuleNode, SearchCallbackNode);
-};
-
-/*!
- * \brief Managed reference to PreloadCustomSketchRuleNode.
- * \sa PreloadCustomSketchRuleNode
- */
-class PreloadCustomSketchRule : public SearchCallback {
- public:
-  PreloadCustomSketchRule(PackedFunc meet_condition_func,
-                          PackedFunc apply_func);
-
-  TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PreloadCustomSketchRule, SearchCallback,
-                                        PreloadCustomSketchRuleNode);
-};
-
-}  // namespace ansor
-}  // namespace tvm
-
-#endif  // TVM_ANSOR_SEARCH_POLICY_SKETCH_SEARCH_POLICY_H_
diff --git a/src/ansor/search_policy/utils.cc b/src/ansor/search_policy/utils.cc
deleted file mode 100644
index 412d0afcca98d..0000000000000
--- a/src/ansor/search_policy/utils.cc
+++ /dev/null
@@ -1,429 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file ansor/search_policy/utils.cc
- * \brief Common utilities for search policies
- */
-
-#include "utils.h"
-#include "search_policy.h"
-
-namespace tvm {
-namespace ansor {
-
-void GetSpaceSplitStepIds(const State& s, int stage_id, std::vector<int>* spatial_split_step_ids) {
-  auto pop = s->stages[stage_id]->op.as<te::ComputeOpNode>();
-  CHECK(pop != nullptr);
-
-  auto no_split_name_pair = QueryNoSplitAxis(s->stages[stage_id]);
-  std::set<std::string> no_split_at_inner_name_set = no_split_name_pair.first;
-  std::set<std::string> no_split_at_outer_name_set = no_split_name_pair.second;
-  size_t reduce_count = 0;
-  for (const auto axis : pop->reduce_axis) {
-    if (!no_split_at_inner_name_set.count(axis->var->name_hint) &&
-        !no_split_at_outer_name_set.count(axis->var->name_hint)) {
-      reduce_count++;
-    }
-  }
-
-  for (int i = static_cast<int>(s->transform_steps.size()) - 1; i >= 0; --i) {
-    if (s->transform_steps[i]->IsInstance<CacheWriteStepNode>() ||
-        s->transform_steps[i]->IsInstance<CacheReadStepNode>() ||
-        s->transform_steps[i]->IsInstance<RfactorStepNode>()) {
-      if (stage_id > s->transform_steps[i]->stage_id) {
-        stage_id--;
-      }
-    } else if (auto ps = s->transform_steps[i].as<SplitStepNode>()) {
-      if (stage_id == ps->stage_id) {
-        if (reduce_count) {
-          reduce_count--;
-        } else {
-          spatial_split_step_ids->push_back(i);
-        }
-      }
-    }
-  }
-}
-
-State DoMultiLevelTiling(const State& state, int stage_id, const std::string& format,
-                         std::vector<int>* spatial_split_step_ids) {
-  std::vector<std::vector<Iterator> > space_levels;
-  std::vector<std::vector<Iterator> > reduce_levels;
-  std::vector<Iterator> space_outer, space_inner, reduce_outer, reduce_inner;
-  std::vector<Iterator> split_res;
-
-  for (const auto c : format) {
-    if (tolower(c) == 's') {
-      space_levels.emplace_back();
-    } else if (tolower(c) == 'r') {
-      reduce_levels.emplace_back();
-    } else {
-      LOG(FATAL) << "Invalid multi level tiling format: " << format;
-    }
-  }
-  size_t n_space = space_levels.size();
-  size_t n_reduce = reduce_levels.size();
-
-  spatial_split_step_ids->clear();
-
-  State tmp_s = state;
-  const Stage& stage = state->stages[stage_id];
-  auto no_split_name_pair = QueryNoSplitAxis(stage);  // handle special split strategy
-  auto last_split_is_one_name_set = QueryLastSplitIsOneAxis(stage);
-  std::set<std::string> no_split_at_inner_name_set = no_split_name_pair.first;
-  std::set<std::string> no_split_at_outer_name_set = no_split_name_pair.second;
-
-  for (const auto& iter : state->stages[stage_id]->iters) {
-    if (iter->iter_type == kSpace) {
-      if (!no_split_at_inner_name_set.count(iter->name) &&
-          !no_split_at_outer_name_set.count(iter->name)) {
-        CHECK_GE(n_space, 1);
-        int tmp_n_space = n_space;
-
-        if (last_split_is_one_name_set.count(iter->name)) {
-          tmp_n_space--;
-        }
-
-        if (tmp_n_space == 1) {
-          space_levels[0].push_back(iter);
-        } else {
-          split_res = tmp_s.split(stage_id, iter, std::vector<PrimExpr>(tmp_n_space - 1));
-          for (int i = 0; i < tmp_n_space; i++) {
-            space_levels[i].push_back(std::move(split_res[i]));
-          }
-          spatial_split_step_ids->push_back(tmp_s->transform_steps.size() - 1);
-        }
-      } else {
-        if (no_split_at_inner_name_set.count(iter->name)) {
-          space_inner.push_back(iter);
-        }
-        if (no_split_at_outer_name_set.count(iter->name)) {
-          space_outer.push_back(iter);
-        }
-      }
-    } else if (iter->iter_type == kReduce) {
-      // for reduce iterator, split it into two iterators
-      if (!no_split_at_inner_name_set.count(iter->name) &&
-          !no_split_at_outer_name_set.count(iter->name)) {
-        CHECK_GE(n_reduce, 1);
-        if (n_reduce == 1) {
-          reduce_levels[0].push_back(iter);
-        } else {
-          split_res = tmp_s.split(stage_id, iter, std::vector<PrimExpr>(n_reduce - 1));
-          for (size_t i = 0; i < n_reduce; i++) {
-            reduce_levels[i].push_back(std::move(split_res[i]));
-          }
-        }
-      } else {
-        if (no_split_at_inner_name_set.count(iter->name)) {
-          reduce_inner.push_back(iter);
-        }
-        if (no_split_at_outer_name_set.count(iter->name)) {
-          reduce_outer.push_back(iter);
-        }
-      }
-    } else {
-      LOG(FATAL) << "Invalid iter type: " << iter->iter_type;
-    }
-  }
-
-  if (!space_outer.empty()) {
-    CHECK(!space_levels.empty());
-    space_levels.front().insert(space_levels.front().begin(),
-            space_outer.begin(), space_outer.end());
-  }
-  if (!space_inner.empty()) {
-    CHECK(!space_levels.empty());
-    space_levels.back().insert(space_levels.back().begin(),
-            space_inner.begin(), space_inner.end());
-  }
-
-  if (!reduce_outer.empty()) {
-    CHECK(!reduce_levels.empty());
-    reduce_levels.front().insert(reduce_levels.front().begin(),
-            reduce_outer.begin(), reduce_outer.end());
-  }
-  if (!reduce_inner.empty()) {
-    CHECK(!reduce_levels.empty());
-    reduce_levels.back().insert(reduce_levels.back().begin(),
-            reduce_inner.begin(), reduce_inner.end());
-  }
-
-  std::vector<Iterator> order;
-  int space_ct = 0, reduce_ct = 0;
-  for (const auto c : format) {
-    if (tolower(c) == 's') {
-      order.insert(order.end(), std::make_move_iterator(space_levels[space_ct].begin()),
-              std::make_move_iterator(space_levels[space_ct].end()));
-      space_ct++;
-    } else if (tolower(c) == 'r') {
-      order.insert(order.end(), std::make_move_iterator(reduce_levels[reduce_ct].begin()),
-              std::make_move_iterator(reduce_levels[reduce_ct].end()));
-      reduce_ct++;
-    } else {
-      LOG(FATAL) << "Invalid multi level tiling format: " << format;
-    }
-  }
-
-  tmp_s.reorder(stage_id, order);
-  return tmp_s;
-}
-
-State FollowTiling(const State& state, int stage_id,
-                   const std::vector<int>& split_step_ids, int n_split) {
-  if (n_split < 1 || n_split > 3) {
-    LOG(FATAL) << "Invalid split parts, currently only support 1, 2 and 3";
-  }
-  // Apply up to three-level tiling structure:  space_L0, space_L1, space_L2
-  std::vector<Iterator> space_0, space_1, space_2, space_3;
-  std::vector<Iterator> split_res, tmp_order;
-
-  auto pop = state->stages[stage_id]->op.as<te::ComputeOpNode>();
-  CHECK(pop != nullptr);
-  const Stage& stage = state->stages[stage_id];
-  auto no_split_name_pair = QueryNoSplitAxis(stage);  // handle special split strategy
-  const std::set<std::string>& no_split_at_inner_name_set = no_split_name_pair.first;
-  const std::set<std::string>& no_split_at_outer_name_set = no_split_name_pair.second;
-  int no_split_at_inner_name_in_stage_cnt = 0;
-  int no_split_at_outer_name_in_stage_cnt = 0;
-  for (const auto& iter : state->stages[stage_id]->iters) {
-    no_split_at_inner_name_in_stage_cnt += no_split_at_inner_name_set.count(iter->name);
-    no_split_at_outer_name_in_stage_cnt += no_split_at_outer_name_set.count(iter->name);
-  }
-
-  CHECK_EQ(state->stages[stage_id]->iters.size()
-               - no_split_at_inner_name_in_stage_cnt
-               - no_split_at_outer_name_in_stage_cnt,
-           split_step_ids.size());
-
-  State tmp_s = state;
-  int ct = 0;
-  for (const auto& iter : state->stages[stage_id]->iters) {
-    if (iter->iter_type == kSpace) {
-      // For spatial iterator, split it into multi iterators
-      if (!no_split_at_inner_name_set.count(iter->name) &&
-          !no_split_at_outer_name_set.count(iter->name)) {
-        IteratorAnnotation ann_type = iter->annotation;
-        split_res = tmp_s.follow_split(stage_id, iter, split_step_ids[ct],
-                                       n_split);
-        // Restore annotation. Move unroll and vectorize to inner, move parallel
-        // to outer
-        switch (ann_type) {
-          case kUnroll:
-            split_res[n_split] = tmp_s.unroll(stage_id, split_res[n_split]);
-            break;
-          case kVectorize:
-            split_res[n_split] = tmp_s.vectorize(stage_id, split_res[n_split]);
-            break;
-          case kParallel:
-            split_res[0] = tmp_s.parallel(stage_id, split_res[0]); break;
-          default:
-            break;
-        }
-
-        space_0.push_back(std::move(split_res[0]));
-        space_1.push_back(std::move(split_res[1]));
-        if (n_split >= 2) {
-          space_2.push_back(std::move(split_res[2]));
-          if (n_split == 3) {
-            space_3.push_back(std::move(split_res[3]));
-          }
-        }
-        ct++;
-      } else {
-        if (no_split_at_outer_name_set.count(iter->name)) {
-          space_0.push_back(iter);
-        }
-        if (no_split_at_inner_name_set.count(iter->name)) {
-          if (n_split == 1) {
-            space_1.push_back(iter);
-          } else if (n_split == 2) {
-            space_2.push_back(iter);
-          } else {
-            CHECK_EQ(n_split, 3);
-            space_3.push_back(iter);
-          }
-        }
-      }
-    } else {
-      LOG(FATAL) << "Invalid iter type: " << iter->iter_type;
-    }
-  }
-  if (n_split == 3) {
-    ConcatenateMove(&tmp_order, &space_0, &space_1, &space_2, &space_3);
-  } else if (n_split == 2) {
-    ConcatenateMove(&tmp_order, &space_0, &space_1, &space_2);
-  } else {
-    ConcatenateMove(&tmp_order, &space_0, &space_1);
-  }
-  tmp_s.reorder(stage_id, tmp_order);
-  return tmp_s;
-}
-
-State RandomMutateTileSize(const State& old_state, SplitFactorizationMemo* split_memo,
-                           std::mt19937* random_gen, int max_innermost_split_factor) {
-  State tmp_s = old_state;
-
-  // Extract all SplitStep
-  std::vector<size_t> split_step_ids;
-  for (size_t i = 0; i < tmp_s->transform_steps.size(); ++i) {
-    if (auto ps = tmp_s->transform_steps[i].as<SplitStepNode>()) {
-      if (ps->extent.defined() && ps->extent->IsInstance<IntImmNode>() &&
-          GetIntImm(ps->lengths.back()) <= max_innermost_split_factor) {
-        split_step_ids.push_back(i);
-      }
-    }
-  }
-  if (split_step_ids.empty()) {
-    return State();
-  }
-
-  // Find a SplitStep with extent != 1
-  int retry_ct = 0;
-  int64_t extent = 1;
-  int step_id;
-  const SplitStepNode* ps;
-
-  do {
-    step_id = split_step_ids[(*random_gen)() % split_step_ids.size()];
-    ps = tmp_s->transform_steps[step_id].as<SplitStepNode>();
-    CHECK(ps != nullptr);
-    extent = GetIntImm(ps->extent);
-    retry_ct += 1;
-  } while (retry_ct < static_cast<int>(split_step_ids.size()) << 2 &&
-           (extent == 1 || extent == 0));
-
-  if (extent == 0 || extent == 1) {
-    return State();
-  }
-
-  // Mutate tile size
-  std::vector<int> lengths(ps->lengths.size() + 1, 1);
-  for (int i = 0; i < static_cast<int>(ps->lengths.size()); ++i) {
-    lengths[i + 1] = GetIntImm(ps->lengths[i]);
-  }
-  lengths[0] = extent / ElementProduct(lengths);
-
-  std::vector<int> random_perm;
-  RandomPermutation(lengths.size(), &random_perm, random_gen);
-
-  for (size_t i = 0; i < random_perm.size(); ++i) {
-    size_t src_idx = random_perm[i];
-    int length = lengths[src_idx];
-
-    if (length == 1) {
-      continue;
-    }
-
-    // Divide one factor from lengths[src_idx] and multiply it to lengths[dst_idx]
-    size_t dst_idx = random_perm[(i + 1) % random_perm.size()];
-
-    const std::vector<int>& factors = split_memo->GetFactors(length);
-    CHECK_GE(factors.size(), 1);
-
-    int divide_factor;
-    if (dst_idx == lengths.size() - 1) {
-      // Maintain the restriction of hardware_params.max_innermost_split_factor
-      int max_factor_index = static_cast<int>(factors.size()) - 1;
-      for (; max_factor_index >= 1; max_factor_index--) {
-        if (factors[max_factor_index] * lengths[dst_idx] <= max_innermost_split_factor) {
-          break;
-        }
-      }
-      if (max_factor_index == 0) {
-        // failed on this dst_idx, try next one
-        continue;
-      }
-      divide_factor = factors[1 + (*random_gen)() % (max_factor_index)];
-    } else {
-      divide_factor = factors[1 + (*random_gen)() % (factors.size() - 1)];
-    }
-
-    std::vector<PrimExpr> new_lengths;
-    for (size_t j = 1; j < lengths.size(); ++j) {
-      if (j == src_idx) {
-        new_lengths.emplace_back(lengths[j] / divide_factor);
-      } else if (j == dst_idx) {
-        new_lengths.emplace_back(lengths[j] * divide_factor);
-      } else {
-        new_lengths.emplace_back(lengths[j]);
-      }
-    }
-
-    CHECK_LE(GetIntImm(new_lengths.back()), max_innermost_split_factor);
-
-    auto pstate = tmp_s.CopyOnWrite();
-    pstate->transform_steps[step_id] =
-        SplitStep(ps->stage_id, ps->iter_id, ps->extent, new_lengths, ps->inner_to_outer);
-    return tmp_s;
-  }
-
-  return State();
-}
-
-State RandomMutateMaxUnrollStep(const State& old_state, std::mt19937* random_gen,
-    const std::vector<int>& auto_unroll_configs) {
-  State tmp_s = old_state;
-
-  // Extract all auto_unroll_max_step pragma steps.
-  std::vector<int> annotate_steps;
-  for (size_t i = 0; i < old_state->transform_steps.size(); ++i) {
-    if (auto ps = tmp_s->transform_steps[i].as<PragmaStepNode>()) {
-      if (ps->pragma_type.find("auto_unroll_max_step") != std::string::npos) {
-        annotate_steps.push_back(i);
-      }
-    }
-  }
-  if (annotate_steps.empty()) {
-    return State();
-  }
-
-  // Randomly pick one step.
-  auto step_id = annotate_steps[(*random_gen)() % annotate_steps.size()];
-  auto ps = tmp_s->transform_steps[step_id].as<PragmaStepNode>();
-  auto val = std::to_string(auto_unroll_configs[(*random_gen)() % auto_unroll_configs.size()]);
-
-  auto pstate = tmp_s.CopyOnWrite();
-  pstate->transform_steps[step_id] = PragmaStep(
-      ps->stage_id, ps->iter_id, std::string("auto_unroll_max_step") + "$" + val);
-  return tmp_s;
-}
-
-void PruneUndefined(std::vector<State>* states) {
-  size_t pt = 0;
-  for (size_t i = 0; i < states->size(); ++i) {
-    if (!(*states)[i].defined()) {
-      continue;
-    }
-    (*states)[pt++] = std::move((*states)[i]);
-  }
-
-  if (pt == 0) {
-    LOG(FATAL) << "All states are undefined.";
-  } else {
-    states->resize(pt);
-  }
-}
-
-State CrossOverState(const State& p1, const State& p2) { return State(); }
-
-}  // namespace ansor
-}  // namespace tvm
-
diff --git a/src/ansor/search_policy/utils.h b/src/ansor/search_policy/utils.h
deleted file mode 100644
index 5f15397e7e905..0000000000000
--- a/src/ansor/search_policy/utils.h
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file ansor/search_policy/utils.cc
- * \brief Common utilities for search policies
- */
-
-#ifndef TVM_ANSOR_SEARCH_POLICY_UTILS_H_
-#define TVM_ANSOR_SEARCH_POLICY_UTILS_H_
-
-#include <tvm/te/operation.h>
-#include <set>
-#include <string>
-#include <unordered_set>
-#include <utility>
-#include <vector>
-#include "../cost_model/cost_model.h"
-#include "../utils.h"
-#include "../loop_state.h"
-#include "../transform_step.h"
-#include "search_policy.h"
-
-namespace tvm {
-namespace ansor {
-
-// Get an integer from a tvm str Map
-inline int GetIntParam(const Map<String, ObjectRef>& attr_dict,
-                       const std::string& key) {
-  CHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
-  auto pint = attr_dict[key].as<IntImmNode>();
-  CHECK(pint != nullptr);
-  return pint->value;
-}
-
-// Get a double from a tvm str Map
-inline double GetDoubleParam(const Map<String, ObjectRef>& attr_dict,
-                             const std::string& key) {
-  CHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
-  auto pdouble = attr_dict[key].as<FloatImmNode>();
-  CHECK(pdouble != nullptr);
-  return pdouble->value;
-}
-
-// Get a string from a tvm str Map
-inline std::string GetStringParam(const Map<String, ObjectRef>& attr_dict,
-                                  const std::string& key) {
-  CHECK_GT(attr_dict.count(key), 0)
-      << "Cannot find key: \"" << key << "\" in " << attr_dict;
-  const auto& target = attr_dict[key];
-  if (auto pstr = target.as<StringImmNode>()) {
-    return pstr->value;
-  }
-  auto pstr = target.as<StringObj>();
-  CHECK(pstr != nullptr);
-  return pstr->data;
-}
-
-// Get a iterator name set from a tvm str Map
-inline std::set<std::string> GetIterNameSetParam(const Map<String, ObjectRef>& attr_dict,
-                                                 const std::string& key) {
-  std::set<std::string> ret;
-  CHECK_GT(attr_dict.count(key), 0) << "Cannot find key: \"" << key << "\" in " << attr_dict;
-  auto names = attr_dict[key].as<ArrayNode>();
-  CHECK(names != nullptr);
-  for (auto name = names->begin(); name != names->end(); name++) {
-    ret.insert(name->as<StringImmNode>()->value);
-  }
-  return ret;
-}
-
-// Convert operation to stage id
-inline int OperationToStage(const te::Operation& op, const State& state) {
-  for (size_t i = 0; i < state->stages.size(); ++i) {
-    if (op == state->stages[i]->op) {
-      return i;
-    }
-  }
-  LOG(FATAL) << "Cannot find op: " << op;
-  return -1;
-}
-
-// Return the extent of an iterator
-inline int64_t GetExtent(const Iterator& it) {
-  if (it->range.defined()) {
-    if (auto pint = it->range->extent.as<IntImmNode>()) {
-      return pint->value;
-    }
-  }
-  return -1;
-}
-
-// Return whether an op is strict inlineable
-inline bool IsStrictInlineable(const SearchTask& task,
-    const State& state, const te::Operation& op) {
-  if (state->task_dag.defined()) {
-    return state->task_dag->access_analyzer.IsStrictInlineable(op);
-  } else {
-    return task->compute_dag->access_analyzer.IsStrictInlineable(op);
-  }
-}
-
-// Return whether an op is an output op
-inline bool IsOutputOp(const SearchTask& task, const State& state, const te::Operation& op) {
-  if (state->task_dag.defined()) {
-    return state->task_dag->access_analyzer.IsOutput(op);
-  } else {
-    return task->compute_dag->access_analyzer.IsOutput(op);
-  }
-}
-
-// Return whether the stage has an attribute flag
-inline bool HasAttrsFlag(const State& state, int stage_id, const char* target) {
-  if (state->stages[stage_id]->op->attrs.count(target)) {
-    return GetStringParam(state->stages[stage_id]->op->attrs, target) == "True";
-  }
-  return false;
-}
-
-// Return whether the stage has reduce iterators
-inline bool HasReduceIter(const Stage& stage) {
-  for (const auto& iter : stage->iters) {
-    if (iter->iter_type != kSpace) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// Return whether the stage has specific annotated iterators
-inline bool HasAnnotationIter(const Stage& stage, IteratorAnnotation type) {
-  for (const auto& iter : stage->iters) {
-    if (iter->annotation == type) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// Return whether an op needs multi level tiling
-inline bool NeedsMultilevelTiling(const SearchTask& task,
-    const State& state, const te::Operation& op) {
-  if (state->task_dag.defined()) {
-    return state->task_dag->access_analyzer.NeedsMultiLevelTiling(op);
-  } else {
-    return task->compute_dag->access_analyzer.NeedsMultiLevelTiling(op);
-  }
-}
-
-// Get all consumers for an op. This will take inline into consideration
-inline void GetConsumers(const SearchTask& task, const State& state, const te::Operation& op,
-    std::unordered_set<te::Operation, ObjectHash, ObjectEqual>* consumers) {
-  if (state->task_dag.defined()) {
-    state->task_dag->access_analyzer.GetConsumers(state, op, consumers);
-  } else {
-    task->compute_dag->access_analyzer.GetConsumers(state, op, consumers);
-  }
-}
-
-inline void GetProducers(const SearchTask& task, const State& state, const te::Operation& op,
-    std::unordered_set<te::Operation, ObjectHash, ObjectEqual>* producers) {
-  if (state->task_dag.defined()) {
-    state->task_dag->access_analyzer.GetProducers(state, op, producers);
-  } else {
-    task->compute_dag->access_analyzer.GetProducers(state, op, producers);
-  }
-}
-
-// Return whether two ops are elementwise-matched
-inline bool ElementwiseMatch(const SearchTask& task, const State& state, const te::Operation& op,
-                             const te::Operation& target_op) {
-  if (state->task_dag.defined()) {
-    return state->task_dag->access_analyzer.ElementWiseMatch(op, target_op);
-  } else {
-    return task->compute_dag->access_analyzer.ElementWiseMatch(op, target_op);
-  }
-}
-
-// Return whether the stage has only one consumer and they are elementwise-matched
-inline bool HasSingleElementwiseMatchedConsumer(const SearchTask& task,
-    const State& state, const Stage& stage, int* target_stage_id) {
-  std::unordered_set<te::Operation, ObjectHash, ObjectEqual> consumers;
-
-  GetConsumers(task, state, stage->op, &consumers);
-  if (consumers.size() == 1) {
-    *target_stage_id = OperationToStage(*consumers.begin(), state);
-    const Stage& target_stage = state->stages[*target_stage_id];
-    if (ElementwiseMatch(task, state, stage->op, target_stage->op) &&
-        (!(HasReduceIter(stage) && HasReduceIter(target_stage)))) {
-      return true;
-    }
-  }
-  return false;
-}
-
-// Return whether this stage needs rfactor
-inline bool NeedsRfactor(const SearchTask& task, const State& state, const te::Operation& op) {
-  if (op->IsInstance<te::ComputeOpNode>()) {
-    // Compute the product of lengths of all space iters and all reduce iters
-    int64_t cum_space_len = 1, cum_reduce_len = 1;
-    int stage_id = OperationToStage(op, state);
-    for (const auto& iter : state->stages[stage_id]->iters) {
-      if (iter->iter_type == kSpace) {
-        cum_space_len *= GetExtent(iter);
-      } else if (iter->iter_type == kReduce) {
-        cum_reduce_len *= GetExtent(iter);
-      }
-    }
-
-    if (NeedsMultilevelTiling(task, state, op)) {
-      // Do not use rfactor if we have enough parallelism on space iters
-      if (cum_space_len > cum_reduce_len ||
-          cum_space_len > task->hardware_params->num_cores * 16) {
-        return false;
-      } else {
-        return true;
-      }
-    } else if (cum_reduce_len > 1) {
-      // Always try rfactor for reduction ops
-      return true;
-    }
-  }
-
-  return false;
-}
-
-// Return whether the state did cache_write for stage_id
-inline bool HasCacheWriteStage(const State& s, int stage_id) {
-  for (int i = static_cast<int>(s->transform_steps.size()) - 1; i >= 0; --i) {
-    if (auto ps = s->transform_steps[i].as<CacheWriteStepNode>()) {
-      if (stage_id > ps->stage_id) {
-        stage_id--;
-      } else if (stage_id == ps->stage_id) {
-        return true;
-      }
-    } else if (auto ps = s->transform_steps[i].as<CacheReadStepNode>()) {
-      if (stage_id > ps->stage_id) {
-        stage_id--;
-      }
-    } else if (auto ps = s->transform_steps[i].as<RfactorStepNode>()) {
-      if (stage_id > ps->stage_id) {
-        stage_id--;
-      }
-    }
-  }
-  return false;
-}
-
-// Return whether the state did cache_read for stage_id
-inline bool HasCacheReadStage(const State& s, int stage_id) {
-  for (int i = static_cast<int>(s->transform_steps.size()) - 1; i >= 0; --i) {
-    if (auto ps = s->transform_steps[i].as<CacheWriteStepNode>()) {
-      if (stage_id > ps->stage_id) {
-        stage_id--;
-      }
-    } else if (auto ps = s->transform_steps[i].as<CacheReadStepNode>()) {
-      if (stage_id > ps->stage_id) {
-        stage_id--;
-      } else if (stage_id == ps->stage_id) {
-        return true;
-      }
-    } else if (auto ps = s->transform_steps[i].as<RfactorStepNode>()) {
-      if (stage_id > ps->stage_id) {
-        stage_id--;
-      }
-    }
-  }
-  return false;
-}
-
-// Get all split step on spatial iterators
-void GetSpaceSplitStepIds(const State& s, int stage_id, std::vector<int>* spatial_split_step_ids);
-
-// Return whether the state did split/follow_split/follow_fused_split in stage_id
-inline bool HasSplitStep(const State& s, int stage_id) {
-  for (int i = static_cast<int>(s->transform_steps.size()) - 1; i >= 0; --i) {
-    if (s->transform_steps[i]->IsInstance<CacheWriteStepNode>() ||
-        s->transform_steps[i]->IsInstance<CacheReadStepNode>() ||
-        s->transform_steps[i]->IsInstance<RfactorStepNode>()) {
-      if (stage_id > s->transform_steps[i]->stage_id) {
-        stage_id--;
-      }
-    } else if (s->transform_steps[i]->IsInstance<SplitStepNode>() ||
-        s->transform_steps[i]->IsInstance<FollowSplitStepNode>() ||
-        s->transform_steps[i]->IsInstance<FollowFusedSplitStepNode>()) {
-      if (stage_id == s->transform_steps[i]->stage_id) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
-// Return whether the stage has been tiled already
-inline bool IsTiled(const Stage& stage) {
-  auto op = stage->op.as<te::ComputeOpNode>();
-  CHECK(op != nullptr);
-  return stage->iters.size() != op->axis.size() + op->reduce_axis.size();
-}
-
-// Query axes that should not be splitted according to the attribute from tvm.compute
-inline std::pair<std::set<std::string>, std::set<std::string> > QueryNoSplitAxis(
-    const Stage& stage) {
-  std::pair<std::set<std::string>, std::set<std::string> > ret;
-  if (stage->op->attrs.count(SearchPolicyNode::no_split_at_inner_key)) {
-    ret.first = GetIterNameSetParam(stage->op->attrs, SearchPolicyNode::no_split_at_inner_key);
-  }
-  if (stage->op->attrs.count(SearchPolicyNode::no_split_at_outer_key)) {
-    ret.second = GetIterNameSetParam(stage->op->attrs, SearchPolicyNode::no_split_at_outer_key);
-  }
-  return ret;
-}
-
-// Query axes that last split is one
-inline std::set<std::string> QueryLastSplitIsOneAxis(const Stage& stage) {
-  std::set<std::string> ret;
-  if (stage->op->attrs.count(SearchPolicyNode::last_split_is_one_key)) {
-    ret = GetIterNameSetParam(stage->op->attrs, SearchPolicyNode::last_split_is_one_key);
-  }
-  return ret;
-}
-
-// Extract primitive iterators from a nested fused or splitted iterator's name
-inline void ExtractOriginalIterators(const std::string& name, std::set<std::string>* rets) {
-  size_t last_pos = 0;
-  for (size_t i = 0; i < name.size(); ++i) {
-    if (name[i] == '@' || name[i] == '.') {  // '@' for fuse and '.' for split
-      if (!isdigit(name[last_pos]) && name[last_pos] != '@' && name[last_pos] != '.') {
-        rets->insert(name.substr(last_pos, i - last_pos));
-      }
-      last_pos = i + 1;
-    }
-  }
-
-  if (last_pos < name.size() && !isdigit(name[last_pos]) &&
-      name[last_pos] != '@' && name[last_pos] != '.') {
-    rets->insert(name.substr(last_pos, name.size() - last_pos));
-  }
-}
-
-// Get the last space iterator in the outer most tile
-inline const Iterator& GetLastSpaceIteratorInOutermostTile(const Stage& stage) {
-  auto pop = stage->op.as<te::ComputeOpNode>();
-  CHECK(pop != nullptr);
-  std::set<std::string> original_names;
-
-  for (const auto& iter : stage->iters) {
-    ExtractOriginalIterators(iter->name, &original_names);
-    if (original_names.size() == pop->axis.size()) {
-      return iter;
-    }
-  }
-
-  LOG(FATAL) << "Cannot find the iterator.";
-  return stage->iters[0];
-}
-
-// Get the last reduce iterator in the outermost reduce tile
-inline const Iterator& GetLastReduceIteratorInOutermostReduceTile(const Stage& stage) {
-  auto pop = stage->op.as<te::ComputeOpNode>();
-  CHECK(pop != nullptr);
-  std::set<std::string> original_names;
-
-  auto no_split_name_pair = QueryNoSplitAxis(stage);
-  std::set<std::string> no_split_at_inner_name_set = no_split_name_pair.first;
-  size_t axis_size = 0;
-  for (const auto axis : pop->axis) {
-    if (!no_split_at_inner_name_set.count(axis->var->name_hint)) {
-      axis_size++;
-    }
-  }
-  size_t reduce_axis_size = 0;
-  for (const auto axis : pop->reduce_axis) {
-    if (!no_split_at_inner_name_set.count(axis->var->name_hint)) {
-      reduce_axis_size++;
-    }
-  }
-
-  if (reduce_axis_size) {
-    for (const auto& iter : stage->iters) {
-      ExtractOriginalIterators(iter->name, &original_names);
-      if (original_names.size() == axis_size + reduce_axis_size) {
-        return iter;
-      }
-    }
-  } else {
-    for (size_t i = 0; i < stage->iters.size(); i++) {
-      ExtractOriginalIterators(stage->iters[i]->name, &original_names);
-      if (original_names.size() == axis_size + 1) {
-        return stage->iters[i-1];
-      }
-    }
-  }
-
-  LOG(FATAL) << "Cannot find the iterator.";
-  return stage->iters[0];
-}
-
-// Random sample states
-inline void RandomSampleStates(const std::vector<State>& in_states, std::mt19937* random_gen,
-        size_t out_size, std::vector<State>* out_states) {
-  out_states->clear();
-  for (size_t i = 0; i < out_size; i++) {
-    out_states->push_back(in_states[(*random_gen)() % in_states.size()]);
-  }
-}
-
-// Random choose an index according to a prefix sum probability
-inline int RandomChoose(const std::vector<double>& prefix_sum_probs, std::mt19937* random_gen) {
-  std::uniform_real_distribution<> dis(0.0, 1.0);
-  double x = dis(*random_gen);
-
-  CHECK(!prefix_sum_probs.empty());
-
-  return std::lower_bound(prefix_sum_probs.begin(), prefix_sum_probs.end(), x) -
-      prefix_sum_probs.begin();
-}
-
-// Print all states
-inline void PrintAllStates(const std::vector<State>& states) {
-  for (size_t i = 0; i < states.size(); ++i) {
-    std::cerr << i << std::endl;
-    std::cerr << states[i];
-    std::cerr << "==============================================" << std::endl;
-  }
-}
-
-// Apply multi-level tiling structure according to a string format,
-// where "S" stands a space level, "R" stands for a reudciton level.
-// For example, if the format is "SSRSRS", the we will
-// use tiling structure:  space_L0, space_L1, reduce_L0, space_L2, reduce_L1, space_L3
-// For example, if apply "SSRSRS" to matrix multiplication,
-// we have space iterators i and j, reduce iterator k.
-// Then the tiling structure is : i0, j0, i1, j1, k0, i2, j2, k1, i3, j3
-State DoMultiLevelTiling(const State& state, int stage_id, const std::string& format,
-                         std::vector<int>* spatial_split_step_ids);
-
-// Apply tiling structure: space, space
-// But use tile sizes from other SplitStep
-State FollowTiling(const State& state, int stage_id,
-                   const std::vector<int>& split_step_ids, int n_split);
-
-// Randomly mutate the tile size of one SplitStep
-State RandomMutateTileSize(const State& old_state, SplitFactorizationMemo* split_memo,
-                           std::mt19937* random_gen, int max_innermost_split_factor);
-
-// Randomly mutate the value of one auto_unroll_max_step PragmaStep
-State RandomMutateMaxUnrollStep(const State& old_state, std::mt19937* random_gen,
-                                const std::vector<int>& auto_unroll_configs);
-
-// GA: Crossover two states
-State CrossOverState(const State& p1, const State& p2);
-
-// Prune undefined states.
-void PruneUndefined(std::vector<State>* states);
-
-}  // namespace ansor
-}  // namespace tvm
-
-#endif  // TVM_ANSOR_SEARCH_POLICY_UTILS_H_
diff --git a/src/relay/analysis/type_solver.cc b/src/relay/analysis/type_solver.cc
index 5b063eca43375..a192002825e65 100644
--- a/src/relay/analysis/type_solver.cc
+++ b/src/relay/analysis/type_solver.cc
@@ -219,7 +219,6 @@ class TypeSolver::Unifier : public TypeFunctor<Type(const Type&, const Type&)> {
       return Type(nullptr);
     }
 
-    tt1 = tt2;
     tvm::Array<IndexExpr> shape;
     if (tt1->shape.size() != tt2->shape.size()) {
       this->solver_->ReportError(ErrorBuilder() << "tensor type `" << PrettyPrint(tt1) << "` has "
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index a8cd1d3c24626..b6cd9e2c6b772 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -335,18 +335,6 @@ class RelayBuildModule : public runtime::ModuleNode {
 
     // Fuse the operations if it is needed.
     relay_module = transform::FuseOps()(relay_module);
-
-    if (targets.size() == 1) {
-      pass_seqs.push_back(transform::KernelLayoutTransform());
-      pass_seqs.push_back(transform::DeFuseOps());
-      pass_seqs.push_back(transform::FoldConstant());
-      transform::Pass seq = transform::Sequential(pass_seqs);
-      const auto& it = targets.begin();
-      With<Target> tctx((*it).second);
-      relay_module = seq(relay_module);
-      relay_module = transform::FuseOps()(relay_module);
-    }
-
     relay_module = transform::InferType()(relay_module);
     // Inline the functions that have been lifted by the module scope.
     //
diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc
index fde880b10f1d0..2aae8546248fa 100644
--- a/src/relay/backend/compile_engine.cc
+++ b/src/relay/backend/compile_engine.cc
@@ -68,11 +68,6 @@ CCacheKey::CCacheKey(Function source_func, Target target) {
   auto n = make_object<CCacheKeyNode>();
   n->source_func = std::move(source_func);
   n->target = std::move(target);
-  n->disabled = false;
-  char* envar = getenv("TVM_RELAY_DISABLE_BUILD_CACHE");
-  if (envar != nullptr && strcmp(envar, "true") == 0) {
-    n->disabled = true;
-  }
   data_ = std::move(n);
 }
 
diff --git a/src/relay/backend/compile_engine.h b/src/relay/backend/compile_engine.h
index b290462a4b22e..a5f3f6359f893 100644
--- a/src/relay/backend/compile_engine.h
+++ b/src/relay/backend/compile_engine.h
@@ -115,8 +115,6 @@ class CCacheKeyNode : public Object {
   /*! \brief The hardware target.*/
   Target target;
 
-  bool disabled;
-
   void VisitAttrs(tvm::AttrVisitor* v) {
     v->Visit("source_func", &source_func);
     v->Visit("target", &target);
@@ -261,7 +259,6 @@ inline size_t CCacheKeyNode::Hash() const {
 }
 
 inline bool CCacheKeyNode::Equal(const CCacheKeyNode* other) const {
-  if (disabled) return false;
   if (Hash() != other->Hash()) return false;
   return this->target->str() == other->target->str() &&
          tvm::StructuralEqual()(this->source_func, other->source_func);
diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc
index 18ace14a0b75e..ee5e291e3d532 100644
--- a/src/relay/op/tensor/transform.cc
+++ b/src/relay/op/tensor/transform.cc
@@ -2455,62 +2455,6 @@ the input array by output[n, c, h, w, C] = data[n, C*16+c, h, w]
     .set_support_level(5)
     .set_attr<FTVMCompute>("FTVMCompute", LayoutTransformCompute);
 
-// relay.kernel_layout_transform
-TVM_REGISTER_NODE_TYPE(KernelLayoutTransformAttrs);
-
-Array<te::Tensor> KernelLayoutTransformCompute(const Attrs& attrs,
-                                               const Array<te::Tensor>& inputs,
-                                               const Type& out_type) {
-  //const Target& target) {
-  const auto* param = attrs.as<KernelLayoutTransformAttrs>();
-  CHECK(param != nullptr);
-  return Array<te::Tensor>{
-      topi::kernel_layout_transform(inputs[0], param->src_layout, param->dst_layout)
-  };
-}
-
-bool KernelLayoutTransformRel(const Array<Type>& types,
-                              int num_inputs,
-                              const Attrs& attrs,
-                              const TypeReporter& reporter) {
-
-  const auto* data = types[0].as<TensorTypeNode>();
-  CHECK(data != nullptr);
-  const KernelLayoutTransformAttrs* params = attrs.as<KernelLayoutTransformAttrs>();
-
-  Array<IndexExpr> dst_shape;
-  std::vector<std::string> dst_axes;
-
-  topi::parse_kernel_layout(params->dst_layout, &dst_shape, &dst_axes);
-
-  reporter->Assign(types[1], TensorType(dst_shape, data->dtype));
-  return true;
-}
-
-Expr MakeKernelLayoutTransform(Expr data,
-                               String src_layout,
-                               String dst_layout) {
-  auto attrs = make_object<KernelLayoutTransformAttrs>();
-  attrs->src_layout = std::move(src_layout);
-  attrs->dst_layout = std::move(dst_layout);
-  static const Op& op = Op::Get("kernel_layout_transform");
-  return Call(op, {data}, Attrs(attrs), {});
-}
-
-TVM_REGISTER_GLOBAL("relay.op._make.kernel_layout_transform")
-.set_body_typed(MakeKernelLayoutTransform);
-
-RELAY_REGISTER_OP("kernel_layout_transform")
-    .describe(R"code(Transform the input kernel layout.
-)code" TVM_ADD_FILELINE)
-    .set_attrs_type<KernelLayoutTransformAttrs>()
-    .set_num_inputs(1)
-    .add_argument("data", "Tensor", "The input tensor.")
-    .add_type_rel("kernel_layout_transform", KernelLayoutTransformRel)
-    .set_support_level(5)
-    .set_attr<FTVMCompute>("FTVMCompute", KernelLayoutTransformCompute);
-
-
 /* relay._contrib_reverse_reshape */
 Expr MakeReverseReshape(Expr data, Array<Integer> newshape) {
   auto attrs = make_object<ReshapeAttrs>();
diff --git a/src/relay/transforms/defuse_ops.cc b/src/relay/transforms/defuse_ops.cc
deleted file mode 100644
index f7c9037df6875..0000000000000
--- a/src/relay/transforms/defuse_ops.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/analysis.h>
-#include <tvm/relay/transform.h>
-#include <tvm/relay/op_attr_types.h>
-#include <tvm/relay/attrs/transform.h>
-#include <tvm/relay/transform.h>
-#include <tvm/te/operation.h>
-#include <tuple>
-#include <vector>
-#include <functional>
-#include <string>
-#include <utility>
-#include <unordered_map>
-
-#include "pattern_util.h"
-
-namespace tvm {
-namespace relay {
-
-class DefuseOpsMutator : public ExprMutator {
- public:
-
-  class FuncBodyMutator : public ExprMutator {
-   public:
-    Array<Expr> args_;
-
-    FuncBodyMutator(const Array<Expr>& args) : ExprMutator() {
-      args_ = args;
-    }
-
-    Expr VisitExpr_(const VarNode* n) {
-      const std::string& name = n->name_hint();
-      CHECK_EQ(name[0], 'p');
-      std::string id_str = name.substr(1);
-      int id = atoi(id_str.c_str());
-      CHECK(id >= 0 && size_t(id) < args_.size());
-      return args_[id];
-    }
-  };
-
-  Expr VisitExpr_(const CallNode* n) {
-    auto new_n = ExprMutator::VisitExpr_(n);
-
-    const auto* call = new_n.as<CallNode>();
-    if (call) {
-      const auto* func = call->op.as<FunctionNode>();
-      if (func) {
-        const auto& func_call = func->body.as<CallNode>();
-        if (func_call) {
-          return FuncBodyMutator(call->args).Mutate(func->body);
-        }
-      }
-    }
-    return new_n;
-  }
-};
-
-Expr DeFuseOps(const Expr& expr) {
-  return DefuseOpsMutator().Mutate(expr);
-}
-
-namespace transform {
-
-Pass DeFuseOps() {
-  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
-    [=](Function f, IRModule m, PassContext pc) {
-      return Downcast<Function>(relay::DeFuseOps(f));
-  };
-  return CreateFunctionPass(pass_func, 3, "DeFuseOps",
-                            {"InferType"});
-}
-
-TVM_REGISTER_GLOBAL("relay._transform.DeFuseOps")
-.set_body_typed(DeFuseOps);
-
-}  // namespace transform
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/transforms/kernel_layout_transform.cc b/src/relay/transforms/kernel_layout_transform.cc
deleted file mode 100644
index 681785c8123c9..0000000000000
--- a/src/relay/transforms/kernel_layout_transform.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/transform.h>
-#include <tvm/relay/op_attr_types.h>
-#include <tvm/relay/attrs/transform.h>
-#include <tvm/te/operation.h>
-#include <functional>
-#include "kernel_layout_transform.h"
-
-namespace tvm {
-namespace relay {
-
-// Todo: do not use global variables
-std::deque<std::string> KernelLayoutVisitor::global_ori_layouts_queue;
-std::deque<std::string> KernelLayoutVisitor::global_new_layouts_queue;
-
-Expr KernelLayoutTransform(const Expr& expr) {
-  KernelLayoutVisitor visitor;
-
-  // Do a pre-order DFS to gather the optimal kernel layouts for all conv2d nodes.
-  // These layouts were written to global static variables in python function `prepare_layout_rewrite`
-  visitor.VisitExpr(expr);
-
-  // Do a post-order DSF to mutate layout for all conv2d nodes
-  return KernelLayoutTransformer(&visitor).Mutate(expr);
-}
-
-namespace transform {
-
-Pass KernelLayoutTransform() {
-  runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
-    [=](Function f, IRModule m, PassContext pc) {
-      return Downcast<Function>(relay::KernelLayoutTransform(f));
-  };
-  return CreateFunctionPass(pass_func, 3, "KernelLayoutTransform",
-                            {"InferType"});
-}
-
-TVM_REGISTER_GLOBAL("relay._transform.KernelLayoutTransform")
-.set_body_typed(KernelLayoutTransform);
-
-}  // namespace transform
-
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/transforms/kernel_layout_transform.h b/src/relay/transforms/kernel_layout_transform.h
deleted file mode 100644
index c82a96b306122..0000000000000
--- a/src/relay/transforms/kernel_layout_transform.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/transform.h>
-#include <tuple>
-#include <unordered_map>
-
-#include "pattern_util.h"
-
-#include "../../ansor/compute_dag.h"
-
-namespace tvm {
-namespace relay {
-
-/*! \brief A visitor to gather the optimal kernel layout for all conv2d nodes. */
-class KernelLayoutVisitor : public ExprVisitor {
- public:
-  void VisitExpr_(const CallNode *n) {
-    if (n && n->op.as<OpNode>() &&
-        (std::find(op_white_lists.begin(), op_white_lists.end(), n->op.as<OpNode>()->name) !=
-         op_white_lists.end()) && n->args[1]->type_as<TensorTypeNode>()->shape[3].as<IntImmNode>()->value > 1 &&
-        !global_ori_layouts_queue.empty() && !global_new_layouts_queue.empty()) {
-      ori_layouts_map[n] = global_ori_layouts_queue.front();
-      new_layouts_map[n] = global_new_layouts_queue.front();
-      // std::cout << "ori_layout " << global_ori_layouts_queue.front()
-      //     << " Filter_shape " << n->args[1]->type_as<TensorTypeNode>()->shape << std::endl;
-      global_ori_layouts_queue.pop_front();
-      global_new_layouts_queue.pop_front();
-    }
-    ExprVisitor::VisitExpr_(n);
-  }
-
-  std::unordered_map<const CallNode *, std::string> ori_layouts_map;
-  std::unordered_map<const CallNode *, std::string> new_layouts_map;
-  std::vector<std::string> op_white_lists {"nn.contrib_conv2d_winograd_without_weight_transform",
-                                           "nn.conv2d", "nn.conv3d"};
-
-  static std::deque<std::string> global_ori_layouts_queue;
-  static std::deque<std::string> global_new_layouts_queue;
-};
-
-
-/*! \brief A mutator to rewrite kernel layout for all conv2d nodes */
-class KernelLayoutTransformer : public ExprMutator {
- public:
-  KernelLayoutTransformer(KernelLayoutVisitor* visitor): ExprMutator(), visitor_(visitor) {}
-
-  Expr VisitExpr_(const CallNode* n) {
-    auto new_n = ExprMutator::VisitExpr_(n);
-
-    const auto* call = new_n.as<CallNode>();
-    std::vector<std::string> op_white_lists {"nn.contrib_conv2d_winograd_without_weight_transform",
-                                             "nn.conv2d", "nn.conv3d"};
-    if (call && call->op.as<OpNode>() &&
-        (std::find(op_white_lists.begin(), op_white_lists.end(), n->op.as<OpNode>()->name) !=
-         op_white_lists.end() && n->args[1]->type_as<TensorTypeNode>()->shape[3].as<IntImmNode>()->value > 1)) {
-      auto ori_layout_iter = visitor_->ori_layouts_map.find(n);
-      auto new_layout_iter = visitor_->new_layouts_map.find(n);
-      if (ori_layout_iter != visitor_->ori_layouts_map.end() &&
-          new_layout_iter != visitor_->new_layouts_map.end()) {
-        const std::string& ori_layout = ori_layout_iter->second;
-        const std::string& new_layout = new_layout_iter->second;
-        Expr updated_kernel = MakeKernelLayoutTransform(call->args[1], ori_layout, new_layout);
-        Array<Expr> updated_args = {call->args[0], updated_kernel};
-        new_n = Call(call->op, updated_args,
-                               call->attrs);
-      }
-    }
-    return new_n;
-  }
-
- private:
-  KernelLayoutVisitor* visitor_;
-};
-
-
-} // namespace relay
-} // namespace tvm
diff --git a/src/relay/transforms/pattern_util.h b/src/relay/transforms/pattern_util.h
index a9d3b5168e474..7518eb9ac81a1 100644
--- a/src/relay/transforms/pattern_util.h
+++ b/src/relay/transforms/pattern_util.h
@@ -685,8 +685,6 @@ Expr MakeExpandDims(Expr data, int axis, int num_newaxis);
 
 Expr MakeLayoutTransform(Expr data, String src_layout, String dst_layout);
 
-Expr MakeKernelLayoutTransform(Expr data, String src_layout, String dst_layout);
-
 Expr StopFusion(Expr data);
 
 Expr CastHint(Expr data, DataType dtype);
diff --git a/tests/python/unittest/test_ansor_relay_integration.py b/tests/python/unittest/test_ansor_relay_integration.py
deleted file mode 100644
index 1ad507e2f3715..0000000000000
--- a/tests/python/unittest/test_ansor_relay_integration.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-""" Test Relay Integration """
-
-import tempfile
-import numpy as np
-
-import tvm
-from tvm import ansor, relay
-import tvm.contrib.graph_runtime as runtime
-from tvm.relay.testing import dqn
-
-def test_tune_dense_graph():
-    def dense_graph(N, dtype="float32"):
-        ori_data = relay.var("data", shape=(N, N), dtype=dtype)
-        weight = relay.var("weight", shape=(N, N), dtype=dtype)
-        data = relay.multiply(ori_data, relay.const(2, dtype=dtype))
-        dense = relay.nn.dense(data, weight, out_dtype=dtype)
-        dense = relay.add(dense, weight)
-        dense = relay.nn.dense(dense, weight, out_dtype=dtype)
-        return ori_data, weight, dense
-
-    N = 128
-    data, weight, dense = dense_graph(N)
-    mod = relay.Function([data, weight], dense)
-    mod = tvm.IRModule.from_expr(mod)
-
-    ctx = tvm.context("llvm")
-    target = tvm.target.create("llvm")
-    d = tvm.nd.array(np.random.uniform(size=(N, N)).astype(data.type_annotation.dtype), ctx)
-    w = tvm.nd.array(np.random.uniform(size=(N, N)).astype(weight.type_annotation.dtype), ctx)
-    wkl_keys, wkl_weights = ansor.extract_from_program(mod, {}, target=target)
-
-    assert len(wkl_keys) == 2
-    assert len(wkl_weights) == 2
-
-    tasks = []
-    for wkl_key in wkl_keys:
-        dag = ansor.workload_key_to_dag(wkl_key)
-        tasks.append(ansor.SearchTask(dag, wkl_key, target))
-
-    tuner = ansor.SimpleTaskScheduler(tasks)
-    measure_ctx = ansor.LocalRPCMeasureContext()
-    with tempfile.NamedTemporaryFile() as fp:
-        tuner.tune(ansor.TuneOption(n_trials=2, runner=measure_ctx.runner,
-                                    measure_callbacks=[ansor.LogToFile(fp.name)]))
-        with ansor.apply_history_best(fp.name):
-            with tvm.transform.PassContext(opt_level=3,  disabled_pass={"AlterOpLayout"}):
-                graph, lib, opt_params = relay.build_module.build(
-                    mod, target=target)
-
-                m = runtime.create(graph, lib, ctx)
-                m.set_input('data', d)
-                m.set_input('weight', w)
-                m.run()
-                res = m.get_output(0)
-
-    del measure_ctx
-
-    d = d.asnumpy()
-    d = d * 2
-    w = w.asnumpy()
-    d = np.dot(d, np.transpose(w))
-    d = d + w
-    d = np.dot(d, np.transpose(w))
-
-    tvm.testing.assert_allclose(res.asnumpy(), d, rtol=1e-5)
-
-
-def test_tune_dqn():
-    mod, params = dqn.get_workload(1, image_shape=(84, 84, 4), layout='NHWC')
-    target = tvm.target.create('llvm')
-
-    wkl_keys, wkl_weights = ansor.extract_from_program(mod, params, target)
-
-    tasks = []
-    for wkl_key in wkl_keys:
-        dag = ansor.workload_key_to_dag(wkl_key)
-        tasks.append(ansor.SearchTask(dag, wkl_key, target))
-
-    assert len(tasks) == 5
-
-    tuner = ansor.SimpleTaskScheduler(tasks)
-    measure_ctx = ansor.LocalRPCMeasureContext()
-    with tempfile.NamedTemporaryFile() as fp:
-        tuner.tune(ansor.TuneOption(n_trials=len(tasks), runner=measure_ctx.runner,
-                                    measure_callbacks=[ansor.LogToFile('tmp.json')]),
-                   search_policy='sketch.random')
-        with ansor.apply_history_best('tmp.json'):
-            ansor.prepare_layout_rewrite(mod, params, target)
-            with tvm.transform.PassContext(opt_level=3,  disabled_pass={"AlterOpLayout"}):
-                graph, lib, opt_params = relay.build_module.build(mod, target=target)
-            ansor.finish_layout_rewrite()
-
-    del measure_ctx
-
-if __name__ == "__main__":
-    test_tune_dense_graph()
-    test_tune_dqn()
-
diff --git a/tests/python/unittest/test_ansor_search_policy.py b/tests/python/unittest/test_ansor_search_policy.py
deleted file mode 100644
index deff561a4547d..0000000000000
--- a/tests/python/unittest/test_ansor_search_policy.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test search policy"""
-
-import random
-import numpy as np
-import tempfile
-import threading
-
-import tvm
-from tvm import ansor
-
-from test_ansor_common import matmul_ansor_test
-
-def search_common(target="llvm", seed=random.randint(1, 1 << 30), runner='local',
-                  cost_model=ansor.RandomModel(), n_trials=2, params=None,
-                  pre_search_callbacks=None):
-    print("Test %s schedule search with the default search policy" % (target))
-
-    random.seed(seed)
-    N = 128
-    workload_key = ansor.make_workload_key_func(matmul_ansor_test, (N, N, N))
-    dag = ansor.workload_key_to_dag(workload_key)
-    target = tvm.target.create(target)
-    task = ansor.SearchTask(dag, workload_key, target)
-
-    with tempfile.NamedTemporaryFile() as fp:
-        log_file = fp.name
-
-        search_policy = ansor.SketchSearchPolicy(cost_model, params=params, seed=seed)
-        tune_option = ansor.TuneOption(n_trials=n_trials, runner=runner,
-                                       measure_callbacks=[ansor.LogToFile(log_file)],
-                                       pre_search_callbacks=pre_search_callbacks)
-        sch, args = ansor.auto_schedule(task, search_policy=search_policy,
-                                        tune_option=tune_option)
-        inp, res = ansor.best_measure_pair_in_file(log_file, workload_key, target)
-
-        print("==== Python Code ====")
-        print(dag.print_python_code_from_state(inp.state))
-
-        try:
-            print("==== Lowered Stmt ====")
-            print(tvm.lower(sch, args, simple_mode=True))
-            mod = tvm.build(sch, args, target)
-
-            ctx = tvm.context(str(target), 0)
-            dtype = dag.tensors[0].dtype
-            a = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx)
-            b = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx)
-            c = tvm.nd.array(np.zeros((N, N), dtype=dtype), ctx)
-            mod(a, b, c)
-            tvm.testing.assert_allclose(c.asnumpy(), np.dot(
-                a.asnumpy(), b.asnumpy()), rtol=1e-5)
-            print("==== Verification passed ====")
-        except Exception:
-            raise Exception("Error encountered with seed: %d" % (seed))
-    print()
-
-
-def test_search_basic():
-    # wrap the search in a new thread to avoid the conflict
-    # between python's multiprocessing and tvm's thread pool
-    t = threading.Thread(target=search_common, kwargs={'seed': 944563397})
-    t.start()
-    t.join()
-
-
-def test_search_xgb_model_rpc_runner():
-    measure_ctx = ansor.LocalRPCMeasureContext()
-    search_common(seed=456787236, cost_model=ansor.XGBModel(),
-                  runner=measure_ctx.runner)
-
-
-def test_search_opencl():
-    if tvm.context("opencl", 0).exist:
-        measure_ctx = ansor.LocalRPCMeasureContext()
-        search_common("opencl", 380344973, measure_ctx.runner)
-    else:
-        print("OpenCL device not found, skip this test.")
-
-
-def test_search_cuda():
-    if tvm.context("cuda", 0).exist:
-        measure_ctx = ansor.LocalRPCMeasureContext()
-        search_common("cuda", 903667810, measure_ctx.runner)
-    else:
-        print("CUDA device not found, skip this test.")
-
-
-def test_search_custom_sketch_rule():
-    def meet_condition_func(meta_policy, state, stage_id):
-        # Apply and Skip the Rest if this function does not return
-        pass
-
-    # Expecting:
-    # i.0
-    #   i.1
-    #     i.2
-    #       j.0
-    #         j.1
-    #           ax0
-    #             ax1
-    #               B.global
-    #           j.2
-    #             k
-    #               C
-    def apply_func1(meta_policy, state, stage_id):
-        # Stage by stage way
-        ret = []
-        if stage_id == 2:
-            state = ansor.loop_state.State(state, meta_policy.cur_task.compute_dag)
-            state.split(2, state.stages[2].iters[0], [4, 4])
-            state.split(2, state.stages[2].iters[3], [4, 4])
-            ret.append([state.state_object, stage_id - 1])
-        elif stage_id == 1:
-            state = ansor.loop_state.State(state, meta_policy.cur_task.compute_dag)
-            state.cache_read(1, "global", [2])
-            state.compute_at(2, 3, state.stages[3].iters[4])
-            ret.append([state.state_object, stage_id - 1])
-        else:
-            ret.append([state, stage_id - 1])
-        return ret
-
-    def apply_func2(meta_policy, state, stage_id):
-        # More template like way
-        ret = []
-        state = ansor.loop_state.State(state, meta_policy.cur_task.compute_dag)
-
-        state.split(2, state.stages[2].iters[0], [4, 4])
-        state.split(2, state.stages[2].iters[3], [4, 4])
-        state.cache_read(1, "global", [2])
-        state.compute_at(2, 3, state.stages[3].iters[4])
-
-        ret.append([state.state_object, -1])
-        return ret
-
-    measure_ctx = ansor.LocalRPCMeasureContext()
-    search_common(seed=887823438, runner=measure_ctx.runner,
-                  pre_search_callbacks=[ansor.PreloadCustomSketchRule(
-                      meet_condition_func, apply_func1)],
-                  params={'disable_change_compute_location': 1})
-    search_common(seed=887823438, runner=measure_ctx.runner,
-                  pre_search_callbacks=[ansor.PreloadCustomSketchRule(
-                      meet_condition_func, apply_func2)],
-                  params={'disable_change_compute_location': 1})
-
-
-if __name__ == "__main__":
-    test_search_basic()
-    test_search_xgb_model_rpc_runner()
-    test_search_opencl()
-    test_search_cuda()
-    test_search_custom_sketch_rule()
diff --git a/tests/python/unittest/test_ansor_task_scheduler.py b/tests/python/unittest/test_ansor_task_scheduler.py
deleted file mode 100644
index 53cf2059c1f3f..0000000000000
--- a/tests/python/unittest/test_ansor_task_scheduler.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Test the task scheduler """
-
-import threading
-
-import tvm
-from tvm import ansor
-
-from test_ansor_common import matmul_ansor_test
-
-def test_task_scheduler_basic():
-    N = 128
-    A, B, C = matmul_ansor_test(N, N, N)
-    dag = ansor.ComputeDAG([A, B, C])
-    tgt = tvm.target.create("llvm")
-    task1 = ansor.SearchTask(dag, "test", tgt)
-    task2 = ansor.SearchTask(dag, "test", tgt)
-
-    def basic_test_func(task1, task2):
-        def objective(costs):
-            return sum(costs)
-
-        task_scheduler = ansor.SimpleTaskScheduler([task1, task2], objective)
-        tune_option = ansor.TuneOption(n_trials=3, runner='local')
-        task_scheduler.tune(tune_option)
-
-    # Ansor search process with local runner has some modification on thread
-    # binding, wrap this to a subprocess to eliminate the impacts to other tests
-    t = threading.Thread(target=basic_test_func,
-                         kwargs={'task1': task1, 'task2': task2})
-    t.start()
-    t.join()
-
-
-if __name__ == "__main__":
-    test_task_scheduler_basic()
diff --git a/topi/include/topi/transform.h b/topi/include/topi/transform.h
index 7dd782f5b6228..e0e4556678894 100644
--- a/topi/include/topi/transform.h
+++ b/topi/include/topi/transform.h
@@ -1295,75 +1295,6 @@ inline Tensor layout_transform(const Tensor& src, const std::string& src_layout,
       name, tag);
 }
 
-/*!
- * \brief utility function for kernel_layout_transform
- */
-inline void parse_kernel_layout(const String& layout,
-                                Array<PrimExpr>* shape,
-                                std::vector<std::string>* axes) {
-  int32_t factor = 0;
-  std::string axis = "";
-  for (char c : std::string(layout)) {
-    if (c >= 'A' && c <= 'z') {
-      axis += c;
-      if (factor != 0) {
-        shape->push_back(factor);
-        factor = 0;
-      }
-    } else if (c >= '0' && c <= '9') {
-      factor = factor * 10 + c - '0';
-      if (!axis.empty()) {
-        axes->push_back(axis);
-        axis = "";
-      }
-    } else {
-      LOG(FATAL) << "Invalid layout " << layout;
-    }
-  }
-  if (!axis.empty()) {
-    axes->push_back(axis);
-  }
-}
-
-/*!
- * \brief Transform the kernel layout according to \p src_layout and \p dst_layout
- * \param src the source input.
- * \param src_layout the source layout.
- * \param dst_layout the destination layout.
- * \param name output tensor name.
- * \param tag output tensor tag.
- * \return A tensor with shape in \p dst_layout
- */
-inline Tensor kernel_layout_transform(const Tensor& src,
-                                      const String& src_layout,
-                                      const String& dst_layout,
-                                      const String name = "T_kernel_layout_trans",
-                                      const String tag = kInjective) {
-  Array<PrimExpr> src_shape;
-  std::vector<std::string> src_axes;
-  Array<PrimExpr> dst_shape;
-  std::vector<std::string> dst_axes;
-
-  parse_kernel_layout(src_layout, &src_shape, &src_axes);
-  parse_kernel_layout(dst_layout, &dst_shape, &dst_axes);
-  return compute(
-      dst_shape, [&](const Array<Var>& dst_indices) {
-        Array<PrimExpr> dst_indices_expr(dst_indices.begin(), dst_indices.end());
-        Array<PrimExpr> src_indices;
-        for (const std::string& src_axis : src_axes) {
-          PrimExpr src_index = 0;
-          CHECK_EQ(dst_indices_expr.size(), dst_axes.size());
-          for (size_t i = 0; i < dst_axes.size(); ++i) {
-            if (dst_axes[i] == src_axis) {
-              src_index = src_index * dst_shape[i] + dst_indices_expr[i];
-            }
-          }
-          src_indices.push_back(src_index);
-        }
-        return src(src_indices);
-      }, name, tag);
-}
-
 /*!
  * \brief Get the shape of input tensor.
  * \param src the input tensor.
diff --git a/topi/python/topi/nn/conv2d.py b/topi/python/topi/nn/conv2d.py
index 6800129c12aab..4c7941b49692c 100644
--- a/topi/python/topi/nn/conv2d.py
+++ b/topi/python/topi/nn/conv2d.py
@@ -20,7 +20,7 @@
 from __future__ import absolute_import as _abs
 from collections import namedtuple
 import tvm
-from tvm import te, ansor
+from tvm import te
 
 from .pad import pad
 from .util import get_pad_tuple
@@ -342,37 +342,7 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
         dilation_h, dilation_w = dilation
 
     batch, in_height, in_width, in_channel = Input.shape
-    if ansor.GLOBAL_SCOPE.topi_in_compute_rewrite_mode:
-        # infer shape for the rewritten layout
-        if len(Filter.shape) >= 10:
-            # For cpu tile structure SSRSRS
-            base = len(Filter.shape) - 10
-            kernel_h = Filter.shape[2 + base] * Filter.shape[6 + base]
-            kernel_w = Filter.shape[3 + base] * Filter.shape[7 + base]
-            channel = Filter.shape[4 + base] * Filter.shape[8 + base]
-            num_filter = Filter.shape[5 + base] * Filter.shape[9 + base]
-            for i in range(base + 2):
-                num_filter *= Filter.shape[i]
-        elif len(Filter.shape) == 6:
-            # For cpu tile structure SRS
-            num_filter = Filter.shape[0] * Filter.shape[1] * Filter.shape[5]
-            kernel_h = Filter.shape[2]
-            kernel_w = Filter.shape[3]
-            channel = Filter.shape[4]
-        elif len(Filter.shape) == 5:
-            # For cpu tile structure SRS
-            num_filter = Filter.shape[0] * Filter.shape[4]
-            kernel_h = Filter.shape[1]
-            kernel_w = Filter.shape[2]
-            channel = Filter.shape[3]
-        elif len(Filter.shape) == 4:
-            num_filter, kernel_h, kernel_w, channel = Filter.shape
-        else:
-            raise ValueError("Don't know how to infer layout for filter shape: %s. " \
-                             "You can add a new branch for it to fix this." % str(Filter))
-    else:
-        kernel_h, kernel_w, channel, num_filter = Filter.shape
-
+    kernel_h, kernel_w, channel, num_filter = Filter.shape
     # compute the output shape
     dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
     dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
@@ -392,9 +362,8 @@ def conv2d_nhwc(Input, Filter, stride, padding, dilation, out_dtype='float32'):
         lambda nn, yy, xx, ff: te.sum(
             PaddedInput[nn, yy * stride_h + ry * dilation_h,
                         xx * stride_w + rx * dilation_w, rc].astype(out_dtype) *
-            Filter[ry, rx, rc, ff].astype(out_dtype)
-            , axis=[ry, rx, rc]),
-        name="Conv2dOutput", tag="conv2d_nhwc", attrs={"layout_free_placeholders": [Filter]})
+            Filter[ry, rx, rc, ff].astype(out_dtype), axis=[ry, rx, rc]),
+        name="Conv2dOutput", tag="conv2d_nhwc")
     return Output
 
 
diff --git a/tutorials/ansor/README.txt b/tutorials/ansor/README.txt
deleted file mode 100644
index 85b6ba401daec..0000000000000
--- a/tutorials/ansor/README.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-.. _tutorial-ansor-auto-schedule:
-
-Ansor: Template Free Auto Scheduling
-------------------------------------
diff --git a/tutorials/ansor/tune_conv2d_cuda.py b/tutorials/ansor/tune_conv2d_cuda.py
deleted file mode 100644
index 03f1b24a768ee..0000000000000
--- a/tutorials/ansor/tune_conv2d_cuda.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Auto-scheduling High Performance Convolution on NVIDIA GPUs
-===========================================================
-**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
-            `Chengfan Jia <https://github.com/jcf94>`_, \
-            `Minmin Sun <https://github.com/minminsun>`_, \
-            `Zhao Wu <https://github.com/FrozenGene>`_
-
-This is an tutorial for searching high performance schedule for NVIDIA GPU using
-Ansor auto-scheduler. By running Ansor on this template, we can outperform the
-vendor provided library CuDNN in many cases.
-"""
-
-######################################################################
-# Install dependencies
-# --------------------
-# To use autotvm package in tvm, we need to install some extra dependencies.
-# (change "3" to "2" if you use python2):
-#
-# .. code-block:: bash
-#
-#   pip3 install --user psutil xgboost tornado
-#
-# To make TVM run faster in tuning, it is recommended to use cython
-# as FFI of tvm. In the root directory of tvm, execute
-#
-# .. code-block:: bash
-#
-#   pip3 install --user cython
-#   sudo make cython3
-#
-# Now return to python code. Import packages.
-
-import random
-import sys
-
-import numpy as np
-import tvm
-import topi
-from topi.testing import conv2d_nchw_python
-from tvm import te
-
-# the module is called `ansor`
-from tvm import ansor
-
-######################################################################
-# Step 1:  Define the search task
-# -------------------------------
-# There are plenty of useful schedule primitives in tvm. You can also find
-# some tutorials that describe them in more details, such as
-# (1). :ref:`opt-conv-gpu`
-# (2). `Optimizing DepthwiseConv on NVIDIA GPU <https://tvm.apache.org/2017/08/22/Optimize-Deep-Learning-GPU-Operators-with-TVM-A-Depthwise-Convolution-Example>`_
-#
-# It's usually a hard job if one wants to get a high performance schedule for a
-# specific workload. Even writing an AutoTVM tunable template needs user to have
-# expertises on how each schedule primitive works as well as how they finally
-# reflect on the hardward architecture.
-#
-# However, with Ansor this will be quite simple. Firstly, define the target workload.
-# Both :code:`tvm.te` API or topi op API are fine to be used.
-#
-# We can use the retuned :code:`Tensors` to create a ComputeDAG just like what we do
-# in :ref:`ansor-simple-subgraph`, while the way to use workload registry is more
-# recommended.
-
-# Use an extra function decorator to regist this workload
-@ansor.register_workload_func
-def conv2d_nchw(N, H, W, CO, CI, KH, KW, stride, padding):
-    data = te.placeholder((N, CI, H, W), name='data')
-    kernel = te.placeholder((CO, CI, KH, KW), name='kernel')
-    conv = topi.nn.conv2d_nchw(data, kernel, stride, padding, dilation=1, out_dtype='float32')
-
-    return [data, kernel, conv]
-
-######################################################################
-# Step 2:  Search through the schedule space
-# ------------------------------------------
-# We pick the last layer on resnet as test case.
-# Since our space is very large, :code:`XGBModel` is most suitable
-# for our case. Here we only do 20 trials for demonstration.
-# In practice, making 1000 trials usually can find some good kernels
-# for this workload.
-
-tgt = tvm.target.cuda()
-
-# The last layer in resnet
-N, H, W, CO, CI, KH, KW, strides, padding = 1, 7, 7, 512, 512, 3, 3, (1, 1), (1, 1)
-# Generate workload key with the ansor API
-wkl_key = ansor.make_workload_key_func(conv2d_nchw, (N, H, W, CO, CI, KH, KW, strides, padding))
-# Generate ComputeDAG using the workload key
-dag = ansor.workload_key_to_dag(wkl_key)
-task = ansor.SearchTask(dag, wkl_key, target=tgt)
-
-log_file = "conv2d_nchw.json"
-seed = 0
-random.seed(seed)
-cost_model = ansor.XGBModel(seed=seed)
-search_policy = ansor.SketchSearchPolicy(cost_model, seed=seed)
-
-#########################################################################
-# The :code:`ansor.LocalRPCMeasureContext` is used to create a RPC runner environment.
-# 
-# Use local gpu, measure 10 times for every schedule to reduce variance. The timeout
-# for each running is set to 4 seconds.
-#
-# During the searching process, we may generate several invalid schedules and they
-# will be filtered out. It's fine to see "Encountered errors during feature extraction."
-# in the tuning logs.
-# :code:`ansor.LogToFile` callback will log the tuning results into a
-# log file, which can be used to get the best config later.
-# :code:`ansor.PreloadMeasuredStates` callback will load measured states
-# from history log before schedule search, we can add this callback to make
-# sure a same schedule will never be measured for multiple times.
-
-measure_ctx = ansor.LocalRPCMeasureContext(repeat=3, min_repeat_ms=100, timeout=4)
-tune_option = ansor.TuneOption(n_trials=20,
-                               runner=measure_ctx.runner,
-                               measure_callbacks=[ansor.LogToFile(log_file)],
-                               pre_search_callbacks=[ansor.PreloadMeasuredStates(log_file)])
-s, arg_bufs = ansor.auto_schedule(task, search_policy=search_policy, tune_option=tune_option)
-
-print("==== Get Lowered Stmt ====")
-print(tvm.lower(s, arg_bufs, simple_mode=True))
-
-# Release the RPC runner environment
-del measure_ctx
-
-#########################################################################
-# From the example lower result showed above, we can see that Ansor has tried
-# techniques such as `Shared Memory Cooperative Fetching`, `Kernel Fusion`,
-# `Axis unroll`, `Axis Vectorize` and so on. There is no need for users to care
-# about the details, and Ansor will catch them well.
-#
-# Finally we can directly use the returned result to get the generated schedule,
-# while in the following tutorial we'll show how to inspect the best config from
-# log file, check correctness, and measure running time.
-
-# Get history best from log file
-inp, res = ansor.best_measure_pair_in_file(log_file)
-# Get the task ComputeDAG from log result
-dag = ansor.workload_key_to_dag(inp.task.workload_key)
-# Apply log result to TVM schedule
-s, arg_bufs = dag.apply_steps_from_state(inp.state)
-func = tvm.build(s, arg_bufs, target=tgt)
-
-# check correctness
-a_np = np.random.uniform(size=(N, CI, H, W)).astype(np.float32)
-w_np = np.random.uniform(size=(CO, CI, KH, KW)).astype(np.float32)
-c_np = conv2d_nchw_python(a_np, w_np, strides, padding)
-
-ctx = tvm.gpu()
-a_tvm = tvm.nd.array(a_np, ctx=ctx)
-w_tvm = tvm.nd.array(w_np, ctx=ctx)
-c_tvm = tvm.nd.empty(c_np.shape, ctx=ctx)
-func(a_tvm, w_tvm, c_tvm)
-
-tvm.testing.assert_allclose(c_np, c_tvm.asnumpy(), rtol=1e-2)
-
-# Evaluate running time. Here we choose a large repeat number (400) to reduce the noise
-# and the overhead of kernel launch. You can also use nvprof to validate the result.
-evaluator = func.time_evaluator(func.entry_name, ctx, number=400)
-print('Time cost of this operator: %f s' % evaluator(a_tvm, w_tvm, c_tvm).mean)
-
diff --git a/tutorials/ansor/tune_simple_subgraph.py b/tutorials/ansor/tune_simple_subgraph.py
deleted file mode 100644
index 00bef82cf855c..0000000000000
--- a/tutorials/ansor/tune_simple_subgraph.py
+++ /dev/null
@@ -1,193 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-.. _ansor-simple-subgraph:
-
-Writing compute expression and Using Ansor auto-scheduler
-=========================================================
-**Author**: `Lianmin Zheng <https://github.com/merrymercy>`_, \
-            `Chengfan Jia <https://github.com/jcf94>`_, \
-            `Minmin Sun <https://github.com/minminsun>`_, \
-            `Zhao Wu <https://github.com/FrozenGene>`_
-
-This is an introduction tutorial to the auto-scheduler module in TVM.
-
-There are two steps in auto-scheduling.
-The first step is defining the target task.
-The second step is running a search algorithm to auto explore the schedule.
-In this tutorial, you can learn how to perform these two steps in TVM.
-The whole workflow is illustrated by a matrix multiplication with bias add example.
-"""
-
-######################################################################
-# Install dependencies
-# --------------------
-# To use Ansor package in TVM, we need to install some extra dependencies.
-# This step (installing xgboost) can be skipped as it doesn't need XGBoost
-# (change "3" to "2" if you use python2):
-#
-# .. code-block:: bash
-#
-#   pip3 install --user psutil xgboost
-#
-# To make TVM run faster in tuning, it is recommended to use cython
-# as FFI of TVM. In the root directory of TVM, execute
-# (change "3" to "2" if you use python2):
-#
-# .. code-block:: bash
-#
-#   pip3 install --user cython
-#   sudo make cython3
-#
-# Now return to python code. Import packages.
-
-import random
-import sys
-
-import numpy as np
-import tvm
-from tvm import te
-
-# the module is called `ansor`
-from tvm import ansor
-
-######################################################################
-# Step 1:  Define the target compute subgraph
-# -------------------------------------------
-# In this section, we will write a deterministic TVM compute expression code
-# to a compute subgraph.
-#
-# .. note:: Comparing to :ref:`tutorials-autotvm-sec`
-#
-#  In Ansor, we do not need users to provide a schedule template, the only input
-#  is the compute expression writing by :code:`tvm.te` API or topi op API.
-#
-# Here is how we implement a matrix multiplication subgraph in TVM.
-
-# Matmul with bias add
-def matmul_add(N, L, M, dtype):
-    A = te.placeholder((N, L), name='A', dtype=dtype)
-    B = te.placeholder((L, M), name='B', dtype=dtype)
-    C = te.placeholder((N, M), name='C', dtype=dtype)
-
-    k = te.reduce_axis((0, L), name='k')
-    mul = te.compute((N, M), lambda i, j: te.sum(A[i, k] * B[k, j], axis=k),
-                     name='Mul')
-    D = te.compute((N, M), lambda i, j: C[i, j] + mul[i, j], name='D')
-
-    return [A, B, C, D]
-
-######################################################################
-# Step 2:  Search through the schedule space
-# ------------------------------------------
-# In step 1, we build the compute subgraph.
-# The next step is to pick a cost model as well as a search policy and explore the
-# possible schedule.
-#
-# Auto-scheduler in TVM
-# ^^^^^^^^^^^^^^^^^^^^^
-# The job for the Ansor auto-scheduler can be described by following pseudo code
-#
-#   .. code-block:: c
-#
-#    ct = 0
-#    while ct < max_number_of_trials:
-#        auto generate a batch of schedules
-#        measure this batch of schedules on real hardware and get results
-#        ct += batch_size
-#
-# When proposing the next batch of schedules, Ansor can take different cost models to
-# guide the schedule generating process.
-#
-# * :code:`RandomModel`: Generate and take new schedule randomly
-# * :code:`XGBModel`: Use XGBoost model to estimate the performance of potential schedules, try to pick schedules with better performance in each step
-#
-# XGBModel can explore more efficiently and find better schedules.
-
-################################################################
-# Begin tuning
-# ^^^^^^^^^^^^
-# Here we continue our matrix multiplication example.
-#
-# The :code:`ansor.ComputeDAG` takes the Tensor list as input, and generates
-# a dag structure. During which process, :code:`ansor.ComputeDAG` will
-# do some analyzes with the target subgraph and the results will be used in
-# search policy later.
-#
-# Then we create the :code:`tvm.target` and a tuning task.
-
-N, L, M = 128, 128, 128
-A, B, C, D = matmul_add(N, L, M, 'float32')
-dag = ansor.ComputeDAG([A, B, C, D])
-
-print(dag)
-print(dag.access_analyzer)
-
-tgt = tvm.target.create("llvm")
-task = ansor.SearchTask(dag, "test", tgt)
-
-################################################################
-# Next, we choose random model and create a default search policy:
-# :code:`ansor.SketchSearchPolicy`.
-#
-# We only make 5 trials in this tutorial for demonstration. In practice,
-# you can do more trials according to your time budget.
-# :code:`ansor.LogToFile` callback will log the tuning results into a
-# log file, which can be used to get the best config later.
-# :code:`ansor.PreloadMeasuredStates` callback will load measured states
-# from history log before schedule search, we can add this callback to make
-# sure a same schedule will never be measured for multiple times.
-
-log_file = "matmul_add.json"
-
-seed = 0
-random.seed(seed)
-cost_model = ansor.RandomModel()
-search_policy = ansor.SketchSearchPolicy(cost_model, seed=seed)
-
-tune_option = ansor.TuneOption(n_trials=5,
-                               measure_callbacks=[ansor.LogToFile(log_file)],
-                               pre_search_callbacks=[ansor.PreloadMeasuredStates(log_file)])
-
-################################################################
-# Then just call :code:`ansor.auto_schedule` and Ansor will try to find a high
-# performance schedule for the target subgraph automatically.
-#
-# The returned result will be a :code:`te.schedule` and a list of :code:`te.Tensor`,
-# which can be used as the input of :code:`tvm.lower` or :code:`tvm.build`.
-
-s, arg_bufs = ansor.auto_schedule(task, search_policy=search_policy,
-                                  tune_option=tune_option)
-
-print("==== Get Lowered Stmt ====")
-print(tvm.lower(s, arg_bufs, simple_mode=True))
-
-#########################################################################
-# Check the correctness to make sure we generate a right schedule.
-
-func = tvm.build(s, arg_bufs)
-
-# check correctness
-a_np = np.random.uniform(size=(N, L)).astype(np.float32)
-b_np = np.random.uniform(size=(L, M)).astype(np.float32)
-c_np = np.random.uniform(size=(N, M)).astype(np.float32)
-d_np = a_np.dot(b_np) + c_np
-
-d_tvm = tvm.nd.empty(d_np.shape)
-func(tvm.nd.array(a_np), tvm.nd.array(b_np), tvm.nd.array(c_np), d_tvm)
-
-tvm.testing.assert_allclose(d_np, d_tvm.asnumpy(), rtol=1e-2)
diff --git a/tutorials/autotvm/README.txt b/tutorials/autotvm/README.txt
index 4ad36c000e3c2..38e3b3343f4ea 100644
--- a/tutorials/autotvm/README.txt
+++ b/tutorials/autotvm/README.txt
@@ -1,4 +1,4 @@
 .. _tutorials-autotvm-sec:
 
-AutoTVM: Template Based Auto Tuning
------------------------------------
+Auto tuning
+-----------