From 33a7dcbf9d263f641dc7ba4daa9d554f61ba126b Mon Sep 17 00:00:00 2001
From: Zihao Ye <expye@outlook.com>
Date: Wed, 3 Nov 2021 02:55:14 -0700
Subject: [PATCH] revert

---
 .github/CODEOWNERS                            |  20 +-
 .github/workflows/main.yml                    |  26 -
 .gitmodules                                   |   3 -
 3rdparty/cutlass                              |   1 -
 CMakeLists.txt                                |   6 -
 CONTRIBUTORS.md                               |   6 +-
 Jenkinsfile                                   |   4 +-
 LICENSE                                       |   5 -
 apps/hexagon_launcher/launcher_core.cc        |  10 +-
 apps/hexagon_launcher/launcher_core.h         |   2 -
 apps/hexagon_launcher/launcher_hexagon.cc     |  17 +-
 apps/ios_rpc/tvmrpc/ViewController.mm         |   1 -
 .../template_project/microtvm_api_server.py   |  13 +-
 .../template_project/microtvm_api_server.py   |   9 -
 .../template_project/src/aot_demo/main.c      |  86 ++-
 cmake/config.cmake                            |   4 -
 cmake/modules/Arduino.cmake                   |  78 ---
 cmake/modules/Hexagon.cmake                   |  41 +-
 cmake/modules/StandaloneCrt.cmake             |  15 +-
 cmake/modules/Zephyr.cmake                    |  78 ---
 cmake/modules/contrib/CUTLASS.cmake           |  23 -
 cmake/utils/Utils.cmake                       |  13 -
 ...buntu_download_arm_compute_lib_binaries.sh |  10 +-
 docker/install/ubuntu_install_cmake_source.sh |   4 +-
 docs/conf.py                                  |   6 -
 docs/contribute/git_howto.rst                 |   3 +-
 docs/how_to/deploy/arm_compute_lib.rst        |  27 +-
 docs/install/from_source.rst                  |   3 -
 docs/legacy_redirect.py                       | 272 ----------
 .../work_with_microtvm/micro_autotune.py      |  15 +-
 .../how_to/work_with_microtvm/micro_tflite.py |   7 +-
 gallery/tutorial/tensor_ir_blitz_course.py    | 191 -------
 include/tvm/runtime/vm/executable.h           |   7 -
 include/tvm/runtime/vm/vm.h                   |   4 +-
 licenses/LICENSE.cutlass.txt                  |  23 -
 python/setup.py                               |   7 -
 python/tvm/contrib/cutlass/__init__.py        |  18 -
 python/tvm/contrib/cutlass/build.py           | 255 ---------
 python/tvm/contrib/cutlass/gemm_operation.py  | 262 ---------
 python/tvm/contrib/cutlass/gemm_profiler.py   | 196 -------
 python/tvm/contrib/cutlass/gen_gemm.py        | 389 --------------
 python/tvm/contrib/cutlass/library.py         | 219 --------
 python/tvm/contrib/target/onnx.py             |  10 +-
 python/tvm/driver/tvmc/composite_target.py    |   9 +-
 python/tvm/driver/tvmc/frontends.py           |   4 +-
 python/tvm/ir/instrument.py                   | 158 ++----
 python/tvm/micro/__init__.py                  |   1 -
 python/tvm/micro/build.py                     |  46 --
 python/tvm/micro/project.py                   |   2 +-
 python/tvm/micro/testing.py                   |  52 --
 .../backend/contrib/ethosu/tir/compiler.py    |   1 -
 python/tvm/relay/backend/vm.py                |  12 +-
 python/tvm/relay/frontend/paddlepaddle.py     | 338 ------------
 python/tvm/relay/frontend/pytorch.py          |  78 +--
 python/tvm/relay/frontend/tflite.py           |   4 +
 python/tvm/relay/op/contrib/__init__.py       |   1 -
 python/tvm/relay/op/contrib/cutlass.py        |  74 ---
 python/tvm/relay/op/contrib/ethosn.py         |  45 +-
 python/tvm/rpc/server_ios_launcher.py         | 498 ------------------
 python/tvm/runtime/vm.py                      |   7 -
 python/tvm/testing/plugin.py                  |   8 +-
 python/tvm/topi/arm_cpu/injective.py          |   3 +-
 rust/tvm/src/ir/relay/mod.rs                  |   2 +-
 src/driver/driver_api.cc                      |   3 +-
 src/ir/module.cc                              |   9 +-
 .../backend/contrib/codegen_c/codegen_c.h     |  17 +-
 src/relay/backend/contrib/cutlass/codegen.cc  | 409 --------------
 src/relay/backend/contrib/dnnl/codegen.cc     |   6 +
 src/relay/backend/contrib/ethosn/codegen.cc   |  24 +-
 .../backend/contrib/ethosn/codegen_ethosn.h   |  20 +-
 src/relay/backend/te_compiler.cc              |   4 +-
 src/relay/backend/te_compiler_cache.cc        |  21 +-
 src/relay/backend/vm/compiler.cc              |  25 +-
 src/relay/backend/vm/inline_primitives.cc     |   2 +-
 src/relay/op/annotation/annotation.cc         |  10 -
 src/relay/qnn/utils.h                         |   6 +-
 src/relay/transforms/device_aware_visitors.cc |   2 +-
 src/relay/transforms/device_planner.cc        |   2 +-
 src/relay/transforms/fold_constant.cc         | 398 ++++++--------
 src/relay/transforms/memory_alloc.cc          |  16 +-
 src/runtime/cuda/cuda_device_api.cc           |  10 +-
 src/runtime/dso_library.cc                    | 130 ++---
 src/runtime/hexagon/android/hexagon_device.h  | 135 -----
 src/runtime/hexagon/hexagon/hexagon_buffer.cc | 122 -----
 src/runtime/hexagon/hexagon/hexagon_buffer.h  | 135 -----
 src/runtime/hexagon/hexagon/hexagon_common.cc | 136 -----
 src/runtime/hexagon/hexagon/hexagon_common.h  |  65 ---
 .../hexagon/hexagon/hexagon_device_api_v2.cc  | 130 -----
 .../hexagon/hexagon/hexagon_device_api_v2.h   | 108 ----
 src/runtime/hexagon/hexagon/hexagon_module.cc |  56 --
 .../{android => }/hexagon_device_api.cc       |   2 +-
 .../hexagon/{android => }/hexagon_module.cc   |   6 +-
 src/runtime/hexagon/hexagon_module.h          | 101 ++++
 .../hexagon/{android => }/hexagon_posix.cc    |   0
 .../{android => }/sim/driver/CMakeLists.txt   |   4 +-
 .../{android => }/sim/driver/README.md        |   0
 .../{android => }/sim/driver/fake_pthread.cc  |   0
 .../{android => }/sim/driver/pthread.h        |   6 +-
 .../hexagon/{android => }/sim/driver/sched.h  |   6 +-
 .../{android => }/sim/driver/sim_device.cc    |   0
 .../{android => }/sim/hexagon_device_sim.cc   |   4 +-
 .../{android => }/sim/hexagon_sim_proto.h     |   6 +-
 .../target/fastrpc/CMakeLists.txt             |   0
 .../{android => }/target/fastrpc/README.md    |   0
 .../target/fastrpc/include/tvm_remote.idl     |   0
 .../target/fastrpc/include/tvm_remote_nd.idl  |   0
 .../target/fastrpc/src/tvm_hvx.cc             |   0
 .../target/fastrpc/src/tvm_hvx.h              |   6 +-
 .../target/fastrpc/src/tvm_remote_imp.cc      |   0
 .../target/fastrpc/src/tvm_remote_nd_imp.cc   |   0
 .../target/fastrpc/src/tvm_wrap_pthread.cc    |   0
 .../target/hexagon_device_target.cc           |   2 +-
 .../{android => }/target/hexagon_dsprpcapi.cc |   0
 .../{android => }/target/hexagon_dsprpcapi.h  |   6 +-
 .../{android => }/target/hexagon_stubapi.cc   |   0
 .../{android => }/target/hexagon_stubapi.h    |   6 +-
 .../{android => }/target/hexagon_target_log.h |   6 +-
 src/runtime/library_module.cc                 |  24 +-
 src/runtime/library_module.h                  |  21 +-
 src/runtime/opencl/opencl_device_api.cc       |   4 -
 src/runtime/vm/executable.cc                  |  32 +-
 src/runtime/vm/memory_manager.cc              |   8 +-
 src/runtime/vm/pooled_allocator.h             |   6 +-
 src/runtime/vm/serialize_utils.h              |   4 +-
 src/runtime/vm/vm.cc                          |  36 +-
 src/support/utils.h                           |   2 +-
 src/target/source/codegen_cuda.cc             |   2 +-
 src/target/source/codegen_opencl.cc           |  25 -
 src/target/source/codegen_opencl.h            |   4 -
 src/tir/analysis/verify_gpu_code.cc           |   8 +-
 ...merge_dynamic_shared_memory_allocations.cc | 487 +----------------
 tests/cpp/support_test.cc                     |   6 -
 tests/micro/arduino/conftest.py               |  14 +-
 tests/micro/zephyr/test_utils.py              |  78 ++-
 tests/micro/zephyr/test_zephyr.py             |   4 +-
 tests/micro/zephyr/test_zephyr_aot.py         |  49 +-
 tests/micro/zephyr/test_zephyr_armv7m.py      |  30 +-
 tests/python/contrib/test_cutlass.py          | 222 --------
 .../contrib/test_ethosn/infrastructure.py     |   9 +-
 .../test_ethosn/test_partition_params.py      | 123 -----
 .../contrib/test_ethosu/test_replace_copy.py  |  64 +--
 tests/python/contrib/test_hexagon/README.md   |   6 +-
 .../contrib/test_hexagon/infrastructure.py    |  63 +--
 .../test_hexagon/test_conv2d_blocked.py       | 349 +++++++-----
 tests/python/contrib/test_onnx.py             |  34 +-
 .../python/contrib/test_rpc_server_device.py  | 440 ----------------
 tests/python/contrib/test_tensorrt.py         |   5 +-
 tests/python/driver/tvmc/test_compiler.py     |  35 +-
 .../driver/tvmc/test_composite_target.py      |   1 -
 tests/python/driver/tvmc/test_target.py       |  30 +-
 .../frontend/paddlepaddle/test_forward.py     | 299 +----------
 tests/python/frontend/pytorch/test_forward.py |  30 +-
 tests/python/frontend/tflite/test_forward.py  | 129 +----
 tests/python/relay/aot/aot_test_utils.py      |  10 +-
 tests/python/relay/test_pass_fold_constant.py | 100 +---
 tests/python/relay/test_pass_instrument.py    |  18 +-
 tests/python/relay/test_prng.py               |   7 +-
 tests/python/relay/test_vm.py                 |  87 ---
 .../unittest/test_target_codegen_opencl.py    |   1 -
 .../unittest/test_target_codegen_vulkan.py    |   5 +-
 .../test_tir_analysis_verify_gpu_code.py      |  29 -
 ...merge_dynamic_shared_memory_allocations.py |  85 +--
 tests/scripts/task_python_ethosn_tests.sh     |   2 +-
 tests/scripts/task_python_integration.sh      |   4 +-
 .../task_python_integration_i386only.sh       |  23 -
 165 files changed, 1087 insertions(+), 7861 deletions(-)
 delete mode 160000 3rdparty/cutlass
 delete mode 100644 cmake/modules/Arduino.cmake
 delete mode 100644 cmake/modules/Zephyr.cmake
 delete mode 100644 cmake/modules/contrib/CUTLASS.cmake
 delete mode 100644 docs/legacy_redirect.py
 delete mode 100644 gallery/tutorial/tensor_ir_blitz_course.py
 delete mode 100644 licenses/LICENSE.cutlass.txt
 delete mode 100644 python/tvm/contrib/cutlass/__init__.py
 delete mode 100644 python/tvm/contrib/cutlass/build.py
 delete mode 100644 python/tvm/contrib/cutlass/gemm_operation.py
 delete mode 100644 python/tvm/contrib/cutlass/gemm_profiler.py
 delete mode 100644 python/tvm/contrib/cutlass/gen_gemm.py
 delete mode 100644 python/tvm/contrib/cutlass/library.py
 delete mode 100644 python/tvm/relay/op/contrib/cutlass.py
 delete mode 100644 python/tvm/rpc/server_ios_launcher.py
 delete mode 100644 src/relay/backend/contrib/cutlass/codegen.cc
 delete mode 100644 src/runtime/hexagon/android/hexagon_device.h
 delete mode 100644 src/runtime/hexagon/hexagon/hexagon_buffer.cc
 delete mode 100644 src/runtime/hexagon/hexagon/hexagon_buffer.h
 delete mode 100644 src/runtime/hexagon/hexagon/hexagon_common.cc
 delete mode 100644 src/runtime/hexagon/hexagon/hexagon_common.h
 delete mode 100644 src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
 delete mode 100644 src/runtime/hexagon/hexagon/hexagon_device_api_v2.h
 delete mode 100644 src/runtime/hexagon/hexagon/hexagon_module.cc
 rename src/runtime/hexagon/{android => }/hexagon_device_api.cc (99%)
 rename src/runtime/hexagon/{android => }/hexagon_module.cc (99%)
 rename src/runtime/hexagon/{android => }/hexagon_posix.cc (100%)
 rename src/runtime/hexagon/{android => }/sim/driver/CMakeLists.txt (95%)
 rename src/runtime/hexagon/{android => }/sim/driver/README.md (100%)
 rename src/runtime/hexagon/{android => }/sim/driver/fake_pthread.cc (100%)
 rename src/runtime/hexagon/{android => }/sim/driver/pthread.h (94%)
 rename src/runtime/hexagon/{android => }/sim/driver/sched.h (84%)
 rename src/runtime/hexagon/{android => }/sim/driver/sim_device.cc (100%)
 rename src/runtime/hexagon/{android => }/sim/hexagon_device_sim.cc (99%)
 rename src/runtime/hexagon/{android => }/sim/hexagon_sim_proto.h (90%)
 rename src/runtime/hexagon/{android => }/target/fastrpc/CMakeLists.txt (100%)
 rename src/runtime/hexagon/{android => }/target/fastrpc/README.md (100%)
 rename src/runtime/hexagon/{android => }/target/fastrpc/include/tvm_remote.idl (100%)
 rename src/runtime/hexagon/{android => }/target/fastrpc/include/tvm_remote_nd.idl (100%)
 rename src/runtime/hexagon/{android => }/target/fastrpc/src/tvm_hvx.cc (100%)
 rename src/runtime/hexagon/{android => }/target/fastrpc/src/tvm_hvx.h (95%)
 rename src/runtime/hexagon/{android => }/target/fastrpc/src/tvm_remote_imp.cc (100%)
 rename src/runtime/hexagon/{android => }/target/fastrpc/src/tvm_remote_nd_imp.cc (100%)
 rename src/runtime/hexagon/{android => }/target/fastrpc/src/tvm_wrap_pthread.cc (100%)
 rename src/runtime/hexagon/{android => }/target/hexagon_device_target.cc (99%)
 rename src/runtime/hexagon/{android => }/target/hexagon_dsprpcapi.cc (100%)
 rename src/runtime/hexagon/{android => }/target/hexagon_dsprpcapi.h (96%)
 rename src/runtime/hexagon/{android => }/target/hexagon_stubapi.cc (100%)
 rename src/runtime/hexagon/{android => }/target/hexagon_stubapi.h (98%)
 rename src/runtime/hexagon/{android => }/target/hexagon_target_log.h (87%)
 delete mode 100644 tests/python/contrib/test_cutlass.py
 delete mode 100644 tests/python/contrib/test_ethosn/test_partition_params.py
 delete mode 100644 tests/python/contrib/test_rpc_server_device.py
 delete mode 100755 tests/scripts/task_python_integration_i386only.sh

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index 97cf467cca07d..02134b64b6195 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -33,10 +33,10 @@
 ##############################
 # Top-level Fallbacks
 ##############################
-include/**  @tqchen @jroesch @yzhliu @icemelon @junrushao1994 @comaniac @zhiics
-src/** @tqchen @jroesch @yzhliu @icemelon @junrushao1994 @comaniac @zhiics
-apps/** @tqchen @jroesch @yzhliu @icemelon @junrushao1994 @comaniac @zhiics
-python/** @tqchen @jroesch @yzhliu @icemelon @junrushao1994 @comaniac @zhiics
+include/**  @tqchen @jroesch @yzhliu @icemelon9 @junrushao1994 @comaniac @zhiics
+src/** @tqchen @jroesch @yzhliu @icemelon9 @junrushao1994 @comaniac @zhiics
+apps/** @tqchen @jroesch @yzhliu @icemelon9 @junrushao1994 @comaniac @zhiics
+python/** @tqchen @jroesch @yzhliu @icemelon9 @junrushao1994 @comaniac @zhiics
 
 # Thirdparty license audit
 3rdparty/**  @tqchen @jroesch
@@ -67,11 +67,11 @@ rust/** @jroesch @nhynes @nhynes
 vta/**    @tmoreau89 @vegaluisjose
 
 # docs
-docs/**  @comaniac @junrushao1994 @tqchen @jroesch @areusch @yzhliu @merrymercy @icemelon
-tutorials/**  @comaniac @junrushao1994 @tqchen @jroesch @areusch @yzhliu @merrymercy @icemelon
+docs/**  @comaniac @junrushao1994 @tqchen @jroesch @areusch @yzhliu @merrymercy @icemelon9
+tutorials/**  @comaniac @junrushao1994 @tqchen @jroesch @areusch @yzhliu @merrymercy @icemelon9
 
 # tests
-tests/**  @comaniac @junrushao1994 @tqchen @jroesch @areusch @yzhliu @merrymercy @icemelon
+tests/**  @comaniac @junrushao1994 @tqchen @jroesch @areusch @yzhliu @merrymercy @icemelon9
 
 ##############################
 # Specific modules
@@ -129,9 +129,9 @@ include/tvm/runtime/micro/** @areusch @liangfu @tmoreau89 @manupa-arm
 python/tvm/micro/** @areusch @liangfu @tmoreau89 @manupa-arm
 
 # relay
-src/relay/** @jroesch @slyubomirsky @icemelon @MarisaKirisame @ZihengJiang @yzhliu @vinx13 @mbrookhart @jwfromm @zhiics @anijain2305 @wweic @eqy @junrushao1994
-include/tvm/relay/** @jroesch @slyubomirsky @icemelon @MarisaKirisame @ZihengJiang @yzhliu @vinx13 @mbrookhart @jwfromm @zhiics @anijain2305 @wweic @eqy @junrushao1994
-python/tvm/relay/** @jroesch @slyubomirsky @icemelon @MarisaKirisame @ZihengJiang @yzhliu @vinx13 @mbrookhart @jwfromm @zhiics @anijain2305 @wweic @eqy @junrushao1994
+src/relay/** @jroesch @slyubomirsky @icemelon9 @MarisaKirisame @ZihengJiang @yzhliu @vinx13 @mbrookhart @jwfromm @zhiics @anijain2305 @wweic @eqy @junrushao1994
+include/tvm/relay/** @jroesch @slyubomirsky @icemelon9 @MarisaKirisame @ZihengJiang @yzhliu @vinx13 @mbrookhart @jwfromm @zhiics @anijain2305 @wweic @eqy @junrushao1994
+python/tvm/relay/** @jroesch @slyubomirsky @icemelon9 @MarisaKirisame @ZihengJiang @yzhliu @vinx13 @mbrookhart @jwfromm @zhiics @anijain2305 @wweic @eqy @junrushao1994
 
 
 # relay/qnn
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index ca7d0f2a5052e..1b9ebb3411e23 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -71,23 +71,6 @@ jobs:
       run: >-
         conda build --output-folder=conda/pkg  conda/recipe &&
         conda install tvm -c ./conda/pkg
-    - name: Build iOS RPC@MacOS
-      if: startsWith(matrix.os, 'macOS')
-      run: |
-        IOS_VERSION="14.0"
-        CMAKE_FLAGS="-DCMAKE_BUILD_TYPE=Release \
-                     -DCMAKE_SYSTEM_NAME=iOS \
-                     -DCMAKE_SYSTEM_VERSION=${IOS_VERSION} \
-                     -DCMAKE_OSX_SYSROOT=iphonesimulator \
-                     -DCMAKE_OSX_ARCHITECTURES=x86_64 \
-                     -DCMAKE_OSX_DEPLOYMENT_TARGET=14.0 \
-                     -DCMAKE_BUILD_WITH_INSTALL_NAME_DIR=ON \
-                     -DUSE_IOS_RPC=ON"
-
-        mkdir build-ios-simulator
-        cd build-ios-simulator
-        cmake .. ${CMAKE_FLAGS}
-        cmake --build . --target ios_rpc
     - name: Test@Win
       if: startsWith(matrix.os, 'windows')
       shell: cmd /C call {0}
@@ -98,12 +81,3 @@ jobs:
       shell: bash -l {0}
       run: >-
         python -m pytest -v tests/python/all-platform-minimal-test
-    - name: Test iOS RPC@MacOS
-      if: startsWith(matrix.os, 'macOS')
-      shell: bash -l {0}
-      run: >-
-        python -m pip install tornado psutil cloudpickle &&
-        export PYTHONPATH=tests/python/contrib:${PYTHONPATH} &&
-        export BUNDLE_ID=org.apache.tvmrpc &&
-        export BUNDLE_PATH=build-ios-simulator/apps/ios_rpc/ios_rpc/src/ios_rpc-build/Release-iphonesimulator/tvmrpc.app &&
-        python -m pytest -v tests/python/contrib/test_rpc_server_device.py
diff --git a/.gitmodules b/.gitmodules
index 8dfda44d10a03..6ef740e331532 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -13,6 +13,3 @@
 [submodule "3rdparty/libbacktrace"]
 	path = 3rdparty/libbacktrace
 	url = https://github.com/tlc-pack/libbacktrace.git
-[submodule "3rdparty/cutlass"]
-	path = 3rdparty/cutlass
-	url = https://github.com/NVIDIA/cutlass
diff --git a/3rdparty/cutlass b/3rdparty/cutlass
deleted file mode 160000
index a3bcc6981d5da..0000000000000
--- a/3rdparty/cutlass
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit a3bcc6981d5dad3afb212689e2c7853d1b1ee45d
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7eb2ffd943e25..24f0653b3a781 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,7 +69,6 @@ tvm_option(USE_MKLDNN "Build with MKLDNN" OFF)
 tvm_option(USE_DNNL_CODEGEN "Enable MKLDNN (DNNL) codegen" OFF)
 tvm_option(USE_CUDNN "Build with cuDNN" OFF)
 tvm_option(USE_CUBLAS "Build with cuBLAS" OFF)
-tvm_option(USE_CUTLASS "Build with CUTLASS" OFF)
 tvm_option(USE_THRUST "Build with Thrust" OFF)
 tvm_option(USE_MIOPEN "Build with ROCM:MIOpen" OFF)
 tvm_option(USE_ROCBLAS "Build with ROCM:RoCBLAS" OFF)
@@ -414,8 +413,6 @@ endif(USE_PIPELINE_EXECUTOR)
 # Module rules
 include(cmake/modules/VTA.cmake)
 include(cmake/modules/StandaloneCrt.cmake)
-include(cmake/modules/Zephyr.cmake)
-include(cmake/modules/Arduino.cmake)
 include(cmake/modules/CUDA.cmake)
 include(cmake/modules/Hexagon.cmake)
 include(cmake/modules/OpenCL.cmake)
@@ -431,7 +428,6 @@ include(cmake/modules/contrib/EthosU.cmake)
 include(cmake/modules/contrib/BLAS.cmake)
 include(cmake/modules/contrib/CODEGENC.cmake)
 include(cmake/modules/contrib/DNNL.cmake)
-include(cmake/modules/contrib/CUTLASS.cmake)
 include(cmake/modules/contrib/ExampleTargetHooks.cmake)
 include(cmake/modules/contrib/Random.cmake)
 include(cmake/modules/contrib/Posit.cmake)
@@ -504,8 +500,6 @@ if(USE_MICRO)
   # Unix Makefiles generator, need to add these explicit target-level dependency)
   add_dependencies(tvm host_standalone_crt)
   add_dependencies(tvm_runtime host_standalone_crt)
-  add_dependencies(tvm_runtime zephyr)
-  add_dependencies(tvm_runtime arduino)
 endif()
 
 if(USE_CPP_RPC)
diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index d2c2745c8f852..ca438a75849c8 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -59,7 +59,7 @@ We do encourage everyone to work anything they are interested in.
 - [Giuseppe Rossini](https://github.com/giuseros): @giuseros - aot, arm
 - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel - frontends
 - [Junru Shao](https://github.com/junrushao1994) (PMC): @junrushao1994 - relay, compiler
-- [Haichen Shen](https://github.com/icemelon) (PMC): @icemelon - relay, topi
+- [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi
 - [Siva Rama Krishna Reddy](https://github.com/srkreddy1238): @srkreddy1238 - frontends, golang
 - [Zhixun Tan](https://github.com/phisiart): @phisiart - opengl, web
 - [Andrew Tulloch](https://github.com/ajtulloch): @ajtulloch - topi, compiler, runtime
@@ -131,14 +131,13 @@ We do encourage everyone to work anything they are interested in.
 - [Giuseppe Rossini](https://github.com/giuseros): @giuseros
 - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel
 - [Junru Shao](https://github.com/junrushao1994): @junrushao1994
-- [Haichen Shen](https://github.com/icemelon): @icemelon
+- [Haichen Shen](https://github.com/icemelon9): @icemelon9
 - [Xingjian Shi](https://github.com/sxjscience): @sxjscience
 - [Mark Shields](https://github.com/mbs-octoml): @mbs-octoml
 - [Christopher Sidebottom](https://github.com/mousius): @mousius
 - [Siva Rama Krishna Reddy](https://github.com/srkreddy1238): @srkreddy1238
 - [Dmitriy Smirnov](https://github.com/d-smirnov): @d-smirnov
 - [Jon Soifer](https://github.com/soiferj): @soiferj
-- [Chris Sullivan](https://github.com/csullivan): @csullivan
 - [Zhixun Tan](https://github.com/phisiart): @phisiart
 - [Andrew Tulloch](https://github.com/ajtulloch): @ajtulloch
 - [Jorn Tuyls](https://github.com/jtuyls): @jtuyls
@@ -151,7 +150,6 @@ We do encourage everyone to work anything they are interested in.
 - [Logan Weber](https://github.com/weberlo): @weberlo
 - [Matt Welsh](https://github.com/mdw-octoml): @mdw-octoml
 - [Jian Weng](https://github.com/were): @were
-- [wrongtest](https://github.com/wrongtest): @wrongtest
 - [Yong Wu](https://github.com/yongwww): @yongwww
 - [Zhao Wu](https://github.com/FrozenGene): @FrozenGene
 - [Bing Xu](https://github.com/antinucleon): @antinucleon
diff --git a/Jenkinsfile b/Jenkinsfile
index 12ab17b193db0..6a7a1f4e3d365 100755
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -46,7 +46,7 @@ import org.jenkinsci.plugins.pipeline.modeldefinition.Utils
 
 // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->
 ci_lint = "tlcpack/ci-lint:v0.67"
-ci_gpu = "tlcpack/ci-gpu:v0.78"
+ci_gpu = "tlcpack/ci-gpu:v0.77"
 ci_cpu = "tlcpack/ci-cpu:v0.78"
 ci_wasm = "tlcpack/ci-wasm:v0.71"
 ci_i386 = "tlcpack/ci-i386:v0.74"
@@ -353,7 +353,7 @@ stage('Unit Test') {
             timeout(time: max_time, unit: 'MINUTES') {
               sh "${docker_run} ${ci_i386} ./tests/scripts/task_ci_setup.sh"
               sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_unittest.sh"
-              sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh"
+              sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration.sh"
               sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_vta_fsim.sh"
               junit "build/pytest-results/*.xml"
             }
diff --git a/LICENSE b/LICENSE
index 18718f986baa8..52b2219396d2f 100644
--- a/LICENSE
+++ b/LICENSE
@@ -238,8 +238,3 @@ The Unlicense
 -------------
 
 3rdparty/rang
-
-BSD 3-Clause "New" or "Revised" License
----------------------------------------
-
-3rdparty/cutlass
\ No newline at end of file
diff --git a/apps/hexagon_launcher/launcher_core.cc b/apps/hexagon_launcher/launcher_core.cc
index 0fe9f9f59e4ad..6a5704d3888ac 100644
--- a/apps/hexagon_launcher/launcher_core.cc
+++ b/apps/hexagon_launcher/launcher_core.cc
@@ -148,13 +148,12 @@ const tvm::runtime::PackedFunc get_module_func(tvm::runtime::Module module,
 }
 
 void reset_device_api() {
-  const tvm::runtime::PackedFunc api = get_runtime_func("device_api.hexagon.v2");
+  const tvm::runtime::PackedFunc api = get_runtime_func("device_api.cpu");
   tvm::runtime::Registry::Register("device_api.hexagon", true).set_body(api);
 }
 
 tvm::runtime::Module load_module(const std::string& file_name) {
-  static const tvm::runtime::PackedFunc loader =
-      get_runtime_func("runtime.module.loadfile_hexagon");
+  static const tvm::runtime::PackedFunc loader = get_runtime_func("runtime.module.loadfile_so");
   tvm::runtime::TVMRetValue rv = loader(file_name);
   if (rv.type_code() == kTVMModuleHandle) {
     return rv.operator tvm::runtime::Module();
@@ -170,10 +169,7 @@ tvm::runtime::Module create_graph_executor(const std::string& graph_json,
   uint64_t device_type = device.device_type;
   uint64_t device_id = device.device_id;
 
-  std::string linked_params = "tvm.runtime.hexagon.lookup_linked_params";
-  const tvm::runtime::PackedFunc lookup_linked_params = get_runtime_func(linked_params);
   // Use default param lookup function (linked into the module).
-  tvm::runtime::TVMRetValue rv =
-      create_executor(graph_json, graph_module, lookup_linked_params, device_type, device_id);
+  tvm::runtime::TVMRetValue rv = create_executor(graph_json, graph_module, device_type, device_id);
   return rv.operator tvm::runtime::Module();
 }
diff --git a/apps/hexagon_launcher/launcher_core.h b/apps/hexagon_launcher/launcher_core.h
index 91384133ab7bf..f2aa8f10d0a68 100644
--- a/apps/hexagon_launcher/launcher_core.h
+++ b/apps/hexagon_launcher/launcher_core.h
@@ -89,8 +89,6 @@ struct Model {
 
   static tvm::Device device() { return tvm::Device{static_cast<DLDeviceType>(kDLHexagon), 0}; }
 
-  static tvm::Device external() { return tvm::Device{static_cast<DLDeviceType>(kDLCPU), 0}; }
-
   tvm::runtime::PackedFunc run;
 };
 
diff --git a/apps/hexagon_launcher/launcher_hexagon.cc b/apps/hexagon_launcher/launcher_hexagon.cc
index 6925e1da9bfa9..0a5d1f55e0c25 100644
--- a/apps/hexagon_launcher/launcher_hexagon.cc
+++ b/apps/hexagon_launcher/launcher_hexagon.cc
@@ -26,8 +26,6 @@ extern "C" {
 #include <qurt_hvx.h>
 }
 
-#include <tvm/runtime/object.h>
-
 #include <algorithm>
 #include <memory>
 #include <string>
@@ -108,7 +106,7 @@ AEEResult __QAIC_HEADER(launcher_rpc_set_input)(remote_handle64 handle, int inpu
 
   DLTensor tensor{
       const_cast<unsigned char*>(input_value),
-      Model::external(),
+      Model::device(),
       meta->ndim,
       meta->dtype,
       const_cast<int64_t*>(meta->shape),
@@ -155,16 +153,6 @@ AEEResult __QAIC_HEADER(launcher_rpc_get_output)(remote_handle64 handle, int out
   tvm::runtime::PackedFunc get_output = get_module_func(TheModel->graph_executor, "get_output");
   tvm::runtime::NDArray output = get_output(output_idx);
 
-  std::vector<int64_t> shape_vec{output->shape, output->shape + output->ndim};
-
-  auto* container = new tvm::runtime::NDArray::Container(
-      static_cast<void*>(output_value), shape_vec, output->dtype, Model::external());
-  container->SetDeleter([](tvm::Object* container) {
-    delete static_cast<tvm::runtime::NDArray::Container*>(container);
-  });
-
-  tvm::runtime::NDArray host_output(GetObjectPtr<tvm::Object>(container));
-
   if (meta_size != 0) {
     auto* meta = reinterpret_cast<tensor_meta*>(output_meta);
     if (meta_size < meta->meta_size(output->ndim)) {
@@ -182,7 +170,8 @@ AEEResult __QAIC_HEADER(launcher_rpc_get_output)(remote_handle64 handle, int out
       return error_too_small(__func__, "value_size", value_size, data_size);
     }
 
-    host_output.CopyFrom(output);
+    auto data = reinterpret_cast<decltype(output_value)>(output->data);
+    std::copy(data, data + data_size, output_value);
   }
 
   return AEE_SUCCESS;
diff --git a/apps/ios_rpc/tvmrpc/ViewController.mm b/apps/ios_rpc/tvmrpc/ViewController.mm
index 9b476bbd47ce3..3f8c647fa4f29 100644
--- a/apps/ios_rpc/tvmrpc/ViewController.mm
+++ b/apps/ios_rpc/tvmrpc/ViewController.mm
@@ -94,7 +94,6 @@ - (void)open {
   server_.port = self.proxyPort.text.intValue;
   server_.key = self.proxyKey.text;
   server_.custom_addr = [NSString stringWithUTF8String:args.custom_addr];
-  server_.verbose = args.verbose;
   server_.delegate = self;
 
   [server_ start];
diff --git a/apps/microtvm/arduino/template_project/microtvm_api_server.py b/apps/microtvm/arduino/template_project/microtvm_api_server.py
index 1768c61197a9d..18c1b659dafde 100644
--- a/apps/microtvm/arduino/template_project/microtvm_api_server.py
+++ b/apps/microtvm/arduino/template_project/microtvm_api_server.py
@@ -31,7 +31,6 @@
 import tempfile
 import time
 from string import Template
-import re
 
 import serial
 import serial.tools.list_ports
@@ -103,7 +102,7 @@ def server_info_query(self, tvm_version):
         return server.ServerInfo(
             platform_name="arduino",
             is_template=IS_TEMPLATE,
-            model_library_format_path="" if IS_TEMPLATE else MODEL_LIBRARY_FORMAT_PATH,
+            model_library_format_path=MODEL_LIBRARY_FORMAT_PATH,
             project_options=PROJECT_OPTIONS,
         )
 
@@ -288,11 +287,11 @@ def _find_modified_include_path(self, project_dir, file_path, include_path):
         return include_path
 
     def _get_platform_version(self, arduino_cli_path: str) -> float:
-        # sample output of this command:
-        # 'arduino-cli alpha Version: 0.18.3 Commit: d710b642 Date: 2021-05-14T12:36:58Z\n'
         version_output = subprocess.check_output([arduino_cli_path, "version"], encoding="utf-8")
-        full_version = re.findall("version: ([\.0-9]*)", version_output.lower())
-        full_version = full_version[0].split(".")
+        version_output = (
+            version_output.replace("\n", "").replace("\r", "").replace(":", "").lower().split(" ")
+        )
+        full_version = version_output[version_output.index("version") + 1].split(".")
         version = float(f"{full_version[0]}.{full_version[1]}")
 
         return version
@@ -330,7 +329,7 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
 
         # Unpack the MLF and copy the relevant files
         metadata = self._disassemble_mlf(model_library_format_path, source_dir)
-        shutil.copy2(model_library_format_path, project_dir / MODEL_LIBRARY_FORMAT_RELPATH)
+        shutil.copy2(model_library_format_path, source_dir / "model")
 
         # For AOT, template model.h with metadata to minimize space usage
         if options["project_type"] == "example_project":
diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
index 7e13f928b2882..36a5ff12193ff 100644
--- a/apps/microtvm/zephyr/template_project/microtvm_api_server.py
+++ b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -275,10 +275,6 @@ def _get_nrf_device_args(options):
         choices=(True, False),
         help="Treat warnings as errors and raise an Exception.",
     ),
-    server.ProjectOption(
-        "compile_definitions",
-        help="Extra definitions added project compile.",
-    ),
 ]
 
 
@@ -423,11 +419,6 @@ def generate_project(self, model_library_format_path, standalone_crt_dir, projec
 
                     cmake_f.write(line)
 
-                if options.get("compile_definitions"):
-                    flags = options.get("compile_definitions")
-                    for item in flags:
-                        cmake_f.write(f"target_compile_definitions(app PUBLIC {item})\n")
-
         self._create_prj_conf(project_dir, options)
 
         # Populate crt-config.h
diff --git a/apps/microtvm/zephyr/template_project/src/aot_demo/main.c b/apps/microtvm/zephyr/template_project/src/aot_demo/main.c
index 3946727b26a8c..a96e3b4d0a4ed 100644
--- a/apps/microtvm/zephyr/template_project/src/aot_demo/main.c
+++ b/apps/microtvm/zephyr/template_project/src/aot_demo/main.c
@@ -38,21 +38,14 @@
 #include "posix_board_if.h"
 #endif
 
-// WORKSPACE_SIZE defined in Project API Makefile
+#define WORKSPACE_SIZE (270 * 1024)
 
 static uint8_t g_aot_memory[WORKSPACE_SIZE];
 tvm_workspace_t app_workspace;
 
-// Transport Commands.
-// Commands on host end with `\n`
-// Commands on microTVM device end with `%`
-const unsigned char CMD_WAKEUP[] = "wakeup\n";
-const unsigned char CMD_READY[] = "ready\n";
-const unsigned char CMD_INIT[] = "init";
-const unsigned char CMD_INFER[] = "infer";
-
-#define CMD_SIZE 80u
-#define CMD_TERMINATOR '%'
+// Wakeup sequence used to wake up QEMU on the host.
+const unsigned char g_wakeup_sequence[] = "#wakeup\n";
+const char g_start_cmd[] = "start\n";
 
 size_t TVMPlatformFormatMessage(char* out_buf, size_t out_buf_size_bytes, const char* fmt,
                                 va_list args) {
@@ -170,10 +163,35 @@ int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) {
 }
 
 static uint8_t main_rx_buf[128];
-static uint8_t g_cmd_buf[128];
+static uint8_t cmd_buf[128];
 static size_t g_cmd_buf_ind;
 
-void TVMInfer() {
+void main(void) {
+  g_cmd_buf_ind = 0;
+  memset((char*)cmd_buf, 0, sizeof(cmd_buf));
+  TVMPlatformUARTInit();
+  k_timer_init(&g_microtvm_timer, NULL, NULL);
+  // Wake up host side.
+  TVMPlatformWriteSerial(g_wakeup_sequence, sizeof(g_wakeup_sequence));
+
+  // Wait for start command
+  while (true) {
+    int bytes_read = TVMPlatformUartRxRead(main_rx_buf, sizeof(main_rx_buf));
+    if (bytes_read > 0) {
+      memcpy((char*)cmd_buf + g_cmd_buf_ind, main_rx_buf, bytes_read);
+      g_cmd_buf_ind += bytes_read;
+    }
+    if (g_cmd_buf_ind >= 6) {
+      if (!strcmp((char*)(cmd_buf), g_start_cmd)) {
+        break;
+      } else {
+        memset((char*)cmd_buf, 0, sizeof(cmd_buf));
+        g_cmd_buf_ind = 0;
+      }
+    }
+  }
+  TVMLogf("Zephyr AOT Runtime\n");
+
   struct tvmgen_default_inputs inputs = {
       .input_1 = input_data,
   };
@@ -201,47 +219,7 @@ void TVMInfer() {
       max_val = output_data[i];
     }
   }
-  TVMLogf("result:%d:%d\n", max_ind, (uint32_t)(elapsed_time * 1000));
-}
-
-// Execute functions based on received command
-void command_ready(char* command) {
-  if (strncmp(command, CMD_INIT, CMD_SIZE) == 0) {
-    TVMPlatformWriteSerial(CMD_WAKEUP, sizeof(CMD_WAKEUP));
-  } else if (strncmp(command, CMD_INFER, CMD_SIZE) == 0) {
-    TVMInfer();
-  } else {
-    TVMPlatformWriteSerial(CMD_READY, sizeof(CMD_READY));
-  }
-}
-
-// Append received characters to buffer and check for termination character.
-void serial_callback(char* message, int len_bytes) {
-  for (int i = 0; i < len_bytes; i++) {
-    if (message[i] == CMD_TERMINATOR) {
-      g_cmd_buf[g_cmd_buf_ind] = (char)0;
-      command_ready(g_cmd_buf);
-      g_cmd_buf_ind = 0;
-    } else {
-      g_cmd_buf[g_cmd_buf_ind] = message[i];
-      g_cmd_buf_ind += 1;
-    }
-  }
-}
-
-void main(void) {
-  g_cmd_buf_ind = 0;
-  memset((char*)g_cmd_buf, 0, sizeof(g_cmd_buf));
-  TVMPlatformUARTInit();
-  k_timer_init(&g_microtvm_timer, NULL, NULL);
-
-  while (true) {
-    int bytes_read = TVMPlatformUartRxRead(main_rx_buf, sizeof(main_rx_buf));
-    if (bytes_read > 0) {
-      serial_callback(main_rx_buf, bytes_read);
-    }
-  }
-
+  TVMLogf("#result:%d:%d\n", max_ind, (uint32_t)(elapsed_time * 1000));
 #ifdef CONFIG_ARCH_POSIX
   posix_exit(0);
 #endif
diff --git a/cmake/config.cmake b/cmake/config.cmake
index 960adaff4036b..1fce11f90aed2 100644
--- a/cmake/config.cmake
+++ b/cmake/config.cmake
@@ -347,7 +347,3 @@ set(USE_PAPI OFF)
 # Note that cmake will use `find_package` to find GTest. Please use cmake's
 # predefined variables to specify the path to the GTest package if needed.
 set(USE_GTEST AUTO)
-
-# Enable using CUTLASS as a BYOC backend
-# Need to have USE_CUDA=ON
-set(USE_CUTLASS OFF)
diff --git a/cmake/modules/Arduino.cmake b/cmake/modules/Arduino.cmake
deleted file mode 100644
index 54c144081efa7..0000000000000
--- a/cmake/modules/Arduino.cmake
+++ /dev/null
@@ -1,78 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more contributor
-# license agreements.  See the NOTICE file distributed with this work for
-# additional information regarding copyright ownership.  The ASF licenses this
-# file to you under the Apache License, Version 2.0 (the "License"); you may not
-# use this file except in compliance with the License.  You may obtain a copy of
-# the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
-# License for the specific language governing permissions and limitations under
-# the License.
-
-if(USE_MICRO)
-  message(STATUS "Add Arduino for microTVM")
-
-  function(microtvm_add_arduino)
-    list(
-      APPEND
-      ARDUINO_FILE_COPY_JOBS
-      "apps/microtvm/arduino/template_project microtvm_api_server.py -> arduino"
-      "apps/microtvm/arduino/template_project boards.json -> arduino"
-      "apps/microtvm/arduino/template_project/src/example_project *.c -> arduino/src/example_project"
-      "apps/microtvm/arduino/template_project/src/example_project *.h -> arduino/src/example_project"
-      "apps/microtvm/arduino/template_project/src/example_project *.ino -> arduino/src/example_project"
-      "apps/microtvm/arduino/template_project/src/host_driven *.c -> arduino/src/host_driven"
-      "apps/microtvm/arduino/template_project/src/host_driven *.ino -> arduino/src/host_driven"
-      "apps/microtvm/arduino/template_project/crt_config *.h -> arduino/crt_config"
-    )
-
-    foreach(job_spec IN LISTS ARDUINO_FILE_COPY_JOBS)
-      string(REPLACE " " ";" job_spec "${job_spec}")
-      list(LENGTH job_spec job_spec_length)
-      math(EXPR job_spec_length_mod "${job_spec_length} % 3")
-      if(NOT "${job_spec_length_mod}" EQUAL 1)
-        message(
-          FATAL_ERROR
-            "Arduino copy job spec list length is ${job_spec_length}; parsed job spec is ${job_spec}"
-        )
-      endif()
-      math(EXPR job_spec_stop "${job_spec_length} - 3")
-
-      list(GET job_spec 0 job_src_base)
-      set(job_src_base "${CMAKE_SOURCE_DIR}/${job_src_base}")
-      foreach(copy_pattern_index RANGE 1 "${job_spec_stop}" 3)
-        list(GET job_spec ${copy_pattern_index} copy_pattern)
-        math(EXPR copy_dest_index "${copy_pattern_index} + 2")
-        list(GET job_spec ${copy_dest_index} copy_dest)
-
-        file(
-          GLOB_RECURSE copy_files
-          RELATIVE "${job_src_base}"
-          "${job_src_base}/${copy_pattern}")
-        list(LENGTH copy_files copy_files_length)
-        if("${copy_files_length}" EQUAL 0)
-          message(
-            FATAL_ERROR
-              "Arduino copy job matched 0 files: ${job_src_base}/${copy_pattern} -> ${copy_dest}"
-          )
-        endif()
-        foreach(copy_src IN LISTS copy_files)
-          get_filename_component(
-            dest_path "${MICROTVM_TEMPLATE_PROJECTS}/${copy_dest}/${copy_src}"
-            ABSOLUTE)
-          tvm_micro_add_copy_file(arduino_template_deps
-                                  ${job_src_base}/${copy_src} ${dest_path})
-        endforeach()
-      endforeach()
-    endforeach()
-
-    add_custom_target(arduino DEPENDS ${arduino_template_deps})
-  endfunction()
-
-  microtvm_add_arduino()
-
-endif(USE_MICRO)
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 1ae250f1bee3d..88623ab045fde 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -53,22 +53,18 @@ if(BUILD_FOR_HEXAGON)
   include_directories(SYSTEM ${HEXAGON_SDK_INCLUDES} ${HEXAGON_QURT_INCLUDES})
 endif()
 
-# Don't run these checks when compiling Hexagon device code,
-# e.g. when compiling the TVM runtime for Hexagon.
-if (NOT BUILD_FOR_HEXAGON)
-  if(USE_HEXAGON_LAUNCHER STREQUAL "ON")
-    set(USE_HEXAGON_DEVICE "${PICK_SIM}")
-  else()
-    if(USE_HEXAGON_DEVICE STREQUAL "OFF")
-      list(APPEND COMPILER_SRCS src/target/opt/build_hexagon_off.cc)
-      return()
-    elseif(NOT USE_HEXAGON_DEVICE STREQUAL "${PICK_SIM}" AND
-        NOT USE_HEXAGON_DEVICE STREQUAL "${PICK_HW}")
-      set(ERROR_MSG
-        "USE_HEXAGON_DEVICE must be one of [${PICK_NONE}|${PICK_SIM}|${PICK_HW}]")
-      message(SEND_ERROR "${ERROR_MSG}")
-      return()
-    endif()
+if(USE_HEXAGON_LAUNCHER STREQUAL "ON")
+  set(USE_HEXAGON_DEVICE "${PICK_SIM}")
+else()
+  if(USE_HEXAGON_DEVICE STREQUAL "OFF")
+    list(APPEND COMPILER_SRCS src/target/opt/build_hexagon_off.cc)
+    return()
+  elseif(NOT USE_HEXAGON_DEVICE STREQUAL "${PICK_SIM}" AND
+      NOT USE_HEXAGON_DEVICE STREQUAL "${PICK_HW}")
+    set(ERROR_MSG
+      "USE_HEXAGON_DEVICE must be one of [${PICK_NONE}|${PICK_SIM}|${PICK_HW}]")
+    message(SEND_ERROR "${ERROR_MSG}")
+    return()
   endif()
 endif()
 
@@ -141,12 +137,12 @@ endif()
 if(USE_HEXAGON_DEVICE STREQUAL "${PICK_SIM}")
   find_hexagon_toolchain()
   message(STATUS "Hexagon toolchain: ${HEXAGON_TOOLCHAIN}")
-  file(GLOB RUNTIME_HEXAGON_SIM_SRCS src/runtime/hexagon/android/sim/*.cc)
+  file(GLOB RUNTIME_HEXAGON_SIM_SRCS src/runtime/hexagon/sim/*.cc)
   include_directories(SYSTEM "${HEXAGON_TOOLCHAIN}/include/iss")
   link_directories("${HEXAGON_TOOLCHAIN}/lib/iss")
   list(APPEND TVM_RUNTIME_LINKER_LIBS "-lwrapper")
   ExternalProject_Add(sim_dev
-    SOURCE_DIR "${CMAKE_SOURCE_DIR}/src/runtime/hexagon/android/sim/driver"
+    SOURCE_DIR "${CMAKE_SOURCE_DIR}/src/runtime/hexagon/sim/driver"
     CMAKE_ARGS
       "-DCMAKE_C_COMPILER=${HEXAGON_TOOLCHAIN}/bin/hexagon-clang"
       "-DCMAKE_CXX_COMPILER=${HEXAGON_TOOLCHAIN}/bin/hexagon-clang++"
@@ -156,7 +152,7 @@ if(USE_HEXAGON_DEVICE STREQUAL "${PICK_SIM}")
 elseif(USE_HEXAGON_DEVICE STREQUAL "${PICK_HW}")
   find_hexagon_sdk_root("${USE_HEXAGON_SDK}" "${USE_HEXAGON_ARCH}")
   find_hexagon_toolchain()
-  file(GLOB RUNTIME_HEXAGON_DEVICE_SRCS src/runtime/hexagon/android/target/*.cc)
+  file(GLOB RUNTIME_HEXAGON_DEVICE_SRCS src/runtime/hexagon/target/*.cc)
 
   include_directories(SYSTEM
     ${HEXAGON_SDK_INCLUDES}
@@ -170,10 +166,7 @@ elseif(USE_HEXAGON_DEVICE STREQUAL "${PICK_HW}")
   endif()
 endif()
 
-if(BUILD_FOR_HEXAGON AND USE_HEXAGON_DEVICE STREQUAL "${PICK_NONE}")
-  file(GLOB RUNTIME_HEXAGON_SRCS src/runtime/hexagon/hexagon/*.cc)
-else()
-  file(GLOB RUNTIME_HEXAGON_SRCS src/runtime/hexagon/android/*.cc)
-endif()
+file(GLOB RUNTIME_HEXAGON_SRCS src/runtime/hexagon/*.cc)
 list(APPEND RUNTIME_SRCS ${RUNTIME_HEXAGON_SRCS} ${RUNTIME_HEXAGON_SIM_SRCS}
                          ${RUNTIME_HEXAGON_DEVICE_SRCS})
+
diff --git a/cmake/modules/StandaloneCrt.cmake b/cmake/modules/StandaloneCrt.cmake
index 5d822844ae349..9f79c7da3cdf6 100644
--- a/cmake/modules/StandaloneCrt.cmake
+++ b/cmake/modules/StandaloneCrt.cmake
@@ -16,9 +16,20 @@
 # under the License.
 
 if(USE_MICRO)
-  message(STATUS "Build standalone CRT for microTVM")
+  message(STATUS "Build standalone CRT for micro TVM")
   file(GLOB crt_srcs src/runtime/crt/**)
 
+  function(tvm_crt_add_copy_file var src dest)
+    get_filename_component(basename "${src}" NAME)
+    get_filename_component(dest_parent_dir "${dest}" DIRECTORY)
+    add_custom_command(
+        OUTPUT "${dest}"
+        COMMAND "${CMAKE_COMMAND}" -E copy "${src}" "${dest}"
+        DEPENDS "${src}")
+    list(APPEND "${var}" "${dest}")
+    set("${var}" "${${var}}" PARENT_SCOPE)
+  endfunction(tvm_crt_add_copy_file)
+
   function(tvm_crt_define_targets)
     # Build an isolated build directory, separate from the TVM tree.
     list(APPEND CRT_FILE_COPY_JOBS
@@ -72,7 +83,7 @@ if(USE_MICRO)
         endif()
         foreach(copy_src IN LISTS copy_files)
           get_filename_component(dest_path "${standalone_crt_base}/${copy_dest}/${copy_src}" ABSOLUTE)
-          tvm_micro_add_copy_file(host_isolated_build_deps ${job_src_base}/${copy_src} ${dest_path})
+          tvm_crt_add_copy_file(host_isolated_build_deps ${job_src_base}/${copy_src} ${dest_path})
         endforeach()
       endforeach()
     endforeach()
diff --git a/cmake/modules/Zephyr.cmake b/cmake/modules/Zephyr.cmake
deleted file mode 100644
index 048240375cd6e..0000000000000
--- a/cmake/modules/Zephyr.cmake
+++ /dev/null
@@ -1,78 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one or more contributor
-# license agreements.  See the NOTICE file distributed with this work for
-# additional information regarding copyright ownership.  The ASF licenses this
-# file to you under the Apache License, Version 2.0 (the "License"); you may not
-# use this file except in compliance with the License.  You may obtain a copy of
-# the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
-# License for the specific language governing permissions and limitations under
-# the License.
-
-if(USE_MICRO)
-  message(STATUS "Add Zephyr for microTVM")
-
-  function(microtvm_add_zephyr)
-    list(
-      APPEND
-      ZEPHYR_FILE_COPY_JOBS
-      "apps/microtvm/zephyr/template_project microtvm_api_server.py -> zephyr"
-      "apps/microtvm/zephyr/template_project boards.json -> zephyr"
-      "apps/microtvm/zephyr/template_project CMakeLists.txt.template -> zephyr"
-      "apps/microtvm/zephyr/template_project/src/aot_demo *.c -> zephyr/src/aot_demo"
-      "apps/microtvm/zephyr/template_project/src/aot_demo *.h -> zephyr/src/aot_demo"
-      "apps/microtvm/zephyr/template_project/src/host_driven *.c -> zephyr/src/host_driven"
-      "apps/microtvm/zephyr/template_project/qemu-hack * -> zephyr/qemu-hack"
-      "apps/microtvm/zephyr/template_project/crt_config *.h -> zephyr/crt_config"
-    )
-
-    foreach(job_spec IN LISTS ZEPHYR_FILE_COPY_JOBS)
-      string(REPLACE " " ";" job_spec "${job_spec}")
-      list(LENGTH job_spec job_spec_length)
-      math(EXPR job_spec_length_mod "${job_spec_length} % 3")
-      if(NOT "${job_spec_length_mod}" EQUAL 1)
-        message(
-          FATAL_ERROR
-            "Zephyr copy job spec list length is ${job_spec_length}; parsed job spec is ${job_spec}"
-        )
-      endif()
-      math(EXPR job_spec_stop "${job_spec_length} - 3")
-
-      list(GET job_spec 0 job_src_base)
-      set(job_src_base "${CMAKE_SOURCE_DIR}/${job_src_base}")
-      foreach(copy_pattern_index RANGE 1 "${job_spec_stop}" 3)
-        list(GET job_spec ${copy_pattern_index} copy_pattern)
-        math(EXPR copy_dest_index "${copy_pattern_index} + 2")
-        list(GET job_spec ${copy_dest_index} copy_dest)
-
-        file(
-          GLOB_RECURSE copy_files
-          RELATIVE "${job_src_base}"
-          "${job_src_base}/${copy_pattern}")
-        list(LENGTH copy_files copy_files_length)
-        if("${copy_files_length}" EQUAL 0)
-          message(
-            FATAL_ERROR
-              "Zephyr copy job matched 0 files: ${job_src_base}/${copy_pattern} -> ${copy_dest}"
-          )
-        endif()
-        foreach(copy_src IN LISTS copy_files)
-          get_filename_component(
-            dest_path "${MICROTVM_TEMPLATE_PROJECTS}/${copy_dest}/${copy_src}"
-            ABSOLUTE)
-          tvm_micro_add_copy_file(zephyr_template_deps
-                                  ${job_src_base}/${copy_src} ${dest_path})
-        endforeach()
-      endforeach()
-    endforeach()
-
-    add_custom_target(zephyr DEPENDS ${zephyr_template_deps})
-  endfunction()
-
-  microtvm_add_zephyr()
-
-endif(USE_MICRO)
diff --git a/cmake/modules/contrib/CUTLASS.cmake b/cmake/modules/contrib/CUTLASS.cmake
deleted file mode 100644
index 10309f0d90b33..0000000000000
--- a/cmake/modules/contrib/CUTLASS.cmake
+++ /dev/null
@@ -1,23 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-if(USE_CUDA AND USE_CUTLASS)
-  file(GLOB CUTLASS_RELAY_CONTRIB_SRC src/relay/backend/contrib/cutlass/*.cc)
-  list(APPEND COMPILER_SRCS ${CUTLASS_RELAY_CONTRIB_SRC})
-
-  message(STATUS "Build with CUTLASS")
-endif()
diff --git a/cmake/utils/Utils.cmake b/cmake/utils/Utils.cmake
index 44f622126abbc..4e6762b148944 100644
--- a/cmake/utils/Utils.cmake
+++ b/cmake/utils/Utils.cmake
@@ -75,19 +75,6 @@ function(assign_source_group group)
     endforeach()
 endfunction(assign_source_group)
 
-function(tvm_micro_add_copy_file var src dest)
-    get_filename_component(basename "${src}" NAME)
-    get_filename_component(dest_parent_dir "${dest}" DIRECTORY)
-    add_custom_command(
-        OUTPUT "${dest}"
-        COMMAND "${CMAKE_COMMAND}" -E copy "${src}" "${dest}"
-        DEPENDS "${src}")
-    list(APPEND "${var}" "${dest}")
-    set("${var}" "${${var}}" PARENT_SCOPE)
-endfunction(tvm_micro_add_copy_file)
-
-set(MICROTVM_TEMPLATE_PROJECTS "${CMAKE_CURRENT_BINARY_DIR}/microtvm_template_projects")
-
 # From cmake documentation:
 # True if the constant is 1, ON, YES, TRUE, Y, or a non-zero number.
 # False if the constant is 0, OFF, NO, FALSE, N, IGNORE, NOTFOUND, the empty string, or ends in the suffix -NOTFOUND.
diff --git a/docker/install/ubuntu_download_arm_compute_lib_binaries.sh b/docker/install/ubuntu_download_arm_compute_lib_binaries.sh
index 5097fad3d0b67..c68654c753921 100755
--- a/docker/install/ubuntu_download_arm_compute_lib_binaries.sh
+++ b/docker/install/ubuntu_download_arm_compute_lib_binaries.sh
@@ -27,19 +27,17 @@ if [ "$architecture_type" != "aarch64" ]; then
     gcc-aarch64-linux-gnu
 fi
 
-compute_lib_version="v21.08"
-compute_lib_variant="arm64-v8a-neon"
-compute_lib_full_name="arm_compute-${compute_lib_version}-bin-linux-${compute_lib_variant}"
+compute_lib_version="v21.05"
 compute_lib_base_url="https://github.com/ARM-software/ComputeLibrary/releases/download/${compute_lib_version}"
-compute_lib_file_name="${compute_lib_full_name}.tar.gz"
+compute_lib_file_name="arm_compute-${compute_lib_version}-bin-linux.tar.gz"
 compute_lib_download_url="${compute_lib_base_url}/${compute_lib_file_name}"
 
-target_lib="${compute_lib_variant}"
+target_lib="linux-arm64-v8a-neon"
 
 # uncomment line below if you need asserts/debug version of the library
 # target_lib="${target_lib}-asserts"
 
-extract_dir="${compute_lib_full_name}"
+extract_dir="arm_compute-${compute_lib_version}-bin-linux"
 install_path="/opt/acl"
 
 tmpdir=$(mktemp -d)
diff --git a/docker/install/ubuntu_install_cmake_source.sh b/docker/install/ubuntu_install_cmake_source.sh
index 18335c98c4035..f818fba9721b2 100644
--- a/docker/install/ubuntu_install_cmake_source.sh
+++ b/docker/install/ubuntu_install_cmake_source.sh
@@ -20,8 +20,8 @@ set -e
 set -u
 set -o pipefail
 
-v=3.14
-version=3.14.7
+v=3.13
+version=3.13.5
 wget https://cmake.org/files/v${v}/cmake-${version}.tar.gz
 tar xvf cmake-${version}.tar.gz
 cd cmake-${version}
diff --git a/docs/conf.py b/docs/conf.py
index 893d89c26156f..766fda49997fc 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -53,7 +53,6 @@
 
 sys.path.insert(0, str(tvm_path.resolve() / "python"))
 sys.path.insert(0, str(tvm_path.resolve() / "vta" / "python"))
-sys.path.insert(0, str(tvm_path.resolve() / "docs"))
 
 # -- General configuration ------------------------------------------------
 
@@ -259,7 +258,6 @@ def git_describe_version(original_version):
         "tensor_expr_get_started.py",
         "autotvm_matmul_x86.py",
         "auto_scheduler_matmul_x86.py",
-        "tensor_ir_blitz_course.py",
         "topi.pi",
         "cross_compilation_and_rpc.py",
         "relay_quick_start.py",
@@ -468,9 +466,5 @@ def process_docstring(app, what, name, obj, options, lines):
         update_alias_docstring(name, obj, lines)
 
 
-from legacy_redirect import build_legacy_redirect
-
-
 def setup(app):
     app.connect("autodoc-process-docstring", process_docstring)
-    app.connect("build-finished", build_legacy_redirect(tvm_path))
diff --git a/docs/contribute/git_howto.rst b/docs/contribute/git_howto.rst
index 765153be220be..458573630aa5e 100644
--- a/docs/contribute/git_howto.rst
+++ b/docs/contribute/git_howto.rst
@@ -23,8 +23,7 @@ Git Usage Tips
 
 Here are some tips for git workflow.
 
-How to resolve a conflict with `main`
--------------------------------------
+## How to resolve conflict with main
 
 - First rebase to most recent main
 
diff --git a/docs/how_to/deploy/arm_compute_lib.rst b/docs/how_to/deploy/arm_compute_lib.rst
index a7ec8b9501c78..831438273ccab 100644
--- a/docs/how_to/deploy/arm_compute_lib.rst
+++ b/docs/how_to/deploy/arm_compute_lib.rst
@@ -34,31 +34,32 @@ Before installing Arm Compute Library, it is important to know what architecture
 to determine this is to use `lscpu` and look for the "Model name" of the CPU. You can then use this to
 determine the architecture by looking online.
 
-TVM only supports a single version of ACL, currently this is v21.08, there are two recommended ways to build and install 
-the required libraries:
-
-* Use the script located at `docker/install/ubuntu_download_arm_compute_lib_binaries.sh`. You can use this
-  script for downloading ACL binaries for the architecture and extensions specified in `target_lib`, these
-  will be installed to the location denoted by `install_path`.
-* Alternatively, you can download the pre-built binaries from:
+We recommend two different ways to build and install ACL:
+
+* Use the script located at `docker/install/ubuntu_install_arm_compute_lib.sh`. You can use this
+  script for building ACL from source natively or for cross-compiling the library on an x86 machine.
+  You may need to change the architecture of the device you wish to compile for by altering the
+  `target_arch` variable. Binaries will be built from source and installed to the location denoted by
+  `install_path`.
+* Alternatively, you can download and use pre-built binaries from:
   https://github.com/ARM-software/ComputeLibrary/releases. When using this package, you will need to
-  select the binaries for the architecture and extensions you require, then make sure they are visible
-  to CMake:
+  select the binaries for the architecture you require and make sure they are visible to cmake. This
+  can be done like so:
 
   .. code:: bash
 
       cd <acl-prebuilt-package>/lib
-      mv ./<architecture-and-extensions-required>/* .
+      mv ./linux-<architecture-to-build-for>-neon/* .
 
 
 In both cases you will need to set USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR to the path where the ACL package
-is located. CMake will look in /path-to-acl/ along with /path-to-acl/lib and /path-to-acl/build for the
+is located. Cmake will look in /path-to-acl/ along with /path-to-acl/lib and /path-to-acl/build for the
 required binaries. See the section below for more information on how to use these configuration options.
 
 Building with ACL support
 -------------------------
 
-The current implementation has two separate build options in CMake. The reason for this split is
+The current implementation has two separate build options in cmake. The reason for this split is
 because ACL cannot be used on an x86 machine. However, we still want to be able compile an ACL
 runtime module on an x86 machine.
 
@@ -72,7 +73,7 @@ need to use USE_ARM_COMPUTE_LIB=ON on the x86 machine and USE_ARM_COMPUTE_LIB_GR
 AArch64 device.
 
 By default both options are set to OFF. Using USE_ARM_COMPUTE_LIB_GRAPH_EXECUTOR=ON will mean that ACL
-binaries are searched for by CMake in the default locations
+binaries are searched for by cmake in the default locations
 (see https://cmake.org/cmake/help/v3.4/command/find_library.html). In addition to this,
 /path-to-tvm-project/acl/ will also be searched. It is likely that you will need to set your own path to
 locate ACL. This can be done by specifying a path in the place of ON.
diff --git a/docs/install/from_source.rst b/docs/install/from_source.rst
index 4fad42b0af763..23be3198bf7c9 100644
--- a/docs/install/from_source.rst
+++ b/docs/install/from_source.rst
@@ -123,9 +123,6 @@ The configuration of TVM can be modified by editing `config.cmake` and/or by pas
     - Note that apt-package append ``llvm-config`` with version number.
       For example, set ``set(USE_LLVM llvm-config-10)`` if you installed LLVM 10 package
 
-  - If you are a PyTorch user, it is recommended to set ``(USE_LLVM "/path/to/llvm-config --link-static")`` and ``set(HIDE_PRIVATE_SYMBOLS ON)``
-    to avoid potential symbol conflicts between different versions LLVM used by TVM and PyTorch.
-
 - We can then build tvm and related libraries.
 
   .. code:: bash
diff --git a/docs/legacy_redirect.py b/docs/legacy_redirect.py
deleted file mode 100644
index 0f1340e5491fe..0000000000000
--- a/docs/legacy_redirect.py
+++ /dev/null
@@ -1,272 +0,0 @@
-# -*- coding: utf-8 -*-
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from string import Template
-import json
-import os
-
-legacy_redirects = [
-    ["dev/benchmark.html", "../arch/benchmark.html"],
-    ["dev/convert_layout.html", "../arch/convert_layout.html"],
-    ["dev/debugger.html", "../arch/debugger.html"],
-    ["dev/device_target_interactions.html", "../arch/device_target_interactions.html"],
-    ["dev/frontend/tensorflow.html", "../../arch/frontend/tensorflow.html"],
-    ["dev/hybrid_script.html", "../arch/hybrid_script.html"],
-    ["dev/index.html", "../arch/index.html"],
-    ["dev/inferbound.html", "../arch/inferbound.html"],
-    [
-        "dev/introduction_to_module_serialization.html",
-        "../arch/introduction_to_module_serialization.html",
-    ],
-    ["dev/microtvm_design.html", "../arch/microtvm_design.html"],
-    ["dev/model_library_format.html", "../arch/model_library_format.html"],
-    ["dev/pass_infra.html", "../arch/pass_infra.html"],
-    ["dev/relay_intro.html", "../arch/relay_intro.html"],
-    ["dev/relay_op_strategy.html", "../arch/relay_op_strategy.html"],
-    ["dev/runtime.html", "../arch/runtime.html"],
-    ["dev/runtimes/vulkan.html", "../../arch/runtimes/vulkan.html"],
-    ["dev/security.html", "../arch/security.html"],
-    ["dev/virtual_machine.html", "../arch/virtual_machine.html"],
-    ["dev/how_to.html", "index.html"],
-    ["dev/pytest_target_parametrization.html", "how_to/pytest_target_parametrization.html"],
-    ["dev/relay_add_op.html", "how_to/relay_add_op.html"],
-    ["dev/relay_add_pass.html", "how_to/relay_add_pass.html"],
-    ["dev/relay_bring_your_own_codegen.html", "how_to/relay_bring_your_own_codegen.html"],
-    ["dev/codebase_walkthrough.html", "tutorial/codebase_walkthrough.html"],
-    ["deploy/android.html", "../how_to/deploy/android.html"],
-    ["deploy/arm_compute_lib.html", "../how_to/deploy/arm_compute_lib.html"],
-    ["deploy/bnns.html", "../how_to/deploy/bnns.html"],
-    ["deploy/cpp_deploy.html", "../how_to/deploy/cpp_deploy.html"],
-    ["deploy/hls.html", "../how_to/deploy/hls.html"],
-    ["deploy/index.html", "../how_to/deploy/index.html"],
-    ["deploy/integrate.html", "../how_to/deploy/integrate.html"],
-    ["deploy/tensorrt.html", "../how_to/deploy/tensorrt.html"],
-    ["deploy/vitis_ai.html", "../how_to/deploy/vitis_ai.html"],
-    ["profiling/index.html", "../how_to/profile/index.html"],
-    ["profiling/papi.html", "../how_to/profile/papi.html"],
-    ["api/links.html", "../reference/api/links.html"],
-    ["api/python/auto_scheduler.html", "../../reference/api/python/auto_scheduler.html"],
-    ["api/python/autotvm.html", "../../reference/api/python/autotvm.html"],
-    ["api/python/contrib.html", "../../reference/api/python/contrib.html"],
-    ["api/python/driver.html", "../../reference/api/python/driver.html"],
-    ["api/python/error.html", "../../reference/api/python/error.html"],
-    ["api/python/graph_executor.html", "../../reference/api/python/graph_executor.html"],
-    ["api/python/index.html", "../../reference/api/python/index.html"],
-    ["api/python/ir.html", "../../reference/api/python/ir.html"],
-    ["api/python/micro.html", "../../reference/api/python/micro.html"],
-    ["api/python/ndarray.html", "../../reference/api/python/ndarray.html"],
-    ["api/python/relay/analysis.html", "../../../reference/api/python/relay/analysis.html"],
-    ["api/python/relay/backend.html", "../../../reference/api/python/relay/backend.html"],
-    [
-        "api/python/relay/dataflow_pattern.html",
-        "../../../reference/api/python/relay/dataflow_pattern.html",
-    ],
-    ["api/python/relay/frontend.html", "../../../reference/api/python/relay/frontend.html"],
-    ["api/python/relay/image.html", "../../../reference/api/python/relay/image.html"],
-    ["api/python/relay/index.html", "../../../reference/api/python/relay/index.html"],
-    ["api/python/relay/nn.html", "../../../reference/api/python/relay/nn.html"],
-    ["api/python/relay/testing.html", "../../../reference/api/python/relay/testing.html"],
-    ["api/python/relay/transform.html", "../../../reference/api/python/relay/transform.html"],
-    ["api/python/relay/vision.html", "../../../reference/api/python/relay/vision.html"],
-    ["api/python/rpc.html", "../../reference/api/python/rpc.html"],
-    ["api/python/runtime.html", "../../reference/api/python/runtime.html"],
-    ["api/python/target.html", "../../reference/api/python/target.html"],
-    ["api/python/te.html", "../../reference/api/python/te.html"],
-    ["api/python/tir.html", "../../reference/api/python/tir.html"],
-    ["api/python/topi.html", "../../reference/api/python/topi.html"],
-    ["api/python/vta/index.html", "../../../reference/api/python/vta/index.html"],
-    ["langref/hybrid_script.html", "../reference/langref/hybrid_script.html"],
-    ["langref/index.html", "../reference/langref/index.html"],
-    ["langref/relay_adt.html", "../reference/langref/relay_adt.html"],
-    ["langref/relay_expr.html", "../reference/langref/relay_expr.html"],
-    ["langref/relay_op.html", "../reference/langref/relay_op.html"],
-    ["langref/relay_pattern.html", "../reference/langref/relay_pattern.html"],
-    ["langref/relay_type.html", "../reference/langref/relay_type.html"],
-    ["microtvm/index.html", "../topic/microtvm/index.html"],
-    ["vta/dev/config.html", "../../topic/vta/dev/config.html"],
-    ["vta/dev/hardware.html", "../../topic/vta/dev/hardware.html"],
-    ["vta/dev/index.html", "../../topic/vta/dev/index.html"],
-    ["vta/index.html", "../topic/vta/index.html"],
-    ["vta/install.html", "../topic/vta/install.html"],
-    ["tutorials/frontend/from_caffe2.html", "../../how_to/compile_models/from_caffe2.html"],
-    ["tutorials/frontend/from_coreml.html", "../../how_to/compile_models/from_coreml.html"],
-    ["tutorials/frontend/from_darknet.html", "../../how_to/compile_models/from_darknet.html"],
-    ["tutorials/frontend/from_keras.html", "../../how_to/compile_models/from_keras.html"],
-    ["tutorials/frontend/from_mxnet.html", "../../how_to/compile_models/from_mxnet.html"],
-    ["tutorials/frontend/from_onnx.html", "../../how_to/compile_models/from_onnx.html"],
-    ["tutorials/frontend/from_paddle.html", "../../how_to/compile_models/from_paddle.html"],
-    ["tutorials/frontend/from_pytorch.html", "../../how_to/compile_models/from_pytorch.html"],
-    ["tutorials/frontend/from_tensorflow.html", "../../how_to/compile_models/from_tensorflow.html"],
-    ["tutorials/frontend/from_tflite.html", "../../how_to/compile_models/from_tflite.html"],
-    [
-        "tutorials/frontend/deploy_model_on_android.html",
-        "../../how_to/deploy_models/deploy_model_on_android.html",
-    ],
-    [
-        "tutorials/frontend/deploy_model_on_rasp.html",
-        "../../how_to/deploy_models/deploy_model_on_rasp.html",
-    ],
-    [
-        "tutorials/frontend/deploy_object_detection_pytorch.html",
-        "../../how_to/deploy_models/deploy_object_detection_pytorch.html",
-    ],
-    [
-        "tutorials/frontend/deploy_prequantized.html",
-        "../../how_to/deploy_models/deploy_prequantized.html",
-    ],
-    [
-        "tutorials/frontend/deploy_prequantized_tflite.html",
-        "../../how_to/deploy_models/deploy_prequantized_tflite.html",
-    ],
-    [
-        "tutorials/frontend/deploy_quantized.html",
-        "../../how_to/deploy_models/deploy_quantized.html",
-    ],
-    ["tutorials/frontend/deploy_sparse.html", "../../how_to/deploy_models/deploy_sparse.html"],
-    [
-        "tutorials/frontend/deploy_ssd_gluoncv.html",
-        "../../how_to/deploy_models/deploy_ssd_gluoncv.html",
-    ],
-    [
-        "tutorials/dev/bring_your_own_datatypes.html",
-        "../../how_to/extend_tvm/bring_your_own_datatypes.html",
-    ],
-    [
-        "tutorials/dev/low_level_custom_pass.html",
-        "../../how_to/extend_tvm/low_level_custom_pass.html",
-    ],
-    ["tutorials/dev/use_pass_infra.html", "../../how_to/extend_tvm/use_pass_infra.html"],
-    ["tutorials/dev/use_pass_instrument.html", "../../how_to/extend_tvm/use_pass_instrument.html"],
-    ["tutorials/optimize/opt_conv_cuda.html", "../../how_to/optimize_operators/opt_conv_cuda.html"],
-    [
-        "tutorials/optimize/opt_conv_tensorcore.html",
-        "../../how_to/optimize_operators/opt_conv_tensorcore.html",
-    ],
-    ["tutorials/optimize/opt_gemm.html", "../../how_to/optimize_operators/opt_gemm.html"],
-    [
-        "tutorials/auto_scheduler/tune_conv2d_layer_cuda.html",
-        "../../how_to/tune_with_autoscheduler/tune_conv2d_layer_cuda.html",
-    ],
-    [
-        "tutorials/auto_scheduler/tune_network_arm.html",
-        "../../how_to/tune_with_autoscheduler/tune_network_arm.html",
-    ],
-    [
-        "tutorials/auto_scheduler/tune_network_cuda.html",
-        "../../how_to/tune_with_autoscheduler/tune_network_cuda.html",
-    ],
-    [
-        "tutorials/auto_scheduler/tune_network_mali.html",
-        "../../how_to/tune_with_autoscheduler/tune_network_mali.html",
-    ],
-    [
-        "tutorials/auto_scheduler/tune_network_x86.html",
-        "../../how_to/tune_with_autoscheduler/tune_network_x86.html",
-    ],
-    [
-        "tutorials/auto_scheduler/tune_sparse_x86.html",
-        "../../how_to/tune_with_autoscheduler/tune_sparse_x86.html",
-    ],
-    [
-        "tutorials/autotvm/tune_conv2d_cuda.html",
-        "../../how_to/tune_with_autotvm/tune_conv2d_cuda.html",
-    ],
-    ["tutorials/autotvm/tune_relay_arm.html", "../../how_to/tune_with_autotvm/tune_relay_arm.html"],
-    [
-        "tutorials/autotvm/tune_relay_cuda.html",
-        "../../how_to/tune_with_autotvm/tune_relay_cuda.html",
-    ],
-    [
-        "tutorials/autotvm/tune_relay_mobile_gpu.html",
-        "../../how_to/tune_with_autotvm/tune_relay_mobile_gpu.html",
-    ],
-    ["tutorials/autotvm/tune_relay_x86.html", "../../how_to/tune_with_autotvm/tune_relay_x86.html"],
-    ["tutorials/micro/micro_autotune.html", "../../how_to/work_with_microtvm/micro_autotune.html"],
-    [
-        "tutorials/micro/micro_reference_vm.html",
-        "../../how_to/work_with_microtvm/micro_reference_vm.html",
-    ],
-    ["tutorials/micro/micro_tflite.html", "../../how_to/work_with_microtvm/micro_tflite.html"],
-    ["tutorials/frontend/build_gcn.html", "../../how_to/work_with_relay/build_gcn.html"],
-    [
-        "tutorials/frontend/using_external_lib.html",
-        "../../how_to/work_with_relay/using_external_lib.html",
-    ],
-    ["tutorials/language/extern_op.html", "../../how_to/work_with_schedules/extern_op.html"],
-    ["tutorials/language/intrin_math.html", "../../how_to/work_with_schedules/intrin_math.html"],
-    ["tutorials/language/reduction.html", "../../how_to/work_with_schedules/reduction.html"],
-    ["tutorials/language/scan.html", "../../how_to/work_with_schedules/scan.html"],
-    [
-        "tutorials/language/schedule_primitives.html",
-        "../../how_to/work_with_schedules/schedule_primitives.html",
-    ],
-    ["tutorials/language/tedd.html", "../../how_to/work_with_schedules/tedd.html"],
-    ["tutorials/language/tensorize.html", "../../how_to/work_with_schedules/tensorize.html"],
-    ["tutorials/language/tuple_inputs.html", "../../how_to/work_with_schedules/tuple_inputs.html"],
-    [
-        "tutorials/get_started/auto_scheduler_matmul_x86.html",
-        "../../tutorial/auto_scheduler_matmul_x86.html",
-    ],
-    ["tutorials/get_started/autotvm_matmul_x86.html", "../../tutorial/autotvm_matmul_x86.html"],
-    ["tutorials/get_started/autotvm_relay_x86.html", "../../tutorial/autotvm_relay_x86.html"],
-    [
-        "tutorials/get_started/cross_compilation_and_rpc.html",
-        "../../tutorial/cross_compilation_and_rpc.html",
-    ],
-    ["tutorials/get_started/install.html", "../../tutorial/install.html"],
-    ["tutorials/topi/intro_topi.html", "../../tutorial/intro_topi.html"],
-    ["tutorials/get_started/introduction.html", "../../tutorial/introduction.html"],
-    ["tutorials/get_started/relay_quick_start.html", "../../tutorial/relay_quick_start.html"],
-    [
-        "tutorials/get_started/tensor_expr_get_started.html",
-        "../../tutorial/tensor_expr_get_started.html",
-    ],
-    [
-        "tutorials/get_started/tvmc_command_line_driver.html",
-        "../../tutorial/tvmc_command_line_driver.html",
-    ],
-]
-
-redirect_template = """
-<!DOCTYPE html>
-<html>
-  <head>
-    <meta http-equiv="refresh" content="1; url=$to" />
-    <script>
-      window.location.href = "$to"
-    </script>
-  </head>
-</html>
-"""
-
-
-def build_legacy_redirect(tvm_path):
-    def legacy_redirect(app, docname):  # Sphinx expects two arguments
-        if app.builder.name == "html":
-
-            src = Template(redirect_template)
-
-            for frm, to in legacy_redirects:
-                frm = tvm_path.resolve() / "docs" / "_build" / "html" / frm
-                redirect = src.substitute({"to": to})
-                os.makedirs(os.path.dirname(frm), exist_ok=True)
-                with open(frm, "w") as f:
-                    f.write(redirect)
-
-    return legacy_redirect
diff --git a/gallery/how_to/work_with_microtvm/micro_autotune.py b/gallery/how_to/work_with_microtvm/micro_autotune.py
index d3106712aa991..e7a1fa84a1104 100644
--- a/gallery/how_to/work_with_microtvm/micro_autotune.py
+++ b/gallery/how_to/work_with_microtvm/micro_autotune.py
@@ -113,9 +113,12 @@
 # choose other options by choosing from `PLATFORM` list.
 #
 
+repo_root = pathlib.Path(
+    subprocess.check_output(["git", "rev-parse", "--show-toplevel"], encoding="utf-8").strip()
+)
 
 module_loader = tvm.micro.AutoTvmModuleLoader(
-    template_project_dir=pathlib.Path(tvm.micro.get_microtvm_template_projects("crt")),
+    template_project_dir=repo_root / "src" / "runtime" / "crt" / "host",
     project_options={"verbose": False},
 )
 builder = tvm.autotvm.LocalBuilder(
@@ -131,7 +134,7 @@
 # Compiling for physical hardware
 # --------------------------------------------------------------------------
 #    module_loader = tvm.micro.AutoTvmModuleLoader(
-#        template_project_dir=pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr")),
+#        template_project_dir=repo_root / "apps" / "microtvm" / "zephyr" / "template_project",
 #        project_options={
 #            "zephyr_board": BOARD,
 #            "west_cmd": "west",
@@ -180,7 +183,7 @@
 temp_dir = tvm.contrib.utils.tempdir()
 
 project = tvm.micro.generate_project(
-    str(tvm.micro.get_microtvm_template_projects("crt")),
+    str(repo_root / "src" / "runtime" / "crt" / "host"),
     lowered,
     temp_dir / "project",
     {"verbose": False},
@@ -189,7 +192,7 @@
 # Compiling for physical hardware
 # --------------------------------------------------------------------------
 #    project = tvm.micro.generate_project(
-#        str(tvm.micro.get_microtvm_template_projects("zephyr")),
+#        str(repo_root / "apps" / "microtvm" / "zephyr" / "template_project"),
 #        lowered,
 #        temp_dir / "project",
 #        {
@@ -223,7 +226,7 @@
 temp_dir = tvm.contrib.utils.tempdir()
 
 project = tvm.micro.generate_project(
-    str(tvm.micro.get_microtvm_template_projects("crt")),
+    str(repo_root / "src" / "runtime" / "crt" / "host"),
     lowered_tuned,
     temp_dir / "project",
     {"verbose": False},
@@ -232,7 +235,7 @@
 # Compiling for physical hardware
 # --------------------------------------------------------------------------
 #    project = tvm.micro.generate_project(
-#        str(tvm.micro.get_microtvm_template_projects("zephyr")),
+#        str(repo_root / "apps" / "microtvm" / "zephyr" / "template_project"),
 #        lowered_tuned,
 #        temp_dir / "project",
 #        {
diff --git a/gallery/how_to/work_with_microtvm/micro_tflite.py b/gallery/how_to/work_with_microtvm/micro_tflite.py
index 35b08d87b9ee0..cab105cb450fe 100644
--- a/gallery/how_to/work_with_microtvm/micro_tflite.py
+++ b/gallery/how_to/work_with_microtvm/micro_tflite.py
@@ -269,7 +269,10 @@
 import subprocess
 import pathlib
 
-template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("crt"))
+repo_root = pathlib.Path(
+    subprocess.check_output(["git", "rev-parse", "--show-toplevel"], encoding="utf-8").strip()
+)
+template_project_path = repo_root / "src" / "runtime" / "crt" / "host"
 project_options = {}  # You can use options to provide platform-specific options through TVM.
 
 # Compiling for physical hardware (or an emulated board, like the mps_an521)
@@ -277,7 +280,7 @@
 #  For physical hardware, you can try out the Zephyr platform by using a different template project
 #  and options:
 #
-#     template_project_path = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr"))
+#     template_project_path = repo_root / "apps" / "microtvm" / "zephyr" / "template_project"
 #     project_options = {"project_type": "host_driven", zephyr_board": "nucleo_f746zg"}}
 
 # Create a temporary directory
diff --git a/gallery/tutorial/tensor_ir_blitz_course.py b/gallery/tutorial/tensor_ir_blitz_course.py
deleted file mode 100644
index e9a0801f34a81..0000000000000
--- a/gallery/tutorial/tensor_ir_blitz_course.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-.. _tir_blitz:
-
-Blitz Course to TensorIR
-========================
-**Author**: `Siyuan Feng <https://github.com/Hzfengsy>`_
-
-TensorIR is a domain specific language for deep learning programs serving two broad purposes:
-
-- An implementation for transforming and optimizing programs on various hardware backends.
-
-- An abstraction for automatic tensorized program optimization.
-
-"""
-
-import tvm
-from tvm.ir.module import IRModule
-from tvm.script import tir as T
-import numpy as np
-
-################################################################################################
-# IRModule
-# --------
-# An IRModule is the central data structure in TVM, which contains deep learning programs.
-# It is the basic object of interest of IR transformation and model building.
-#
-# .. image:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_life_of_irmodule.png
-#    :align: center
-#    :width: 85%
-#
-# This is the life cycle of an IRModule, which can be created from TVMScript. TensorIR schedule
-# primitives and passes are two major ways to transform an IRModule. Also, a sequence of
-# transformations on an IRModule is acceptable. Note that we can print an IRModule at **ANY** stage
-# to TVMScript. After all transformations and optimizations are complete, we can build the IRModule
-# to a runnable module to deploy on target devices.
-#
-# Based on the design of TensorIR and IRModule, we are able to create a new programming method:
-#
-# 1. Write a program by TVMScript in a python-AST based syntax.
-#
-# 2. Transform and optimize a program with python api.
-#
-# 3. Interactively inspect and try the performance with an imperative style transformation API.
-
-
-################################################################################################
-# Create an IRModule
-# ------------------
-# IRModule can be created by writing TVMScript, which is a round-trippable syntax for TVM IR.
-#
-# Different than creating a computational expression by Tensor Expression
-# (:ref:`tutorial-tensor-expr-get-started`), TensorIR allow users to program through TVMScript,
-# a language embedded in python AST. The new method makes it possible to write complex programs
-# and further schedule and optimize it.
-#
-# Following is a simple example for vector addition.
-#
-
-
-@tvm.script.ir_module
-class MyModule:
-    @T.prim_func
-    def main(a: T.handle, b: T.handle):
-        # We exchange data between function by handles, which are similar to pointer.
-        T.func_attr({"global_symbol": "main", "tir.noalias": True})
-        # Create buffer from handles.
-        A = T.match_buffer(a, (8,), dtype="float32")
-        B = T.match_buffer(b, (8,), dtype="float32")
-        for i in range(8):
-            # A block is an abstraction for computation.
-            with T.block("B"):
-                # Define a spatial block iterator and bind it to value i.
-                vi = T.axis.spatial(8, i)
-                B[vi] = A[vi] + 1.0
-
-
-ir_module = MyModule
-print(type(ir_module))
-print(ir_module.script())
-
-################################################################################################
-# Besides, we can also use tensor expression DSL to write simple operators, and convert them
-# to an IRModule.
-#
-
-from tvm import te
-
-A = te.placeholder((8,), dtype="float32", name="A")
-B = te.compute((8,), lambda *i: A(*i) + 1.0, name="B")
-func = te.create_prim_func([A, B])
-ir_module_from_te = IRModule({"main": func})
-print(ir_module_from_te.script())
-
-
-################################################################################################
-# Build and Run an IRModule
-# -------------------------
-# We can build the IRModule into a runnable module with specific target backends.
-#
-
-mod = tvm.build(ir_module, target="llvm")  # The module for CPU backends.
-print(type(mod))
-
-################################################################################################
-# Prepare the input array and output array, then run the module.
-#
-
-a = tvm.nd.array(np.arange(8).astype("float32"))
-b = tvm.nd.array(np.zeros((8,)).astype("float32"))
-mod(a, b)
-print(a)
-print(b)
-
-
-################################################################################################
-# Transform an IRModule
-# ---------------------
-# The IRModule is the central data structure for program optimization, which can be transformed
-# by :code:`Schedule`.
-# A schedule contains multiple primitive methods to interactively transform the program.
-# Each primitive transforms the program in certain ways to bring additional performance optimizations.
-#
-# .. image:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_tensor_ir_opt_flow.png
-#    :align: center
-#    :width: 100%
-#
-# The image above is a typical workflow for optimizing a tensor program. First, we need to create a
-# schedule on the initial IRModule created from either TVMScript or Tensor Expression. Then, a
-# sequence of schedule primitives will help to improve the performance. And at last, we can lower
-# and build it into a runnable module.
-#
-# Here we just demostrate a very simple tranformation. First we create schedule on the input `ir_module`.
-
-sch = tvm.tir.Schedule(ir_module)
-print(type(sch))
-
-################################################################################################
-# Tile the loop into 3 loops and print the result.
-
-# Get block by its name
-block_b = sch.get_block("B")
-# Get loops surronding the block
-(i,) = sch.get_loops(block_b)
-# Tile the loop nesting.
-i_0, i_1, i_2 = sch.split(i, factors=[2, 2, 2])
-print(sch.mod.script())
-
-
-################################################################################################
-# We can also reorder the loops. Now we move loop `i_2` to outside of `i_1`.
-sch.reorder(i_0, i_2, i_1)
-print(sch.mod.script())
-
-
-################################################################################################
-# Transform to a GPU program
-# ~~~~~~~~~~~~~~~~~~~~~~~~~~
-# If we want to deploy models on GPUs, thread binding is necessary. Fortunately, we can
-# also use primitives and do incrementally transformation.
-#
-
-sch.bind(i_0, "blockIdx.x")
-sch.bind(i_2, "threadIdx.x")
-print(sch.mod.script())
-
-
-################################################################################################
-# After binding the threads, now build the IRModule with :code:`cuda` backends.
-ctx = tvm.cuda(0)
-cuda_mod = tvm.build(sch.mod, target="cuda")
-cuda_a = tvm.nd.array(np.arange(8).astype("float32"), ctx)
-cuda_b = tvm.nd.array(np.zeros((8,)).astype("float32"), ctx)
-cuda_mod(cuda_a, cuda_b)
-print(cuda_a)
-print(cuda_b)
diff --git a/include/tvm/runtime/vm/executable.h b/include/tvm/runtime/vm/executable.h
index 6e564fd623802..2cdd180730ec0 100644
--- a/include/tvm/runtime/vm/executable.h
+++ b/include/tvm/runtime/vm/executable.h
@@ -131,13 +131,6 @@ class Executable : public ModuleNode {
    */
   std::string GetBytecode() const;
 
-  /*!
-   * \brief Returns a description of all the contants in the executable in human-readable
-   * format. Not intended to be machine readable, but rather to help with debugging and
-   * diffing generated code.
-   */
-  std::string GetConstants() const;
-
   /*!
    * \brief Print the detailed statistics of the given code, i.e. number of
    * globls and constants, etc.
diff --git a/include/tvm/runtime/vm/vm.h b/include/tvm/runtime/vm/vm.h
index ece73fcfda34d..039b1894d7c4c 100644
--- a/include/tvm/runtime/vm/vm.h
+++ b/include/tvm/runtime/vm/vm.h
@@ -84,11 +84,11 @@ struct VMFunction {
   /*! \brief The size of the frame for this function */
   Index register_file_size;
   /*! \brief The device type of each parameter for this function. */
-  std::vector<Index> params_device_type;
+  std::vector<DLDeviceType> params_device_type;
 
   VMFunction(const std::string& name, std::vector<std::string> params,
              const std::vector<Instruction>& instructions, Index register_file_size,
-             const std::vector<Index> params_device_type = {})
+             const std::vector<DLDeviceType> params_device_type = {})
       : name(name),
         params(params),
         instructions(instructions),
diff --git a/licenses/LICENSE.cutlass.txt b/licenses/LICENSE.cutlass.txt
deleted file mode 100644
index 64a49d680b1ed..0000000000000
--- a/licenses/LICENSE.cutlass.txt
+++ /dev/null
@@ -1,23 +0,0 @@
-Copyright (c) 2017 - 2020, NVIDIA CORPORATION.  All rights reserved.
-
-Redistribution and use in source and binary forms, with or without
-modification, are permitted provided that the following conditions are met:
-   *  Redistributions of source code must retain the above copyright
-      notice, this list of conditions and the following disclaimer.
-   *  Redistributions in binary form must reproduce the above copyright
-      notice, this list of conditions and the following disclaimer in the
-      documentation and/or other materials provided with the distribution.
-   *  Neither the name of the NVIDIA CORPORATION nor the
-      names of its contributors may be used to endorse or promote products
-      derived from this software without specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
-ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
-WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
-DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
-DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
-ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
-SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/python/setup.py b/python/setup.py
index 5d21af6b58781..1b2a9d3ee9658 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -62,13 +62,6 @@ def get_lib_path():
                 libs.append(candidate_path)
                 break
 
-        # Add microTVM template projects
-        for name in lib_path:
-            candidate_path = os.path.join(os.path.dirname(name), "microtvm_template_projects")
-            if os.path.isdir(candidate_path):
-                libs.append(candidate_path)
-                break
-
     else:
         libs = None
 
diff --git a/python/tvm/contrib/cutlass/__init__.py b/python/tvm/contrib/cutlass/__init__.py
deleted file mode 100644
index 69d3e9c4bd7c1..0000000000000
--- a/python/tvm/contrib/cutlass/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""BYOC support for CUTLASS."""
-from .build import tune_cutlass_kernels, build_cutlass_kernels, build_cutlass_kernels_vm
diff --git a/python/tvm/contrib/cutlass/build.py b/python/tvm/contrib/cutlass/build.py
deleted file mode 100644
index 58e7a115444ea..0000000000000
--- a/python/tvm/contrib/cutlass/build.py
+++ /dev/null
@@ -1,255 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Driver for partitioning and building a Relay module for CUTLASS offload."""
-import logging
-import os
-import multiprocessing
-import tvm
-from tvm import runtime, relay
-from tvm.contrib.nvcc import find_cuda_path, get_cuda_version
-from .gen_gemm import CutlassGemmProfiler
-
-logger = logging.getLogger("cutlass")
-
-
-def _get_cutlass_path():
-    tvm_root = os.path.join(os.path.dirname(os.path.realpath(__file__)), "../../../../")
-    cutlass_path = os.path.join(tvm_root, "3rdparty/cutlass")
-    assert os.path.exists(
-        cutlass_path
-    ), """The CUTLASS root directory not found in {}.
-        Currently, using CUTLASS requires building TVM from source.""".format(
-        cutlass_path
-    )
-    return cutlass_path
-
-
-def _get_cutlass_compile_options(sm, threads):
-    cutlass_root = _get_cutlass_path()
-    cutlass_include = os.path.join(cutlass_root, "include")
-    cutlass_util_include = os.path.join(cutlass_root, "tools/util/include")
-
-    kwargs = {}
-    kwargs["cc"] = "nvcc"
-    kwargs["options"] = [
-        "-DCUTLASS_ENABLE_TENSOR_CORE_MMA=1",
-        "-gencode=arch=compute_%d,code=[sm_%d,compute_%d]" % (sm, sm, sm),
-        "-Xcompiler=-fPIC",
-        "-Xcompiler=-Wconversion",
-        "-Xcompiler=-fno-strict-aliasing",
-        "-O3",
-        "-std=c++14",
-        "-I" + cutlass_include,
-        "-I" + cutlass_util_include,
-    ]
-    cuda_path = find_cuda_path()
-    cuda_ver = get_cuda_version(cuda_path)
-    if cuda_ver >= 11.2:
-        ncpu = multiprocessing.cpu_count() if threads < 0 else threads
-        kwargs["options"].append("-t %d" % ncpu)
-    return kwargs
-
-
-class GemmAnnotator(tvm.relay.ExprVisitor):
-    """Annotates partitioned functions with shape and dtype information."""
-
-    def __init__(self):
-        super().__init__()
-        self.signature = {}
-
-    def visit_call(self, call):
-        op = call.op
-        if isinstance(op, relay.Function) and "PartitionedFromPattern" in op.attrs:
-            self.signature["op_type"] = op.attrs["Composite"]
-            for i, arg in enumerate(op.params):
-                self.signature["arg%d_shape" % i] = arg.checked_type.shape
-                self.signature["arg%d_dtype" % i] = arg.checked_type.dtype
-            self.signature["ret_shape"] = op.ret_type.shape
-            self.signature["ret_dtype"] = op.ret_type.dtype
-
-
-def tune_cutlass_kernels(mod, sm, profile_all=True, use_multiprocessing=False, tmp_dir="./tmp"):
-    """Given a module partitioned for CUTLASS offloading, profile each workload to select which
-    kernels to emit.
-
-    Parameters
-    ----------
-    mod : IRModule
-        The Relay module with cutlass partitions.
-
-    sm : int
-        An integer specifying the compute capability. For example, 75 for Turing and
-        80 or 86 for Ampere.
-
-    profile_all : bool
-        Whether or not profile all candidate kernels, or stop profiling after
-        the first applicable kernel is found.
-
-    use_multiprocessing : bool
-        Whether or not compile profiler executables for different kernels in parallel.
-
-    tmp_dir : string, optional
-        A temporary directory where intermediate compiled artifacts will be stored.
-
-    Returns
-    -------
-    mod : IRModule
-        The updated module annotated with cutlass profiling information.
-
-    num_cutlass_partition : int
-        The number of partitioned functions created for CUTLASS.
-    """
-    cutlass_profiler = CutlassGemmProfiler(sm, _get_cutlass_path(), tmp_dir)
-    num_cutlass_partition = 0
-    for var in mod.get_global_vars():
-        fun_name = var.name_hint
-        func = mod[fun_name]
-        annotator = GemmAnnotator()
-        if "cutlass" in fun_name:
-            num_cutlass_partition += 1
-            annotator.visit(func)
-            # call cutlass profiler to find best settings, update attr
-            new_attrs = {}
-            new_attrs.update(annotator.signature)
-            for key in func.attrs.keys():
-                new_attrs[key] = func.attrs[key]
-            # call profiler
-            arg0_shape = new_attrs["arg0_shape"]
-            arg1_shape = new_attrs["arg1_shape"]
-            MM = arg0_shape[0]
-            KK = arg0_shape[1]
-            NN = arg1_shape[0]
-            out_dtype = annotator.signature["ret_dtype"]
-            if any(isinstance(s, tvm.tir.Any) for s in [MM, KK, NN]):
-                out = cutlass_profiler.get_default(out_dtype)
-                logger.info("Picked the default kernel %s", out["name"])
-            else:
-                out = cutlass_profiler.profile(
-                    MM, NN, KK, out_dtype, profile_all, use_multiprocessing
-                )
-                if profile_all:
-                    logger.info("The best kernel is %s", out["name"])
-                else:
-                    logger.info("Picked the first kernel found %s", out["name"])
-
-            if new_attrs["op_type"] == "cutlass.dense":
-                new_attrs["cutlass_op_def"] = out["opdef"]
-            elif new_attrs["op_type"] == "cutlass.dense_bias":
-                new_attrs["cutlass_op_def"] = out["opdef_bias"]
-            elif new_attrs["op_type"] == "cutlass.dense_bias_relu":
-                new_attrs["cutlass_op_def"] = out["opdef_bias_relu"]
-            elif "cutlass.dense_bias_gelu" in new_attrs["op_type"]:
-                new_attrs["cutlass_op_def"] = out["opdef_bias_gelu"]
-            else:
-                raise ValueError("%s pattern is not implemented." % new_attrs["op_type"])
-            new_attrs["cutlass_op_name"] = out["name"]
-
-            if new_attrs["cutlass_op_name"].find("_tn_align") > 0:
-                new_attrs["lda"] = "K"
-                new_attrs["ldb"] = "K"
-                new_attrs["ldc"] = "N"
-            else:
-                raise ValueError("%s unsupported operation" % new_attrs["cutlass_op_name"])
-
-            new_attrs = tvm.ir.make_node("DictAttrs", **new_attrs)
-            new_func = relay.Function(
-                func.params,
-                func.body,
-                ret_type=func.ret_type,
-                type_params=func.type_params,
-                attrs=new_attrs,
-            )
-            mod.update_func(var, new_func)
-
-    return mod, num_cutlass_partition
-
-
-def build_cutlass_kernels(lib, sm, tmp_dir="./tmp", lib_path="compile.so", threads=-1):
-    """Compile CUTLASS kernels in lib and return the runtime module ready to run.
-
-    Parameters
-    ----------
-    lib : GraphExecutorFactoryModule
-        The output from relay.build containing compiled host code and non-cutlass kernels.
-
-    sm : int
-        An integer specifying the compute capability. For example, 75 for Turing and
-        80 or 86 for Ampere.
-
-    tmp_dir : string, optional
-        A temporary directory where intermediate compiled artifacts will be stored.
-
-    lib_path : string, optional
-        The path to a shared library which will be generated as the result of the build process.
-
-    threads : int, optional
-        The number of threads to use for compiling generated kernels. Only available for
-        CUDA 11.2 or later. Use all physical cores by default.
-
-    Returns
-    -------
-    updated_lib : runtime.Module
-        The updated module with compiled cutlass kernels.
-    """
-    kwargs = _get_cutlass_compile_options(sm, threads)
-    lib.export_library(lib_path, workspace_dir=tmp_dir, **kwargs)
-    return runtime.load_module(lib_path)
-
-
-def build_cutlass_kernels_vm(
-    vm_exec, sm, tmp_dir="./tmp", lib_path="compile.so", vmcode_path="vmcode.ro", threads=-1
-):
-    """Compile CUTLASS kernels in vm_exec and return a VM executable ready to run.
-
-    Parameters
-    ----------
-    vm_exec : vm.Executable
-        The output from relay.vm.compile containing compiled host code and non-cutlass kernels.
-
-    sm : int
-        An integer specifying the compute capability. For example, 75 for Turing and
-        80 or 86 for Ampere.
-
-    tmp_dir : string, optional
-        A temporary directory where intermediate compiled artifacts will be stored.
-
-    lib_path : string, optional
-        The path to a shared library which will be generated as the result of the build process.
-
-    vmcode_path : string, optional
-        The path where the VM bytecode will be serialized to.
-
-    threads : int, optional
-        The number of threads to use for compiling generated kernels. Only available for
-        CUDA 11.2 or later. Use all physical cores by default.
-
-    Returns
-    -------
-    updated_vm_exec: vm.Executable
-        The updated exectuable with compiled cutlass kernels.
-    """
-    code, lib = vm_exec.save()
-    kwargs = _get_cutlass_compile_options(sm, threads)
-    lib_path = os.path.join(tmp_dir, lib_path)
-    vmcode_path = os.path.join(tmp_dir, vmcode_path)
-    lib.export_library(lib_path, workspace_dir=tmp_dir, **kwargs)
-    with open(vmcode_path, "wb") as fo:
-        fo.write(code)
-    lib = tvm.runtime.load_module(lib_path)
-    code = bytearray(open(vmcode_path, "rb").read())
-    return tvm.runtime.vm.Executable.load_exec(code, lib)
diff --git a/python/tvm/contrib/cutlass/gemm_operation.py b/python/tvm/contrib/cutlass/gemm_operation.py
deleted file mode 100644
index e53b3ee7b93a9..0000000000000
--- a/python/tvm/contrib/cutlass/gemm_operation.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name, unused-wildcard-import, wildcard-import
-"""Generator for CUTLASS GEMM kernels."""
-from .library import *
-
-
-class GemmOperation:
-    """Describes various attributes for instantiating GEMM kernels."""
-
-    def __init__(
-        self,
-        arch,
-        tile_description,
-        A,
-        B,
-        C,
-        element_epilogue,
-        epilogue_functor=EpilogueFunctor.LinearCombination,
-        swizzling_functor=SwizzlingFunctor.Identity8,
-    ):
-        self.operation_kind = OperationKind.Gemm
-        self.arch = arch
-        self.tile_description = tile_description
-        self.A = A
-        self.B = B
-        self.C = C
-        self.element_epilogue = element_epilogue
-        self.epilogue_functor = epilogue_functor
-        self.swizzling_functor = swizzling_functor
-
-    def accumulator_type(self):
-        return self.tile_description.math_instruction.element_accumulator
-
-    def short_math_name(self):
-        return ShortDataTypeNames[self.accumulator_type()]
-
-    def core_name(self):
-        """ The basic operation kind is prefixed with a letter indicating the accumulation type. """
-        inst_shape = ""
-        intermediate_type = ""
-
-        if (
-            self.tile_description.math_instruction.opcode_class == OpcodeClass.TensorOp
-            or self.tile_description.math_instruction.opcode_class == OpcodeClass.WmmaTensorOp
-        ):
-            inst_shape = "%d%d%d" % tuple(self.tile_description.math_instruction.instruction_shape)
-            if (
-                self.tile_description.math_instruction.element_a != self.A.element
-                and self.tile_description.math_instruction.element_a
-                != self.tile_description.math_instruction.element_accumulator
-            ):
-                intermediate_type = DataTypeNames[self.tile_description.math_instruction.element_a]
-
-        return "%s%s%s%s" % (
-            self.short_math_name(),
-            inst_shape,
-            intermediate_type,
-            "gemm",
-        )
-
-    def extended_name(self):
-        """ Append data types if they differ from compute type. """
-        if (
-            self.C.element != self.tile_description.math_instruction.element_accumulator
-            and self.A.element != self.tile_description.math_instruction.element_accumulator
-        ):
-            extended_name = "${element_c}_${core_name}_${element_a}"
-        elif (
-            self.C.element == self.tile_description.math_instruction.element_accumulator
-            and self.A.element != self.tile_description.math_instruction.element_accumulator
-        ):
-            extended_name = "${core_name}_${element_a}"
-        else:
-            extended_name = "${core_name}"
-
-        extended_name = substitute_template(
-            extended_name,
-            {
-                "element_a": DataTypeNames[self.A.element],
-                "element_c": DataTypeNames[self.C.element],
-                "core_name": self.core_name(),
-            },
-        )
-
-        return extended_name
-
-    def layout_name(self):
-        return "%s%s" % (ShortLayoutTypeNames[self.A.layout], ShortLayoutTypeNames[self.B.layout])
-
-    def procedural_name(self):
-        """The full procedural name indicates architecture, extended name, tile size,
-        and layout.
-        """
-        threadblock = self.tile_description.procedural_name()
-        opcode_class_name = OpcodeClassNames[self.tile_description.math_instruction.opcode_class]
-
-        return substitute_template(
-            "cutlass_${opcode_class}_${extended_name}_${threadblock}_${layout}_align${alignment}",
-            {
-                "opcode_class": opcode_class_name,
-                "extended_name": self.extended_name(),
-                "threadblock": threadblock,
-                "layout": self.layout_name(),
-                "alignment": "%d" % self.A.alignment,
-            },
-        )
-
-    def leading_dim(self):
-        """ lda, ldb, ldc, according to the leading dimension. """
-        if self.A.layout == LayoutType.RowMajor:
-            lda = "K"
-        elif self.A.layout == LayoutType.ColumnMajor:
-            lda = "M"
-        else:
-            ValueError("The layout of A is not implemented.")
-
-        if self.B.layout == LayoutType.RowMajor:
-            ldb = "N"
-        elif self.B.layout == LayoutType.ColumnMajor:
-            ldb = "K"
-        else:
-            ValueError("The layout of B is not implemented.")
-
-        if self.C.layout == LayoutType.RowMajor:
-            ldc = "N"
-        elif self.C.layout == LayoutType.ColumnMajor:
-            ldc = "M"
-        else:
-            ValueError("The layout of B is not implemented.")
-
-        return substitute_template(
-            "int lda = ${lda_val};\n\tint ldb = ${ldb_val};\n\tint ldc = ${ldc_val};\n",
-            {
-                "lda_val": lda,
-                "ldb_val": ldb,
-                "ldc_val": ldc,
-            },
-        )
-
-
-class EmitGemmInstance:
-    """ Responsible for emitting a CUTLASS template definition."""
-
-    def __init__(self):
-        self.epilogue_default = """
-    ${epilogue_functor}<
-      ${element_c},
-      ${epilogue_vector_length},
-      ${element_accumulator},
-      ${element_epilogue}
-    >"""
-        self.epilogue_no_beta_scaling = """
-    ${epilogue_functor}<
-      ${element_c},
-      ${epilogue_vector_length},
-      ${element_accumulator},
-      ${element_epilogue},
-      cutlass::epilogue::thread::ScaleType::NoBetaScaling
-    >"""
-        self.gemm_template = """
-  // Gemm operator ${operation_name}
-  using Operation_${operation_name} = cutlass::gemm::device::Gemm<
-    ${element_a}, ${layout_a},
-    ${element_b}, ${layout_b},
-    ${element_c}, ${layout_c},
-    ${element_accumulator},
-    ${opcode_class},
-    ${arch},
-    cutlass::gemm::GemmShape<${threadblock_shape_m}, ${threadblock_shape_n}, ${threadblock_shape_k}>,
-    cutlass::gemm::GemmShape<${warp_shape_m}, ${warp_shape_n}, ${warp_shape_k}>,
-    cutlass::gemm::GemmShape<${instruction_shape_m}, ${instruction_shape_n}, ${instruction_shape_k}>,
-    ${epilogue},
-    ${swizzling_functor},
-    ${stages},
-    ${align_a},
-    ${align_b},
-    false,
-    ${math_operation}
-    ${residual}
-  >;
-"""
-
-    def emit(self, operation, no_beta_scaling=False):
-        """Instantiate a GEMM kernel from given `operation`."""
-        warp_shape = [
-            operation.tile_description.threadblock_shape[idx]
-            // operation.tile_description.warp_count[idx]
-            for idx in range(3)
-        ]
-        epilogue_vector_length = (
-            min(operation.C.alignment * DataTypeSize[operation.C.element], 128)
-            // DataTypeSize[operation.C.element]
-        )
-        residual = ""
-        complex_transform_tag = "cutlass::ComplexTransform::kNone"
-        values = {
-            "operation_name": operation.procedural_name(),
-            "element_a": DataTypeTag[operation.A.element],
-            "layout_a": LayoutTag[operation.A.layout],
-            "element_b": DataTypeTag[operation.B.element],
-            "layout_b": LayoutTag[operation.B.layout],
-            "element_c": DataTypeTag[operation.C.element],
-            "layout_c": LayoutTag[operation.C.layout],
-            "element_accumulator": DataTypeTag[operation.accumulator_type()],
-            "opcode_class": OpcodeClassTag[
-                operation.tile_description.math_instruction.opcode_class
-            ],
-            "arch": "cutlass::arch::Sm%d" % operation.arch,
-            "threadblock_shape_m": str(operation.tile_description.threadblock_shape[0]),
-            "threadblock_shape_n": str(operation.tile_description.threadblock_shape[1]),
-            "threadblock_shape_k": str(operation.tile_description.threadblock_shape[2]),
-            "warp_shape_m": str(warp_shape[0]),
-            "warp_shape_n": str(warp_shape[1]),
-            "warp_shape_k": str(warp_shape[2]),
-            "instruction_shape_m": str(
-                operation.tile_description.math_instruction.instruction_shape[0]
-            ),
-            "instruction_shape_n": str(
-                operation.tile_description.math_instruction.instruction_shape[1]
-            ),
-            "instruction_shape_k": str(
-                operation.tile_description.math_instruction.instruction_shape[2]
-            ),
-            "epilogue_vector_length": str(epilogue_vector_length),
-            "element_epilogue": str(DataTypeTag[operation.element_epilogue]),
-            "epilogue_functor": EpilogueFunctorTag[operation.epilogue_functor],
-            "swizzling_functor": SwizzlingFunctorTag[operation.swizzling_functor],
-            "stages": str(operation.tile_description.stages),
-            "align_a": str(operation.A.alignment),
-            "align_b": str(operation.B.alignment),
-            "transform_a": complex_transform_tag,
-            "transform_b": complex_transform_tag,
-            "math_operation": MathOperationTag[
-                operation.tile_description.math_instruction.math_operation
-            ],
-            "residual": residual,
-        }
-
-        gemm_template = substitute_template(
-            self.gemm_template,
-            {
-                "epilogue": self.epilogue_no_beta_scaling
-                if no_beta_scaling
-                else self.epilogue_default
-            },
-        )
-        return substitute_template(gemm_template, values)
diff --git a/python/tvm/contrib/cutlass/gemm_profiler.py b/python/tvm/contrib/cutlass/gemm_profiler.py
deleted file mode 100644
index 13679cd05c42e..0000000000000
--- a/python/tvm/contrib/cutlass/gemm_profiler.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=import-outside-toplevel, invalid-name
-"""Instantiate a C++ source for profiling CUTLASS kernels."""
-
-
-class GemmProfilerEmitter(object):
-    """Emit a C++ source for profiling CUTLASS kernels."""
-
-    def __init__(self):
-        from jinja2 import Template
-
-        self.template = Template(
-            """
-#include <iostream>
-#include <sstream>
-#include <vector>
-#include <chrono>
-
-#include "cuda_runtime.h"
-#include "cutlass/gemm/device/gemm.h"
-
-#define CUTLASS_CHECK(status)                                                                    \\
-  {                                                                                              \\
-    cutlass::Status error = status;                                                              \\
-    if (error != cutlass::Status::kSuccess) {                                                    \\
-      std::cerr << "Got cutlass error: " << cutlassGetStatusString(error) << " at: " << __LINE__ \\
-                << std::endl;                                                                    \\
-      exit(EXIT_FAILURE);                                                                        \\
-    }                                                                                            \\
-  }
-
-#define CUDA_CHECK(status)                                              \\
-  {                                                                     \\
-    cudaError_t error = status;                                         \\
-    if (error != cudaSuccess) {                                         \\
-      std::cerr << "Got bad cuda status: " << cudaGetErrorString(error) \\
-                << " at line: " << __LINE__ << std::endl;               \\
-      exit(EXIT_FAILURE);                                               \\
-    }                                                                   \\
-  }
-
-template<typename DTypeA, typename DTypeB, typename DTypeC>
-cudaError_t CutlassGemmRCR(
-    int M,
-    int N,
-    int K,
-    DTypeC alpha,
-    DTypeA const *A,
-    int lda,
-    DTypeB const *B,
-    int ldb,
-    DTypeC beta,
-    DTypeC *C,
-    int ldc) {
-  using namespace std::chrono;
-  {{OperatorDef}}
-  Operation_{{OperatorName}} gemm_operator;
-  Operation_{{OperatorName}}::Arguments args({M, N, K},
-                              {A, lda},
-                              {B, ldb},
-                              {C, ldc},
-                              {C, ldc},
-                              {alpha, beta});
-  cutlass::Status status = gemm_operator(args);
-  CUTLASS_CHECK(status)
-
-  high_resolution_clock::time_point t1 = high_resolution_clock::now();
-  for (int i = 0; i < 100; ++i) {
-    status = gemm_operator(args);
-  }
-  cudaDeviceSynchronize();
-  high_resolution_clock::time_point t2 = high_resolution_clock::now();
-  duration<double> time_span = duration_cast<duration<double>>(t2 - t1);
-  std::cout << time_span.count() << std::endl;
-  return cudaSuccess;
-}
-
-
-template<typename DType>
-cudaError_t AllocateMatrix(DType **matrix, int ldm, int rows, int columns, int seed = 0) {
-  cudaError_t result;
-
-  size_t sizeof_matrix = sizeof(DType) * rows * columns;
-
-  // Allocate device memory.
-  result = cudaMalloc(reinterpret_cast<void **>(matrix), sizeof_matrix);
-
-  if (result != cudaSuccess) {
-    std::cerr << "Failed to allocate matrix: "
-      << cudaGetErrorString(result) << std::endl;
-    return result;
-  }
-
-  // Clear the allocation.
-  result = cudaMemset(*matrix, 0, sizeof_matrix);
-
-  if (result != cudaSuccess) {
-    std::cerr << "Failed to clear matrix device memory: "
-      << cudaGetErrorString(result) << std::endl;
-    return result;
-  }
-
-  if (result != cudaSuccess) {
-    std::cerr << "Failed to initialize matrix: "
-      << cudaGetErrorString(result) << std::endl;
-    return result;
-  }
-
-  return result;
-}
-
-template<typename DTypeA, typename DTypeB, typename DTypeC>
-cudaError_t TestCutlassGemm(int M, int N, int K, DTypeC alpha, DTypeC beta) {
-  cudaError_t result;
-
-  {{LeadingDim}}
-  // size_t sizeof_C = sizeof(DTypeC) * ldc * N;
-  DTypeA *A;
-  DTypeB *B;
-  DTypeC *C_cutlass;
-  result = AllocateMatrix<DTypeA>(&A, lda, M, K, 0);
-  if (result !=  cudaSuccess) {
-    return result;
-  }
-  result = AllocateMatrix<DTypeB>(&B, ldb, K, N, 17);
-  if (result !=  cudaSuccess) {
-    cudaFree(A);
-    return result;
-  }
-  result = AllocateMatrix<DTypeC>(&C_cutlass, ldc, M, N, 101);
-  if (result != cudaSuccess) {
-    cudaFree(A);
-    cudaFree(B);
-    return result;
-  }
-  result = CutlassGemmRCR<DTypeA, DTypeB, DTypeC>(M, N, K, alpha, A, lda, B, ldb,
-                                                  beta, C_cutlass, ldc);
-  if (result != cudaSuccess) {
-    std::cerr << "CUTLASS GEMM kernel failed: "
-      << cudaGetErrorString(result) << std::endl;
-    cudaFree(C_cutlass);
-    cudaFree(B);
-    cudaFree(A);
-
-    return result;
-  }
-  cudaFree(C_cutlass);
-  cudaFree(B);
-  cudaFree(A);
-  return cudaSuccess;
-}
-
-int main(int argc, const char *arg[]) {
-  int problem[3] = { 4096, 4096, 4096 };
-  for (int i = 1; i < argc && i < 4; ++i) {
-    std::stringstream ss(arg[i]);
-    ss >> problem[i - 1];
-  }
-  float scalars[2] = { 1, 0 };
-  cudaError_t result = TestCutlassGemm< {{DTypeA}}, {{DTypeB}}, {{DTypeC}}>(
-    problem[0],     // GEMM M dimension
-    problem[1],     // GEMM N dimension
-    problem[2],     // GEMM K dimension
-    static_cast<{{DTypeC}}>(scalars[0]),     // alpha
-    static_cast<{{DTypeC}}>(scalars[1])      // beta
-  );
-  return result == cudaSuccess ? 0 : -1;
-}
-"""
-        )
-
-    def emit(self, op_name, op_def, dtype_a, dtype_b, dtype_c, ld):
-        src = self.template.render(
-            OperatorName=op_name,
-            OperatorDef=op_def,
-            DTypeA=dtype_a,
-            DTypeB=dtype_b,
-            DTypeC=dtype_c,
-            LeadingDim=ld,
-        )
-        return src
diff --git a/python/tvm/contrib/cutlass/gen_gemm.py b/python/tvm/contrib/cutlass/gen_gemm.py
deleted file mode 100644
index a43c6d414e389..0000000000000
--- a/python/tvm/contrib/cutlass/gen_gemm.py
+++ /dev/null
@@ -1,389 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Kernel generator and profiler for CUTLASS."""
-import logging
-import os
-import re
-import tempfile
-import subprocess
-import multiprocessing
-from .gemm_operation import GemmOperation, EmitGemmInstance
-from .gemm_profiler import GemmProfilerEmitter
-from .library import (
-    EpilogueFunctor,
-    SwizzlingFunctor,
-    TensorDescription,
-    DataTypeTag,
-    LayoutType,
-    MathInstruction,
-    DataType,
-    OpcodeClass,
-    MathOperation,
-    TileDescription,
-)
-
-logger = logging.getLogger("cutlass")
-
-
-def create_gemm_operator(
-    layouts,
-    tile_descriptions,
-    data_type,
-    alignment_constraints,
-    epilogue_functor=EpilogueFunctor.LinearCombination,
-    swizzling_functor=SwizzlingFunctor.Identity8,
-):
-    """Exhaustively instantiate all kernels from a given configuration."""
-    ret = []
-    kernel_emitter = EmitGemmInstance()
-    profiler_emitter = GemmProfilerEmitter()
-
-    element_a, element_b, element_c, element_epilogue = data_type
-
-    for layout in layouts:
-        for tile_description in tile_descriptions:
-            for alignment in alignment_constraints:
-                alignment_c = min(8, alignment)
-
-                A = TensorDescription(element_a, layout[0], alignment)
-                B = TensorDescription(element_b, layout[1], alignment)
-                C = TensorDescription(element_c, layout[2], alignment_c)
-
-                op_entry = {}
-                op = GemmOperation(
-                    tile_description.minimum_compute_capability,
-                    tile_description,
-                    A,
-                    B,
-                    C,
-                    element_epilogue,
-                    epilogue_functor,
-                    swizzling_functor,
-                )
-                op_bias = GemmOperation(
-                    tile_description.minimum_compute_capability,
-                    tile_description,
-                    A,
-                    B,
-                    C,
-                    element_epilogue,
-                    EpilogueFunctor.LinearCombinationBias,
-                    swizzling_functor,
-                )
-                op_bias_relu = GemmOperation(
-                    tile_description.minimum_compute_capability,
-                    tile_description,
-                    A,
-                    B,
-                    C,
-                    element_epilogue,
-                    EpilogueFunctor.LinearCombinationRelu,
-                    swizzling_functor,
-                )
-                op_bias_gelu = GemmOperation(
-                    tile_description.minimum_compute_capability,
-                    tile_description,
-                    A,
-                    B,
-                    C,
-                    element_epilogue,
-                    EpilogueFunctor.LinearCombinationGelu,
-                    swizzling_functor,
-                )
-
-                kernel_emitter = EmitGemmInstance()
-                op_entry["op"] = op
-                op_entry["name"] = op.procedural_name()
-                op_entry["opdef"] = kernel_emitter.emit(op)
-                op_entry["opdef_bias"] = kernel_emitter.emit(op_bias, no_beta_scaling=True)
-                op_entry["opdef_bias_relu"] = kernel_emitter.emit(
-                    op_bias_relu, no_beta_scaling=True
-                )
-                op_entry["opdef_bias_gelu"] = kernel_emitter.emit(op_bias_gelu)
-                op_entry["src"] = profiler_emitter.emit(
-                    op.procedural_name(),
-                    op_entry["opdef"],
-                    DataTypeTag[element_a],
-                    DataTypeTag[element_b],
-                    DataTypeTag[element_c],
-                    op.leading_dim(),
-                )
-                op_entry["runtime"] = 9999999
-                ret.append(op_entry)
-    return ret
-
-
-def generate_tensor_op_common(math_instructions, alignment_constraints, get_tile_descriptions):
-    """Common kernel generator to be used by archtecture specific generators."""
-    ops = []
-    layouts = [
-        (LayoutType.RowMajor, LayoutType.ColumnMajor, LayoutType.RowMajor),
-    ]
-    for math_inst in math_instructions:
-        tile_descriptions = get_tile_descriptions(math_inst)
-        data_type = [
-            math_inst.element_a,
-            math_inst.element_b,
-            math_inst.element_accumulator,
-            math_inst.element_accumulator,
-        ]
-
-        out = create_gemm_operator(layouts, tile_descriptions, data_type, alignment_constraints)
-
-        ops.extend(out)
-
-    return ops
-
-
-def generate_sm75_tensor_op_1688(out_dtype):
-    """Generate GEMM kernels for Turing."""
-    assert out_dtype in ["float32", "float16"]
-    math_instructions = {
-        "float32": [
-            MathInstruction(
-                [16, 8, 8],
-                DataType.f16,
-                DataType.f16,
-                DataType.f32,
-                OpcodeClass.TensorOp,
-                MathOperation.multiply_add,
-            )
-        ],
-        "float16": [
-            MathInstruction(
-                [16, 8, 8],
-                DataType.f16,
-                DataType.f16,
-                DataType.f16,
-                OpcodeClass.TensorOp,
-                MathOperation.multiply_add,
-            )
-        ],
-    }[out_dtype]
-
-    alignment_constraints = [8, 4, 2, 1]
-
-    def get_tile_descriptions(math_inst):
-        min_cc = 75
-        max_cc = 1024
-        return [
-            TileDescription([256, 128, 32], 2, [4, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([128, 256, 32], 2, [2, 4, 1], math_inst, min_cc, max_cc),
-            TileDescription([128, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([64, 128, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([128, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([64, 64, 32], 2, [2, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([64, 128, 64], 2, [1, 2, 2], math_inst, min_cc, max_cc),
-        ]
-
-    return generate_tensor_op_common(
-        math_instructions, alignment_constraints, get_tile_descriptions
-    )
-
-
-def generate_sm80_tensor_op_16816(out_dtype):
-    """Generate GEMM kernels for Ampere."""
-    assert out_dtype in ["float32", "float16"]
-    math_instructions = {
-        "float32": [
-            MathInstruction(
-                [16, 8, 16],
-                DataType.f16,
-                DataType.f16,
-                DataType.f32,
-                OpcodeClass.TensorOp,
-                MathOperation.multiply_add,
-            )
-        ],
-        "float16": [
-            MathInstruction(
-                [16, 8, 16],
-                DataType.f16,
-                DataType.f16,
-                DataType.f16,
-                OpcodeClass.TensorOp,
-                MathOperation.multiply_add,
-            )
-        ],
-    }[out_dtype]
-
-    alignment_constraints = [8, 4, 2]
-
-    def get_tile_descriptions(math_inst):
-        min_cc = 80
-        max_cc = 1024
-        max_cc_smem_limited = 80
-        return [
-            TileDescription([256, 128, 32], 3, [4, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([128, 256, 32], 3, [2, 4, 1], math_inst, min_cc, max_cc),
-            TileDescription([256, 64, 32], 4, [4, 1, 1], math_inst, min_cc, max_cc),
-            TileDescription([64, 256, 32], 4, [1, 4, 1], math_inst, min_cc, max_cc),
-            TileDescription([128, 128, 32], 3, [2, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([128, 128, 32], 4, [2, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([128, 128, 32], 5, [2, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([128, 64, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([64, 128, 32], 6, [2, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([64, 64, 32], 10, [2, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([256, 128, 64], 3, [4, 2, 1], math_inst, min_cc, max_cc_smem_limited),
-            TileDescription([128, 256, 64], 3, [2, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-            TileDescription([256, 64, 64], 4, [4, 1, 1], math_inst, min_cc, max_cc_smem_limited),
-            TileDescription([64, 256, 64], 4, [1, 4, 1], math_inst, min_cc, max_cc_smem_limited),
-            TileDescription([128, 128, 64], 4, [2, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([128, 64, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([64, 128, 64], 3, [2, 2, 1], math_inst, min_cc, max_cc),
-            TileDescription([64, 64, 64], 5, [2, 2, 1], math_inst, min_cc, max_cc),
-        ]
-
-    return generate_tensor_op_common(
-        math_instructions, alignment_constraints, get_tile_descriptions
-    )
-
-
-GENERATOR_FUNC_TABLE = {
-    75: generate_sm75_tensor_op_1688,
-    80: generate_sm80_tensor_op_16816,
-}
-
-# TODO(masahi): A sensible way to pick reasonable default kernels
-DEFAULT_KERNELS = {
-    75: {
-        "float16": "cutlass_tensorop_h1688gemm_128x64_32x2_tn_align4",
-        "float32": "cutlass_tensorop_s1688gemm_f16_64x64_32x2_tn_align4",
-    },
-    80: {
-        "float16": "cutlass_tensorop_h16816gemm_128x256_32x3_tn_align4",
-        "float32": "cutlass_tensorop_s16816gemm_f16_128x128_32x3_tn_align4",
-    },
-}
-
-
-class ProfilerEngine:
-    """Compile and run a given profiler executable."""
-
-    def __init__(self, cuda_arch, cutlass_path, binary_prefix):
-        self.cuda_arch = cuda_arch
-        self.binary_prefix = binary_prefix
-        self.cutlass = cutlass_path
-        self.cflags = "-I{cutlass}/include -I{cutlass}/tools/util/include -O3 -std=c++11".format(
-            cutlass=cutlass_path
-        )
-        self.cflags += " -DCUTLASS_ENABLE_TENSOR_CORE_MMA=1"
-        self.cflags += " -gencode=arch=compute_{arch},code=[sm_{arch},compute_{arch}]".format(
-            arch=cuda_arch
-        )
-        self.cflags += " -Xcompiler=-Wconversion -Xcompiler=-fno-strict-aliasing"
-        self.cmd = "nvcc {cflags} {src} -o {output}"
-
-    def _compile(self, op):
-        os.makedirs(self.binary_prefix, exist_ok=True)
-        opath = os.path.join(self.binary_prefix, op["name"])
-        if os.path.exists(opath):
-            return
-        fi = tempfile.NamedTemporaryFile("w", delete=False, suffix=".cu")
-        fi.write(op["src"])
-        fi.close()
-        cmd = self.cmd.format(cflags=self.cflags, src=fi.name, output=opath)
-        os.system(cmd)
-        os.unlink(fi.name)
-
-    def compile_all(self, ops, use_multiprocessing=False):
-        """Compile all profiler executables."""
-        if use_multiprocessing:
-            pool = multiprocessing.Pool(multiprocessing.cpu_count())
-            pool.map(self._compile, ops)
-        else:
-            for op in ops:
-                self._compile(op)
-
-    def evaluate(self, op, args):
-        """Run the profiler executable corresponding to op_name with args."""
-        op_name = op["name"]
-        opath = os.path.join(self.binary_prefix, op_name)
-        if not os.path.exists(opath):
-            self._compile(op)
-        cmd = [opath]
-        if args is not None:
-            cmd.append(str(args[0]))
-            cmd.append(str(args[1]))
-            cmd.append(str(args[2]))
-            if len(args) > 3:
-                cmd.append(str(args[3]))
-        try:
-            sp = subprocess.run(cmd, capture_output=True, check=True)
-            rt = float(sp.stdout)
-            logger.info("%s, %f", op_name, rt)
-        except subprocess.CalledProcessError:
-            rt = -1
-        return rt
-
-
-class CutlassGemmProfiler(object):
-    """Profile all candidate kernels and select the best one."""
-
-    def __init__(self, sm, cutlass_path, binary_path):
-        assert sm in GENERATOR_FUNC_TABLE and sm in DEFAULT_KERNELS, "sm%d not supported yet." % sm
-        self.engine = ProfilerEngine(sm, cutlass_path, binary_path)
-        self.sm = sm
-        self.cache = {}
-
-    def check_align(self, op_name, M):
-        """Filter out kernels that cannot be supported."""
-        aligns = re.findall(r"align[1|2|4|8]", op_name)
-        assert len(aligns) == 1
-        align = int(aligns[0][-1])
-        if M % align != 0:
-            return False
-        return True
-
-    def get_default(self, out_dtype):
-        """Return the default kernel for the requested architecture.
-        For now, the default kernel was picked arbitrary.
-        """
-        ops = GENERATOR_FUNC_TABLE[self.sm](out_dtype)
-        default_kernel_name = DEFAULT_KERNELS[self.sm][out_dtype]
-        filtered = list(filter(lambda op: op["name"] == default_kernel_name, ops))
-        assert len(filtered) == 1
-        return filtered[0]
-
-    def profile(self, M, N, K, out_dtype, profile_all=True, use_multiprocessing=False):
-        """Profile and select the best kernel from candidate kernels.
-        If profile_all is False, return immediately after the first applicable kernel is found.
-        If use_multiprocessing is True, compile all profiler executables in parallel.
-        """
-        if (M, N, K) in self.cache:
-            return self.cache[(M, N, K)]
-
-        ops = GENERATOR_FUNC_TABLE[self.sm](out_dtype)
-        ops = list(filter(lambda op: self.check_align(op["name"], M), ops))
-
-        for op in ops:
-            op["runtime"] = -1
-
-        if profile_all:
-            self.engine.compile_all(ops, use_multiprocessing)
-
-        for op in ops:
-            out = self.engine.evaluate(op, [M, N, K])
-            op["runtime"] = out
-            if out > 0 and profile_all is False:
-                break
-
-        valid_ops = filter(lambda op: op["runtime"] > 0, ops)
-        output = sorted(valid_ops, key=lambda i: i["runtime"])
-        self.cache[(M, N, K)] = output[0]
-        return output[0]
diff --git a/python/tvm/contrib/cutlass/library.py b/python/tvm/contrib/cutlass/library.py
deleted file mode 100644
index 7d544293901a9..0000000000000
--- a/python/tvm/contrib/cutlass/library.py
+++ /dev/null
@@ -1,219 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-# pylint: disable=invalid-name
-"""Various type definitions to help instantiate CUTLASS kernels."""
-import re
-import enum
-from enum import auto as enum_auto
-
-
-class GeneratorTarget(enum.Enum):
-    Library = enum_auto()
-
-
-class DataType(enum.Enum):
-    f16 = enum_auto()
-    f32 = enum_auto()
-
-
-ShortDataTypeNames = {
-    DataType.f16: "h",
-    DataType.f32: "s",
-}
-
-
-DataTypeNames = {
-    DataType.f16: "f16",
-    DataType.f32: "f32",
-}
-
-DataTypeTag = {
-    DataType.f16: "cutlass::half_t",
-    DataType.f32: "float",
-}
-
-DataTypeSize = {
-    DataType.f16: 16,
-    DataType.f32: 32,
-}
-
-
-class MathOperation(enum.Enum):
-    multiply_add = enum_auto()
-
-
-MathOperationTag = {
-    MathOperation.multiply_add: "cutlass::arch::OpMultiplyAdd",
-}
-
-
-class LayoutType(enum.Enum):
-    ColumnMajor = enum_auto()
-    RowMajor = enum_auto()
-
-
-LayoutTag = {
-    LayoutType.ColumnMajor: "cutlass::layout::ColumnMajor",
-    LayoutType.RowMajor: "cutlass::layout::RowMajor",
-}
-
-
-TransposedLayout = {
-    LayoutType.ColumnMajor: LayoutType.RowMajor,
-    LayoutType.RowMajor: LayoutType.ColumnMajor,
-}
-
-
-ShortLayoutTypeNames = {
-    LayoutType.ColumnMajor: "n",
-    LayoutType.RowMajor: "t",
-}
-
-
-class OpcodeClass(enum.Enum):
-    Simt = enum_auto()
-    TensorOp = enum_auto()
-    WmmaTensorOp = enum_auto()
-
-
-OpcodeClassNames = {
-    OpcodeClass.Simt: "simt",
-    OpcodeClass.TensorOp: "tensorop",
-    OpcodeClass.WmmaTensorOp: "wmma_tensorop",
-}
-
-OpcodeClassTag = {
-    OpcodeClass.Simt: "cutlass::arch::OpClassSimt",
-    OpcodeClass.TensorOp: "cutlass::arch::OpClassTensorOp",
-    OpcodeClass.WmmaTensorOp: "cutlass::arch::OpClassWmmaTensorOp",
-}
-
-
-class OperationKind(enum.Enum):
-    Gemm = enum_auto()
-
-
-OperationKindNames = {
-    OperationKind.Gemm: "gemm",
-}
-
-
-class Target(enum.Enum):
-    library = enum_auto()
-
-
-def substitute_template(template, values):
-    """Instantiate a kernel template using `values`."""
-    text = template
-    changed = True
-    while changed:
-        changed = False
-        for key, value in values.items():
-            regex = "\\$\\{%s\\}" % key
-            newtext = re.sub(regex, value, text)
-            if newtext != text:
-                changed = True
-            text = newtext
-    return text
-
-
-class GemmKind(enum.Enum):
-    Gemm = enum_auto()
-
-
-GemmKindNames = {
-    GemmKind.Gemm: "gemm",
-}
-
-
-class EpilogueFunctor(enum.Enum):
-    LinearCombination = enum_auto()
-    LinearCombinationRelu = enum_auto()
-    LinearCombinationBias = enum_auto()
-    LinearCombinationGelu = enum_auto()
-
-
-EpilogueFunctorTag = {
-    EpilogueFunctor.LinearCombination: "cutlass::epilogue::thread::LinearCombination",
-    EpilogueFunctor.LinearCombinationRelu: "cutlass::epilogue::thread::LinearCombinationRelu",
-    EpilogueFunctor.LinearCombinationBias: "cutlass::epilogue::thread::LinearCombination",
-    EpilogueFunctor.LinearCombinationGelu: "cutlass::epilogue::thread::LinearCombinationGELU",
-}
-
-
-class SwizzlingFunctor(enum.Enum):
-    Identity1 = enum_auto()
-    Identity2 = enum_auto()
-    Identity4 = enum_auto()
-    Identity8 = enum_auto()
-
-
-SwizzlingFunctorTag = {
-    SwizzlingFunctor.Identity1: "cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<1>",
-    SwizzlingFunctor.Identity2: "cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<2>",
-    SwizzlingFunctor.Identity4: "cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<4>",
-    SwizzlingFunctor.Identity8: "cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>",
-}
-
-
-class MathInstruction:
-    """Describe characteristics of a math instruction."""
-
-    def __init__(
-        self,
-        instruction_shape,
-        element_a,
-        element_b,
-        element_accumulator,
-        opcode_class,
-        math_operation=MathOperation.multiply_add,
-    ):
-        self.instruction_shape = instruction_shape
-        self.element_a = element_a
-        self.element_b = element_b
-        self.element_accumulator = element_accumulator
-        self.opcode_class = opcode_class
-        self.math_operation = math_operation
-
-
-class TileDescription:
-    """Describe characteristics of a GEMM tile."""
-
-    def __init__(
-        self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute
-    ):
-        self.threadblock_shape = threadblock_shape
-        self.stages = stages
-        self.warp_count = warp_count
-        self.math_instruction = math_instruction
-        self.minimum_compute_capability = min_compute
-        self.maximum_compute_capability = max_compute
-
-    def procedural_name(self):
-        return "%dx%d_%dx%d" % (
-            self.threadblock_shape[0],
-            self.threadblock_shape[1],
-            self.threadblock_shape[2],
-            self.stages,
-        )
-
-
-class TensorDescription:
-    def __init__(self, element, layout, alignment=1):
-        self.element = element
-        self.layout = layout
-        self.alignment = alignment
diff --git a/python/tvm/contrib/target/onnx.py b/python/tvm/contrib/target/onnx.py
index 2b142dc75a053..c26255fc55173 100644
--- a/python/tvm/contrib/target/onnx.py
+++ b/python/tvm/contrib/target/onnx.py
@@ -70,10 +70,6 @@ def get_onnx_version():
     return onnx.__version__
 
 
-def get_node_shape(node):
-    return tuple("Any" if isinstance(i, tvm.tir.Any) else int(i) for i in node.shape)
-
-
 def infer_type(node):
     """A method to infer the type of a relay expression."""
     mod = tvm.IRModule.from_expr(node)
@@ -525,7 +521,7 @@ def convert(cls, node_entry, model_container, node_dict):
         input_node = node_dict[node_entry["inputs"][0]]
         assert len(input_node) == 1, "input node can not be a Tuple"
         input_node = input_node[0]
-        shape = get_node_shape(input_node["types"][0])
+        shape = input_node["types"][0].concrete_shape
 
         indices_or_sect = attrs["indices_or_section"]
         axis = attrs["axis"]
@@ -1023,7 +1019,7 @@ def _add_input(self, node_entry, idx):
             node_type = node_entry["types"][0]
             dtype = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[numpy.dtype(node_type.dtype)]
             input = onnx.helper.make_tensor_value_info(
-                node_entry["name"], dtype, shape=get_node_shape(node_type)
+                node_entry["name"], dtype, shape=node_type.concrete_shape
             )
             self._mc.add_inputs([input])
 
@@ -1034,7 +1030,7 @@ def _add_output(self, node_entries):
             for node_type, output_name in zip(node_entry["types"], node_entry["output_names"]):
                 dtype = onnx.mapping.NP_TYPE_TO_TENSOR_TYPE[numpy.dtype(node_type.dtype)]
                 output = onnx.helper.make_tensor_value_info(
-                    output_name, dtype, shape=get_node_shape(node_type)
+                    output_name, dtype, shape=node_type.concrete_shape
                 )
                 self._mc.add_outputs([output])
 
diff --git a/python/tvm/driver/tvmc/composite_target.py b/python/tvm/driver/tvmc/composite_target.py
index 0c04d2b7248fc..ba7862378557c 100644
--- a/python/tvm/driver/tvmc/composite_target.py
+++ b/python/tvm/driver/tvmc/composite_target.py
@@ -23,8 +23,7 @@
 import tvm.contrib.target.vitis_ai  # pylint: disable=unused-import
 
 from tvm.relay.op.contrib.arm_compute_lib import partition_for_arm_compute_lib
-from tvm.relay.op.contrib.ethosn import partition_for_ethosn77
-from tvm.relay.op.contrib.ethosn import partition_for_ethosn78
+from tvm.relay.op.contrib.ethosn import partition_for_ethosn
 from tvm.relay.op.contrib.cmsisnn import partition_for_cmsisnn
 from tvm.relay.op.contrib.ethosu import partition_for_ethosu
 from tvm.relay.op.contrib.bnns import partition_for_bnns
@@ -58,11 +57,7 @@
     },
     "ethos-n77": {
         "config_key": "relay.ext.ethos-n.options",
-        "pass_pipeline": partition_for_ethosn77,
-    },
-    "ethos-n78": {
-        "config_key": "relay.ext.ethos-n.options",
-        "pass_pipeline": partition_for_ethosn78,
+        "pass_pipeline": partition_for_ethosn,
     },
     "ethos-u": {
         "config_key": "relay.ext.ethosu.options",
diff --git a/python/tvm/driver/tvmc/frontends.py b/python/tvm/driver/tvmc/frontends.py
index 13ab3dd170c34..21d3d59fb0133 100644
--- a/python/tvm/driver/tvmc/frontends.py
+++ b/python/tvm/driver/tvmc/frontends.py
@@ -262,9 +262,7 @@ def load(self, path, shape_dict=None, **kwargs):
         input_shapes = list(shape_dict.items())
 
         logger.debug("parse Torch model and convert into Relay computation graph")
-        return relay.frontend.from_pytorch(
-            traced_model, input_shapes, keep_quantized_weight=True, **kwargs
-        )
+        return relay.frontend.from_pytorch(traced_model, input_shapes, **kwargs)
 
 
 class PaddleFrontend(Frontend):
diff --git a/python/tvm/ir/instrument.py b/python/tvm/ir/instrument.py
index 70c482e06125c..1948a6787eaca 100644
--- a/python/tvm/ir/instrument.py
+++ b/python/tvm/ir/instrument.py
@@ -29,140 +29,52 @@
 class PassInstrument(tvm.runtime.Object):
     """A pass instrument implementation.
 
-    To use, a user class can either subclass from PassInstrument
-    directly, or can apply the :py:func:`pass_instrument` wrapper.  In
-    either case, the `enter_pass_ctx`, `exit_pass_ctx`, `should_run`,
-    `run_before_pass`, and `run_after_pass` methods can be defined to
-    adjust the instrument's behavior.  See the no-op implementations
-    in this class definition for more information on each.
-
+    Users don't need to interact with this class directly.
+    Instead, a `PassInstrument` instance should be created through
+    :py:func:`pass_instrument`
     """
 
-    def __init__(self):
-        # initialize handle in case pi_cls creation failed.
-        self.handle = None
-        cls = type(self)
-
-        # If the child class declared the method, then use it.
-        # Otherwise, pass None to avoid a C++ -> Python round trip for
-        # a no-op.
-        def get_child_method(name):
-            if getattr(cls, name) is getattr(PassInstrument, name):
-                return None
-
-            return getattr(self, name)
-
-        # Create runtime pass instrument object.
-        # reister instance's enter_pass_ctx,exit_pass_ctx, should_run, run_before_pass and
-        # run_after_pass methods to it if present.
-        self.__init_handle_by_constructor__(
-            _ffi_instrument_api.PassInstrument,
-            cls.__name__,
-            get_child_method("enter_pass_ctx"),
-            get_child_method("exit_pass_ctx"),
-            get_child_method("should_run"),
-            get_child_method("run_before_pass"),
-            get_child_method("run_after_pass"),
-        )
-
-    def enter_pass_ctx(self):
-        """Called when entering the instrumented context.
-
-        Returns
-        -------
-        None
-        """
-
-    def exit_pass_ctx(self):
-        """Called when exiting the instrumented context.
-
-        Returns
-        -------
-        None
-        """
-
-    def should_run(self, mod, info):
-        """Determine whether to run the pass or not.
-
-        Called once for each pass that is run while the instrumented
-        context is active.
-
-        Parameters
-        ----------
-        mod : tvm.ir.module.IRModule
-
-            The module on which an optimization pass is being run.
-
-        info : tvm.transform.PassInfo
-
-            The pass information.
-
-        Returns
-        -------
-        should_run : bool
-
-            True to run the pass, or False to skip the pass.
-        """
-
-    def run_before_pass(self, mod, info):
-        """Instrument before the pass runs.
-
-        Called once for each pass that is run while the instrumented
-        context is active.
-
-        Parameters
-        ----------
-        mod : tvm.ir.module.IRModule
-
-            The module on which an optimization pass is being run.
-
-        info : tvm.transform.PassInfo
-
-            The pass information.
-
-        Returns
-        -------
-        None
-        """
-
-    def run_after_pass(self, mod, info):
-        """Instrument after the pass runs.
-
-        Called once for each pass that is run while the instrumented
-        context is active.
-
-        Parameters
-        ----------
-        mod : tvm.ir.module.IRModule
-
-            The module on which an optimization pass is being run.
-
-        info : tvm.transform.PassInfo
-
-            The pass information.
-
-        Returns
-        -------
-        None
-        """
-
 
 def _wrap_class_pass_instrument(pi_cls):
     """Wrap a python class as pass instrument"""
 
-    # No additional wrapping needed if the user class already
-    # inherits.
-    if issubclass(pi_cls, PassInstrument):
-        return pi_cls
-
-    class PyPassInstrument(pi_cls, PassInstrument):
+    class PyPassInstrument(PassInstrument):
         """Internal wrapper class to create a class instance."""
 
         def __init__(self, *args, **kwargs):
             # initialize handle in case pi_cls creation failed.
             self.handle = None
-            pi_cls.__init__(self, *args, **kwargs)
-            PassInstrument.__init__(self)
+            inst = pi_cls(*args, **kwargs)
+
+            # check method declartion within class, if found, wrap it.
+            def create_method(method):
+                if hasattr(inst, method) and inspect.ismethod(getattr(inst, method)):
+
+                    def func(*args):
+                        return getattr(inst, method)(*args)
+
+                    func.__name__ = "_" + method
+                    return func
+                return None
+
+            # create runtime pass instrument object
+            # reister instance's enter_pass_ctx,exit_pass_ctx, should_run, run_before_pass and
+            # run_after_pass methods to it if present.
+            self.__init_handle_by_constructor__(
+                _ffi_instrument_api.PassInstrument,
+                pi_cls.__name__,
+                create_method("enter_pass_ctx"),
+                create_method("exit_pass_ctx"),
+                create_method("should_run"),
+                create_method("run_before_pass"),
+                create_method("run_after_pass"),
+            )
+
+            self._inst = inst
+
+        def __getattr__(self, name):
+            # fall back to instance attribute if there is not any
+            return self._inst.__getattribute__(name)
 
     functools.update_wrapper(PyPassInstrument.__init__, pi_cls.__init__)
     PyPassInstrument.__name__ = pi_cls.__name__
diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py
index ba966d3791bb2..2aea9d3fd61d3 100644
--- a/python/tvm/micro/__init__.py
+++ b/python/tvm/micro/__init__.py
@@ -19,7 +19,6 @@
 from .build import autotvm_build_func
 from .build import AutoTvmModuleLoader
 from .build import get_standalone_crt_dir
-from .build import get_microtvm_template_projects
 from .model_library_format import export_model_library_format, UnsupportedInModelLibraryFormatError
 from .project import generate_project, GeneratedProject, TemplateProject
 from .session import (
diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py
index 795a61edcbb34..9e278081933cb 100644
--- a/python/tvm/micro/build.py
+++ b/python/tvm/micro/build.py
@@ -22,7 +22,6 @@
 import os
 import pathlib
 import contextlib
-import enum
 
 from typing import Union
 from .._ffi import libinfo
@@ -35,24 +34,10 @@
 STANDALONE_CRT_DIR = None
 
 
-class MicroTVMTemplateProject(enum.Enum):
-    ZEPHYR = "zephyr"
-    ARDUINO = "arduino"
-    CRT = "crt"
-
-    @classmethod
-    def list(cls):
-        return list(map(lambda c: c.value, cls))
-
-
 class CrtNotFoundError(Exception):
     """Raised when the standalone CRT dirtree cannot be found."""
 
 
-class MicroTVMTemplateProjectNotFoundError(Exception):
-    """Raised when the microTVM template project dirtree cannot be found."""
-
-
 def get_standalone_crt_dir() -> str:
     """Find the standalone_crt directory.
 
@@ -79,37 +64,6 @@ def get_standalone_crt_dir() -> str:
     return STANDALONE_CRT_DIR
 
 
-def get_microtvm_template_projects(platform: str) -> str:
-    """Find microTVM template project directory for specific platform.
-
-    Parameters
-    ----------
-    platform : str
-        Platform type which should be defined in MicroTVMTemplateProject.
-
-    Returns
-    -------
-    str :
-        Path to template project directory for platform.
-    """
-    if platform not in MicroTVMTemplateProject.list():
-        raise ValueError(f"platform {platform} is not supported.")
-
-    if platform == MicroTVMTemplateProject.CRT.value:
-        return os.path.join(get_standalone_crt_dir(), "template", "host")
-
-    microtvm_template_projects = None
-    for path in libinfo.find_lib_path():
-        template_path = os.path.join(os.path.dirname(path), "microtvm_template_projects")
-        if os.path.isdir(template_path):
-            microtvm_template_projects = template_path
-            break
-    else:
-        raise MicroTVMTemplateProjectNotFoundError()
-
-    return os.path.join(microtvm_template_projects, platform)
-
-
 class AutoTvmModuleLoader:
     """MicroTVM AutoTVM Module Loader
 
diff --git a/python/tvm/micro/project.py b/python/tvm/micro/project.py
index a5e54aa816a3f..d1a36ac79d640 100644
--- a/python/tvm/micro/project.py
+++ b/python/tvm/micro/project.py
@@ -184,7 +184,7 @@ def generate_project_from_mlf(
 
     mlf_path : pathlib.Path or str
         Path to the Model Library Format archive that will be used when creating
-        the new project. The archive file will be copied to project_dir.
+        the new project.
 
     options : dict
         Project API options given to the microTVM API server for the specified platform.
diff --git a/python/tvm/micro/testing.py b/python/tvm/micro/testing.py
index 81e29a92a86a1..124f66e021a3f 100644
--- a/python/tvm/micro/testing.py
+++ b/python/tvm/micro/testing.py
@@ -19,16 +19,8 @@
 
 import pathlib
 import json
-import logging
-import tarfile
-import time
 from typing import Union
 
-from tvm.micro.project_api.server import IoTimeoutError
-
-# Timeout in seconds for AOT transport.
-TIMEOUT_SEC = 10
-
 
 def check_tune_log(log_path: Union[pathlib.Path, str]):
     """Read the tuning log and check each result."""
@@ -39,47 +31,3 @@ def check_tune_log(log_path: Union[pathlib.Path, str]):
         if len(line) > 0:
             tune_result = json.loads(line)
             assert tune_result["result"][0][0] < 1000000000.0
-
-
-def aot_transport_init_wait(transport):
-    """Send init message to microTVM device until it receives wakeup sequence."""
-    while True:
-        try:
-            aot_transport_find_message(transport, "wakeup", timeout_sec=TIMEOUT_SEC)
-            break
-        except IoTimeoutError:
-            transport.write(b"init%", timeout_sec=TIMEOUT_SEC)
-
-
-def aot_transport_find_message(transport, expression: str, timeout_sec: int) -> str:
-    """Read transport message until it finds the expression."""
-    timeout = timeout_sec
-    start_time = time.monotonic()
-    while True:
-        data = _read_line(transport, timeout)
-        logging.debug("new line: %s", data)
-        if expression in data:
-            return data
-        timeout = max(0, timeout_sec - (time.monotonic() - start_time))
-
-
-def _read_line(transport, timeout_sec: int) -> str:
-    data = bytearray()
-    while True:
-        new_data = transport.read(1, timeout_sec=timeout_sec)
-        logging.debug("read data: %s", new_data)
-        for item in new_data:
-            data.append(item)
-            if str(chr(item)) == "\n":
-                return data.decode(encoding="utf-8")
-
-
-def mlf_extract_workspace_size_bytes(mlf_tar_path: Union[pathlib.Path, str]) -> int:
-    """Extract an MLF archive file and read workspace size from metadata file."""
-
-    with tarfile.open(mlf_tar_path, "r:*") as tar_file:
-        tar_members = [ti.name for ti in tar_file.getmembers()]
-        assert "./metadata.json" in tar_members
-        with tar_file.extractfile("./metadata.json") as f:
-            metadata = json.load(f)
-            return metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"]
diff --git a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
index bc95a9a3bab7d..c792ade066435 100644
--- a/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
+++ b/python/tvm/relay/backend/contrib/ethosu/tir/compiler.py
@@ -78,7 +78,6 @@ def lower_ethosu(sch, args, const_dict, name="main"):
         mod = tvm.tir.transform.Simplify()(mod)
         mod = tvm.tir.transform.StorageFlatten(64)(mod)
         mod = tvm.tir.transform.UnrollLoop()(mod)
-        mod = tvm.tir.transform.Simplify()(mod)
         mod = tvm.tir.transform.LoopPartition()(mod)
         mod = RemoveZeroStores()(mod)
         mod = tvm.tir.transform.Simplify()(mod)
diff --git a/python/tvm/relay/backend/vm.py b/python/tvm/relay/backend/vm.py
index 1dde27f172b12..363ff893df8b5 100644
--- a/python/tvm/relay/backend/vm.py
+++ b/python/tvm/relay/backend/vm.py
@@ -275,18 +275,14 @@ def __init__(self, mod, device, target):
         self.mod = mod
         self.device = device
         self.target = target
-        self.executable = None
-        self.vm = None
+        self.executable = compile(mod, target)
+        self.vm = vm_rt.VirtualMachine(self.executable, device)
 
     def _make_executor(self, expr=None):
-        if expr:
-            self.mod["main"] = expr
-
-        self.executable = compile(self.mod, self.target)
-        self.vm = vm_rt.VirtualMachine(self.executable, self.device)
+        main = self.mod["main"]
 
         def _vm_wrapper(*args, **kwargs):
-            args = self._convert_args(self.mod["main"], args, kwargs)
+            args = self._convert_args(main, args, kwargs)
             return self.vm.run(*args)
 
         return _vm_wrapper
diff --git a/python/tvm/relay/frontend/paddlepaddle.py b/python/tvm/relay/frontend/paddlepaddle.py
index fa7c80c912d95..ef361d6c55e83 100644
--- a/python/tvm/relay/frontend/paddlepaddle.py
+++ b/python/tvm/relay/frontend/paddlepaddle.py
@@ -115,32 +115,6 @@ def convert_binary_logical_op(g, op, block):
     g.add_node(op.output("Out")[0], out)
 
 
-def convert_addmm(g, op, block):
-    """Operator converter for addmm."""
-
-    input_x = g.get_node(op.input("Input")[0])
-    x = g.get_node(op.input("X")[0])
-    y = g.get_node(op.input("Y")[0])
-
-    alpha = op.attr("Alpha")
-    beta = op.attr("Beta")
-    dtype = block.var(op.output("Out")[0]).dtype
-    dtype = _convert_dtype_value(dtype)
-
-    if not isinstance(alpha, _expr.Expr) and alpha != 1:
-        alpha = _expr.const(alpha, dtype)
-        x *= alpha
-
-    if not isinstance(beta, _expr.Expr) and beta != 1:
-        beta = _expr.const(beta, dtype)
-        input_x *= beta
-
-    transposed_y = _op.transpose(y, axes=[1, 0])
-    dense_out = _op.nn.dense(x, transposed_y)
-    out = dense_out + input_x
-    g.add_node(op.output("Out")[0], out)
-
-
 def convert_arg_max_min(g, op, block):
     """Operator converter for arg_max and arg_min."""
 
@@ -218,26 +192,6 @@ def convert_batch_norm(g, op, block):
     g.add_node(op.output("Y")[0], out[0])
 
 
-def convert_bmm(g, op, block):
-    """Operator converter for bmm."""
-
-    x = g.get_node(op.input("X")[0])
-    y = g.get_node(op.input("Y")[0])
-    y = _op.transpose(y, [0, 2, 1])
-    out = _op.nn.batch_matmul(x, y)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_brelu(g, op, block):
-    """Operator converter for brelu."""
-
-    x = g.get_node(op.input("X")[0])
-    t_max = op.attr("t_max")
-    t_min = op.attr("t_min")
-    out = _op.tensor.clip(x, t_min, t_max)
-    g.add_node(op.output("Out")[0], out)
-
-
 def convert_cast(g, op, block):
     """Operator converter for cast."""
 
@@ -459,29 +413,6 @@ def convert_fill_constant(g, op, block):
     g.add_node(op.output("Out")[0], out)
 
 
-def convert_gather(g, op, block):
-    """Operator converter for gather."""
-
-    x = g.get_node(op.input("X")[0])
-    index = g.get_node(op.input("Index")[0])
-    axis = op.attr("axis")
-    out = _op.take(x, index, axis)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_gather_nd(g, op, block):
-    """Operator converter for gather_nd."""
-
-    x = g.get_node(op.input("X")[0])
-    index = g.get_node(op.input("Index")[0])
-    shape = infer_shape(index)
-    perm = list(range(0, len(shape) - 1))
-    perm.insert(0, len(shape) - 1)
-    index = _op.transpose(index, axes=perm)
-    out = _op.gather_nd(x, index, 0, shape[-1])
-    g.add_node(op.output("Out")[0], out)
-
-
 def convert_gelu(g, op, block):
     """Operator converter for gelu."""
 
@@ -493,39 +424,6 @@ def convert_gelu(g, op, block):
     g.add_node(op.output("Out")[0], out)
 
 
-def convert_group_norm(g, op, block):
-    """Operator converter for group_norm."""
-
-    x = g.get_node(op.input("X")[0])
-    num_groups = op.attr("groups")
-    epsilon = op.attr("epsilon")
-    gamma = g.get_node(op.input("Scale")[0])
-    beta = g.get_node(op.input("Bias")[0])
-    out = _op.nn.group_norm(
-        x,
-        gamma=gamma,
-        beta=beta,
-        num_groups=num_groups,
-        axis=1,
-        epsilon=epsilon,
-        center=True,
-        scale=True,
-    )
-    g.add_node(op.output("Y")[0], out)
-
-
-def convert_hard_shrink(g, op, block):
-    """Operator converter for hard_shrink."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = infer_type(x).checked_type.dtype
-    threshold = op.attr("threshold")
-    threshold = _op.const(threshold, dtype)
-    out = _op.logical_or(x < _op.const(-1.0, dtype) * threshold, x > threshold)
-    out = _op.cast(out, dtype) * x
-    g.add_node(op.output("Out")[0], out)
-
-
 def convert_hard_sigmoid(g, op, block):
     """Operator converter for hard_sigmoid."""
 
@@ -592,15 +490,6 @@ def convert_leaky_relu(g, op, block):
     g.add_node(op.output("Out")[0], out)
 
 
-def convert_logical_not(g, op, block):
-    """Operator converter for logical_not op."""
-
-    ipt0 = g.get_node(op.input("X")[0])
-    op_func = get_relay_op(op.type)
-    out = op_func(ipt0)
-    g.add_node(op.output("Out")[0], out)
-
-
 def convert_lookup_table(g, op, block):
     """Operator converter for lookup_table_v2."""
 
@@ -874,62 +763,6 @@ def convert_pool2d(g, op, block):
     g.add_node(op.output("Out")[0], out)
 
 
-def convert_pow(g, op, block):
-    """Operator converter for pow."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = block.var(op.output("Out")[0]).dtype
-    dtype = _convert_dtype_value(dtype)
-    factor = op.attr("factor")
-    factor = _expr.const(factor, dtype=dtype)
-    out = _op.power(x, factor)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_reciprocal(g, op, block):
-    """Operator converter for reciprocal."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = infer_type(x).checked_type.dtype
-    out = _expr.const(1.0, dtype) / x
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_reduce(g, op, block):
-    """Operator converter for series of reduce operators."""
-
-    op_map = {
-        "reduce_all": "all",
-        "reduce_any": "any",
-        "reduce_max": "max",
-        "reduce_min": "min",
-        "reduce_prod": "prod",
-        "reduce_sum": "sum",
-        "reduce_mean": "mean",
-    }
-    op_name = op_map[op.type]
-    input_x = g.get_node(op.input("X")[0])
-    axis = op.attr("dim")
-    if op.attr("reduce_all"):
-        axis = None
-    keepdims = op.attr("keep_dim")
-    out = get_relay_op(op_name)(input_x, axis=axis, keepdims=keepdims)
-    if not axis and not keepdims:
-        # use `expand_dims` to solve the following situation
-        # for TVM, the shape of `out` will be (, )
-        # for Paddle, the shape of `out` will be [1]
-        out = _op.expand_dims(out, axis=0)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_relu6(g, op, block):
-    """Operator converter for relu6."""
-
-    x = g.get_node(op.input("X")[0])
-    out = _op.clip(x, 0.0, 6.0)
-    g.add_node(op.output("Out")[0], out)
-
-
 def convert_reshape(g, op, block):
     """Operator converter for reshape."""
 
@@ -983,56 +816,6 @@ def convert_scale(g, op, block):
     g.add_node(op.output("Out")[0], out)
 
 
-def convert_scatter(g, op, block):
-    """Operator converter for scatter."""
-
-    x = g.get_node(op.input("X")[0])
-    index = g.get_node(op.input("Ids")[0])
-    updates = g.get_node(op.input("Updates")[0])
-    overwrite = op.attr("overwrite")
-
-    shape = infer_shape(updates)
-    ndims = len(shape)
-    index = _op.expand_dims(index, axis=-1, num_newaxis=ndims - 1)
-    index = _op.transform.broadcast_to(index, shape)
-
-    if overwrite:
-        out = _op.scatter(x, index, updates, axis=0)
-    else:
-        out = _op.scatter_add(_op.zeros_like(x), index, updates, axis=0)
-        out += _op.scatter(x, index, _op.zeros_like(updates), axis=0)
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_scatter_nd_add(g, op, block):
-    """Operator converter for scatter_nd_add."""
-
-    x = g.get_node(op.input("X")[0])
-    index = g.get_node(op.input("Index")[0])
-    updates = g.get_node(op.input("Updates")[0])
-    indices_dim = len(infer_shape(index))
-    axes = list(range(indices_dim))
-    index = _op.transpose(index, axes[-1:] + axes[:-1])
-    out = _op.scatter_nd(x, index, updates, mode="add")
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_selu(g, op, block):
-    """Operator converter for selu."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = infer_type(x).checked_type.dtype
-    alpha = _op.const(op.attr("alpha"), dtype)
-    scale = _op.const(op.attr("scale"), dtype)
-    out = (
-        _expr.const(-1.0, dtype=dtype)
-        * alpha
-        * _op.nn.relu(_expr.const(1.0, dtype=dtype) - _op.exp(x))
-    )
-    out = scale * (out + _op.nn.relu(x))
-    g.add_node(op.output("Out")[0], out)
-
-
 def convert_shape(g, op, block):
     """Operator converter for shape."""
 
@@ -1041,15 +824,6 @@ def convert_shape(g, op, block):
     g.add_node(op.output("Out")[0], out)
 
 
-def convert_size(g, op, block):
-    """Operator converter for size."""
-
-    input_x = g.get_node(op.input("Input")[0])
-    out = _op.ndarray_size(input_x, dtype="int64")
-    out = _op.expand_dims(out, axis=0)
-    g.add_node(op.output("Out")[0], out)
-
-
 def convert_slice(g, op, block):
     """Operator converter for slice."""
 
@@ -1104,36 +878,6 @@ def convert_softmax(g, op, block):
     g.add_node(op.output("Out")[0], out)
 
 
-def convert_softplus(g, op, block):
-    """Operator converter for softplus."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = infer_type(x).checked_type.dtype
-    beta = op.attr("beta")
-    beta = _expr.const(beta, dtype=dtype)
-    out = _op.log(_op.exp(x * beta) + _expr.const(1.0, dtype=dtype)) / beta
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_softsign(g, op, block):
-    """Operator converter for softsign."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = infer_type(x).checked_type.dtype
-    out = x / (_op.const(1.0, dtype) + _op.abs(x))
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_square(g, op, block):
-    """Operator converter for square."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = block.var(op.output("Out")[0]).dtype
-    dtype = _convert_dtype_value(dtype)
-    out = _op.power(x, _expr.const(2, dtype))
-    g.add_node(op.output("Out")[0], out)
-
-
 def convert_squeeze(g, op, block):
     """Operator converter for squeeze2."""
 
@@ -1145,23 +889,6 @@ def convert_squeeze(g, op, block):
     g.add_node(op.output("Out")[0], x)
 
 
-def convert_swish(g, op, block):
-    """Operator converter for swish."""
-
-    x = g.get_node(op.input("X")[0])
-    dtype = infer_type(x).checked_type.dtype
-    out = x / (_op.const(1.0, dtype) + _op.exp(_op.const(-1.0, dtype) * x))
-    g.add_node(op.output("Out")[0], out)
-
-
-def convert_transpose(g, op, block):
-    """Operator converter for transpose."""
-
-    perm = op.attr("axis")
-    out = _op.transpose(g.get_node(op.input("X")[0]), axes=perm)
-    g.add_node(op.output("Out")[0], out)
-
-
 def convert_unsqueeze(g, op, block):
     """Operator converter for unsqueeze."""
 
@@ -1173,56 +900,31 @@ def convert_unsqueeze(g, op, block):
 
 
 _convert_map = {
-    "abs": convert_unary_op,
-    "acos": convert_unary_op,
-    "addmm": convert_addmm,
     "arg_max": convert_arg_max_min,
     "arg_min": convert_arg_max_min,
     "argsort": convert_argsort,
-    "asin": convert_unary_op,
     "assign": convert_assign,
     "assign_value": convert_assign_value,
-    "atan": convert_unary_op,
     "batch_norm": convert_batch_norm,
-    "bmm": convert_bmm,
-    "brelu": convert_brelu,
     "cast": convert_cast,
-    "ceil": convert_unary_op,
     "concat": convert_concat,
     "conv2d": convert_conv2d,
-    "cos": convert_unary_op,
-    "cosh": convert_unary_op,
     "cumsum": convert_cumsum,
     "depthwise_conv2d": convert_conv2d,
     "dot": convert_dot,
     "dropout": convert_dropout,
     "elementwise_add": convert_elementwise_op,
     "elementwise_div": convert_elementwise_op,
-    "elementwise_floordiv": convert_elementwise_op,
-    "elementwise_max": convert_elementwise_op,
-    "elementwise_min": convert_elementwise_op,
-    "elementwise_mod": convert_elementwise_op,
     "elementwise_mul": convert_elementwise_op,
-    "elementwise_pow": convert_elementwise_op,
-    "elementwise_prod": convert_elementwise_op,
     "elementwise_sub": convert_elementwise_op,
     "equal": convert_elementwise_op,
-    "erf": convert_unary_op,
     "exp": convert_unary_op,
     "expand_v2": convert_expand,
     "expand_as_v2": convert_expand_as,
     "feed": convert_feed,
     "fill_any_like": convert_fill_any_like,
     "fill_constant": convert_fill_constant,
-    "floor": convert_unary_op,
-    "floor_mod": convert_elementwise_op,
-    "gather": convert_gather,
-    "gather_nd": convert_gather_nd,
     "gelu": convert_gelu,
-    "greater_equal": convert_elementwise_op,
-    "greater_than": convert_elementwise_op,
-    "group_norm": convert_group_norm,
-    "hard_shrink": convert_hard_shrink,
     "hard_sigmoid": convert_hard_sigmoid,
     "hard_swish": convert_hard_swish,
     "isfinite_v2": convert_unary_op,
@@ -1230,59 +932,23 @@ def convert_unsqueeze(g, op, block):
     "isnan_v2": convert_unary_op,
     "layer_norm": convert_layer_norm,
     "leaky_relu": convert_leaky_relu,
-    "less_equal": convert_elementwise_op,
-    "less_than": convert_elementwise_op,
-    "log": convert_unary_op,
-    "log2": convert_unary_op,
-    "log10": convert_unary_op,
     "logical_and": convert_binary_logical_op,
-    "logical_not": convert_logical_not,
     "logical_or": convert_binary_logical_op,
     "logical_xor": convert_binary_logical_op,
     "lookup_table_v2": convert_lookup_table,
     "matmul": convert_matmul,
     "matmul_v2": convert_matmul,
     "mul": convert_mul,
-    "not_equal": convert_elementwise_op,
-    "pad1d": convert_padding,
-    "pad2d": convert_padding,
     "pad3d": convert_padding,
     "pool2d": convert_pool2d,
-    "pow": convert_pow,
     "relu": convert_unary_op,
-    "relu6": convert_relu6,
     "reshape2": convert_reshape,
-    "round": convert_unary_op,
-    "reciprocal": convert_reciprocal,
-    "reduce_all": convert_reduce,
-    "reduce_any": convert_reduce,
-    "reduce_max": convert_reduce,
-    "reduce_min": convert_reduce,
-    "reduce_prod": convert_reduce,
-    "reduce_sum": convert_reduce,
-    "reduce_mean": convert_reduce,
-    "rsqrt": convert_unary_op,
     "scale": convert_scale,
-    "scatter": convert_scatter,
-    "scatter_nd_add": convert_scatter_nd_add,
-    "selu": convert_selu,
     "shape": convert_shape,
-    "sigmoid": convert_unary_op,
-    "sign": convert_unary_op,
-    "sin": convert_unary_op,
-    "sinh": convert_unary_op,
-    "size": convert_size,
     "slice": convert_slice,
     "softmax": convert_softmax,
-    "softplus": convert_softplus,
-    "softsign": convert_softsign,
-    "sqrt": convert_unary_op,
-    "square": convert_square,
     "squeeze2": convert_squeeze,
-    "swish": convert_swish,
-    "tan": convert_unary_op,
     "tanh": convert_unary_op,
-    "transpose2": convert_transpose,
     "unsqueeze2": convert_unsqueeze,
 }
 
@@ -1457,10 +1123,6 @@ def from_paddle(program_or_layer, shape_dict=None, scope=None):
 
     import paddle
 
-    # disable system signal capturing in paddle framework
-    # the signal capturing may cause conflict while running autotvm with paddle frontend
-    paddle.disable_signal_handler()
-
     g = GraphProto()
     if isinstance(program_or_layer, paddle.jit.TranslatedLayer):
         # model is loaded by `paddle.jit.load`
diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
index 13704ff7aad94..3fc202a7cc91f 100644
--- a/python/tvm/relay/frontend/pytorch.py
+++ b/python/tvm/relay/frontend/pytorch.py
@@ -849,23 +849,35 @@ def hard_swish(self, inputs, input_types):
         data = inputs[0]
         return data * self.hard_sigmoid(inputs, input_types)
 
-    def adaptive_avg_pool(self, op, inputs, input_types):
+    def adaptive_avg_pool_2d(self, inputs, input_types):
         data = inputs[0]
         output_size = inputs[1]
 
         def func(x):
-            return op(x, output_size=output_size)
+            return _op.nn.adaptive_avg_pool2d(x, output_size=output_size)
 
         if self.is_quantized_tensor(data):
             return qnn_torch.apply_with_upcast(data, func)
 
         return func(data)
 
-    def adaptive_max_pool(self, op, inputs, input_types):
+    def adaptive_max_pool_2d(self, inputs, input_types):
         data = inputs[0]
         output_size = inputs[1]
+
         # returns dummy indices too
-        return op(data, output_size=output_size), None
+        return _op.nn.adaptive_max_pool2d(data, output_size=output_size), None
+
+    def adaptive_max_pool_3d(self, inputs, input_types):
+        data = inputs[0]
+        output_size = inputs[1]
+        # returns dummy indices too
+        return _op.nn.adaptive_max_pool3d(data, output_size=output_size), None
+
+    def adaptive_avg_pool_3d(self, inputs, input_types):
+        data = inputs[0]
+        output_size = inputs[1]
+        return _op.nn.adaptive_avg_pool3d(data, output_size=output_size)
 
     @staticmethod
     def convert_const_list(data):
@@ -2782,39 +2794,6 @@ def searchsorted(self, inputs, input_types):
     def bucketize(self, inputs, input_types):
         return self.searchsorted_common(inputs[1], inputs[0], inputs[2], inputs[3])
 
-    def roll(self, inputs, input_types):
-        def slide_axes(inp, shape, ax):
-            axes = list(range(len(shape)))
-            axes = axes[:ax] + [-1] + axes[ax:-1]
-            return _op.transpose(inp, axes)
-
-        x = inputs[0]
-        shifts = inputs[1]
-        dims = inputs[2]
-        shape = self.infer_shape(x)
-        start = _expr.const(0, "int64")
-        step = _expr.const(1, "int64")
-
-        out = x
-        for i, dim in enumerate(dims):
-            roll_dim = _expr.const(shape[dim], "int64")
-            indices_1d = _op.mod(
-                _op.transform.arange(start, roll_dim, step, "int64")
-                - _expr.const(shifts[i], "int64")
-                + roll_dim,
-                roll_dim,
-            )
-            # First fill in the last axis with roll indices, and then do transpose to
-            # bring the roll indices into the desired axis.
-            indices = slide_axes(
-                _op.tile(indices_1d, shape[:dim] + shape[dim + 1 :] + (1,)),
-                shape,
-                dim,
-            )
-            out = _op.gather(out, dim, indices)
-
-        return out
-
     # Operator mappings
     def create_convert_map(self):
         self.convert_map = {
@@ -2872,26 +2851,9 @@ def create_convert_map(self):
             "aten::gelu": self.gelu,
             "aten::selu": self.selu,
             "aten::silu": self.silu,
-            "aten::silu_": self.silu,
             "aten::log_sigmoid": self.log_sigmoid,
-            "aten::adaptive_avg_pool1d": functools.partial(
-                self.adaptive_avg_pool, _op.nn.adaptive_avg_pool1d
-            ),
-            "aten::adaptive_avg_pool2d": functools.partial(
-                self.adaptive_avg_pool, _op.nn.adaptive_avg_pool2d
-            ),
-            "aten::adaptive_avg_pool3d": functools.partial(
-                self.adaptive_avg_pool, _op.nn.adaptive_avg_pool3d
-            ),
-            "aten::adaptive_max_pool1d": functools.partial(
-                self.adaptive_max_pool, _op.nn.adaptive_max_pool1d
-            ),
-            "aten::adaptive_max_pool2d": functools.partial(
-                self.adaptive_max_pool, _op.nn.adaptive_max_pool2d
-            ),
-            "aten::adaptive_max_pool3d": functools.partial(
-                self.adaptive_max_pool, _op.nn.adaptive_max_pool3d
-            ),
+            "aten::adaptive_avg_pool2d": self.adaptive_avg_pool_2d,
+            "aten::adaptive_max_pool2d": self.adaptive_max_pool_2d,
             "aten::max_pool2d": self.maxpool_2d,
             "aten::max_pool2d_with_indices": self.maxpool_2d_with_indices,
             "aten::max_pool1d": self.maxpool_1d,
@@ -2977,7 +2939,6 @@ def create_convert_map(self):
             "aten::rsqrt": self.make_unary("rsqrt"),
             "aten::ceil": self.make_unary("ceil"),
             "aten::floor": self.make_unary("floor"),
-            "aten::floor_": self.make_unary("floor"),
             "aten::round": self.make_unary("round"),
             "aten::isfinite": self.make_unary("isfinite"),
             "aten::isinf": self.make_unary("isinf"),
@@ -3003,6 +2964,8 @@ def create_convert_map(self):
             "aten::bitwise_xor": self.bitwise_xor,
             "aten::Bool": self.Bool,
             "aten::Float": self.Float,
+            "aten::adaptive_avg_pool3d": self.adaptive_avg_pool_3d,
+            "aten::adaptive_max_pool3d": self.adaptive_max_pool_3d,
             "aten::rsub": self.rsub,
             "aten::embedding": self.embedding,
             "aten::one_hot": self.one_hot,
@@ -3058,7 +3021,6 @@ def create_convert_map(self):
             "aten::any": functools.partial(self.all_any_common, _op.any),
             "aten::searchsorted": self.searchsorted,
             "aten::bucketize": self.bucketize,
-            "aten::roll": self.roll,
         }
 
     def update_convert_map(self, custom_map):
diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py
index 97382ffe6fcd5..3688ff5ff4e57 100644
--- a/python/tvm/relay/frontend/tflite.py
+++ b/python/tvm/relay/frontend/tflite.py
@@ -1135,6 +1135,8 @@ def _convert_unary_elemwise(self, relay_op, op):
 
     def convert_abs(self, op):
         """Convert TFLite ABS"""
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented("TFlite quantized ABS operator is not supported yet.")
         return self._convert_unary_elemwise(_op.abs, op)
 
     def convert_ceil(self, op):
@@ -1201,6 +1203,8 @@ def convert_rsqrt(self, op):
 
     def convert_neg(self, op):
         """Convert TFLite NEG"""
+        if self.is_quantized(op):
+            raise tvm.error.OpNotImplemented("TFlite quantized NEG operator is not supported yet.")
         return self._convert_unary_elemwise(_op.negative, op)
 
     def convert_elu(self, op):
diff --git a/python/tvm/relay/op/contrib/__init__.py b/python/tvm/relay/op/contrib/__init__.py
index 1dd6da6c2747f..30c2db0ddf0b9 100644
--- a/python/tvm/relay/op/contrib/__init__.py
+++ b/python/tvm/relay/op/contrib/__init__.py
@@ -24,4 +24,3 @@
 from .coreml import *
 from .ethosn import *
 from .tensorrt import *
-from .cutlass import *
diff --git a/python/tvm/relay/op/contrib/cutlass.py b/python/tvm/relay/op/contrib/cutlass.py
deleted file mode 100644
index 631089ce766d8..0000000000000
--- a/python/tvm/relay/op/contrib/cutlass.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Patterns supported CUTLASS."""
-from tvm.relay import transform
-from ...dataflow_pattern import wildcard, is_op, is_constant
-
-
-def make_gelu_pattern(bias_out, out_dtype="float16"):
-    mul = is_op("multiply")(bias_out, is_constant())
-    if out_dtype == "float16":
-        erf = is_op("cast")(is_op("erf")(is_op("cast")(mul)))
-    else:
-        erf = is_op("erf")(mul)
-    mul_half = is_op("multiply")(erf, is_constant())
-    add = is_op("add")(mul_half, is_constant())
-    return is_op("multiply")(add, bias_out)
-
-
-def make_gemm_pattern(with_bias=True, with_act=None, out_dtype="float16"):
-    """Create a pattern for dense op followed by activations."""
-    data = wildcard()
-    weight = wildcard()
-    bias = wildcard()
-    gemm = is_op("nn.dense")(data, weight)
-    if with_bias:
-        add_or_bias_add = is_op("add") | is_op("nn.bias_add")
-        gemm_out = add_or_bias_add(gemm, bias)
-    else:
-        gemm_out = gemm
-
-    if with_act is None:
-        return gemm_out
-    if isinstance(with_act, str) and with_act == "relu":
-        return is_op("nn.relu")(gemm_out)
-
-    assert isinstance(with_act, str) and with_act == "gelu"
-    return make_gelu_pattern(gemm_out, out_dtype)
-
-
-def partition_for_cutlass(mod):
-    """Partition the input module into CUTLASS-supported subgraphs."""
-    dense_pat = ("cutlass.dense", make_gemm_pattern(False, None))
-    dense_bias_pat = ("cutlass.dense_bias", make_gemm_pattern(True, None))
-    dense_bias_relu_pat = ("cutlass.dense_bias_relu", make_gemm_pattern(True, "relu"))
-    dense_bias_gelu_fp16_pat = ("cutlass.dense_bias_gelu_fp16", make_gemm_pattern(True, "gelu"))
-    dense_bias_gelu_fp32_pat = (
-        "cutlass.dense_bias_gelu_fp32",
-        make_gemm_pattern(True, "gelu", out_dtype="float32"),
-    )
-    cutlass_patterns = [
-        dense_bias_gelu_fp16_pat,
-        dense_bias_gelu_fp32_pat,
-        dense_bias_relu_pat,
-        dense_bias_pat,
-        dense_pat,
-    ]
-    mod = transform.MergeComposite(cutlass_patterns)(mod)
-    mod = transform.AnnotateTarget(["cutlass"])(mod)
-    mod = transform.PartitionGraph()(mod)
-    return mod
diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index 412ae713bae16..39ecec7049b3c 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -46,7 +46,7 @@ def ethosn_available():
     return Available.SW_AND_HW if hw else Available.SW_ONLY
 
 
-def partition_for_ethosn77(mod, params=None, **opts):
+def partition_for_ethosn(mod, params=None, **opts):
     """Partition the graph greedily offloading supported
     operators to Arm Ethos-N NPU.
 
@@ -61,49 +61,6 @@ def partition_for_ethosn77(mod, params=None, **opts):
     -------
     ret : annotated and partitioned module.
     """
-    if opts:
-        tops = opts.get("tops", None)
-        ple_ratio = opts.get("ple_ratio", None)
-        sram_size = opts.get("sram_size", None)
-        if tops or ple_ratio or sram_size:
-            raise ValueError(
-                "Setting tops, ple_ratio or sram_size has no effect when targeting Ethos(TM)-N77"
-            )
-
-    if params:
-        mod["main"] = bind_params_by_name(mod["main"], params)
-
-    seq = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            transform.MergeComposite(pattern_table()),
-            transform.AnnotateTarget("ethos-n"),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-        ]
-    )
-
-    return seq(mod)
-
-
-def partition_for_ethosn78(mod, params=None, **opts):
-    """Partition the graph greedily offloading supported
-    operators to Arm Ethos(TM)-N NPU.
-
-    Parameters
-    ----------
-    mod : Module
-        The module to run passes on.
-    params : Optional[Dict[str, NDArray]]
-        Constant input parameters.
-
-    Returns
-    -------
-    ret : annotated and partitioned module.
-    """
-    if not opts or opts.get("variant", "").lower() != "ethos-n78":
-        raise ValueError("When targeting Ethos(TM)-N78, -variant=Ethos-N78 should be set.")
-
     if params:
         mod["main"] = bind_params_by_name(mod["main"], params)
 
diff --git a/python/tvm/rpc/server_ios_launcher.py b/python/tvm/rpc/server_ios_launcher.py
deleted file mode 100644
index 2e31586f64566..0000000000000
--- a/python/tvm/rpc/server_ios_launcher.py
+++ /dev/null
@@ -1,498 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""
-Python wrapper for running a RPC Server through iOS RPC
-on the iOS simulator using the simctl command line tool.
-"""
-# pylint: disable=invalid-name
-import os
-import json
-import time
-import threading
-import subprocess
-from enum import Enum
-from typing import Dict, List, AnyStr
-
-
-class OSName(Enum):
-    """The names of the operating systems available on the simulator."""
-
-    iOS = "iOS"
-    tvOS = "tvOS"
-    watchOS = "watchOS"
-
-
-class IOSDevice(Enum):
-    """The names of available iOS devices."""
-
-    iPhone = "iPhone"
-    iPod = "iPod"
-    iPad = "iPad"
-
-
-class RPCServerMode(Enum):
-    """Server modes available in the iOS RPC application."""
-
-    standalone = "standalone"
-    proxy = "proxy"
-    tracker = "tracker"
-
-
-def get_list_of_available_simulators() -> Dict[AnyStr, List]:
-    """
-    List of simulators available on the system. Simulators are presented as a dictionary.
-    The dictionary key is the name of the operating system of the simulator.
-    The dictionary value is a list of all simulators with a given operating system.
-    """
-
-    with subprocess.Popen(
-        "xcrun simctl list devices available --json",
-        shell=True,
-        stdin=subprocess.PIPE,
-        stdout=subprocess.PIPE,
-    ) as proc:
-        out, _ = proc.communicate()
-        available_simulators = json.loads(out)["devices"]
-        available_simulators = {
-            key: value for key, value in available_simulators.items() if value != []
-        }
-    return available_simulators
-
-
-def grep_by_system(available_devices: Dict[AnyStr, List], system_name: OSName) -> List[Dict]:
-    """Search for simulators that use the target operating system."""
-
-    def find_index_of_substr(search_field: List[AnyStr], target: AnyStr) -> int:
-        for i, item in enumerate(search_field):
-            if target in item:
-                return i
-        raise ValueError("Search field doesn't content target")
-
-    keys = list(available_devices.keys())
-
-    return available_devices[keys[find_index_of_substr(keys, system_name.value)]]
-
-
-def grep_by_device(available_devices: List[Dict], device_name: IOSDevice) -> List[Dict]:
-    """Search for simulators that emulate a given device."""
-
-    return [item for item in available_devices if device_name.value in item["name"]]
-
-
-def get_device_uid(target_device: Dict) -> AnyStr:
-    """Get a unique device ID."""
-
-    return target_device["udid"]
-
-
-def check_call_with_runtime_error(cmd: AnyStr, error_message: AnyStr) -> None:
-    """Calling the function `subprocess.check_call` and catching its possible thrown exception."""
-
-    try:
-        subprocess.check_call(cmd.split(" "))
-    except subprocess.CalledProcessError as called_process_error:
-        raise called_process_error from RuntimeError(error_message)
-
-
-def boot_device(udid: AnyStr) -> None:
-    """Boot the device by its unique ID."""
-
-    cmd = f"xcrun simctl boot {udid}"
-    error_message = f"Failed to boot device with unique id: {udid}"
-    check_call_with_runtime_error(cmd, error_message)
-    if not is_booted(udid):
-        raise RuntimeError(error_message)
-
-
-def shutdown_device(udid: AnyStr) -> None:
-    """Shutdown the device by its unique ID."""
-
-    cmd = f"xcrun simctl shutdown {udid}"
-    error_message = f"Failed to shut down device with unique id: {udid}"
-    check_call_with_runtime_error(cmd, error_message)
-    if not is_turned_off(udid):
-        raise RuntimeError(error_message)
-
-
-def deploy_bundle_to_simulator(udid: AnyStr, bundle_path: AnyStr) -> None:
-    """Deploy iOS RPC bundle <bundle_path> to simulator with its unique ID <udid>."""
-
-    check_call_with_runtime_error(
-        cmd=f"xcrun simctl install {udid} {bundle_path}",
-        error_message=f"Failed to deploy bundle <{bundle_path}> to device with unique id: {udid}",
-    )
-
-
-def delete_bundle_from_simulator(udid: AnyStr, bundle_id: AnyStr) -> None:
-    """Delete iOS RPC bundle <bundle_id> from simulator with its unique ID <udid>."""
-
-    check_call_with_runtime_error(
-        cmd=f"xcrun simctl uninstall {udid} {bundle_id}",
-        error_message=f"Failed to uninstall bundle <{bundle_id}> "
-        f"from device with unique id: {udid}",
-    )
-
-
-def launch_ios_rpc(
-    udid: AnyStr, bundle_id: AnyStr, host_url: AnyStr, host_port: int, key: AnyStr, mode: AnyStr
-):  # pylint: disable=too-many-arguments, consider-using-with
-    """
-    Launch iOS RPC application on simulator with No UI interconnection.
-
-    udid : str
-        Unique device ID.
-
-    bundle_id : str
-        iOS RPC bundle ID.
-
-    host_url : str
-        The tracker/proxy address.
-
-    host_port : int
-        The tracker/proxy port.
-
-    key : str
-        The key used to identify the device type in tracker.
-
-    mode : str
-        Server mode. See RPCServerMode.
-    """
-
-    cmd = (
-        f"xcrun simctl launch --console {udid} {bundle_id}"
-        f" --immediate_connect"
-        f" --host_url={host_url}"
-        f" --host_port={host_port}"
-        f" --key={key}"
-        f" --server_mode={mode}"
-        f" --verbose"
-    )
-    proc = subprocess.Popen(
-        cmd.split(" "),
-        stdout=subprocess.PIPE,
-        stderr=subprocess.STDOUT,
-        bufsize=1,
-        universal_newlines=True,
-    )
-    return proc
-
-
-def terminate_ios_rpc(udid: AnyStr, bundle_id: AnyStr) -> None:
-    """Terminate iOS RPC application."""
-
-    check_call_with_runtime_error(
-        cmd=f"xcrun simctl terminate {udid} {bundle_id}",
-        error_message=f"Failed to terminate bundle <{bundle_id}> "
-        f"from device with unique id: {udid}",
-    )
-
-
-def is_booted(udid: AnyStr) -> bool:
-    """Check that the device has booted."""
-
-    device = find_device(udid)
-    return device["state"] == "Booted"
-
-
-def is_turned_off(udid: AnyStr) -> bool:
-    """Check that the device has turned off."""
-
-    device = find_device(udid)
-    return device["state"] == "Shutdown"
-
-
-def check_booted_device(devices: List[Dict]) -> Dict:
-    """Check if there is already a booted device. If so, return this device."""
-
-    for device in devices:
-        if device["state"] == "Booted":
-            return device
-    return {}
-
-
-def find_device(udid: AnyStr) -> Dict:
-    """Find device by its unique ID."""
-
-    return_value = {}
-    available_devices = get_list_of_available_simulators()
-    for devices in available_devices.values():
-        for device in devices:
-            if device["udid"] == udid:
-                return_value = device
-    return return_value
-
-
-class ServerIOSLauncher:
-    """
-    Python wrapper for launch iOS RPC to simulator.
-
-    mode : str
-        Server mode. See RPCServerMode.
-
-    host : str
-        The tracker/proxy address.
-
-    port : int
-        The tracker/proxy port.
-
-    key : str
-        The key used to identify the device type in tracker.
-    """
-
-    booted_devices = []
-    bundle_id = os.environ.get("BUNDLE_ID")
-    bundle_path = os.environ.get("BUNDLE_PATH")
-
-    class ConsoleMarkers(Enum):
-        """
-        Marker-messages that iOS RPC Server should print to the console output
-        when its states change (see apps/ios_rpc/tvmrpc/RPCServer.mm).
-
-        STOPPED : str
-            iOS RPC Server process was stopped
-
-        CALLSTACK : str
-            Call stack if RPC Server was stopped with an error.
-
-        CONNECTED : str
-            RPC Server reports that it successfully connected.
-
-        SERVER_IP : str
-            IP on which RPC Server started (for standalone mode).
-
-        SERVER_PORT : str
-            HOST on which RPC Server started (for standalone mode).
-        """
-
-        STOPPED = "PROCESS_STOPPED"
-        CALLSTACK = "First throw call stack"
-        CONNECTED = "[IOS-RPC] STATE: 2"
-        SERVER_IP = "[IOS-RPC] IP: "
-        SERVER_PORT = "[IOS-RPC] PORT: "
-
-    def __init__(self, mode, host, port, key):
-        if not ServerIOSLauncher.is_compatible_environment():
-            raise RuntimeError(
-                "Can't create ServerIOSLauncher instance."
-                " No environment variables set for iOS RPC Server."
-            )
-
-        self.host = host
-        self.port = port
-
-        self.external_booted_device = None
-        if not ServerIOSLauncher.booted_devices:
-            self._boot_or_find_booted_device()
-
-        self.udid = get_device_uid(
-            self.external_booted_device
-            if self.external_booted_device is not None
-            else ServerIOSLauncher.booted_devices[-1]
-        )
-
-        self.bundle_was_deployed = False
-        deploy_bundle_to_simulator(self.udid, self.bundle_path)
-        self.bundle_was_deployed = True
-
-        self.server_was_started = False
-        self.launch_process = launch_ios_rpc(self.udid, self.bundle_id, host, port, key, mode)
-        self._wait_launch_complete(
-            waiting_time=60,
-            hz=10,
-            should_print_host_and_port=mode == RPCServerMode.standalone.value,
-        )
-        self.server_was_started = True
-
-    def terminate(self):
-        """Terminate iOS RPC server."""
-
-        if self.bundle_was_deployed and self.server_was_started:
-            try:
-                terminate_ios_rpc(self.udid, self.bundle_id)
-                self.launch_process.terminate()
-                self.server_was_started = False
-            except RuntimeError as e:
-                print(e)
-        if self.bundle_was_deployed:
-            try:
-                delete_bundle_from_simulator(self.udid, self.bundle_id)
-                self.bundle_was_deployed = False
-            except RuntimeError as e:
-                print(e)
-
-    def __del__(self):
-        self.terminate()
-
-    @staticmethod
-    def is_compatible_environment():
-        """Check that the current environment has the required variables."""
-
-        return bool(os.environ.get("BUNDLE_ID")) and bool(os.environ.get("BUNDLE_PATH"))
-
-    @staticmethod
-    def shutdown_booted_devices():
-        """Shutdown simulators that have been booted using this class."""
-
-        for device_meta in ServerIOSLauncher.booted_devices:
-            try:
-                shutdown_device(get_device_uid(device_meta))
-            except RuntimeError as e:
-                print(e)
-        ServerIOSLauncher.booted_devices = []
-
-    def _boot_or_find_booted_device(self):
-        """
-        Boot the required simulator if there is no suitable booted simulator
-        among the available simulators. If there is a suitable booted simulator,
-        then take it as a simulator to which the iOS RPC application will be deployed.
-        """
-
-        target_system = OSName.iOS
-        target_device_type = IOSDevice.iPhone
-        available_devices = get_list_of_available_simulators()
-        if not available_devices:
-            raise ValueError("No devices available in this environment")
-        target_devices = grep_by_system(available_devices, target_system)
-        if not target_devices:
-            raise ValueError(f"No available simulators for target system: {target_system.value}")
-        target_devices = grep_by_device(target_devices, target_device_type)
-        if not target_devices:
-            raise ValueError(
-                f"No available simulators for target device type: {target_device_type.value}"
-            )
-
-        maybe_booted = check_booted_device(target_devices)
-        if maybe_booted:
-            self.external_booted_device = maybe_booted
-        else:
-            take_latest_model = True
-            target_device = target_devices[-1 if take_latest_model else 0]
-            boot_device(get_device_uid(target_device))
-            ServerIOSLauncher.booted_devices.append(target_device)
-
-    def _wait_launch_complete(self, waiting_time, hz, should_print_host_and_port=False):
-        # pylint: disable=too-many-locals
-        """
-        Wait for the iOS RPC server to start.
-
-        waiting_time : int
-            The maximum waiting time during which it is necessary
-            to receive a message from RPC Server.
-
-        hz : int
-            The frequency of checking (in hertz) messages from RPC Server.
-            Checks for messages from the server will occur every 1 / hz second.
-
-        should_print_host_and_port : bool
-            A flag that indicates that RPC Server should print the host and port
-            on which it was started.
-            Used for standalone mode.
-        """
-
-        class Switch:
-            """A simple helper class for boolean switching."""
-
-            def __init__(self):
-                self._on = False
-
-            def toggle(self):
-                """Toggle flag."""
-                self._on = not self._on
-
-            @property
-            def on(self):
-                """Flag of this switch."""
-                return self._on
-
-        def watchdog():
-            for _ in range(waiting_time * hz):
-                time.sleep(1.0 / hz)
-                if switch_have_data.on:
-                    break
-            if not switch_have_data.on:
-                self.launch_process.terminate()
-                switch_process_was_terminated.toggle()
-
-        switch_have_data = Switch()
-        switch_process_was_terminated = Switch()
-        watchdog_thread = threading.Thread(target=watchdog)
-
-        host, port = None, None
-        watchdog_thread.start()
-        for line in self.launch_process.stdout:
-            if not switch_have_data.on:
-                switch_have_data.toggle()
-
-            found = str(line).find(ServerIOSLauncher.ConsoleMarkers.STOPPED.value)
-            if found != -1:
-                raise RuntimeError("[ERROR] Crash during RCP Server launch.. ")
-
-            found = str(line).find(ServerIOSLauncher.ConsoleMarkers.CALLSTACK.value)
-            if found != -1:
-                raise RuntimeError("[ERROR] Crash during RCP Server launch.. ")
-
-            found = str(line).find(ServerIOSLauncher.ConsoleMarkers.SERVER_IP.value)
-            if found != -1:
-                ip = str(line)[
-                    found + len(ServerIOSLauncher.ConsoleMarkers.SERVER_IP.value) :
-                ].rstrip("\n")
-                host = ip
-
-            found = str(line).find(ServerIOSLauncher.ConsoleMarkers.SERVER_PORT.value)
-            if found != -1:
-                port = str(line)[
-                    found + len(ServerIOSLauncher.ConsoleMarkers.SERVER_PORT.value) :
-                ].rstrip("\n")
-                port = int(port)
-
-            if str(line).find(ServerIOSLauncher.ConsoleMarkers.CONNECTED.value) != -1:
-                # rpc server reports that it successfully connected
-                break
-        watchdog_thread.join()
-
-        if switch_process_was_terminated.on:
-            raise TimeoutError("Can't get a response from the iOS Server.")
-        if should_print_host_and_port:
-            if host is None or port is None:
-                raise RuntimeError("No messages with actual host and port.")
-            self.port = port
-
-
-class ServerIOSContextManager:
-    """
-    Context manager for ServerIOSLauncher.
-    To work with ServerIOSLauncher, it is preferable to use this class
-    so that the terminate method is called in any case.
-    """
-
-    def __init__(self, mode, host, port, key):
-        self.__mode = mode
-        self.__host = host
-        self.__port = port
-        self.__key = key
-        self.__ios_rpc_server_launcher = None
-
-    def __enter__(self):
-        self.__ios_rpc_server_launcher = ServerIOSLauncher(
-            self.__mode, self.__host, self.__port, self.__key
-        )
-        return self.__ios_rpc_server_launcher
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        if self.__ios_rpc_server_launcher is not None:
-            self.__ios_rpc_server_launcher.terminate()
-            self.__ios_rpc_server_launcher = None
diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py
index c1cbc966acdc6..8ebb0f6301d25 100644
--- a/python/tvm/runtime/vm.py
+++ b/python/tvm/runtime/vm.py
@@ -71,7 +71,6 @@ def __init__(self, mod):
         self._save = self.mod["save"]
         self._get_lib = self.mod["get_lib"]
         self._get_bytecode = self.mod["get_bytecode"]
-        self._get_constants = self.mod["get_constants"]
         self._get_stats = self.mod["get_stats"]
         self._get_function_arity = self.mod["get_function_arity"]
         self._get_function_param_name = self.mod["get_function_param_name"]
@@ -245,12 +244,6 @@ def bytecode(self):
         """
         return self._get_bytecode()
 
-    @property
-    def constants(self):
-        """Returns a human-readable description of all the constants in the executable.
-        Useful for debugging and diffing generated executables in unit tests."""
-        return self._get_constants()
-
     @property
     def globals(self):
         """Get the globals used by the Relay VM executable.
diff --git a/python/tvm/testing/plugin.py b/python/tvm/testing/plugin.py
index c0decb7747bdb..2cb228c357e5a 100644
--- a/python/tvm/testing/plugin.py
+++ b/python/tvm/testing/plugin.py
@@ -253,13 +253,7 @@ def _sort_tests(items):
     Should be called from pytest_collection_modifyitems.
 
     """
-
-    def sort_key(item):
-        filename, lineno, test_name = item.location
-        test_name = test_name.split("[")[0]
-        return filename, lineno, test_name
-
-    items.sort(key=sort_key)
+    items.sort(key=lambda item: item.location)
 
 
 def _target_to_requirement(target):
diff --git a/python/tvm/topi/arm_cpu/injective.py b/python/tvm/topi/arm_cpu/injective.py
index 330144b33fb6d..55f47c5dee4d4 100644
--- a/python/tvm/topi/arm_cpu/injective.py
+++ b/python/tvm/topi/arm_cpu/injective.py
@@ -17,7 +17,6 @@
 # pylint: disable=invalid-name, unused-variable
 """Schedule for pooling operators"""
 import tvm
-import numpy as np
 from tvm import te
 from ..utils import is_empty_shape
 
@@ -68,7 +67,7 @@ def schedule_injective(outs):
 
     if list(s[x].op.axis):
         # do not vectorize for broadcast
-        (io, ii) = s[x].split(list(s[x].op.axis)[-1], 16 // np.dtype(x.dtype).itemsize)
+        (io, ii) = s[x].split(list(s[x].op.axis)[-1], 4)
         s[x].vectorize(ii)
     tvm.te.schedule.AutoInlineInjective(s)
 
diff --git a/rust/tvm/src/ir/relay/mod.rs b/rust/tvm/src/ir/relay/mod.rs
index b65b784bf400f..f43967f28d603 100644
--- a/rust/tvm/src/ir/relay/mod.rs
+++ b/rust/tvm/src/ir/relay/mod.rs
@@ -163,7 +163,7 @@ impl Call {
         span: Span,
     ) -> Call {
         let node = CallNode {
-            base: ExprNode::base::<CallNode>(span),
+            base: ExprNode::base::<VarNode>(span),
             op: op,
             args: args,
             attrs: attrs,
diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc
index 8f9c8589e65d9..34661f81c8474 100644
--- a/src/driver/driver_api.cc
+++ b/src/driver/driver_api.cc
@@ -590,6 +590,7 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target)
   mixed_pass_list.push_back(BindTarget(target));
 
   mixed_pass_list.push_back(tir::transform::VerifyMemory());
+  mixed_pass_list.push_back(tir::transform::MergeDynamicSharedMemoryAllocations());
 
   if (ShouldAnnotateEntryFunc(target, mixed_mod)) {
     mixed_pass_list.push_back(AnnotateEntryFunc(true));
@@ -602,8 +603,6 @@ transform::Sequential MixedModulePassManager(IRModule mixed_mod, Target target)
   }
 
   mixed_pass_list.push_back(tir::transform::ThreadSync("shared"));
-  mixed_pass_list.push_back(tir::transform::ThreadSync("shared.dyn"));
-  mixed_pass_list.push_back(tir::transform::MergeDynamicSharedMemoryAllocations());
   mixed_pass_list.push_back(tir::transform::ThreadSync("warp"));
   mixed_pass_list.push_back(tir::transform::InferFragment());
   mixed_pass_list.push_back(tir::transform::LowerThreadAllreduce());
diff --git a/src/ir/module.cc b/src/ir/module.cc
index 8ea83cfb40f06..3deb70dd766cd 100644
--- a/src/ir/module.cc
+++ b/src/ir/module.cc
@@ -187,12 +187,9 @@ void WarnIfMalformed(const IRModule& mod, relay::Function func) {
   auto fv = relay::FreeVars(func);
   auto ftv = relay::FreeTypeVars(func, mod);
   // TODO(@jroesch): refactor to use diagnostic context
-  ICHECK_EQ(fv.size(), 0) << "Function:" << std::endl
-                          << PrettyPrint(func) << std::endl
-                          << "contains free variables: " << fv;
-  ICHECK_EQ(ftv.size(), 0) << "Function:" << std::endl
-                           << PrettyPrint(func) << std::endl
-                           << "contains free type variables: " << fv;
+  ICHECK_EQ(fv.size(), 0) << "There are free variables: " << fv << std::endl;
+  ICHECK_EQ(ftv.size(), 0) << "There are free type variables: " << fv
+                           << " in function: " << AsText(func, false);
 }
 
 void IRModuleNode::Add(const GlobalVar& var, const BaseFunc& f, bool update) {
diff --git a/src/relay/backend/contrib/codegen_c/codegen_c.h b/src/relay/backend/contrib/codegen_c/codegen_c.h
index 964d7dee3ad10..0d575b3ec4987 100644
--- a/src/relay/backend/contrib/codegen_c/codegen_c.h
+++ b/src/relay/backend/contrib/codegen_c/codegen_c.h
@@ -44,12 +44,6 @@ struct Output {
   bool need_copy;
 };
 
-struct GenerateBodyOutput {
-  std::string decl;
-  std::vector<std::string> buffers;
-  std::vector<Output> outputs;
-};
-
 class CSourceModuleCodegenBase {
  public:
   CSourceModuleCodegenBase() = default;
@@ -160,8 +154,7 @@ class CodegenCBase {
    * \endcode
    */
   void GenerateBackendCFunc(const std::string& func_name, const Array<Var>& args,
-                            const std::string& const_arr_name, const std::vector<Output>& outs,
-                            bool pass_dl_tensor = false) {
+                            const std::string& const_arr_name, const std::vector<Output>& outs) {
     // Print signature
     code_stream_ << "\n";
 
@@ -182,12 +175,8 @@ class CodegenCBase {
     PrintIndents();
     code_stream_ << func_name << "_(";
     for (size_t i = 0; i < args.size(); i++) {
-      if (pass_dl_tensor) {
-        code_stream_ << "arg" << i << ",\n";
-      } else {
-        const auto& dtype_str = GetDtypeString(args[i]);
-        code_stream_ << "(" << dtype_str << "*)(arg" << i << "->data),\n";
-      }
+      const auto& dtype_str = GetDtypeString(args[i]);
+      code_stream_ << "(" << dtype_str << "*)(arg" << i << "->data),\n";
       PrintIndents();
     }
     for (size_t i = 0; i < outs.size() - 1; i++) {
diff --git a/src/relay/backend/contrib/cutlass/codegen.cc b/src/relay/backend/contrib/cutlass/codegen.cc
deleted file mode 100644
index c1217a08b7120..0000000000000
--- a/src/relay/backend/contrib/cutlass/codegen.cc
+++ /dev/null
@@ -1,409 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file src/relay/backend/contrib/cutlass/codegen.cc
- * \brief Implementation of CUTLASS codegen.
- */
-
-#include <tvm/relay/attrs/nn.h>
-#include <tvm/relay/expr_functor.h>
-#include <tvm/relay/transform.h>
-#include <tvm/relay/type.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/registry.h>
-
-#include <fstream>
-#include <numeric>
-#include <sstream>
-
-#include "../../utils.h"
-#include "../codegen_c/codegen_c.h"
-
-namespace tvm {
-namespace relay {
-namespace contrib {
-
-using namespace backend;
-using Str2StrMap = std::unordered_map<std::string, std::string>;
-
-static Str2StrMap dtype_map = {{"float16", "cutlass::half_t"}, {"float32", "float"}};
-
-constexpr const char* kAnyDim = "Any";
-
-std::string GetDimAsStr(ObjectRef dim) {
-  if (auto d = dim.as<IntImmNode>()) {
-    return std::to_string(d->value);
-  }
-  return kAnyDim;
-}
-
-Str2StrMap DenseArgs(const Map<String, ObjectRef>& attrs) {
-  Str2StrMap args;
-  auto arg0_dtype = std::string(attrs["arg0_dtype"].as<StringObj>()->data);
-  auto arg1_dtype = std::string(attrs["arg1_dtype"].as<StringObj>()->data);
-  auto ret_dtype = std::string(attrs["ret_dtype"].as<StringObj>()->data);
-  auto arg0_shape = attrs["arg0_shape"].as<ArrayNode>();
-  auto arg1_shape = attrs["arg1_shape"].as<ArrayNode>();
-  args["ElementInputA"] = dtype_map.at(arg0_dtype);
-  args["ElementInputB"] = dtype_map.at(arg1_dtype);
-  args["ElementOutput"] = dtype_map.at(ret_dtype);
-  args["M"] = GetDimAsStr(arg0_shape->at(0));
-  args["K"] = GetDimAsStr(arg0_shape->at(1));
-  args["N"] = GetDimAsStr(arg1_shape->at(0));
-  args["op_def"] = std::string(attrs["cutlass_op_def"].as<StringObj>()->data);
-  args["op_name"] = std::string(attrs["cutlass_op_name"].as<StringObj>()->data);
-  args["op_type"] = std::string(attrs["op_type"].as<StringObj>()->data);
-  args["lda"] = std::string(attrs["lda"].as<StringObj>()->data);
-  args["ldb"] = std::string(attrs["ldb"].as<StringObj>()->data);
-  args["ldc"] = std::string(attrs["ldc"].as<StringObj>()->data);
-  return args;
-}
-
-inline void CutlassPrint(std::ostringstream& os, const std::string& stmt, int indent = 2) {
-  for (int i = 0; i < indent; ++i) {
-    os << " ";
-  }
-  os << stmt;
-}
-
-std::string DenseOp(std::string id, const Str2StrMap& attrs,
-                    const std::vector<std::string>& func_args) {
-  bool has_bias = false;
-  bool is_gelu =
-      attrs.at("op_type").find("cutlass.dense_bias_gelu") != std::string::npos;  // fp32 or fp16
-  if (attrs.at("op_type") == "cutlass.dense_bias" ||
-      attrs.at("op_type") == "cutlass.dense_bias_relu" || is_gelu) {
-    has_bias = true;
-  }
-  std::ostringstream gemm_decl;
-  CutlassPrint(gemm_decl, "using ElementInputA = " + attrs.at("ElementInputA") + ";\n");
-  CutlassPrint(gemm_decl, "using ElementInputB = " + attrs.at("ElementInputB") + ";\n");
-  CutlassPrint(gemm_decl, "using ElementOutput = " + attrs.at("ElementOutput") + ";\n");
-  CutlassPrint(gemm_decl, "using ElementComputeEpilogue = " + attrs.at("ElementOutput") + ";\n");
-  CutlassPrint(gemm_decl, attrs.at("op_def"));
-  CutlassPrint(gemm_decl, "using Gemm = Operation_" + attrs.at("op_name") + ";\n");
-
-  auto get_dim = [&attrs, &func_args](const std::string& axis, int arg_idx, int axis_idx) {
-    if (attrs.at(axis) == kAnyDim) {
-      return func_args[arg_idx] + "->shape[" + std::to_string(axis_idx) + "]";
-    } else {
-      return attrs.at(axis);
-    }
-  };
-  CutlassPrint(gemm_decl, "int M = " + get_dim("M", 0, 0) + ";\n");
-  CutlassPrint(gemm_decl, "int N = " + get_dim("N", 1, 0) + ";\n");
-  CutlassPrint(gemm_decl, "int K = " + get_dim("K", 0, 1) + ";\n");
-  CutlassPrint(gemm_decl, "cutlass::gemm::GemmCoord problem_size(M, N, K);\n");
-  // Initialize alpha for dot product computation
-  CutlassPrint(gemm_decl, "ElementComputeEpilogue alpha = ElementComputeEpilogue(1);\n");
-  if (is_gelu) {
-    // GeLU epilogue does not compile with NoBetaScaling, so we explicitly specify the scale.
-    CutlassPrint(gemm_decl, "ElementComputeEpilogue beta = ElementComputeEpilogue(1);\n");
-  } else {
-    CutlassPrint(gemm_decl, "ElementComputeEpilogue beta = ElementComputeEpilogue(0);\n");
-  }
-
-  // Split K dimension into 1 partitions
-  CutlassPrint(gemm_decl, "int split_k_slices = 1;\n");
-
-  // Create a tuple of gemm kernel arguments. This is later passed as arguments to launch
-  // instantiated CUTLASS kernel
-  ICHECK(func_args.size() >= 2);
-  CutlassPrint(gemm_decl, "void* ptr_a = (void*)(" + func_args[0] + "->data);\n");
-  CutlassPrint(gemm_decl, "void* ptr_b = (void*)(" + func_args[1] + "->data);\n");
-  if (has_bias) {
-    ICHECK(func_args.size() >= 3);
-    CutlassPrint(gemm_decl, "void* ptr_c_bias = (void*)(" + func_args[2] + "->data);\n");
-  }
-  CutlassPrint(gemm_decl, "void* ptr_out = (void*)(out0);\n");
-
-  CutlassPrint(gemm_decl, "typename Gemm::Arguments arguments{\n");
-  CutlassPrint(gemm_decl, " problem_size,\n");
-  CutlassPrint(gemm_decl, " {static_cast<ElementInputA*>(ptr_a), " + attrs.at("lda") + "},\n");
-  CutlassPrint(gemm_decl, " {static_cast<ElementInputB*>(ptr_b), " + attrs.at("ldb") + "},\n");
-  if (has_bias) {
-    CutlassPrint(gemm_decl, " {static_cast<ElementOutput*>(ptr_c_bias), 0},\n");
-  } else {
-    CutlassPrint(gemm_decl, " {static_cast<ElementOutput*>(ptr_out), " + attrs.at("ldc") + "},\n");
-  }
-  CutlassPrint(gemm_decl, " {static_cast<ElementOutput*>(ptr_out), " + attrs.at("ldc") + "},\n");
-  if (has_bias && !is_gelu) {
-    CutlassPrint(gemm_decl, " {alpha},\n");
-  } else {
-    // For GeLU, we explicitly specify the scale.
-    CutlassPrint(gemm_decl, " {alpha, beta},\n");
-  }
-  CutlassPrint(gemm_decl, " split_k_slices};\n");
-
-  // Using the arguments, query for extra workspace required for matrix multiplication computation
-  CutlassPrint(gemm_decl, "size_t workspace_size = Gemm::get_workspace_size(arguments);\n");
-  // Allocate workspace memory
-  CutlassPrint(gemm_decl,
-               "cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);\n");
-  // Instantiate CUTLASS kernel depending on template
-  CutlassPrint(gemm_decl, "Gemm gemm_op;\n");
-  // Check the problem size is supported or not
-  CutlassPrint(gemm_decl, "cutlass::Status status = gemm_op.can_implement(arguments);\n");
-  CutlassPrint(gemm_decl, "CHECK(status == cutlass::Status::kSuccess);\n");
-  // Initialize CUTLASS kernel with arguments and workspace pointer
-  CutlassPrint(gemm_decl, "status = gemm_op.initialize(arguments, workspace.get());\n");
-  CutlassPrint(gemm_decl, "CHECK(status == cutlass::Status::kSuccess);\n");
-  // Launch initialized CUTLASS kernel
-  CutlassPrint(gemm_decl, "status = gemm_op();\n");
-  CutlassPrint(gemm_decl, "CHECK(status == cutlass::Status::kSuccess);\n");
-  return gemm_decl.str();
-}
-
-class CodegenCutlass : public MemoizedExprTranslator<std::vector<Output>>, public CodegenCBase {
- public:
-  CodegenCutlass(const std::string& id, const Map<String, ObjectRef>& attrs) {
-    this->ext_func_id_ = id;
-    this->attrs_ = attrs;
-  }
-
-  std::vector<Output> VisitExprDefault_(const Object* op) final {
-    LOG(FATAL) << "Cutlass codegen doesn't support: " << op->GetTypeKey();
-    return {};
-  }
-
-  std::vector<Output> VisitExpr_(const VarNode* node) final {
-    ext_func_args_.push_back(GetRef<Var>(node));
-    Output output;
-    output.name = node->name_hint();
-    return {output};
-  }
-
-  std::vector<Output> VisitExpr_(const CallNode* call) final {
-    const auto* func = call->op.as<FunctionNode>();
-    ICHECK(func) << "Only composite function is supported for CUTLASS.";
-    GenerateBodyOutput ret = GenerateCompositeFunctionCall(func, call);
-    ext_func_body_.push_back(ret.decl);
-    return ret.outputs;
-  }
-
-  std::string JIT(const std::vector<Output>& out) {
-    code_stream_ << "void " << ext_func_id_ << "_(";
-
-    for (const auto& arg : ext_func_args_) {
-      code_stream_ << "DLTensor* " << arg->name_hint() << ", ";
-    }
-    for (size_t i = 0; i < out.size() - 1; ++i) {
-      code_stream_ << out[i].dtype << "* out" << i << ", ";
-    }
-    code_stream_ << out.back().dtype << "* out" << out.size() - 1 << ") {\n";
-    this->EnterScope();
-
-    // Function body
-    for (auto decl : buf_decl_) {
-      this->PrintIndents();
-      code_stream_ << decl << "\n";
-    }
-    code_stream_ << "\n";
-    for (auto stmt : ext_func_body_) {
-      this->PrintIndents();
-      code_stream_ << stmt << "\n";
-    }
-
-    this->ExitScope();
-    code_stream_ << "}\n";
-
-    this->GenerateBackendCFunc(ext_func_id_, ext_func_args_, const_array_name_, out, true);
-    return code_stream_.str();
-  }
-
- private:
-  std::vector<std::string> GetArgumentNames(const CallNode* call) {
-    std::vector<std::string> arg_names;
-    for (size_t i = 0; i < call->args.size(); ++i) {
-      auto res = VisitExpr(call->args[i]);
-      for (const auto& out : res) {
-        arg_names.push_back(out.name);
-      }
-    }
-    return arg_names;
-  }
-
-  GenerateBodyOutput GenerateCompositeFunctionCall(const FunctionNode* callee,
-                                                   const CallNode* caller) {
-    const auto pattern_name = callee->GetAttr<runtime::String>(attr::kComposite);
-    ICHECK(pattern_name.defined()) << "Only functions with composite attribute are supported.";
-
-    if (pattern_name == "cutlass.dense") {
-      const auto* dense_call = GetRootCall(callee->body.as<CallNode>(), 0, {"nn.dense"});
-      return GenerateBody(dense_call, "cutlass_dense", GetArgumentNames(caller),
-                          DenseArgs(std::ref(attrs_)));
-    } else if (pattern_name == "cutlass.dense_bias") {
-      const CallNode* current_call = callee->body.as<CallNode>();
-      std::string add_or_bias_add = current_call->op.as<OpNode>()->name;
-      const auto* dense_call =
-          GetRootCall(callee->body.as<CallNode>(), 1, {"nn.dense", add_or_bias_add});
-      return GenerateBody(dense_call, "cutlass_dense_bias", GetArgumentNames(caller),
-                          DenseArgs(std::ref(attrs_)));
-    } else if (pattern_name == "cutlass.dense_bias_relu") {
-      const CallNode* current_call = callee->body.as<CallNode>();
-      std::string add_or_bias_add = current_call->args[0].as<CallNode>()->op.as<OpNode>()->name;
-      const auto* dense_call =
-          GetRootCall(callee->body.as<CallNode>(), 2, {"nn.dense", add_or_bias_add, "nn.relu"});
-      return GenerateBody(dense_call, "cutlass_dense_bias_relu", GetArgumentNames(caller),
-                          DenseArgs(std::ref(attrs_)));
-    } else if (pattern_name == "cutlass.dense_bias_gelu_fp16") {
-      const CallNode* current_call = callee->body.as<CallNode>();
-      std::string add_or_bias_add = current_call->args[1].as<CallNode>()->op.as<OpNode>()->name;
-      const auto* dense_call = GetRootCall(callee->body.as<CallNode>(), 8,
-                                           {"nn.dense", add_or_bias_add, "multiply", "cast", "erf",
-                                            "cast", "multiply", "add", "multiply"});
-      return GenerateBody(dense_call, "cutlass_dense_bias_gelu", GetArgumentNames(caller),
-                          DenseArgs(std::ref(attrs_)));
-    } else if (pattern_name == "cutlass.dense_bias_gelu_fp32") {
-      const CallNode* current_call = callee->body.as<CallNode>();
-      std::string add_or_bias_add = current_call->args[1].as<CallNode>()->op.as<OpNode>()->name;
-      const auto* dense_call = GetRootCall(
-          callee->body.as<CallNode>(), 6,
-          {"nn.dense", add_or_bias_add, "multiply", "erf", "multiply", "add", "multiply"});
-      return GenerateBody(dense_call, "cutlass_dense_bias_gelu", GetArgumentNames(caller),
-                          DenseArgs(std::ref(attrs_)));
-    }
-    LOG(FATAL) << "Unknown composite function: " << pattern_name;
-    return {};
-  }
-
-  GenerateBodyOutput GenerateBody(const CallNode* root_call, const std::string& func_name,
-                                  const std::vector<std::string>& func_args,
-                                  const Str2StrMap& attribute_args) {
-    // Make function call with input buffers when visiting arguements
-    ICHECK_GT(func_args.size(), 0);
-    std::ostringstream decl_stream;
-    decl_stream << "(" << func_args[0];
-    for (size_t i = 1; i < func_args.size(); ++i) {
-      decl_stream << ", " << func_args[i];
-    }
-    // Analyze the output buffers
-    std::vector<Type> out_types;
-    if (root_call->checked_type()->IsInstance<TupleTypeNode>()) {
-      auto type_node = root_call->checked_type().as<TupleTypeNode>();
-      for (auto field : type_node->fields) {
-        ICHECK(field->IsInstance<TensorTypeNode>());
-        out_types.push_back(field);
-      }
-    } else if (root_call->checked_type()->IsInstance<TensorTypeNode>()) {
-      ICHECK(root_call->checked_type()->IsInstance<TensorTypeNode>());
-      out_types.push_back(root_call->checked_type());
-    } else {
-      LOG(FATAL) << "Unrecognized type node: " << AsText(root_call->checked_type(), false);
-    }
-    GenerateBodyOutput ret;
-    for (const auto& out_type : out_types) {
-      const std::string out = "out" + std::to_string(buf_idx_++);
-      decl_stream << ", " << out;
-      Output output;
-      output.name = out;
-      output.dtype = GetDtypeString(out_type.as<TensorTypeNode>());
-      output.need_copy = false;
-      ret.outputs.push_back(output);
-    }
-    decl_stream << ");";
-    if (func_name == "cutlass_dense" || func_name == "cutlass_dense_bias" ||
-        func_name == "cutlass_dense_bias_relu" || func_name == "cutlass_dense_bias_gelu") {
-      ret.decl = DenseOp(ext_func_id_, attribute_args, func_args);
-    }
-    return ret;
-  }
-  /*! \brief The id of the external cutlass ext_func. */
-  std::string ext_func_id_{""};
-  /*! \brief The attrs of the external cutlass ext_func. */
-  Map<String, ObjectRef> attrs_;
-  /*!
-   * \brief The index to track the output buffer. Each kernel will redirect the
-   * output to a buffer that may be consumed by other kernels.
-   */
-  int buf_idx_{0};
-  /*! \brief The arguments used by a wrapped function that calls CUTLASS kernels. */
-  Array<Var> ext_func_args_;
-  /*! \brief Statement of the function that will be compiled using CUTLASS kernels. */
-  std::vector<std::string> ext_func_body_;
-  /*! \brief The array declared to store the constant values. */
-  std::string const_array_name_;
-  /*! \brief The declaration of intermediate buffers. */
-  std::vector<std::string> buf_decl_;
-};  // class CodegenCutlass
-
-class CutlassModuleCodegen : public CSourceModuleCodegenBase {
- public:
-  std::pair<std::string, Array<String>> GenCutlassFunc(const Function& func) {
-    ICHECK(func.defined()) << "Input error: expect a Relay function.";
-    // Record the external symbol for runtime lookup.
-    auto sid = GetExtSymbol(func);
-    const auto* attrs = func->attrs.as<DictAttrsNode>();
-    ICHECK(attrs != nullptr);
-    const auto dict = attrs->dict;
-    CodegenCutlass builder(sid, dict);
-    auto out = builder.VisitExpr(func->body);
-    code_stream_ << builder.JIT(out);
-    return {sid, {}};
-  }
-
-  runtime::Module CreateCSourceModule(const ObjectRef& ref) override {
-    // create header
-    code_stream_ << "#include <cstdint>\n";
-    code_stream_ << "#include <cstdlib>\n";
-    code_stream_ << "#include <cstring>\n";
-    code_stream_ << "#include <vector>\n";
-    code_stream_ << "#include <tvm/runtime/c_runtime_api.h>\n";
-    code_stream_ << "#include <tvm/runtime/packed_func.h>\n";
-    code_stream_ << "#include <dlpack/dlpack.h>\n";
-    // cutlass header
-    code_stream_ << "#include <cuda_fp16.h>\n";
-    code_stream_ << "#include <cutlass/cutlass.h>\n";
-    code_stream_ << "#include <cutlass/util/host_tensor.h>\n";
-    code_stream_ << "#include <cutlass/util/reference/host/tensor_fill.h>\n";
-    code_stream_ << "#include <cutlass/gemm/device/gemm.h>\n";
-    code_stream_ << "#include <cutlass/epilogue/thread/linear_combination_bias_relu.h>\n";
-    code_stream_ << "#include <cutlass/epilogue/thread/linear_combination_gelu.h>\n";
-
-    ICHECK(ref->IsInstance<FunctionNode>());
-    auto res = GenCutlassFunc(Downcast<Function>(ref));
-    std::string code = code_stream_.str();
-    String sym = std::get<0>(res);
-    Array<String> variables = std::get<1>(res);
-    // Create a CSource module
-    const auto* pf = runtime::Registry::Get("runtime.CSourceModuleCreate");
-    ICHECK(pf != nullptr) << "Cannot find CSource module to create the external runtime module";
-    return (*pf)(code, "cu", Array<String>{sym}, variables);
-  }
-
- private:
-  /*! \brief The code stream that will be compiled by NVCC */
-  std::ostringstream code_stream_;
-};  // CutlassModuleCodegen
-
-/*!
- * \brief The external cutlass compiler/codegen tool. It takes a Relay
- * expression/module and compile it into a runtime module.
- */
-runtime::Module CutlassCompiler(const ObjectRef& ref) {
-  CutlassModuleCodegen cutlass;
-  return cutlass.CreateCSourceModule(ref);
-}
-
-TVM_REGISTER_GLOBAL("relay.ext.cutlass").set_body_typed(CutlassCompiler);
-
-}  // namespace contrib
-}  // namespace relay
-}  // namespace tvm
diff --git a/src/relay/backend/contrib/dnnl/codegen.cc b/src/relay/backend/contrib/dnnl/codegen.cc
index fa1dbc66d8a7c..ae58c2f08e8cf 100644
--- a/src/relay/backend/contrib/dnnl/codegen.cc
+++ b/src/relay/backend/contrib/dnnl/codegen.cc
@@ -231,6 +231,12 @@ class CodegenDNNL : public MemoizedExprTranslator<std::vector<Output>>, public C
   }
 
  private:
+  struct GenerateBodyOutput {
+    std::string decl;
+    std::vector<std::string> buffers;
+    std::vector<Output> outputs;
+  };
+
   std::vector<std::string> GetArgumentNames(const CallNode* call) {
     std::vector<std::string> arg_names;
     for (size_t i = 0; i < call->args.size(); ++i) {
diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc
index 88dee9216a48c..3e675215e7e0b 100644
--- a/src/relay/backend/contrib/ethosn/codegen.cc
+++ b/src/relay/backend/contrib/ethosn/codegen.cc
@@ -195,20 +195,6 @@ sl::TensorsAndId MakeOps(const sl::TensorAndId<sl::Operand>& op) {
   return ops;
 }
 
-String MakeVariant(auto configuration) {
-  String variant = configuration.value()->variant;
-  // Transform variant string to lowercase for comparison
-  std::string variant_string = variant.c_str();
-  std::transform(variant_string.begin(), variant_string.end(), variant_string.begin(), ::tolower);
-  std::string variant_n78 = "ethos-n78";
-  if (variant_string == variant_n78) {
-    String tops = configuration.value()->tops;
-    String ple_ratio = configuration.value()->ple_ratio;
-    variant = "Ethos-N78_" + tops + "TOPS_" + ple_ratio + "PLE_RATIO";
-  }
-  return variant;
-}
-
 NetworkWithIDs ConstructNetworkVisitor::Construct(const Function& func) {
   // Initialise everything
   auto ctx = transform::PassContext::Current();
@@ -217,9 +203,8 @@ NetworkWithIDs ConstructNetworkVisitor::Construct(const Function& func) {
     cfg = AttrsWithDefaultValues<EthosnCompilerConfig>();
   }
   NetworkWithIDs network_with_ids;
-  network_ = sl::CreateNetwork(
-      sl::GetFwAndHwCapabilities(sl::EthosNVariantFromString(MakeVariant(cfg).c_str()),
-                                 static_cast<uint32_t>(std::stoul(cfg.value()->sram_size))));
+  network_ = sl::CreateNetwork(sl::GetFwAndHwCapabilities(
+      sl::EthosNVariantFromString(cfg.value()->variant.c_str()), cfg.value()->sram_size_bytes));
   network_with_ids.network = network_;
   operand_table_.clear();
 
@@ -629,9 +614,8 @@ EthosnError EthosnCompiler::SupportedSetup() {
     auto cfg = ctx->GetConfig<EthosnCompilerConfig>("relay.ext.ethos-n.options").defined()
                    ? ctx->GetConfig<EthosnCompilerConfig>("relay.ext.ethos-n.options")
                    : AttrsWithDefaultValues<EthosnCompilerConfig>();
-    m_Queries = std::make_unique<sl::SupportQueries>(
-        sl::GetFwAndHwCapabilities(sl::EthosNVariantFromString(cfg.value()->variant.c_str()),
-                                   std::stoul(cfg.value()->sram_size)));
+    m_Queries = std::make_unique<sl::SupportQueries>(sl::GetFwAndHwCapabilities(
+        sl::EthosNVariantFromString(cfg.value()->variant.c_str()), cfg.value()->sram_size_bytes));
     if (m_Queries == nullptr) {
       return EthosnError("Could not initialise Ethos-N compiler isSupported");
     }
diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
index 279569596f1bb..ca2df05e958d2 100644
--- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h
+++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -227,9 +227,7 @@ NetworkWithIDs ConstructNetwork(const IRModule& mod, const GlobalVar& var, const
 /*! \brief Attributes to store the compiler options for Ethos-N */
 struct EthosnCompilerConfigNode : public tvm::AttrsNode<EthosnCompilerConfigNode> {
   String variant;
-  String sram_size;
-  String tops;
-  String ple_ratio;
+  int sram_size_bytes;
   bool strategy0;
   bool strategy1;
   bool strategy3;
@@ -249,15 +247,9 @@ struct EthosnCompilerConfigNode : public tvm::AttrsNode<EthosnCompilerConfigNode
 
   TVM_DECLARE_ATTRS(EthosnCompilerConfigNode, "ext.attrs.EthosnCompilerConfigNode") {
     TVM_ATTR_FIELD(variant).describe("See Ethos-N documentation.").set_default("Ethos-N77");
-    TVM_ATTR_FIELD(sram_size)
-        .describe("Optionally override the default sram size. See Ethos(TM)-N documentation.")
-        .set_default("0");
-    TVM_ATTR_FIELD(tops)
-        .describe("Valid values 1, 2, 4 and 8. See Ethos(TM)-N documentation.")
-        .set_default("1");
-    TVM_ATTR_FIELD(ple_ratio)
-        .describe("Valid values 2 and 4. See Ethos(TM)-N documentation.")
-        .set_default("2");
+    TVM_ATTR_FIELD(sram_size_bytes)
+        .describe("Optionally override the default sram size. See Ethos-N documentation.")
+        .set_default(0);
     TVM_ATTR_FIELD(strategy0).set_default(true);
     TVM_ATTR_FIELD(strategy1).set_default(true);
     TVM_ATTR_FIELD(strategy3).set_default(true);
@@ -347,10 +339,6 @@ class EthosnCompiler {
   static std::pair<std::vector<uint32_t>, std::vector<uint32_t>> GetInputOutputOrder(
       NetworkWithIDs network, const std::unique_ptr<sl::CompiledNetwork>& compiled_network);
 
-  /*!
-   * \brief Query interface used to determine if the Ethos-N hardware supports an operation
-   * with the supplied parameters.
-   */
   static std::unique_ptr<sl::SupportQueries> m_Queries;
 };
 
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index ed774eccd8dd2..a8c27a126032b 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -562,7 +562,7 @@ class LowerTensorExprMutator : public DeviceAwareExprMutator {
     BaseFunc prim_func = ResolveToPrimitive(new_value);
 
     if (prim_func.defined() && !prim_func->IsInstance<tir::PrimFuncNode>()) {
-      // Remember let var is bound to (possibly indirectly) a non-tir primitive.
+      // Remember let var is bound to (possibly indirectly) to a non-tir primitive.
       Function func = Downcast<Function>(prim_func);
       primitive_functions_.emplace(var, func);
     }
@@ -896,6 +896,8 @@ void UpdateFunctionMetadata(Function relay_func,
 
 IRModule LowerTE(const IRModule& module, TargetMap targets, const String& module_name,
                  std::function<void(Function)> process_fn) {
+  DLOG(INFO) << "lowering module:\n" << PrettyPrint(module);
+
   TECompiler compiler;
 
   auto updated_module = LowerTensorExpr(targets, module_name, compiler, process_fn)(module);
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index 3970b0e806f03..be5b172e6a7c4 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -466,13 +466,8 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
 
   Array<te::Tensor> VisitExpr_(const VarNode* var_node) final {
     auto var = GetRef<Var>(var_node);
-    auto it = param_arg_map_.find(var);
-    if (it != param_arg_map_.end()) {
-      // This var is a parameter of a nested function. Visit the corresponding argument in the
-      // function call site.
-      return VisitExpr(it->second);
-    }
-    if (param_states_.find(var) == param_states_.end()) {
+    auto it = param_states_.find(var);
+    if (it == param_states_.end()) {
       LOG(FATAL) << "Unexpected free variable " << var->name_hint();
       return {};
     } else {
@@ -547,12 +542,6 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   }
 
   Array<te::Tensor> VisitExpr_(const CallNode* call_node) final {
-    if (auto* func = call_node->op.as<FunctionNode>()) {
-      for (size_t i = 0; i < func->params.size(); ++i) {
-        param_arg_map_[func->params[i]] = call_node->args[i];
-      }
-      return VisitExpr(func->body);
-    }
     static auto fshape_func = Op::GetAttrMap<FShapeFunc>("FShapeFunc");
     static auto tshape_data_dependent = Op::GetAttrMap<TShapeDataDependent>("TShapeDataDependent");
     ICHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
@@ -612,7 +601,7 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   }
 
   Array<te::Tensor> VisitExpr_(const FunctionNode* op) final {
-    LOG(FATAL) << "Nested functions are not allowed to be visited.";
+    LOG(FATAL) << "Do not support sub function";
     return Array<te::Tensor>();
   }
 
@@ -655,10 +644,6 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   std::vector<bool> data_dependents_per_input_;
   /*! \brief Scalars used in the shape function */
   Array<te::Tensor> scalars_;
-  /*! \brief Map from parameters of a nested function to corresponding arguments in a function
-   * call site.
-   */
-  std::unordered_map<Var, Expr, ObjectPtrHash, ObjectPtrEqual> param_arg_map_;
 };
 
 CachedFunc ShapeFuncFor(const Function& prim_func, const Target& target,
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index 6a085adad3d1e..b3c1cd81274fc 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -304,13 +304,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
       }
       VisitExpr(func);
     }
-    std::vector<Index> params_device_type_index;
-    params_device_type_index.reserve(params_device_type.size());
-    for (auto device_type : params_device_type) {
-      params_device_type_index.push_back(static_cast<Index>(device_type));
-    }
-    return VMFunction(var->name_hint, params_, instructions_, registers_num_,
-                      params_device_type_index);
+    return VMFunction(var->name_hint, params_, instructions_, registers_num_, params_device_type);
   }
 
   /*! \brief Attrs objects for each op. */
@@ -323,7 +317,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
   size_t NewRegister() { return registers_num_++; }
 
   inline void Emit(const Instruction& instr) {
-    VLOG(2) << "VMCompiler::Emit: instr=" << instr;
+    VLOG(1) << "VMCompiler::Emit: instr=" << instr;
     ICHECK((int)instr.op < 100) << "Invalid opcode " << (int)instr.op;
     switch (instr.op) {
       case Opcode::AllocADT:
@@ -709,7 +703,7 @@ class VMFunctionCompiler : DeviceAwareExprFunctor<void(const Expr& n)> {
       auto global = GetRef<GlobalVar>(global_node);
       auto it = context_->global_map.find(global);
       ICHECK(it != context_->global_map.end());
-      VLOG(2) << "VisitExpr_: generating invoke for " << global->name_hint
+      VLOG(1) << "VisitExpr_: generating invoke for " << global->name_hint
               << " with func_index=" << it->second;
 
       // TODO(tvm-team):
@@ -947,6 +941,12 @@ void VMCompiler::Lower(IRModule mod, const TargetsMap& targets, const tvm::Targe
     }
   }
 
+#if USE_RELAY_DEBUG
+  for (auto vm_func : exec_->functions) {
+    VLOG(1) << vm_func << "-------------";
+  }
+#endif  // USE_RELAY_DEBUG
+
   // populate constants
   for (auto data : context_.constants) {
     exec_->constants.push_back(data);
@@ -967,12 +967,6 @@ void VMCompiler::Lower(IRModule mod, const TargetsMap& targets, const tvm::Targe
     exec_->primitive_map.insert({cfunc->prim_fn_var->name_hint, primitive_index++});
   }
 
-#if USE_RELAY_DEBUG
-  for (const auto& vm_func : exec_->functions) {
-    VLOG(1) << vm_func << "-------------";
-  }
-#endif  // USE_RELAY_DEBUG
-
   backend::UpdateAutoSchedulerOpWeights(context_.compiler);
 }
 
@@ -1024,7 +1018,6 @@ transform::Sequential MemoryOpt(tvm::Target host_target, TargetsMap targets) {
 
 IRModule VMCompiler::OptimizeModule(IRModule mod, const TargetsMap& targets_arg,
                                     const Target& target_host_arg) {
-  VLOG_CONTEXT << "VMCompiler::OptimizeModule";
   TargetsMap targets = targets_arg;
   Target target_host = target_host_arg;
   CheckAndUpdateHostConsistency(&targets, &target_host);
diff --git a/src/relay/backend/vm/inline_primitives.cc b/src/relay/backend/vm/inline_primitives.cc
index 6744248722515..6924f2598f6fe 100644
--- a/src/relay/backend/vm/inline_primitives.cc
+++ b/src/relay/backend/vm/inline_primitives.cc
@@ -87,7 +87,7 @@ struct PrimitiveInliner : ExprMutator {
     // in w(...)
     while ((var_node = op.as<VarNode>())) {
       auto var = GetRef<Var>(var_node);
-      VLOG(1) << "Var: " << var << std::endl;
+      DLOG(INFO) << "Var: " << var << std::endl;
       auto it = var_map.find(GetRef<Var>(var_node));
       if (it != var_map.end()) {
         op = it->second;
diff --git a/src/relay/op/annotation/annotation.cc b/src/relay/op/annotation/annotation.cc
index 27b61333c9eb4..8b00839cda333 100644
--- a/src/relay/op/annotation/annotation.cc
+++ b/src/relay/op/annotation/annotation.cc
@@ -76,16 +76,6 @@ Expr MaybeOnDevice(Expr expr, DLDeviceType device_type, bool is_fixed) {
     // by the function's attributes.
     return expr;
   }
-  OnDeviceProps props = GetOnDeviceProps(expr);
-  if (props.body.defined()) {
-    // Don't nest on_devices.
-    // If the inner and outer device types differ then we need to be careful:
-    //  - If the inner on_device is_fixed then it disagrees with the outer.
-    //  - If the outer on_device is_fixed then it implies a hidden device_copy
-    // Otherwise just use the inner device type and ignore the outer.
-    ICHECK(props.device_type == device_type || (!is_fixed && !props.is_fixed));
-    return OnDevice(props.body, device_type, is_fixed || props.is_fixed);
-  }
   return OnDevice(expr, device_type, is_fixed);
 }
 
diff --git a/src/relay/qnn/utils.h b/src/relay/qnn/utils.h
index 79d5549d659a6..23759a52ec41e 100644
--- a/src/relay/qnn/utils.h
+++ b/src/relay/qnn/utils.h
@@ -49,8 +49,7 @@ static inline Array<IndexExpr> get_shape(const Type& type) {
 static inline int32_t GetQmin(const DataType& dtype) {
   ICHECK_LE(dtype.bits(), 32) << "QNN ops support int32 or lower precision";
   if (dtype.is_int() || dtype.is_uint()) {
-    auto min_value_expr = tvm::min_value(dtype);
-    auto* min_value = tir::as_const_int(min_value_expr);
+    auto* min_value = tir::as_const_int(tvm::min_value(dtype));
     ICHECK(min_value != nullptr);
     return static_cast<int32_t>(min_value[0]);
   } else {
@@ -62,8 +61,7 @@ static inline int32_t GetQmin(const DataType& dtype) {
 static inline int32_t GetQmax(const DataType& dtype) {
   ICHECK_LE(dtype.bits(), 32) << "QNN ops support int32 or lower precision";
   if (dtype.is_int() || dtype.is_uint()) {
-    auto max_value_expr = tvm::max_value(dtype);
-    auto* max_value = tir::as_const_int(max_value_expr);
+    auto* max_value = tir::as_const_int(tvm::max_value(dtype));
     ICHECK(max_value != nullptr);
     return static_cast<int32_t>(max_value[0]);
   } else {
diff --git a/src/relay/transforms/device_aware_visitors.cc b/src/relay/transforms/device_aware_visitors.cc
index 38c3305d31941..28aeab60539c5 100644
--- a/src/relay/transforms/device_aware_visitors.cc
+++ b/src/relay/transforms/device_aware_visitors.cc
@@ -262,7 +262,7 @@ Expr DeviceAwareExprMutator::VisitExpr_(const CallNode* call_node) {
     Expr expr = VisitExpr(props.body);
     // Leaving lexical scope of "on_device" call.
     PopDeviceType();
-    return MaybeOnDevice(expr, props.device_type, props.is_fixed);
+    return OnDevice(expr, props.device_type, props.is_fixed);
   } else {
     return DeviceAwareVisitExpr_(call_node);
   }
diff --git a/src/relay/transforms/device_planner.cc b/src/relay/transforms/device_planner.cc
index 83429a9e616f0..dc61e79226b68 100644
--- a/src/relay/transforms/device_planner.cc
+++ b/src/relay/transforms/device_planner.cc
@@ -18,7 +18,7 @@
  */
 
 /*!
- * \file src/relay/transforms/device_planner.cc
+ * \file src/relay/analysis/device_planner.cc
  * \brief Determines a unique device to hold the result of every Relay sub-expression.
  *
  * We say a Relay expression E is 'on device D' if the result of executing E is stored on D.
diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc
index c48a9b30967c6..d545518c1c3ce 100644
--- a/src/relay/transforms/fold_constant.cc
+++ b/src/relay/transforms/fold_constant.cc
@@ -21,7 +21,6 @@
  * \file constant_folding.cc
  */
 #include <tvm/relay/analysis.h>
-#include <tvm/relay/attrs/annotation.h>
 #include <tvm/relay/attrs/transform.h>
 #include <tvm/relay/expr_functor.h>
 #include <tvm/relay/interpreter.h>
@@ -31,80 +30,68 @@
 #include <tvm/runtime/ndarray.h>
 #include <tvm/runtime/object.h>
 
-#include "../op/annotation/annotation.h"
-#include "./device_aware_visitors.h"
-#include "./pattern_utils.h"
+#include "pattern_utils.h"
 
 namespace tvm {
 namespace relay {
-namespace transform {
 
-namespace {
-/*!
- * \brief Returns whether \p expr is a literal \p Constant, optionally wrapped by an "on_device"
- * annotation CallNode (which serves only to associate a device to the constant and has no
- * operational effect).
- */
-bool IsSimpleConstant(const Expr& expr) {
-  return AsIgnoringOnDevice<ConstantNode>(expr) != nullptr;
-}
+using FInterpreter = runtime::TypedPackedFunc<ObjectRef(Expr)>;
 
-/*!
- * \brief Returns whether \p expr \p IsSimpleConstant directly or is a tuple of
- * \p IsComplexConstant expressions.
- */
-bool IsComplexConstant(const Expr& expr) {
-  if (IsSimpleConstant(expr)) {
-    return true;
-  } else if (const auto* tuple_node = AsIgnoringOnDevice<TupleNode>(expr)) {
-    return std::all_of(tuple_node->fields.begin(), tuple_node->fields.end(), IsComplexConstant);
-  } else {
-    return false;
+class ConstantChecker : private ExprVisitor {
+ public:
+  // Check whether an expression is constant. The results are memoized.
+  bool Check(const Expr& expr) {
+    // The `ConstantNode` case is common enough that we check directly for the
+    // case here, to avoid the time overhead of dispatching through the vtable
+    // and the space overhead of memoizing always-true results.
+    if (expr.as<ConstantNode>()) {
+      return true;
+    }
+    const auto it = memo_.find(expr);
+    if (it != memo_.end()) return it->second;
+    VisitExpr(expr);
+    return memo_[expr];  // return memoized result or the default value false
   }
-}
+
+ private:
+  std::unordered_map<Expr, bool, ObjectPtrHash, ObjectPtrEqual> memo_;
+
+  void VisitExpr_(const TupleNode* n) final {
+    bool result = true;
+    for (const auto& field : n->fields) {
+      if (!Check(field)) {
+        result = false;
+        break;
+      }
+    }
+    memo_[GetRef<Tuple>(n)] = result;
+  }
+};
+
+bool ConstantCheck(const Expr& e) { return ConstantChecker().Check(e); }
+
+TVM_REGISTER_GLOBAL("relay.analysis.check_constant").set_body_typed(ConstantCheck);
 
 // TODO(tvm-team) consider combine dead-code with constant folder.
 // or make a more powerful partial evaluator.
 class ConstantFolder : public MixedModeMutator {
  public:
   explicit ConstantFolder(IRModule module)
-      : module_(std::move(module)),
+      : module_(module),
         device_copy_op_(Op::Get("device_copy")),
         shape_of_op_(Op::Get("shape_of")),
         vm_shape_of_op_(Op::Get("vm.shape_of")),
         cast_op_(Op::Get("cast")),
         ndarray_size_op_(Op::Get("ndarray_size")) {}
 
- private:
-  using ExprMutator::VisitExpr_;
+  using MixedModeMutator::VisitExpr_;
 
-  Expr VisitExpr_(const LetNode* let_node) final {
+  Expr VisitExpr_(const LetNode* op) final {
     auto pre_visit = [this](const LetNode* op) {
       // Rely on the Memoizer to cache pre-visit values
-      Expr new_value = Mutate(op->value);
-      if (IsSimpleConstant(new_value)) {
-        // Inline new value (along with any on_device annotation wrapping it) at all occurrences of
-        // the variable.
-        //
-        // We need to retain any "on_device" annotation so that downstream 'device aware'
-        // passes can still retrieve the device for the constant in its new position(s). Eg:
-        //   def @f(..., result_device_type=D) {
-        //     let %x = on_device(... something we eval to a constant..., device_type=E)
-        //     @f(..., %x, ...)
-        //   }
-        // Here the default device is D, whereas the argument %x to @f is on E (and @f expects
-        // that). No on_device annotation is required in the call according to the convention used
-        // by the device-aware visitors.
-        //
-        // However once we've inlined the constant we need to insert an on_device, again to
-        // respect the convention used by the device-aware visitors.
-        //   def @f(..., result_device_type=D) {
-        //     @f(..., on_device(...the constant..., device_type=E), ...)
-        //   }
-        VLOG(1) << "Replacing let-binding for " << op->var->name_hint()
-                << " with constant:" << std::endl
-                << PrettyPrint(new_value);
-        memo_[op->var] = new_value;
+      Expr value = this->Mutate(op->value);
+      if (value.as<ConstantNode>()) {
+        this->memo_[op->var] = value;
       } else {
         this->Mutate(op->var);
       }
@@ -112,118 +99,117 @@ class ConstantFolder : public MixedModeMutator {
     auto post_visit = [this](const LetNode* op) {
       Expr expr = GetRef<Expr>(op);
       // Rely on the Memoizer to cache pre-visit values
-      Expr new_value = this->Mutate(op->value);
-      if (IsSimpleConstant(new_value)) {
-        // The let-bound value has been inlined, drop the let-binding itself.
-        this->memo_[expr] = Mutate(op->body);
+      Expr value = this->Mutate(op->value);
+      if (value.as<ConstantNode>()) {
+        this->memo_[expr] = this->Mutate(op->body);
       } else {
-        Var new_var = Downcast<Var>(this->Mutate(op->var));
-        Expr new_body = this->Mutate(op->body);
-        if (new_var.same_as(op->var) && new_value.same_as(op->value) &&
-            new_body.same_as(op->body)) {
+        Var var = Downcast<Var>(this->Mutate(op->var));
+        Expr body = this->Mutate(op->body);
+        if (var.same_as(op->var) && value.same_as(op->value) && body.same_as(op->body)) {
           this->memo_[expr] = expr;
         } else {
-          this->memo_[expr] = Let(new_var, new_value, new_body, op->span);
+          this->memo_[expr] = Let(var, value, body);
         }
       }
     };
-    ExpandANormalForm(let_node, pre_visit, post_visit);
-    return memo_[GetRef<Expr>(let_node)];
+    ExpandANormalForm(op, pre_visit, post_visit);
+    return memo_[GetRef<Expr>(op)];
   }
 
-  Expr VisitExpr_(const FunctionNode* function_node) final {
-    if (function_node->HasNonzeroAttr(attr::kPrimitive)) {
-      ICHECK_EQ(inside_primitive_, false);
-      inside_primitive_ = true;
-      auto ret = ExprMutator::VisitExpr_(function_node);
-      inside_primitive_ = false;
+  bool inside_primitive = false;
+  Expr VisitExpr_(const FunctionNode* op) final {
+    if (op->HasNonzeroAttr(attr::kPrimitive)) {
+      ICHECK_EQ(inside_primitive, false);
+      inside_primitive = true;
+      auto ret = ExprMutator::VisitExpr_(op);
+      inside_primitive = false;
       return ret;
     } else {
-      return ExprMutator::VisitExpr_(function_node);
+      return ExprMutator::VisitExpr_(op);
     }
   }
 
-  Expr Rewrite_(const CallNode* pre_call_node, const Expr& post) final {
-    Call pre_call = GetRef<Call>(pre_call_node);
-    if (inside_primitive_) {
-      return pre_call;
-    }
-
-    Call post_call = Downcast<Call>(post);
-
-    if (post_call->args.empty()) {
-      // We don't constant fold function with zero arguments.
-      // This is a heuristic that is useful.
-      // For example it is harmful to fold ones(shape=(4, 5)).
-      return std::move(pre_call);
+  Expr VisitExpr_(const IfNode* op) final {
+    auto new_cond = ExprMutator::VisitExpr(op->cond);
+    if (auto const_cond = new_cond.as<ConstantNode>()) {
+      if (reinterpret_cast<uint8_t*>(const_cond->data->data)[0]) {
+        return ExprMutator::VisitExpr(op->true_branch);
+      } else {
+        return ExprMutator::VisitExpr(op->false_branch);
+      }
     }
+    return ExprMutator::VisitExpr_(op);
+  }
 
-    static auto fnoncomputational = Op::GetAttrMap<TNonComputational>("TNonComputational");
-
-    const auto* op_node = post_call->op.as<OpNode>();
-    if (op_node == nullptr) {
-      // Only evaluate primitives.
-      return std::move(post_call);
+  Expr Rewrite_(const CallNode* call, const Expr& post) final {
+    if (inside_primitive) {
+      return GetRef<Expr>(call);
     }
-    Op op = GetRef<Op>(op_node);
     static auto op_stateful = Op::GetAttrMap<TOpIsStateful>("TOpIsStateful");
-    if (op_stateful.get(op, false)) {
-      // skip stateful ops.
-      return std::move(post_call);
-    }
-    // Try to evaluate shape_of and ndarray_size ops
-    // Use the original call rather than new_call here since it still has valid checked_type
-    // fields. These operators don't care about the value of their argument anyway.
-    if (Optional<Expr> opt_result = EvaluateShapeOf(pre_call)) {
-      return opt_result.value();
-    }
-    // Use the original call rather than new_call here since it still has valid checked_type
-    // fields. This operator doesn't care about the value of its argument anyway.
-    if (Optional<Expr> opt_result = EvaluateNdarraySize(pre_call)) {
-      return opt_result.value();
+
+    auto origin_args = call->args;
+    call = post.as<CallNode>();
+    // We don't constant fold function with zero arguments.
+    // This is a heuristic that is useful.
+    // For example it is harmful to fold ones(shape=(4, 5)).
+    if (call->args.size() == 0) return post;
+    const OpNode* op = call->op.as<OpNode>();
+    if (op == nullptr) return post;
+    // skip stateful ops.
+    if (op_stateful.get(GetRef<Op>(op), false)) return post;
+    // Try to evaluate shape_of op
+    if (call->op == shape_of_op_ || call->op == vm_shape_of_op_) {
+      return EvaluateShapeOf(post, origin_args, call->attrs);
     }
-    if ((fnoncomputational.count(op) && fnoncomputational[op]) || op == device_copy_op_ ||
-        op == shape_of_op_ || op == vm_shape_of_op_ || op == ndarray_size_op_) {
-      // We should think about potentially constant evaluation over these ops too.
-      return std::move(post_call);
+
+    if (call->op == ndarray_size_op_) {
+      return EvaluateNdarraySize(post, origin_args, call->attrs);
     }
-    if (!std::all_of(post_call->args.begin(), post_call->args.end(), IsComplexConstant)) {
-      // At least one non-constant argument.
-      return std::move(post_call);
+
+    // We should think about potentially constant evaluation over these ops too.
+    static auto fnoncomputational = Op::GetAttrMap<TNonComputational>("TNonComputational");
+    if (const auto* call_node = call->op.as<OpNode>()) {
+      Op op = GetRef<Op>(call_node);
+      if ((fnoncomputational.count(op) && fnoncomputational[op]) || (call->op == device_copy_op_)) {
+        return GetRef<Call>(call);
+      }
     }
-    // During evaluation we have obviously lost all on_device annotations. However any
-    // on_device wrapping this call will be left in place.
-    return ConstEvaluate(post_call);
-  }
 
-  Expr VisitExpr_(const IfNode* if_node) final {
-    If new_if = Downcast<If>(ExprMutator::VisitExpr_(if_node));
-    if (const auto* const_node = AsIgnoringOnDevice<ConstantNode>(new_if->cond)) {
-      if (reinterpret_cast<uint8_t*>(const_node->data->data)[0]) {
-        return new_if->true_branch;
-      } else {
-        return new_if->false_branch;
+    bool all_const_args = true;
+    for (Expr arg : call->args) {
+      if (!checker_.Check(arg)) {
+        all_const_args = false;
       }
     }
-    return std::move(new_if);
+    if (all_const_args) {
+      return ConstEvaluate(post);
+    } else {
+      return post;
+    }
   }
 
-  Expr Rewrite_(const TupleGetItemNode* tuple_get_item_node,
-                const Expr& post_tuple_get_item) final {
-    const auto* post_tuple_get_item_node = post_tuple_get_item.as<TupleGetItemNode>();
-    if (const auto* tuple_node = AsIgnoringOnDevice<TupleNode>(post_tuple_get_item_node->tuple)) {
-      Expr result = tuple_node->fields[tuple_get_item_node->index];
-      OnDeviceProps props = GetOnDeviceProps(post_tuple_get_item_node->tuple);
-      if (props.body.defined()) {
-        // (on_device((x, y, z), device_type=D).1 ==> on_device(y, device_type=D)
-        return MaybeOnDevice(result, props.device_type, props.is_fixed);
-      } else {
-        return result;
-      }
+  Expr Rewrite_(const TupleGetItemNode* op, const Expr& post) final {
+    op = post.as<TupleGetItemNode>();
+    if (const auto* tuple = op->tuple.as<TupleNode>()) {
+      return tuple->fields[op->index];
+    } else {
+      return post;
     }
-    return std::move(post_tuple_get_item);
   }
 
+ private:
+  // Internal constant checker
+  ConstantChecker checker_;
+  // Module
+  IRModule module_;
+
+  // Cache the following ops for equivalence checking in this pass.
+  const Op& device_copy_op_;
+  const Op& shape_of_op_;
+  const Op& vm_shape_of_op_;
+  const Op& cast_op_;
+  const Op& ndarray_size_op_;
+
   // Convert value to expression.
   Expr ObjectToExpr(const ObjectRef& value) {
     if (value->IsInstance<runtime::NDArray::ContainerType>()) {
@@ -238,53 +224,35 @@ class ConstantFolder : public MixedModeMutator {
       return Tuple(fields);
     } else {
       LOG(FATAL) << "Cannot handle " << value->GetTypeKey();
-      return {};
+      return Expr();
     }
   }
-
   // Constant evaluate an expression.
-  Expr ConstEvaluate(const Expr& expr) {
-    VLOG_CONTEXT << "ConstEvaluate";
-    VLOG(1) << "Evaluating :" << std::endl << PrettyPrint(expr);
-
-    // We'll invoke the interpreter using the generic CPU device and target. Technically there's
-    // no guarantee the results we bitwise equal what we'd get on the true device, however to
-    // support cross-compilation we don't want to assume the true device is available.
+  Expr ConstEvaluate(Expr expr) {
     Device dev;
     dev.device_type = kDLCPU;
     dev.device_id = 0;
     Target target = Target("llvm");
 
-    // Use a fresh build context in case we are already in a build context.
+    // use a fresh build context in case we are already in a build context.
     // needed for both execution and creation(due to JIT)
     With<transform::PassContext> fresh_build_ctx(transform::PassContext::Create());
 
-    Expr result =
-        ObjectToExpr(Eval(expr, module_->type_definitions, module_->Imports(), dev, target));
-    VLOG(1) << "Evaluated to constant:" << std::endl << PrettyPrint(result);
-    return result;
+    return ObjectToExpr(Eval(expr, module_->type_definitions, module_->Imports(), dev, target));
   }
 
-  /*!
-   * \brief Returns constant shape result of \p call if it of form \p shape_of(e) and \p e has
-   * a non-dynamic tensor shape. Returns null otherwise.
-   */
-  Optional<Expr> EvaluateShapeOf(const Call& call) {
-    if (call->op != shape_of_op_ && call->op != vm_shape_of_op_) {
-      return {};
-    }
-
-    VLOG(1) << "Evaluating for shape_of:" << std::endl << PrettyPrint(call);
-    ICHECK_EQ(call->args.size(), 1);
-    const auto* param = call->attrs.as<ShapeOfAttrs>();
+  // Evaluate a call to the shape_of operator for tensors with constant
+  // shapes.
+  Expr EvaluateShapeOf(Expr expr, Array<Expr> args, Attrs attrs) {
+    Expr input = args[0];
+    const auto* param = attrs.as<ShapeOfAttrs>();
     ICHECK(param != nullptr);
-    Expr input = call->args[0];
 
     tvm::Array<IndexExpr> ishape;
-    if (Optional<tvm::Array<IndexExpr>> opt_shape = GetConstantShape(input)) {
-      ishape = opt_shape.value();
+    if (auto opt = GetConstantShape(input)) {
+      ishape = opt.value();
     } else {
-      return {};
+      return expr;
     }
 
     // Get the constant shape
@@ -293,26 +261,26 @@ class ConstantFolder : public MixedModeMutator {
     dev.device_id = 0;
     runtime::NDArray value;
     DLDataType cdtype = DataType::Int(32);
-    if (ishape.empty()) {
+    if (ishape.size() == 0) {
       value = runtime::NDArray::Empty({}, cdtype, dev);
     } else {
       ICHECK_NE(ishape.size(), 0);
       std::vector<int64_t> cshape = {static_cast<int64_t>(ishape.size())};
       value = runtime::NDArray::Empty(cshape, cdtype, dev);
-      auto* dims = static_cast<int32_t*>(value->data);
+      int32_t* dims = static_cast<int32_t*>(value->data);
       using ::tvm::tir::IntImmNode;
       for (size_t i = 0; i < ishape.size(); ++i) {
-        if (const auto* dim = ishape[i].as<IntImmNode>()) {
+        if (const IntImmNode* dim = ishape[i].as<IntImmNode>()) {
           dims[i] = dim->value;
         } else {
-          return {};
+          return expr;
         }
       }
     }
 
     Constant shape = Downcast<Constant>(ObjectToExpr(value));
 
-    if (shape->data.Shape().empty() && GetScalarFromConstant<int32_t>(shape) == 0) {
+    if (shape->data.Shape().size() == 0 && GetScalarFromConstant<int32_t>(shape) == 0) {
       auto ndarray = runtime::NDArray::Empty({}, cdtype, dev);
       shape = Constant(ndarray);
     }
@@ -320,25 +288,18 @@ class ConstantFolder : public MixedModeMutator {
     return CastValue(shape, param->dtype);
   }
 
-  /*!
-   * \brief Returns the constant NDArray size of result of \p call if it is of the form
-   * \p ndarray_size(e) and \p e has non-dynamic tensor type. Returns null otherwise.
-   */
-  Optional<Expr> EvaluateNdarraySize(const Call& call) {
-    if (call->op != ndarray_size_op_) {
-      return {};
-    }
-    VLOG(1) << "Evaluating for ndarray_size:" << std::endl << PrettyPrint(call);
-    ICHECK_EQ(call->args.size(), 1);
-    Expr input = call->args[0];
-    const auto* param = call->attrs.as<NdarraySizeAttrs>();
+  // Evaluate a call to the ndarray_size operator for tensors with constant
+  // shapes.
+  Expr EvaluateNdarraySize(Expr expr, Array<Expr> args, Attrs attrs) {
+    Expr input = args[0];
+    const auto* param = attrs.as<NdarraySizeAttrs>();
     ICHECK(param != nullptr);
 
     tvm::Array<IndexExpr> ishape;
-    if (Optional<tvm::Array<IndexExpr>> opt_shape = GetConstantShape(input)) {
-      ishape = opt_shape.value();
+    if (auto opt = GetConstantShape(input)) {
+      ishape = opt.value();
     } else {
-      return {};
+      return expr;
     }
 
     // Get the constant size
@@ -348,17 +309,17 @@ class ConstantFolder : public MixedModeMutator {
     runtime::NDArray value;
     DLDataType cdtype = DataType::Int(32);
     value = runtime::NDArray::Empty({}, cdtype, dev);
-    auto* data = static_cast<int32_t*>(value->data);
-    if (ishape.empty()) {
+    int32_t* data = static_cast<int32_t*>(value->data);
+    if (ishape.size() == 0) {
       *data = 0;
     } else {
       *data = 1;
       using ::tvm::tir::IntImmNode;
       for (size_t i = 0; i < ishape.size(); ++i) {
-        if (const auto* dim = ishape[i].as<IntImmNode>()) {
+        if (const IntImmNode* dim = ishape[i].as<IntImmNode>()) {
           *data *= dim->value;
         } else {
-          return {};
+          return expr;
         }
       }
     }
@@ -376,57 +337,31 @@ class ConstantFolder : public MixedModeMutator {
   }
 
   Optional<tvm::Array<IndexExpr>> GetConstantShape(const Expr& input) {
-    if (const auto* const_node = AsIgnoringOnDevice<ConstantNode>(input)) {
-      // TODO(mbs): This is not necessary since we only ever ask for the shapes for
-      // pre-rewritten expressions which will always have a checked_type.
-      return const_node->tensor_type()->shape;
+    tvm::Array<IndexExpr> ishape;
+    if (const ConstantNode* op = input.as<ConstantNode>()) {
+      ishape = op->tensor_type()->shape;
     } else if (input->checked_type_.defined()) {
-      return input->checked_type().as<TensorTypeNode>()->shape;
+      ishape = input->checked_type().as<TensorTypeNode>()->shape;
     } else {
-      return {};
+      return Optional<tvm::Array<IndexExpr>>(nullptr);
     }
-  }
 
-  // Module
-  IRModule module_;
-
-  // Cache the following ops for equivalence checking in this pass.
-  const Op& device_copy_op_;
-  const Op& shape_of_op_;
-  const Op& vm_shape_of_op_;
-  const Op& cast_op_;
-  const Op& ndarray_size_op_;
-
-  // True if currently within a "primitive" Relay Function.
-  bool inside_primitive_ = false;
+    return Optional<tvm::Array<IndexExpr>>(ishape);
+  }
 };
 
-}  // namespace
-
-TVM_REGISTER_GLOBAL("relay.analysis.check_constant").set_body_typed(IsComplexConstant);
-
-/*!
- * \brief Returns \p expr with any constants expressions evaluated and let-bound constants
- * inlined. Returns \p expr unchanged if no change.
- *
- * CAUTION: The importers rely on this function returning \p expr unchanged to preserve sharing
- * from their p.o.v. Furthermore, this function can be called before conversion to ANF so
- * we must avoid all recursion.
- */
-Expr FoldConstantExpr(const Expr& expr, const IRModule& mod) {
-  VLOG_CONTEXT << "FoldConstantExpr";
-  VLOG(1) << "folding:" << std::endl << PrettyPrint(expr);
-  Expr result = ConstantFolder(mod).VisitExpr(expr);
-  VLOG(1) << "folded to:" << std::endl << PrettyPrint(result);
-  return result;
+Expr FoldConstant(const Expr& expr, const IRModule& mod) {
+  return ConstantFolder(mod).Mutate(expr);
 }
 
-TVM_REGISTER_GLOBAL("relay._transform.FoldConstantExpr").set_body_typed(FoldConstantExpr);
+TVM_REGISTER_GLOBAL("relay._transform.FoldConstantExpr").set_body_typed(FoldConstant);
+
+namespace transform {
 
 Pass FoldConstant() {
   runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
       [=](Function f, IRModule m, PassContext pc) {
-        return Downcast<Function>(FoldConstantExpr(f, m));
+        return Downcast<Function>(FoldConstant(f, m));
       };
   return CreateFunctionPass(pass_func, 2, "FoldConstant", {});
 }
@@ -434,5 +369,6 @@ Pass FoldConstant() {
 TVM_REGISTER_GLOBAL("relay._transform.FoldConstant").set_body_typed(FoldConstant);
 
 }  // namespace transform
+
 }  // namespace relay
 }  // namespace tvm
diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc
index 81d704e2be8ed..71917c31ec007 100644
--- a/src/relay/transforms/memory_alloc.cc
+++ b/src/relay/transforms/memory_alloc.cc
@@ -62,9 +62,8 @@ inline Constant MakeConstant(const std::vector<int64_t>& value) {
 }
 
 inline Expr AllocTensor(const Expr& storage, tvm::relay::Expr shape, DataType dtype,
-                        Array<IndexExpr> assert_shape, DLDeviceType offset_device_type) {
-  auto offset =
-      OnDevice(MakeConstantScalar(DataType::Int(64), 0), offset_device_type, /*is_fixed=*/true);
+                        Array<IndexExpr> assert_shape) {
+  auto offset = MakeConstantScalar(DataType::Int(64), 0);
   return AllocTensor(storage, offset, shape, dtype, assert_shape);
 }
 
@@ -268,9 +267,8 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
     auto sto = scope->Push(var, value);
 
     // TODO(@jroesch): There is a bug with typing based on the constant shape.
-    auto tensor = OnDevice(
-        AllocTensor(sto, shape, type->dtype, /*assert_shape=*/type->shape, cpu_device_.device_type),
-        dev.device_type, /*is_fixed=*/true);
+    auto tensor = OnDevice(AllocTensor(sto, shape, type->dtype, /*assert_shape=*/type->shape),
+                           dev.device_type, /*is_fixed=*/true);
     Var tensor_var("tensor_" + name_hint, Type(nullptr));
     return scope->Push(tensor_var, tensor);
   }
@@ -369,8 +367,7 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
       auto out_shape = out_shapes[i];
       auto out_type = out_types[i];
       auto storage = storages[i];
-      auto alloc = OnDevice(AllocTensor(storage, out_shape, out_type->dtype, out_type->shape,
-                                        cpu_device_.device_type),
+      auto alloc = OnDevice(AllocTensor(storage, out_shape, out_type->dtype, out_type->shape),
                             dev.device_type, /*is_fixed=*/true);
       Var out_var("out_" + std::to_string(i), Type(nullptr));
       outs.push_back(scope->Push(out_var, alloc));
@@ -397,7 +394,7 @@ class DialectRewriter : public transform::DeviceAwareExprMutator {
         CHECK(imm) << "expect static int shape";
         shape.push_back(imm->value);
       }
-      shape_expr = OnDevice(MakeConstant(shape), cpu_device_.device_type, /*is_fixed=*/true);
+      shape_expr = MakeConstant(shape);
     }
     return ReshapeTensor(new_args[0], shape_expr, ret_ty->shape);
   }
@@ -418,6 +415,7 @@ Pass ManifestAlloc(Target target_host, Map<tvm::Integer, tvm::Target> targets) {
   CheckAndUpdateHostConsistency(&targets, &target_host);
   return tvm::transform::CreateModulePass(
       [=](IRModule mod, const PassContext& pass_ctx) {
+        DLOG(INFO) << "tvm::relay::transform::ManifestAlloc";
         // We need to mutate module, therefore making a copy of it.
         mod.CopyOnWrite();
         mod->ImportFromStd("core.rly");
diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc
index b4d7b41b7f4ae..33a87c9a2be27 100644
--- a/src/runtime/cuda/cuda_device_api.cc
+++ b/src/runtime/cuda/cuda_device_api.cc
@@ -112,14 +112,14 @@ class CUDADeviceAPI final : public DeviceAPI {
     ICHECK_EQ(256 % alignment, 0U) << "CUDA space is aligned at 256 bytes";
     void* ret;
     if (dev.device_type == kDLCUDAHost) {
-      VLOG(1) << "allocating " << nbytes << "bytes on host";
+      DLOG(INFO) << "allocating " << nbytes << "bytes on host";
       CUDA_CALL(cudaMallocHost(&ret, nbytes));
     } else {
       CUDA_CALL(cudaSetDevice(dev.device_id));
       size_t free_mem, total_mem;
       CUDA_CALL(cudaMemGetInfo(&free_mem, &total_mem));
-      VLOG(1) << "allocating " << nbytes << " bytes on device, with " << free_mem
-              << " bytes currently free out of " << total_mem << " bytes available";
+      DLOG(INFO) << "allocating " << nbytes << " bytes on device, with " << free_mem
+                 << " bytes currently free out of " << total_mem << " bytes available";
       CUDA_CALL(cudaMalloc(&ret, nbytes));
     }
     return ret;
@@ -127,11 +127,11 @@ class CUDADeviceAPI final : public DeviceAPI {
 
   void FreeDataSpace(Device dev, void* ptr) final {
     if (dev.device_type == kDLCUDAHost) {
-      VLOG(1) << "freeing host memory";
+      DLOG(INFO) << "freeing host memory";
       CUDA_CALL(cudaFreeHost(ptr));
     } else {
       CUDA_CALL(cudaSetDevice(dev.device_id));
-      VLOG(1) << "freeing device memory";
+      DLOG(INFO) << "freeing device memory";
       CUDA_CALL(cudaFree(ptr));
     }
   }
diff --git a/src/runtime/dso_library.cc b/src/runtime/dso_library.cc
index 81eb30ee12d25..c439bde824972 100644
--- a/src/runtime/dso_library.cc
+++ b/src/runtime/dso_library.cc
@@ -37,102 +37,62 @@
 namespace tvm {
 namespace runtime {
 
-/*!
- * \brief Dynamic shared library object used to load
- * and retrieve symbols by name. This is the default
- * module TVM uses for host-side AOT compilation.
- */
+// Dynamic shared libary.
+// This is the default module TVM used for host-side AOT
 class DSOLibrary final : public Library {
  public:
-  ~DSOLibrary();
-  /*!
-   * \brief Initialize by loading and storing
-   * a handle to the underlying shared library.
-   * \param name The string name/path to the
-   * shared library over which to initialize.
-   */
-  void Init(const std::string& name);
-  /*!
-   * \brief Returns the symbol address within
-   * the shared library for a given symbol name.
-   * \param name The name of the symbol.
-   * \return The symbol.
-   */
-  void* GetSymbol(const char* name) final;
+  ~DSOLibrary() {
+    if (lib_handle_) Unload();
+  }
+  void Init(const std::string& name) { Load(name); }
 
- private:
-  /*! \brief Private implementation of symbol lookup.
-   *  Implementation is operating system dependent.
-   *  \param The name of the symbol.
-   * \return The symbol.
-   */
-  void* GetSymbol_(const char* name);
-  /*! \brief Implementation of shared library load.
-   *  Implementation is operating system dependent.
-   *  \param The name/path of the shared library.
-   */
-  void Load(const std::string& name);
-  /*! \brief Implementation of shared library unload.
-   *  Implementation is operating system dependent.
-   */
-  void Unload();
+  void* GetSymbol(const char* name) final { return GetSymbol_(name); }
 
+ private:
+  // Platform dependent handling.
 #if defined(_WIN32)
-  //! \brief Windows library handle
+  // library handle
   HMODULE lib_handle_{nullptr};
+
+  void* GetSymbol_(const char* name) {
+    return reinterpret_cast<void*>(GetProcAddress(lib_handle_, (LPCSTR)name));  // NOLINT(*)
+  }
+
+  // Load the library
+  void Load(const std::string& name) {
+    // use wstring version that is needed by LLVM.
+    std::wstring wname(name.begin(), name.end());
+    lib_handle_ = LoadLibraryW(wname.c_str());
+    ICHECK(lib_handle_ != nullptr) << "Failed to load dynamic shared library " << name;
+  }
+
+  void Unload() {
+    FreeLibrary(lib_handle_);
+    lib_handle_ = nullptr;
+  }
 #else
-  // \brief Linux library handle
+  // Library handle
   void* lib_handle_{nullptr};
+  // load the library
+  void Load(const std::string& name) {
+    lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
+    ICHECK(lib_handle_ != nullptr)
+        << "Failed to load dynamic shared library " << name << " " << dlerror();
+  }
+
+  void* GetSymbol_(const char* name) { return dlsym(lib_handle_, name); }
+
+  void Unload() {
+    dlclose(lib_handle_);
+    lib_handle_ = nullptr;
+  }
 #endif
 };
 
-DSOLibrary::~DSOLibrary() {
-  if (lib_handle_) Unload();
-}
-
-void DSOLibrary::Init(const std::string& name) { Load(name); }
-
-void* DSOLibrary::GetSymbol(const char* name) { return GetSymbol_(name); }
-
-#if defined(_WIN32)
-
-void* DSOLibrary::GetSymbol_(const char* name) {
-  return reinterpret_cast<void*>(GetProcAddress(lib_handle_, (LPCSTR)name));  // NOLINT(*)
-}
-
-void DSOLibrary::Load(const std::string& name) {
-  // use wstring version that is needed by LLVM.
-  std::wstring wname(name.begin(), name.end());
-  lib_handle_ = LoadLibraryW(wname.c_str());
-  ICHECK(lib_handle_ != nullptr) << "Failed to load dynamic shared library " << name;
-}
-
-void DSOLibrary::Unload() {
-  FreeLibrary(lib_handle_);
-  lib_handle_ = nullptr;
-}
-
-#else
-
-void DSOLibrary::Load(const std::string& name) {
-  lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL);
-  ICHECK(lib_handle_ != nullptr) << "Failed to load dynamic shared library " << name << " "
-                                 << dlerror();
-}
-
-void* DSOLibrary::GetSymbol_(const char* name) { return dlsym(lib_handle_, name); }
-
-void DSOLibrary::Unload() {
-  dlclose(lib_handle_);
-  lib_handle_ = nullptr;
-}
-
-#endif
-
-ObjectPtr<Library> CreateDSOLibraryObject(std::string library_path) {
+TVM_REGISTER_GLOBAL("runtime.module.loadfile_so").set_body([](TVMArgs args, TVMRetValue* rv) {
   auto n = make_object<DSOLibrary>();
-  n->Init(library_path);
-  return n;
-}
+  n->Init(args[0]);
+  *rv = CreateModuleFromLibrary(n);
+});
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/hexagon/android/hexagon_device.h b/src/runtime/hexagon/android/hexagon_device.h
deleted file mode 100644
index 552b8f9713698..0000000000000
--- a/src/runtime/hexagon/android/hexagon_device.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef TVM_RUNTIME_HEXAGON_ANDROID_HEXAGON_DEVICE_H_
-#define TVM_RUNTIME_HEXAGON_ANDROID_HEXAGON_DEVICE_H_
-
-#include <tvm/runtime/logging.h>
-#include <tvm/runtime/module.h>
-
-#include <memory>
-#include <string>
-
-#include "../../meta_data.h"
-
-namespace tvm {
-namespace runtime {
-namespace hexagon {
-
-/*!
- * \brief Low-level interface for communicating with Hexagon devices.
- */
-class Device {
- public:
-  /*!
-   * \brief Allocate memory on device.
-   * \param size    Requested size.
-   * \param align   Requested alignment.
-   * \return        Pointer (local to the device) of the allocated memory,
-   *                or nullptr if allocation failed.
-   */
-  virtual void* Alloc(unsigned size, unsigned align) = 0;
-  /*!
-   * \brief Release allocated memory on device.
-   * \param ptr     Pointer to memory previously allocated by \ref Alloc.
-   */
-  virtual void Free(void* ptr) = 0;
-  /*!
-   * \brief Allocate VTCM memory on device.
-   * \param size    Requested size.
-   * \param align   Requested alignment.
-   * \return        Pointer (local to the device) of the allocated memory,
-   *                or nullptr if allocation failed.
-   */
-  virtual void* AllocVtcm(unsigned size, unsigned align) = 0;
-  /*!
-   * \brief Release allocated VTCM memory on device.
-   * \param ptr     Pointer to memory previously allocated by \ref AllocVtcm.
-   */
-  virtual void FreeVtcm(void* ptr) = 0;
-  /*!
-   * \brief Copy a block of data on device to another location on the device.
-   * \param dst     Pointer (local to device) to the destination buffer.
-   * \param src     Pointer (local to device) of the source buffer.
-   * \param len     Number of bytes to copy.
-   */
-  virtual void CopyDeviceToDevice(void* dst, const void* src, unsigned len) = 0;
-  /*!
-   * \brief Copy a block of data from device to host.
-   * \param host_dst  Pointer (local to host) to the destination buffer.
-   * \param src       Pointer (local to device) to the source buffer.
-   * \param len       Number of bytes to copy.
-   */
-  virtual void CopyDeviceToHost(void* host_dst, const void* src, unsigned len) = 0;
-  /*!
-   * \brief Copy a block of data from host to device.
-   * \param dst       Pointer (local to device) to the destination buffer.
-   * \param host_src  Pointer (local to host) to the source buffer.
-   * \param len       Number of bytes to copy.
-   */
-  virtual void CopyHostToDevice(void* dst, const void* host_src, unsigned len) = 0;
-  /*!
-   * \brief Load a module (typically a shared library) into device.
-   * \param data    Name of the shared library.
-   * \param fmt     Format of the library (currently ignored).
-   * \return        Pointer to the loaded module.
-   * \note Currently only one module can be loaded at any given time.
-   */
-  virtual void* Load(const std::string& data, const std::string& fmt) = 0;
-  /*!
-   * \brief Unload a module from device.
-   * \param mod     Pointer to a loaded module returned by \ref Load.
-   */
-  virtual void Unload(void* mod) = 0;
-  /*!
-   * \brief Find the address of an object in the currently loaded module.
-   * \param sym     Name of the object.
-   * \return Address of the located object, or nullptr if object was
-   *         not found.
-   */
-  virtual void* Resolve(const std::string& sym) = 0;
-  /*!
-   * \brief Invoke a function on device with given arguments.
-   * \param func    Address (local to device) of the function to call.
-   * \param scalar  Pointer to an array of 32-bit values that will be
-   *                passed via consecutive registers: r0..r5. This array
-   *                includes dummy values for skipped registers.
-   * \param sc_num  Number of values in the "scalar" array.
-   * \param stack   Pointer to an array of 32-bit values that will be
-   *                passed on the stack. This array includes dummy values
-   *                for padding.
-   * \param st_num  Number of values in the "stack" array.
-   */
-  virtual void Call(void* func, uint32_t* scalar, unsigned sc_num, uint32_t* stack,
-                    unsigned st_num) = 0;
-
-  virtual ~Device() = 0;
-
-  static std::shared_ptr<Device> Global();
-  static bool ValidateDeviceId(decltype(DLDevice::device_id) device_id) {
-    // Only supporting a single device for now.
-    return device_id == 0;
-  }
-};
-
-}  // namespace hexagon
-
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_HEXAGON_ANDROID_HEXAGON_DEVICE_H_
diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon/hexagon_buffer.cc
deleted file mode 100644
index 0760bab6c5822..0000000000000
--- a/src/runtime/hexagon/hexagon/hexagon_buffer.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#include "hexagon_buffer.h"
-
-#include <tvm/runtime/module.h>
-
-#include <string>
-#include <utility>
-
-#include "hexagon_common.h"
-
-namespace tvm {
-namespace runtime {
-namespace hexagon {
-
-static size_t GetDataAlignment(const DLDataType dtype) {
-  size_t align = (dtype.bits / 8) * dtype.lanes;
-  if (align < kAllocAlignment) return kAllocAlignment;
-  return align;
-}
-
-HexagonBuffer::HexagonBuffer(int ndim, const int64_t* shape, DLDataType dtype,
-                             Optional<String> scope) {
-  ICHECK_LE(ndim, 1) << "Hexagon currently only supports flat allocations "
-                     << "and arrays of flat allocations.";
-
-  size_t alignment = GetDataAlignment(dtype);
-  // TODO(csullivan): Extend to support arrays of allocations.
-  // Move assignment from r-value constructed flat allocation.
-  *this = HexagonBuffer(shape[0] * (dtype.bits / 8) * dtype.lanes, alignment, scope);
-}
-
-HexagonBuffer::HexagonBuffer(size_t nbytes, size_t alignment, Optional<String> scope) {
-  void* ptr = nullptr;
-  int ret = posix_memalign(&ptr, alignment, nbytes);
-  if (ret != 0) {
-    throw std::bad_alloc();
-  }
-  allocations_.push_back(ptr);
-  SetStorageScope(scope);
-}
-
-HexagonBuffer::HexagonBuffer(void* data, Optional<String> scope) : managed_{false} {
-  SetStorageScope(scope);
-  allocations_.push_back(data);
-}
-
-HexagonBuffer::~HexagonBuffer() {
-  if (managed_) {
-    for (auto& ptr : allocations_) {
-      free(ptr);
-    }
-  }
-}
-
-HexagonBuffer::HexagonBuffer(HexagonBuffer&& other)
-    : allocations_(other.allocations_),
-      managed_(other.managed_),
-      storage_scope_(other.storage_scope_) {
-  other.allocations_.clear();
-  other.managed_ = false;
-  other.storage_scope_ = StorageScope::kDDR;
-}
-
-HexagonBuffer& HexagonBuffer::operator=(HexagonBuffer&& other) {
-  std::swap(allocations_, other.allocations_);
-  std::swap(managed_, other.managed_);
-  std::swap(storage_scope_, other.storage_scope_);
-  return *this;
-}
-
-void* HexagonBuffer::GetPointer() {
-  if (!allocations_.size()) {
-    return nullptr;
-  }
-  return (allocations_.size() > 1) ? allocations_.data() : allocations_[0];
-}
-
-HexagonBuffer::StorageScope HexagonBuffer::GetStorageScope() const { return storage_scope_; }
-
-void HexagonBuffer::SetStorageScope(Optional<String> scope) {
-  if (!scope.defined()) {
-    storage_scope_ = StorageScope::kDDR;
-  } else {
-    if (scope.value() == "global") {
-      storage_scope_ = StorageScope::kDDR;
-    } else if (scope.value() == "global.vtcm") {
-      storage_scope_ = StorageScope::kVTCM;
-    } else {
-      CHECK(false) << "Encountered unknown HexagonBuffer storage scope: "
-                   << std::string(scope.value());
-    }
-  }
-}
-
-HexagonBuffer* IsHexagonBuffer(DLTensor* tensor) {
-  if (TVMDeviceExtType(tensor->device.device_type) == kDLHexagon) {
-    return static_cast<HexagonBuffer*>(tensor->data);
-  }
-  return nullptr;
-}
-
-}  // namespace hexagon
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.h b/src/runtime/hexagon/hexagon/hexagon_buffer.h
deleted file mode 100644
index c62cee66b0d8c..0000000000000
--- a/src/runtime/hexagon/hexagon/hexagon_buffer.h
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_BUFFER_H_
-#define TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_BUFFER_H_
-
-#include <tvm/runtime/c_runtime_api.h>
-#include <tvm/runtime/device_api.h>
-#include <tvm/runtime/logging.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/packed_func.h>
-
-#include <vector>
-
-namespace tvm {
-namespace runtime {
-namespace hexagon {
-
-class HexagonBuffer {
- public:
-  /* \brief Allocate memory within hexagon accessible memory
-   * scopes.
-   *
-   * \param ndim The number of dimensions of physical storage
-   * to allocate.
-   *
-   * \param shape The shape of the ndarray for which to allocate
-   * physical storage.
-   *
-   * \param dtype The data type of the physical storage.
-   *
-   * \param scope Optional storage scope indicating the memory
-   * space in which to allocate. Defaults to global system
-   * memory (DDR).
-   */
-  HexagonBuffer(int ndim, const int64_t* shape, DLDataType dtype, Optional<String> scope);
-
-  /* \brief Allocate memory within hexagon accessible memory
-   * scopes.
-   *
-   * \param nbytes The number of bytes of flat physical storage
-   * to allocate.
-   *
-   * \param alignment The byte alignment to be used when allocating.
-   *
-   * \param scope Optional storage scope indicating the memory
-   * space in which to allocate. Defaults to global system
-   * memory (DDR).
-   */
-  HexagonBuffer(size_t nbytes, size_t alignment, Optional<String> scope);
-
-  /* \brief Construct a hexagon buffer from externally allocated storage.
-   *
-   * \param data The externally allocated storage.
-   *
-   * \param scope Optional storage scope indicating the memory
-   * space in the external allocation belongs. Assumes global system
-   * memory if not provided.
-   */
-  explicit HexagonBuffer(void* data, Optional<String> scope = Optional<String>());
-
-  //! \brief Destruction deallocates the underlying allocations.
-  ~HexagonBuffer();
-
-  //! \brief Prevent copy construction of HexagonBuffers.
-  HexagonBuffer(const HexagonBuffer&) = delete;
-
-  //! \brief Prevent copy assignment with HexagonBuffers.
-  HexagonBuffer& operator=(const HexagonBuffer&) = delete;
-
-  //! \brief Allow move construction.
-  HexagonBuffer(HexagonBuffer&&);
-
-  //! \brief Allow move assignment.
-  HexagonBuffer& operator=(HexagonBuffer&&);
-
-  //! \brief Return pointer to allocation or allocations.
-  void* GetPointer();
-
-  //! \brief Memory scopes managed by a Hexagon Buffer.
-  enum class StorageScope {
-    //! \brief System DDR corresponding to global storage.
-    kDDR,
-    /*! \brief Vector tightly coupled memory corresponding to
-     *  global.vtcm storage.
-     */
-    kVTCM,
-  };
-
-  //! \brief Return storage scope of underlying allocation.
-  StorageScope GetStorageScope() const;
-
- private:
-  //! \brief Assign a storage scope to the buffer.
-  void SetStorageScope(Optional<String> scope);
-  /*! \brief Array of allocations required by the buffer.
-   *
-   *  For a 1d (flat) storage, a single contiguous allocation will
-   *  result. For 2d storage, (count, nbytes) = shape, which will
-   *  result in `count` discrete allocations.
-   */
-  std::vector<void*> allocations_;
-  /*! \brief Whether the allocation(s) present are managed
-   *  and should be deallocated upon destruction.
-   */
-  bool managed_{true};
-  /*! \brief The underlying storage type in which the allocation
-   *  resides.
-   */
-  StorageScope storage_scope_;
-};
-
-HexagonBuffer* IsHexagonBuffer(DLTensor* tensor);
-
-}  // namespace hexagon
-}  // namespace runtime
-}  // namespace tvm
-
-#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_BUFFER_H_
diff --git a/src/runtime/hexagon/hexagon/hexagon_common.cc b/src/runtime/hexagon/hexagon/hexagon_common.cc
deleted file mode 100644
index 260b105ac43ab..0000000000000
--- a/src/runtime/hexagon/hexagon/hexagon_common.cc
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file hexagon_common.cc
- */
-
-#include "hexagon_common.h"
-
-#include <tvm/runtime/logging.h>
-#include <tvm/runtime/registry.h>
-
-#include <sstream>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "hexagon_buffer.h"
-
-namespace tvm {
-namespace runtime {
-namespace hexagon {
-
-void HexagonLookupLinkedParam(TVMArgs args, TVMRetValue* rv) {
-  Module mod = args[0];
-  int64_t storage_id = args[1];
-  DLTensor* template_tensor = args[2];
-  Device dev = args[3];
-  auto lookup_linked_param = mod.GetFunction(::tvm::runtime::symbol::tvm_lookup_linked_param, true);
-  if (lookup_linked_param == nullptr) {
-    *rv = nullptr;
-    return;
-  }
-
-  TVMRetValue opaque_handle = lookup_linked_param(storage_id);
-  if (opaque_handle.type_code() == kTVMNullptr) {
-    *rv = nullptr;
-    return;
-  }
-
-  std::vector<int64_t> shape_vec{template_tensor->shape,
-                                 template_tensor->shape + template_tensor->ndim};
-
-  auto* param_buffer = new HexagonBuffer(static_cast<void*>(opaque_handle));
-  auto* container = new NDArray::Container(static_cast<void*>(param_buffer), shape_vec,
-                                           template_tensor->dtype, dev);
-  container->SetDeleter([](Object* container) {
-    // The NDArray::Container needs to be deleted
-    // along with the HexagonBuffer wrapper. However the
-    // buffer's data points to global const memory and
-    // so should not be deleted.
-    auto* ptr = static_cast<NDArray::Container*>(container);
-    delete static_cast<HexagonBuffer*>(ptr->dl_tensor.data);
-    delete ptr;
-  });
-  *rv = NDArray(GetObjectPtr<Object>(container));
-}
-
-PackedFunc WrapPackedFunc(TVMBackendPackedCFunc faddr, const ObjectPtr<Object>& sptr_to_self) {
-  return PackedFunc([faddr, sptr_to_self](TVMArgs args, TVMRetValue* rv) {
-    TVMValue ret_value;
-    int ret_type_code = kTVMNullptr;
-
-    TVMValue* arg_values = const_cast<TVMValue*>(args.values);
-    std::vector<std::pair<size_t, HexagonBuffer*>> buffer_args;
-    for (size_t i = 0; i < args.num_args; i++) {
-      if (args.type_codes[i] == kTVMDLTensorHandle) {
-        DLTensor* tensor = static_cast<DLTensor*>(arg_values[i].v_handle);
-        buffer_args.emplace_back(i, static_cast<HexagonBuffer*>(tensor->data));
-        tensor->data = buffer_args.back().second->GetPointer();
-      }
-    }
-    int ret = (*faddr)(const_cast<TVMValue*>(args.values), const_cast<int*>(args.type_codes),
-                       args.num_args, &ret_value, &ret_type_code, nullptr);
-    ICHECK_EQ(ret, 0) << TVMGetLastError();
-
-    for (auto& arg : buffer_args) {
-      DLTensor* tensor = static_cast<DLTensor*>(arg_values[arg.first].v_handle);
-      tensor->data = arg.second;
-    }
-
-    if (ret_type_code != kTVMNullptr) {
-      *rv = TVMRetValue::MoveFromCHost(ret_value, ret_type_code);
-    }
-  });
-}
-}  // namespace hexagon
-
-namespace {
-std::vector<std::string> SplitString(const std::string& str, char delim) {
-  std::vector<std::string> lines;
-  auto ss = std::stringstream{str};
-  for (std::string line; std::getline(ss, line, delim);) {
-    lines.push_back(line);
-  }
-  return lines;
-}
-void HexagonLog(const std::string& file, int lineno, const std::string& message) {
-  HEXAGON_PRINT(ALWAYS, "%s:%d:", file.c_str(), lineno);
-  std::vector<std::string> err_lines = SplitString(message, '\n');
-  for (auto& line : err_lines) {
-    HEXAGON_PRINT(ALWAYS, "%s", line.c_str());
-  }
-}
-}  // namespace
-
-namespace detail {
-void LogFatalImpl(const std::string& file, int lineno, const std::string& message) {
-  HexagonLog(file, lineno, message);
-  throw InternalError(file, lineno, message);
-}
-void LogMessageImpl(const std::string& file, int lineno, const std::string& message) {
-  HexagonLog(file, lineno, message);
-}
-}  // namespace detail
-
-TVM_REGISTER_GLOBAL("tvm.runtime.hexagon.lookup_linked_params")
-    .set_body(hexagon::HexagonLookupLinkedParam);
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon/hexagon_common.h b/src/runtime/hexagon/hexagon/hexagon_common.h
deleted file mode 100644
index 87d36c9865e82..0000000000000
--- a/src/runtime/hexagon/hexagon/hexagon_common.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file hexagon_utils.h
- */
-#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_COMMON_H_
-#define TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_COMMON_H_
-
-#include <dlpack/dlpack.h>
-#include <tvm/runtime/c_backend_api.h>
-#include <tvm/runtime/object.h>
-#include <tvm/runtime/packed_func.h>
-
-#if defined(__hexagon__)
-#include <HAP_farf.h>
-#define HEXAGON_PRINT(level, ...) FARF(level, __VA_ARGS__)
-#else
-#include <cstdio>
-#define HEXAGON_PRINT(level, ...) printf(__VA_ARGS__)
-#endif
-
-#define HEXAGON_SAFE_CALL(api_call)                                               \
-  do {                                                                            \
-    int result = api_call;                                                        \
-    if (result != 0) {                                                            \
-      HEXAGON_PRINT(ERROR, "ERROR: " #api_call " failed with error %d.", result); \
-    }                                                                             \
-  } while (0)
-
-namespace tvm {
-namespace runtime {
-namespace hexagon {
-
-/*! \brief Unpack HexagonBuffers in packed functions
- *  prior to invoking.
- *  \param faddr The function address.
- *  \param mptr The module pointer node.
- *  \return A packed function wrapping the requested function.
- */
-PackedFunc WrapPackedFunc(TVMBackendPackedCFunc faddr, const ObjectPtr<Object>& mptr);
-}  // namespace hexagon
-}  // namespace runtime
-}  // namespace tvm
-inline bool IsHexagonDevice(DLDevice dev) {
-  return TVMDeviceExtType(dev.device_type) == kDLHexagon;
-}
-
-#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_COMMON_H_
diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
deleted file mode 100644
index 9c1f6ebd7d70e..0000000000000
--- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file hexagon_device_api_v2.cc
- */
-
-#include "hexagon_device_api_v2.h"
-
-#include <dmlc/thread_local.h>
-#include <tvm/runtime/logging.h>
-#include <tvm/runtime/ndarray.h>
-#include <tvm/runtime/registry.h>
-
-#include <cstdlib>
-#include <cstring>
-
-#include "../../workspace_pool.h"
-#include "hexagon_buffer.h"
-#include "hexagon_common.h"
-
-namespace tvm {
-namespace runtime {
-namespace hexagon {
-
-HexagonDeviceAPIv2* HexagonDeviceAPIv2::Global() {
-  static auto* inst = new HexagonDeviceAPIv2();
-  return inst;
-}
-
-void HexagonDeviceAPIv2::GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) {
-  if (kind == kExist) {
-    *rv = 1;
-  }
-}
-
-void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, int ndim, const int64_t* shape,
-                                         DLDataType dtype, Optional<String> mem_scope) {
-  return new HexagonBuffer(ndim, shape, dtype, mem_scope.defined() ? mem_scope : String("global"));
-}
-
-void* HexagonDeviceAPIv2::AllocDataSpace(Device dev, size_t nbytes, size_t alignment,
-                                         DLDataType type_hint) {
-  return new HexagonBuffer(nbytes, alignment, String("global"));
-}
-
-void HexagonDeviceAPIv2::FreeDataSpace(Device dev, void* ptr) {
-  auto* pbuf = static_cast<HexagonBuffer*>(ptr);
-  delete pbuf;
-}
-
-struct HexagonWorkspacePool : public WorkspacePool {
-  HexagonWorkspacePool() : WorkspacePool(kDLCPU, HexagonDeviceAPIv2::Global()) {}
-};
-
-void* HexagonDeviceAPIv2::AllocWorkspace(Device dev, size_t size, DLDataType type_hint) {
-  auto* buffer = static_cast<HexagonBuffer*>(
-      dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->AllocWorkspace(dev, size));
-  void* ptr = buffer->GetPointer();
-  workspace_allocations_.insert({ptr, buffer});
-  return ptr;
-}
-
-void HexagonDeviceAPIv2::FreeWorkspace(Device dev, void* data) {
-  auto it = workspace_allocations_.find(data);
-  ICHECK(it != workspace_allocations_.end())
-      << "Attempt made to free unknown or already freed workspace allocation";
-  dmlc::ThreadLocalStore<HexagonWorkspacePool>::Get()->FreeWorkspace(dev, it->second);
-  workspace_allocations_.erase(it);
-}
-
-void HexagonDeviceAPIv2::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) {
-  if (IsHexagonDevice(from->device) && IsHexagonDevice(to->device)) {
-    HexagonBuffer* buffer_src = static_cast<HexagonBuffer*>(from->data);
-    HexagonBuffer* buffer_dst = static_cast<HexagonBuffer*>(to->data);
-    // Check storage scopes
-    if (buffer_src->GetStorageScope() == HexagonBuffer::StorageScope::kDDR &&
-        buffer_dst->GetStorageScope() == HexagonBuffer::StorageScope::kDDR) {
-      memcpy(static_cast<char*>(buffer_dst->GetPointer()) + to->byte_offset,
-             static_cast<const char*>(buffer_src->GetPointer()) + from->byte_offset,
-             GetDataSize(*from));
-    } else {
-      ICHECK(false) << "Currently only copying between DDR storage is supported.";
-    }
-  } else if (IsHexagonDevice(from->device) && to->device.device_type == kDLCPU) {
-    HexagonBuffer* buffer_src = static_cast<HexagonBuffer*>(from->data);
-    memcpy(static_cast<char*>(to->data) + to->byte_offset,
-           static_cast<const char*>(buffer_src->GetPointer()) + from->byte_offset,
-           GetDataSize(*from));
-  } else if (from->device.device_type == kDLCPU && IsHexagonDevice(to->device)) {
-    HexagonBuffer* buffer_dst = static_cast<HexagonBuffer*>(to->data);
-    memcpy(static_cast<char*>(buffer_dst->GetPointer()) + to->byte_offset,
-           static_cast<const char*>(from->data) + from->byte_offset, GetDataSize(*from));
-  } else {
-    CHECK(false)
-        << "Expect copy between DLTensor devices of types kDLHexagon and kDLCPU (external) only.";
-  }
-}
-
-void HexagonDeviceAPIv2::CopyDataFromTo(const void* from, size_t from_offset, void* to,
-                                        size_t to_offset, size_t size, Device dev_from,
-                                        Device dev_to, DLDataType type_hint,
-                                        TVMStreamHandle stream) {
-  memcpy(static_cast<char*>(to) + to_offset, static_cast<const char*>(from) + from_offset, size);
-}
-
-TVM_REGISTER_GLOBAL("device_api.hexagon.v2").set_body([](TVMArgs args, TVMRetValue* rv) {
-  DeviceAPI* ptr = HexagonDeviceAPIv2::Global();
-  *rv = static_cast<void*>(ptr);
-});
-
-}  // namespace hexagon
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h
deleted file mode 100644
index 3d866307f17c1..0000000000000
--- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-#ifndef TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_DEVICE_API_V2_H_
-#define TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_DEVICE_API_V2_H_
-
-#include <tvm/runtime/device_api.h>
-
-#include <unordered_map>
-
-namespace tvm {
-namespace runtime {
-namespace hexagon {
-
-class HexagonBuffer;
-
-/*!
- * \brief Hexagon Device API that is compiled and run on Hexagon.
- */
-class HexagonDeviceAPIv2 final : public DeviceAPI {
- public:
-  //! \brief Retrieve the global singleton instance of the HexagonDeviceAPIv2.
-  static HexagonDeviceAPIv2* Global();
-
-  //! \brief Constructor
-  HexagonDeviceAPIv2() {}
-
-  //! \brief Destructor
-  ~HexagonDeviceAPIv2() {}
-
-  /*! \brief Currently unimplemented interface to specify the active
-   *  Hexagon device.
-   */
-  void SetDevice(Device dev) final{};
-
-  //! \brief Return the queried Hexagon device attribute.
-  void GetAttr(Device dev, DeviceAttrKind kind, TVMRetValue* rv) final;
-
-  //! \brief Currently unimplemented interface to synchronize a device stream.
-  void StreamSync(Device dev, TVMStreamHandle stream) final {}
-
-  //! \note Standard memory allocation methods of the DeviceAPI interface.
-  //! \brief Allocate a flat allocation of global memory wrapped in a HexagonBuffer.
-  void* AllocDataSpace(Device dev, size_t nbytes, size_t alignment, DLDataType type_hint) final;
-
-  //! \brief Free the allocated HexagonBuffer.
-  void FreeDataSpace(Device dev, void* ptr) final;
-
-  /*! \brief Request a dynamically allocated HexagonBuffer from a workspace pool.
-   *  \returns The underlying allocation pointer.
-   */
-  void* AllocWorkspace(Device dev, size_t size, DLDataType type_hint) final;
-
-  //! Dereference workspace pool and erase from tracked workspace_allocations_.
-  void FreeWorkspace(Device dev, void* data) final;
-
-  /*!
-   * \brief Allocate an Nd data space on device with memory scope support.
-   * \param dev The device to perform the operation.
-   * \param ndim The number of dimensions of allocated tensor.
-   * \param shape The shape of allocated tensor.
-   * \param dtype The element type.
-   * \param mem_scope The memory scope of the allocated tensor.
-   * \return The allocated HexagonBuffer pointer.
-   */
-  void* AllocDataSpace(Device dev, int ndim, const int64_t* shape, DLDataType dtype,
-                       Optional<String> mem_scope) final;
-
-  /*!
-   * \brief Copy data from one storage to another.
-   * \note This API is designed to support special memory with shape dependent layout.
-   *       DLTensor's are passed with shape information to support these cases.
-   * \param from The source array.
-   * \param to The target array.
-   * \param stream Optional stream object.
-   */
-  void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final;
-
- protected:
-  //! Standard Device API interface to copy data from one storage to another.
-  void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size,
-                      Device dev_from, Device dev_to, DLDataType type_hint,
-                      TVMStreamHandle stream) final;
-
- private:
-  //! Lookup table for the HexagonBuffer managing a workspace allocation.
-  std::unordered_map<void*, HexagonBuffer*> workspace_allocations_;
-};
-}  // namespace hexagon
-}  // namespace runtime
-}  // namespace tvm
-#endif  // TVM_RUNTIME_HEXAGON_HEXAGON_HEXAGON_DEVICE_API_V2_H_
diff --git a/src/runtime/hexagon/hexagon/hexagon_module.cc b/src/runtime/hexagon/hexagon/hexagon_module.cc
deleted file mode 100644
index a4919ce874e28..0000000000000
--- a/src/runtime/hexagon/hexagon/hexagon_module.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file hexagon_module.cc
- * \brief The HexagonLibraryModuleNode
- */
-#include "../hexagon_module.h"
-
-#include <dmlc/memory_io.h>
-#include <tvm/runtime/module.h>
-#include <tvm/runtime/registry.h>
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "../../library_module.h"
-#include "hexagon_buffer.h"
-#include "hexagon_common.h"
-
-namespace tvm {
-namespace runtime {
-
-Module HexagonModuleCreate(std::string data, std::string fmt,
-                           std::unordered_map<std::string, FunctionInfo> fmap, std::string asm_str,
-                           std::string obj_str, std::string ir_str, std::string bc_str,
-                           const std::set<std::string>& packed_c_abi) {
-  CHECK(fmt == "so") << "Invalid format provided when constructing Hexagon runtime module: " << fmt
-                     << ". Valid formats are: 'so'.";
-  ObjectPtr<Library> n = CreateDSOLibraryObject(data);
-  return CreateModuleFromLibrary(n, hexagon::WrapPackedFunc);
-}
-
-TVM_REGISTER_GLOBAL("runtime.module.loadfile_hexagon").set_body([](TVMArgs args, TVMRetValue* rv) {
-  ObjectPtr<Library> n = CreateDSOLibraryObject(args[0]);
-  *rv = CreateModuleFromLibrary(n, hexagon::WrapPackedFunc);
-});
-}  // namespace runtime
-}  // namespace tvm
diff --git a/src/runtime/hexagon/android/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc
similarity index 99%
rename from src/runtime/hexagon/android/hexagon_device_api.cc
rename to src/runtime/hexagon/hexagon_device_api.cc
index ec50b4bf93a56..a07a7c683026b 100644
--- a/src/runtime/hexagon/android/hexagon_device_api.cc
+++ b/src/runtime/hexagon/hexagon_device_api.cc
@@ -24,7 +24,7 @@
 #include <algorithm>
 #include <cstring>
 
-#include "hexagon_device.h"
+#include "hexagon_module.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/hexagon/android/hexagon_module.cc b/src/runtime/hexagon/hexagon_module.cc
similarity index 99%
rename from src/runtime/hexagon/android/hexagon_module.cc
rename to src/runtime/hexagon/hexagon_module.cc
index e386daf7dc7c4..41aa5855ceeb8 100644
--- a/src/runtime/hexagon/android/hexagon_module.cc
+++ b/src/runtime/hexagon/hexagon_module.cc
@@ -17,7 +17,7 @@
  * under the License.
  */
 
-#include "../hexagon_module.h"
+#include "hexagon_module.h"
 
 #ifdef __ANDROID__
 #include <android/log.h>
@@ -31,8 +31,8 @@
 #include <unordered_map>
 #include <vector>
 
-#include "../../file_utils.h"
-#include "hexagon_device.h"
+#include "../file_utils.h"
+#include "../meta_data.h"
 
 namespace tvm {
 namespace runtime {
diff --git a/src/runtime/hexagon/hexagon_module.h b/src/runtime/hexagon/hexagon_module.h
index 887d9bb30ecb0..1288b933410c6 100644
--- a/src/runtime/hexagon/hexagon_module.h
+++ b/src/runtime/hexagon/hexagon_module.h
@@ -50,6 +50,107 @@ Module HexagonModuleCreate(std::string data, std::string fmt,
                            std::unordered_map<std::string, FunctionInfo> fmap, std::string asm_str,
                            std::string obj_str, std::string ir_str, std::string bc_str,
                            const std::set<std::string>& packed_c_abi);
+
+namespace hexagon {
+
+/*!
+ * \brief Low-level interface for communicating with Hexagon devices.
+ */
+class Device {
+ public:
+  /*!
+   * \brief Allocate memory on device.
+   * \param size    Requested size.
+   * \param align   Requested alignment.
+   * \return        Pointer (local to the device) of the allocated memory,
+   *                or nullptr if allocation failed.
+   */
+  virtual void* Alloc(unsigned size, unsigned align) = 0;
+  /*!
+   * \brief Release allocated memory on device.
+   * \param ptr     Pointer to memory previously allocated by \ref Alloc.
+   */
+  virtual void Free(void* ptr) = 0;
+  /*!
+   * \brief Allocate VTCM memory on device.
+   * \param size    Requested size.
+   * \param align   Requested alignment.
+   * \return        Pointer (local to the device) of the allocated memory,
+   *                or nullptr if allocation failed.
+   */
+  virtual void* AllocVtcm(unsigned size, unsigned align) = 0;
+  /*!
+   * \brief Release allocated VTCM memory on device.
+   * \param ptr     Pointer to memory previously allocated by \ref AllocVtcm.
+   */
+  virtual void FreeVtcm(void* ptr) = 0;
+  /*!
+   * \brief Copy a block of data on device to another location on the device.
+   * \param dst     Pointer (local to device) to the destination buffer.
+   * \param src     Pointer (local to device) of the source buffer.
+   * \param len     Number of bytes to copy.
+   */
+  virtual void CopyDeviceToDevice(void* dst, const void* src, unsigned len) = 0;
+  /*!
+   * \brief Copy a block of data from device to host.
+   * \param host_dst  Pointer (local to host) to the destination buffer.
+   * \param src       Pointer (local to device) to the source buffer.
+   * \param len       Number of bytes to copy.
+   */
+  virtual void CopyDeviceToHost(void* host_dst, const void* src, unsigned len) = 0;
+  /*!
+   * \brief Copy a block of data from host to device.
+   * \param dst       Pointer (local to device) to the destination buffer.
+   * \param host_src  Pointer (local to host) to the source buffer.
+   * \param len       Number of bytes to copy.
+   */
+  virtual void CopyHostToDevice(void* dst, const void* host_src, unsigned len) = 0;
+  /*!
+   * \brief Load a module (typically a shared library) into device.
+   * \param data    Name of the shared library.
+   * \param fmt     Format of the library (currently ignored).
+   * \return        Pointer to the loaded module.
+   * \note Currently only one module can be loaded at any given time.
+   */
+  virtual void* Load(const std::string& data, const std::string& fmt) = 0;
+  /*!
+   * \brief Unload a module from device.
+   * \param mod     Pointer to a loaded module returned by \ref Load.
+   */
+  virtual void Unload(void* mod) = 0;
+  /*!
+   * \brief Find the address of an object in the currently loaded module.
+   * \param sym     Name of the object.
+   * \return Address of the located object, or nullptr if object was
+   *         not found.
+   */
+  virtual void* Resolve(const std::string& sym) = 0;
+  /*!
+   * \brief Invoke a function on device with given arguments.
+   * \param func    Address (local to device) of the function to call.
+   * \param scalar  Pointer to an array of 32-bit values that will be
+   *                passed via consecutive registers: r0..r5. This array
+   *                includes dummy values for skipped registers.
+   * \param sc_num  Number of values in the "scalar" array.
+   * \param stack   Pointer to an array of 32-bit values that will be
+   *                passed on the stack. This array includes dummy values
+   *                for padding.
+   * \param st_num  Number of values in the "stack" array.
+   */
+  virtual void Call(void* func, uint32_t* scalar, unsigned sc_num, uint32_t* stack,
+                    unsigned st_num) = 0;
+
+  virtual ~Device() = 0;
+
+  static std::shared_ptr<Device> Global();
+  static bool ValidateDeviceId(decltype(DLDevice::device_id) device_id) {
+    // Only supporting a single device for now.
+    return device_id == 0;
+  }
+};
+
+}  // namespace hexagon
+
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_RUNTIME_HEXAGON_HEXAGON_MODULE_H_
diff --git a/src/runtime/hexagon/android/hexagon_posix.cc b/src/runtime/hexagon/hexagon_posix.cc
similarity index 100%
rename from src/runtime/hexagon/android/hexagon_posix.cc
rename to src/runtime/hexagon/hexagon_posix.cc
diff --git a/src/runtime/hexagon/android/sim/driver/CMakeLists.txt b/src/runtime/hexagon/sim/driver/CMakeLists.txt
similarity index 95%
rename from src/runtime/hexagon/android/sim/driver/CMakeLists.txt
rename to src/runtime/hexagon/sim/driver/CMakeLists.txt
index ddcec9169211a..dbac995343832 100644
--- a/src/runtime/hexagon/android/sim/driver/CMakeLists.txt
+++ b/src/runtime/hexagon/sim/driver/CMakeLists.txt
@@ -61,10 +61,10 @@ add_executable(sim_dev ${SOURCE_FILES})
 target_include_directories(sim_dev
   PUBLIC "."
   PUBLIC ".."
-  PUBLIC "../../../../../../include"
+  PUBLIC "../../../../../include"
 )
 target_include_directories(sim_dev SYSTEM
-  PUBLIC "../../../../../../3rdparty/dlpack/include"
+  PUBLIC "../../../../../3rdparty/dlpack/include"
 )
 
 target_link_libraries(sim_dev "-ldl")
diff --git a/src/runtime/hexagon/android/sim/driver/README.md b/src/runtime/hexagon/sim/driver/README.md
similarity index 100%
rename from src/runtime/hexagon/android/sim/driver/README.md
rename to src/runtime/hexagon/sim/driver/README.md
diff --git a/src/runtime/hexagon/android/sim/driver/fake_pthread.cc b/src/runtime/hexagon/sim/driver/fake_pthread.cc
similarity index 100%
rename from src/runtime/hexagon/android/sim/driver/fake_pthread.cc
rename to src/runtime/hexagon/sim/driver/fake_pthread.cc
diff --git a/src/runtime/hexagon/android/sim/driver/pthread.h b/src/runtime/hexagon/sim/driver/pthread.h
similarity index 94%
rename from src/runtime/hexagon/android/sim/driver/pthread.h
rename to src/runtime/hexagon/sim/driver/pthread.h
index b4d559c44f8e8..7ec74b4f99f53 100644
--- a/src/runtime/hexagon/android/sim/driver/pthread.h
+++ b/src/runtime/hexagon/sim/driver/pthread.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef TVM_RUNTIME_HEXAGON_ANDROID_SIM_DRIVER_PTHREAD_H_
-#define TVM_RUNTIME_HEXAGON_ANDROID_SIM_DRIVER_PTHREAD_H_
+#ifndef TVM_RUNTIME_HEXAGON_SIM_DRIVER_PTHREAD_H_
+#define TVM_RUNTIME_HEXAGON_SIM_DRIVER_PTHREAD_H_
 
 #define _PROVIDE_POSIX_TIME_DECLS 1
 #include <time.h>
@@ -89,4 +89,4 @@ pthread_t pthread_self(void);
 }
 #endif
 
-#endif  // TVM_RUNTIME_HEXAGON_ANDROID_SIM_DRIVER_PTHREAD_H_
+#endif  // TVM_RUNTIME_HEXAGON_SIM_DRIVER_PTHREAD_H_
diff --git a/src/runtime/hexagon/android/sim/driver/sched.h b/src/runtime/hexagon/sim/driver/sched.h
similarity index 84%
rename from src/runtime/hexagon/android/sim/driver/sched.h
rename to src/runtime/hexagon/sim/driver/sched.h
index 621ef218b795b..cc63630f20723 100644
--- a/src/runtime/hexagon/android/sim/driver/sched.h
+++ b/src/runtime/hexagon/sim/driver/sched.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef TVM_RUNTIME_HEXAGON_ANDROID_SIM_DRIVER_SCHED_H_
-#define TVM_RUNTIME_HEXAGON_ANDROID_SIM_DRIVER_SCHED_H_
+#ifndef TVM_RUNTIME_HEXAGON_SIM_DRIVER_SCHED_H_
+#define TVM_RUNTIME_HEXAGON_SIM_DRIVER_SCHED_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -28,4 +28,4 @@ int sched_yield(void);
 }
 #endif
 
-#endif  // TVM_RUNTIME_HEXAGON_ANDROID_SIM_DRIVER_SCHED_H_
+#endif  // TVM_RUNTIME_HEXAGON_SIM_DRIVER_SCHED_H_
diff --git a/src/runtime/hexagon/android/sim/driver/sim_device.cc b/src/runtime/hexagon/sim/driver/sim_device.cc
similarity index 100%
rename from src/runtime/hexagon/android/sim/driver/sim_device.cc
rename to src/runtime/hexagon/sim/driver/sim_device.cc
diff --git a/src/runtime/hexagon/android/sim/hexagon_device_sim.cc b/src/runtime/hexagon/sim/hexagon_device_sim.cc
similarity index 99%
rename from src/runtime/hexagon/android/sim/hexagon_device_sim.cc
rename to src/runtime/hexagon/sim/hexagon_device_sim.cc
index 250259832597e..14ab4c30e2f28 100644
--- a/src/runtime/hexagon/android/sim/hexagon_device_sim.cc
+++ b/src/runtime/hexagon/sim/hexagon_device_sim.cc
@@ -32,7 +32,7 @@
 #include <utility>
 #include <vector>
 
-#include "../hexagon_device.h"
+#include "../hexagon_module.h"
 #include "HexagonWrapper.h"
 #include "hexagon_sim_proto.h"
 
@@ -121,7 +121,7 @@ struct non_const_str {
     ICHECK_EQ(pointers_.size(), 1);
     return pointers_[0];
   }
-  operator char**() { return pointers_.data(); }
+  operator char* *() { return pointers_.data(); }
 
  private:
   std::vector<char*> pointers_;
diff --git a/src/runtime/hexagon/android/sim/hexagon_sim_proto.h b/src/runtime/hexagon/sim/hexagon_sim_proto.h
similarity index 90%
rename from src/runtime/hexagon/android/sim/hexagon_sim_proto.h
rename to src/runtime/hexagon/sim/hexagon_sim_proto.h
index 8887526232629..2a41536037dfa 100644
--- a/src/runtime/hexagon/android/sim/hexagon_sim_proto.h
+++ b/src/runtime/hexagon/sim/hexagon_sim_proto.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef TVM_RUNTIME_HEXAGON_ANDROID_SIM_HEXAGON_SIM_PROTO_H_
-#define TVM_RUNTIME_HEXAGON_ANDROID_SIM_HEXAGON_SIM_PROTO_H_
+#ifndef TVM_RUNTIME_HEXAGON_SIM_HEXAGON_SIM_PROTO_H_
+#define TVM_RUNTIME_HEXAGON_SIM_HEXAGON_SIM_PROTO_H_
 
 // Protocol:
 
@@ -70,4 +70,4 @@ struct MsgCall {
   uint32_t data[];      //         12
 } __attribute__((packed));
 
-#endif  // TVM_RUNTIME_HEXAGON_ANDROID_SIM_HEXAGON_SIM_PROTO_H_
+#endif  // TVM_RUNTIME_HEXAGON_SIM_HEXAGON_SIM_PROTO_H_
diff --git a/src/runtime/hexagon/android/target/fastrpc/CMakeLists.txt b/src/runtime/hexagon/target/fastrpc/CMakeLists.txt
similarity index 100%
rename from src/runtime/hexagon/android/target/fastrpc/CMakeLists.txt
rename to src/runtime/hexagon/target/fastrpc/CMakeLists.txt
diff --git a/src/runtime/hexagon/android/target/fastrpc/README.md b/src/runtime/hexagon/target/fastrpc/README.md
similarity index 100%
rename from src/runtime/hexagon/android/target/fastrpc/README.md
rename to src/runtime/hexagon/target/fastrpc/README.md
diff --git a/src/runtime/hexagon/android/target/fastrpc/include/tvm_remote.idl b/src/runtime/hexagon/target/fastrpc/include/tvm_remote.idl
similarity index 100%
rename from src/runtime/hexagon/android/target/fastrpc/include/tvm_remote.idl
rename to src/runtime/hexagon/target/fastrpc/include/tvm_remote.idl
diff --git a/src/runtime/hexagon/android/target/fastrpc/include/tvm_remote_nd.idl b/src/runtime/hexagon/target/fastrpc/include/tvm_remote_nd.idl
similarity index 100%
rename from src/runtime/hexagon/android/target/fastrpc/include/tvm_remote_nd.idl
rename to src/runtime/hexagon/target/fastrpc/include/tvm_remote_nd.idl
diff --git a/src/runtime/hexagon/android/target/fastrpc/src/tvm_hvx.cc b/src/runtime/hexagon/target/fastrpc/src/tvm_hvx.cc
similarity index 100%
rename from src/runtime/hexagon/android/target/fastrpc/src/tvm_hvx.cc
rename to src/runtime/hexagon/target/fastrpc/src/tvm_hvx.cc
diff --git a/src/runtime/hexagon/android/target/fastrpc/src/tvm_hvx.h b/src/runtime/hexagon/target/fastrpc/src/tvm_hvx.h
similarity index 95%
rename from src/runtime/hexagon/android/target/fastrpc/src/tvm_hvx.h
rename to src/runtime/hexagon/target/fastrpc/src/tvm_hvx.h
index 3d14252ad6481..2fe947574bbb7 100644
--- a/src/runtime/hexagon/android/target/fastrpc/src/tvm_hvx.h
+++ b/src/runtime/hexagon/target/fastrpc/src/tvm_hvx.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef TVM_RUNTIME_HEXAGON_ANDROID_TARGET_FASTRPC_SRC_TVM_HVX_H_
-#define TVM_RUNTIME_HEXAGON_ANDROID_TARGET_FASTRPC_SRC_TVM_HVX_H_
+#ifndef TVM_RUNTIME_HEXAGON_TARGET_FASTRPC_SRC_TVM_HVX_H_
+#define TVM_RUNTIME_HEXAGON_TARGET_FASTRPC_SRC_TVM_HVX_H_
 
 // Utility providing functions for accessing the Hexagon Vector Extensions
 // (HVX) hardware.
@@ -150,4 +150,4 @@ int cleanup_mt_job(const config_t* hvx_config);
 
 }  // namespace hvx
 
-#endif  // TVM_RUNTIME_HEXAGON_ANDROID_TARGET_FASTRPC_SRC_TVM_HVX_H_
+#endif  // TVM_RUNTIME_HEXAGON_TARGET_FASTRPC_SRC_TVM_HVX_H_
diff --git a/src/runtime/hexagon/android/target/fastrpc/src/tvm_remote_imp.cc b/src/runtime/hexagon/target/fastrpc/src/tvm_remote_imp.cc
similarity index 100%
rename from src/runtime/hexagon/android/target/fastrpc/src/tvm_remote_imp.cc
rename to src/runtime/hexagon/target/fastrpc/src/tvm_remote_imp.cc
diff --git a/src/runtime/hexagon/android/target/fastrpc/src/tvm_remote_nd_imp.cc b/src/runtime/hexagon/target/fastrpc/src/tvm_remote_nd_imp.cc
similarity index 100%
rename from src/runtime/hexagon/android/target/fastrpc/src/tvm_remote_nd_imp.cc
rename to src/runtime/hexagon/target/fastrpc/src/tvm_remote_nd_imp.cc
diff --git a/src/runtime/hexagon/android/target/fastrpc/src/tvm_wrap_pthread.cc b/src/runtime/hexagon/target/fastrpc/src/tvm_wrap_pthread.cc
similarity index 100%
rename from src/runtime/hexagon/android/target/fastrpc/src/tvm_wrap_pthread.cc
rename to src/runtime/hexagon/target/fastrpc/src/tvm_wrap_pthread.cc
diff --git a/src/runtime/hexagon/android/target/hexagon_device_target.cc b/src/runtime/hexagon/target/hexagon_device_target.cc
similarity index 99%
rename from src/runtime/hexagon/android/target/hexagon_device_target.cc
rename to src/runtime/hexagon/target/hexagon_device_target.cc
index a542c5a3e3a22..ee326ca0b159b 100644
--- a/src/runtime/hexagon/android/target/hexagon_device_target.cc
+++ b/src/runtime/hexagon/target/hexagon_device_target.cc
@@ -27,7 +27,7 @@
 #include <string>
 #include <utility>
 
-#include "../hexagon_device.h"
+#include "../hexagon_module.h"
 #include "AEEStdErr.h"
 #include "fastrpc/include/tvm_remote.h"
 #include "hexagon_dsprpcapi.h"
diff --git a/src/runtime/hexagon/android/target/hexagon_dsprpcapi.cc b/src/runtime/hexagon/target/hexagon_dsprpcapi.cc
similarity index 100%
rename from src/runtime/hexagon/android/target/hexagon_dsprpcapi.cc
rename to src/runtime/hexagon/target/hexagon_dsprpcapi.cc
diff --git a/src/runtime/hexagon/android/target/hexagon_dsprpcapi.h b/src/runtime/hexagon/target/hexagon_dsprpcapi.h
similarity index 96%
rename from src/runtime/hexagon/android/target/hexagon_dsprpcapi.h
rename to src/runtime/hexagon/target/hexagon_dsprpcapi.h
index a3d186e302e3f..e4711e3da5843 100644
--- a/src/runtime/hexagon/android/target/hexagon_dsprpcapi.h
+++ b/src/runtime/hexagon/target/hexagon_dsprpcapi.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef TVM_RUNTIME_HEXAGON_ANDROID_TARGET_HEXAGON_DSPRPCAPI_H_
-#define TVM_RUNTIME_HEXAGON_ANDROID_TARGET_HEXAGON_DSPRPCAPI_H_
+#ifndef TVM_RUNTIME_HEXAGON_TARGET_HEXAGON_DSPRPCAPI_H_
+#define TVM_RUNTIME_HEXAGON_TARGET_HEXAGON_DSPRPCAPI_H_
 
 #ifdef __ANDROID__
 #include <stdint.h>
@@ -189,4 +189,4 @@ class DspRpcAPI {
 }  // namespace tvm
 
 #endif  // __ANDROID__
-#endif  // TVM_RUNTIME_HEXAGON_ANDROID_TARGET_HEXAGON_DSPRPCAPI_H_
+#endif  // TVM_RUNTIME_HEXAGON_TARGET_HEXAGON_DSPRPCAPI_H_
diff --git a/src/runtime/hexagon/android/target/hexagon_stubapi.cc b/src/runtime/hexagon/target/hexagon_stubapi.cc
similarity index 100%
rename from src/runtime/hexagon/android/target/hexagon_stubapi.cc
rename to src/runtime/hexagon/target/hexagon_stubapi.cc
diff --git a/src/runtime/hexagon/android/target/hexagon_stubapi.h b/src/runtime/hexagon/target/hexagon_stubapi.h
similarity index 98%
rename from src/runtime/hexagon/android/target/hexagon_stubapi.h
rename to src/runtime/hexagon/target/hexagon_stubapi.h
index feb329f5cef26..fba22b10247c9 100644
--- a/src/runtime/hexagon/android/target/hexagon_stubapi.h
+++ b/src/runtime/hexagon/target/hexagon_stubapi.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef TVM_RUNTIME_HEXAGON_ANDROID_TARGET_HEXAGON_STUBAPI_H_
-#define TVM_RUNTIME_HEXAGON_ANDROID_TARGET_HEXAGON_STUBAPI_H_
+#ifndef TVM_RUNTIME_HEXAGON_TARGET_HEXAGON_STUBAPI_H_
+#define TVM_RUNTIME_HEXAGON_TARGET_HEXAGON_STUBAPI_H_
 
 #ifdef __ANDROID__
 #include <AEEStdErr.h>
@@ -312,4 +312,4 @@ class StubAPI {
 }  // namespace tvm
 
 #endif  // __ANDROID__
-#endif  // TVM_RUNTIME_HEXAGON_ANDROID_TARGET_HEXAGON_STUBAPI_H_
+#endif  // TVM_RUNTIME_HEXAGON_TARGET_HEXAGON_STUBAPI_H_
diff --git a/src/runtime/hexagon/android/target/hexagon_target_log.h b/src/runtime/hexagon/target/hexagon_target_log.h
similarity index 87%
rename from src/runtime/hexagon/android/target/hexagon_target_log.h
rename to src/runtime/hexagon/target/hexagon_target_log.h
index f8ba6a74e3b97..c7684fc561970 100644
--- a/src/runtime/hexagon/android/target/hexagon_target_log.h
+++ b/src/runtime/hexagon/target/hexagon_target_log.h
@@ -17,8 +17,8 @@
  * under the License.
  */
 
-#ifndef TVM_RUNTIME_HEXAGON_ANDROID_TARGET_HEXAGON_TARGET_LOG_H_
-#define TVM_RUNTIME_HEXAGON_ANDROID_TARGET_HEXAGON_TARGET_LOG_H_
+#ifndef TVM_RUNTIME_HEXAGON_TARGET_HEXAGON_TARGET_LOG_H_
+#define TVM_RUNTIME_HEXAGON_TARGET_HEXAGON_TARGET_LOG_H_
 #ifdef __ANDROID__
 
 #include <android/log.h>
@@ -31,4 +31,4 @@
 #define TVM_LOGF(...) __android_log_print(ANDROID_LOG_FATAL, "TVM", ##__VA_ARGS__)
 
 #endif  // __ANDROID__
-#endif  // TVM_RUNTIME_HEXAGON_ANDROID_TARGET_HEXAGON_TARGET_LOG_H_
+#endif  // TVM_RUNTIME_HEXAGON_TARGET_HEXAGON_TARGET_LOG_H_
diff --git a/src/runtime/library_module.cc b/src/runtime/library_module.cc
index 7efa91d912eb3..5dfd5e8ad7d5a 100644
--- a/src/runtime/library_module.cc
+++ b/src/runtime/library_module.cc
@@ -37,8 +37,7 @@ namespace runtime {
 // Library module that exposes symbols from a library.
 class LibraryModuleNode final : public ModuleNode {
  public:
-  explicit LibraryModuleNode(ObjectPtr<Library> lib, PackedFuncWrapper wrapper)
-      : lib_(lib), packed_func_wrapper_(wrapper) {}
+  explicit LibraryModuleNode(ObjectPtr<Library> lib) : lib_(lib) {}
 
   const char* type_key() const final { return "library"; }
 
@@ -54,12 +53,11 @@ class LibraryModuleNode final : public ModuleNode {
       faddr = reinterpret_cast<TVMBackendPackedCFunc>(lib_->GetSymbol(name.c_str()));
     }
     if (faddr == nullptr) return PackedFunc();
-    return packed_func_wrapper_(faddr, sptr_to_self);
+    return WrapPackedFunc(faddr, sptr_to_self);
   }
 
  private:
   ObjectPtr<Library> lib_;
-  PackedFuncWrapper packed_func_wrapper_;
 };
 
 /*!
@@ -130,8 +128,7 @@ Module LoadModuleFromBinary(const std::string& type_key, dmlc::Stream* stream) {
  * \param root_module the output root module
  * \param dso_ctx_addr the output dso module
  */
-void ProcessModuleBlob(const char* mblob, ObjectPtr<Library> lib,
-                       PackedFuncWrapper packed_func_wrapper, runtime::Module* root_module,
+void ProcessModuleBlob(const char* mblob, ObjectPtr<Library> lib, runtime::Module* root_module,
                        runtime::ModuleNode** dso_ctx_addr = nullptr) {
   ICHECK(mblob != nullptr);
   uint64_t nbytes = 0;
@@ -155,7 +152,7 @@ void ProcessModuleBlob(const char* mblob, ObjectPtr<Library> lib,
     // "_lib" serves as a placeholder in the module import tree to indicate where
     // to place the DSOModule
     if (tkey == "_lib") {
-      auto dso_module = Module(make_object<LibraryModuleNode>(lib, packed_func_wrapper));
+      auto dso_module = Module(make_object<LibraryModuleNode>(lib));
       *dso_ctx_addr = dso_module.operator->();
       ++num_dso_module;
       modules.emplace_back(dso_module);
@@ -173,7 +170,7 @@ void ProcessModuleBlob(const char* mblob, ObjectPtr<Library> lib,
   // if we are using old dll, we don't have import tree
   // so that we can't reconstruct module relationship using import tree
   if (import_tree_row_ptr.empty()) {
-    auto n = make_object<LibraryModuleNode>(lib, packed_func_wrapper);
+    auto n = make_object<LibraryModuleNode>(lib);
     auto module_import_addr = ModuleInternal::GetImportsAddr(n.operator->());
     for (const auto& m : modules) {
       module_import_addr->emplace_back(m);
@@ -197,9 +194,9 @@ void ProcessModuleBlob(const char* mblob, ObjectPtr<Library> lib,
   }
 }
 
-Module CreateModuleFromLibrary(ObjectPtr<Library> lib, PackedFuncWrapper packed_func_wrapper) {
+Module CreateModuleFromLibrary(ObjectPtr<Library> lib) {
   InitContextFunctions([lib](const char* fname) { return lib->GetSymbol(fname); });
-  auto n = make_object<LibraryModuleNode>(lib, packed_func_wrapper);
+  auto n = make_object<LibraryModuleNode>(lib);
   // Load the imported modules
   const char* dev_mblob =
       reinterpret_cast<const char*>(lib->GetSymbol(runtime::symbol::tvm_dev_mblob));
@@ -207,7 +204,7 @@ Module CreateModuleFromLibrary(ObjectPtr<Library> lib, PackedFuncWrapper packed_
   Module root_mod;
   runtime::ModuleNode* dso_ctx_addr = nullptr;
   if (dev_mblob != nullptr) {
-    ProcessModuleBlob(dev_mblob, lib, packed_func_wrapper, &root_mod, &dso_ctx_addr);
+    ProcessModuleBlob(dev_mblob, lib, &root_mod, &dso_ctx_addr);
   } else {
     // Only have one single DSO Module
     root_mod = Module(n);
@@ -221,10 +218,5 @@ Module CreateModuleFromLibrary(ObjectPtr<Library> lib, PackedFuncWrapper packed_
 
   return root_mod;
 }
-
-TVM_REGISTER_GLOBAL("runtime.module.loadfile_so").set_body([](TVMArgs args, TVMRetValue* rv) {
-  ObjectPtr<Library> n = CreateDSOLibraryObject(args[0]);
-  *rv = CreateModuleFromLibrary(n);
-});
 }  // namespace runtime
 }  // namespace tvm
diff --git a/src/runtime/library_module.h b/src/runtime/library_module.h
index b5780975f43a3..00c79e8248f47 100644
--- a/src/runtime/library_module.h
+++ b/src/runtime/library_module.h
@@ -78,35 +78,16 @@ PackedFunc WrapPackedFunc(TVMBackendPackedCFunc faddr, const ObjectPtr<Object>&
  */
 void InitContextFunctions(std::function<void*(const char*)> fgetsymbol);
 
-/*!
- * \brief Type alias for funcion to wrap a TVMBackendPackedCFunc.
- * \param The function address imported from a module.
- * \param mptr The module pointer node.
- * \return Packed function that wraps the invocation of the function at faddr.
- */
-using PackedFuncWrapper =
-    std::function<PackedFunc(TVMBackendPackedCFunc faddr, const ObjectPtr<Object>& mptr)>;
-
-/*! \brief Return a library object interface over dynamic shared
- *  libraries in Windows and Linux providing support for
- *  loading/unloading and symbol lookup.
- *  \param Full path to shared library.
- *  \return Returns pointer to the Library providing symbol lookup.
- */
-ObjectPtr<Library> CreateDSOLibraryObject(std::string library_path);
-
 /*!
  * \brief Create a module from a library.
  *
  * \param lib The library.
- * \param wrapper Optional function used to wrap a TVMBackendPackedCFunc,
- * by default WrapPackedFunc is used.
  * \return The corresponding loaded module.
  *
  * \note This function can create multiple linked modules
  *       by parsing the binary blob section of the library.
  */
-Module CreateModuleFromLibrary(ObjectPtr<Library> lib, PackedFuncWrapper wrapper = WrapPackedFunc);
+Module CreateModuleFromLibrary(ObjectPtr<Library> lib);
 }  // namespace runtime
 }  // namespace tvm
 #endif  // TVM_RUNTIME_LIBRARY_MODULE_H_
diff --git a/src/runtime/opencl/opencl_device_api.cc b/src/runtime/opencl/opencl_device_api.cc
index f12a143ab0cc6..26eddb40a7d5d 100644
--- a/src/runtime/opencl/opencl_device_api.cc
+++ b/src/runtime/opencl/opencl_device_api.cc
@@ -196,10 +196,6 @@ void* OpenCLWorkspace::AllocDataSpace(Device dev, size_t size, size_t alignment,
   ICHECK(context != nullptr) << "No OpenCL device";
   cl_int err_code;
   cl::BufferDescriptor* desc = new cl::BufferDescriptor;
-  // CL_INVALID_BUFFER_SIZE if size is 0.
-  if (size == 0) {
-    size = 1;
-  }
   desc->buffer = clCreateBuffer(this->context, CL_MEM_READ_WRITE, size, nullptr, &err_code);
   desc->layout = cl::BufferDescriptor::MemoryLayout::kBuffer1D;
   OPENCL_CHECK_ERROR(err_code);
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 4d7ee457e1e66..a5e7d253f3cd3 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -61,8 +61,6 @@ PackedFunc Executable::GetFunction(const std::string& name, const ObjectPtr<Obje
   } else if (name == "get_bytecode") {
     return PackedFunc(
         [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetBytecode(); });
-  } else if (name == "get_constants") {
-    return PackedFunc([this](TVMArgs args, TVMRetValue* rv) { *rv = this->GetConstants(); });
   } else if (name == "get_stats") {
     return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->Stats(); });
   } else if (name == "save") {
@@ -148,34 +146,6 @@ std::string Executable::GetBytecode() const {
   return oss.str();
 }
 
-namespace {
-String ShapeString(const ShapeTuple& shape_tuple, DLDataType dtype) {
-  std::stringstream sizes;
-  sizes << DLDataType2String(dtype) << "[";
-  for (size_t i = 0; i < shape_tuple.size(); i++) {
-    if (i != 0) {
-      sizes << ", ";
-    }
-    sizes << shape_tuple.data()[i];
-  }
-  sizes << "]";
-  return String(sizes.str());
-}
-}  // namespace
-
-std::string Executable::GetConstants() const {
-  std::ostringstream oss;
-
-  for (size_t i = 0; i < constants.size(); ++i) {
-    const auto& constant = constants[i];
-    auto ndarray = Downcast<NDArray>(constant);
-    DLDeviceType device_type = static_cast<DLDeviceType>(const_device_type[i]);
-    oss << "VM Constant[" << i << "]: has shape " << ShapeString(ndarray.Shape(), ndarray->dtype)
-        << " on device of type " << device_type << std::endl;
-  }
-  return oss.str();
-}
-
 std::string Executable::Stats() const {
   std::ostringstream oss;
   oss << "Relay VM executable statistics:" << std::endl;
@@ -338,7 +308,7 @@ void Executable::SavePrimitiveOpNames(dmlc::Stream* strm) {
 VMInstructionSerializer SerializeInstruction(const Instruction& instr) {
   std::vector<Index> fields;
   // Save the opcode.
-  VLOG(1) << "Serializing: " << instr << std::endl;
+  DLOG(INFO) << "Serializing: " << instr << std::endl;
   switch (instr.op) {
     case Opcode::Move: {
       // Number of fields = 2
diff --git a/src/runtime/vm/memory_manager.cc b/src/runtime/vm/memory_manager.cc
index 22afcce6a01e5..410f6c2a042d8 100644
--- a/src/runtime/vm/memory_manager.cc
+++ b/src/runtime/vm/memory_manager.cc
@@ -119,14 +119,14 @@ Allocator* MemoryManager::GetOrCreateAllocator(Device dev, AllocatorType type) {
     std::unique_ptr<Allocator> alloc;
     switch (type) {
       case kNaive: {
-        VLOG(1) << "New naive allocator for " << DeviceName(dev.device_type) << "(" << dev.device_id
-                << ")";
+        DLOG(INFO) << "New naive allocator for " << DeviceName(dev.device_type) << "("
+                   << dev.device_id << ")";
         alloc.reset(new NaiveAllocator(dev));
         break;
       }
       case kPooled: {
-        VLOG(1) << "New pooled allocator for " << DeviceName(dev.device_type) << "("
-                << dev.device_id << ")";
+        DLOG(INFO) << "New pooled allocator for " << DeviceName(dev.device_type) << "("
+                   << dev.device_id << ")";
         alloc.reset(new PooledAllocator(dev));
         break;
       }
diff --git a/src/runtime/vm/pooled_allocator.h b/src/runtime/vm/pooled_allocator.h
index e5f236983a735..c282eb006f92a 100644
--- a/src/runtime/vm/pooled_allocator.h
+++ b/src/runtime/vm/pooled_allocator.h
@@ -67,7 +67,7 @@ class PooledAllocator final : public Allocator {
     }
 
     used_memory_.fetch_add(size, std::memory_order_relaxed);
-    VLOG(1) << "allocate " << size << " B, used memory " << used_memory_ << " B";
+    DLOG(INFO) << "allocate " << size << " B, used memory " << used_memory_ << " B";
     return buf;
   }
 
@@ -77,7 +77,7 @@ class PooledAllocator final : public Allocator {
       memory_pool_.emplace(buffer.size, std::vector<Buffer>{});
     }
     memory_pool_.at(buffer.size).push_back(buffer);
-    VLOG(1) << "reclaim buffer " << buffer.size;
+    DLOG(INFO) << "reclaim buffer " << buffer.size;
   }
 
   size_t UsedMemory() const override { return used_memory_.load(std::memory_order_relaxed); }
@@ -93,7 +93,7 @@ class PooledAllocator final : public Allocator {
     }
     memory_pool_.clear();
     used_memory_ = 0;
-    VLOG(1) << "release all buffers";
+    DLOG(INFO) << "release all buffers";
   }
 
  private:
diff --git a/src/runtime/vm/serialize_utils.h b/src/runtime/vm/serialize_utils.h
index b4a10806caaf5..cbcdb1bdfa161 100644
--- a/src/runtime/vm/serialize_utils.h
+++ b/src/runtime/vm/serialize_utils.h
@@ -59,13 +59,13 @@ struct VMFunctionSerializer {
   /*! \brief The parameters of the VMFunction. */
   std::vector<std::string> params;
   /*! \brief The device type of each parameter of the VMFunction. */
-  std::vector<Index> params_device_type;
+  std::vector<DLDeviceType> params_device_type;
 
   VMFunctionSerializer() = default;
 
   VMFunctionSerializer(const std::string& name, Index register_file_size, size_t num_instructions,
                        const std::vector<std::string>& params,
-                       const std::vector<Index>& params_device_type)
+                       const std::vector<DLDeviceType>& params_device_type)
       : name(name),
         register_file_size(register_file_size),
         num_instructions(num_instructions),
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index b903f793d799f..addd5ca5d8618 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -236,7 +236,7 @@ void VirtualMachine::SetInput(std::string func_name, TVMArgs args, int offset) {
       << "The number of provided parameters doesn't match the number of assigned devices";
   std::vector<ObjectRef> func_args(param_names.size());
   for (int i = offset; i < args.size(); ++i) {
-    Index device_type = vm_func.params_device_type[i - offset];
+    DLDeviceType device_type = vm_func.params_device_type[i - offset];
     Device dev = GetDevice(device_type);
 
     if (args[i].type_code() == kTVMDLTensorHandle) {
@@ -284,20 +284,20 @@ Index VirtualMachine::PopFrame() {
 }
 
 void VirtualMachine::InvokeGlobal(const VMFunction& func, const std::vector<ObjectRef>& args) {
-  VLOG(2) << "Invoking global " << func.name << " " << args.size();
+  DLOG(INFO) << "Invoking global " << func.name << " " << args.size();
 
   PushFrame(func.params.size(), this->pc_ + 1, func);
   for (size_t i = 0; i < args.size(); ++i) {
     WriteRegister(i, args[i]);
   }
-  VLOG(2) << "func.params= " << func.params.size();
+  DLOG(INFO) << "func.params= " << func.params.size();
 
   code_ = func.instructions.data();
   pc_ = 0;
 }
 
 ObjectRef VirtualMachine::Invoke(const VMFunction& func, const std::vector<ObjectRef>& args) {
-  VLOG(2) << "Executing Function: " << std::endl << func;
+  DLOG(INFO) << "Executing Function: " << std::endl << func;
 
   InvokeGlobal(func, args);
   RunLoop();
@@ -309,7 +309,7 @@ ObjectRef VirtualMachine::Invoke(const std::string& name, const std::vector<Obje
   auto it = exec_->global_map.find(name);
   ICHECK(it != exec_->global_map.end()) << "Cannot find function " << name << " in the executable";
   auto func_index_ = it->second;
-  VLOG(2) << "Invoke Global " << name << " at index " << func_index_;
+  DLOG(INFO) << "Invoke Global " << name << " at index " << func_index_;
   return Invoke(exec_->functions[func_index_], args);
 }
 
@@ -445,7 +445,7 @@ void VirtualMachine::RunLoop() {
   while (true) {
   main_loop:
     auto const& instr = code_[this->pc_];
-    VLOG(2) << "Executing(" << pc_ << "): " << instr;
+    DLOG(INFO) << "Executing(" << pc_ << "): " << instr;
 
     switch (instr.op) {
       case Opcode::Move: {
@@ -500,13 +500,13 @@ void VirtualMachine::RunLoop() {
         goto main_loop;
       }
       case Opcode::InvokePacked: {
-        VLOG(2) << "InvokedPacked " << instr.packed_index << " arity=" << instr.arity;
+        DLOG(INFO) << "InvokedPacked " << instr.packed_index << " arity=" << instr.arity;
         ICHECK_LE(instr.packed_index, packed_funcs_.size());
         const auto& func = packed_funcs_[instr.packed_index];
         const auto& arity = instr.arity;
         std::vector<ObjectRef> args;
         for (Index i = 0; i < arity; ++i) {
-          VLOG(2) << "arg" << i << " $" << instr.packed_args[i];
+          DLOG(INFO) << "arg" << i << " $" << instr.packed_args[i];
           auto arg = ReadRegister(instr.packed_args[i]);
           args.push_back(arg);
         }
@@ -579,18 +579,6 @@ void VirtualMachine::RunLoop() {
         auto storage_obj = ReadRegister(instr.alloc_tensor.storage);
         auto offset = LoadScalarInt(instr.alloc_tensor.offset);
         auto storage = Downcast<Storage>(storage_obj);
-#if TVM_LOG_DEBUG
-        std::ostringstream os;
-        os << "AllocTensor: ";
-        os << "offset=" << offset;
-        os << ", shape=[";
-        for (auto i : shape) {
-          os << i << ",";
-        }
-        os << "]";
-        os << ", dtype=" << DLDataType2String(instr.alloc_tensor.dtype);
-        VLOG(2) << os.str();
-#endif
         auto obj = storage->AllocNDArray(offset, shape, instr.alloc_tensor.dtype);
 
         WriteRegister(instr.dst, obj);
@@ -637,15 +625,17 @@ void VirtualMachine::RunLoop() {
         OpStartHook(instr);
         auto size = LoadScalarInt(instr.alloc_storage.allocation_size);
         auto alignment = instr.alloc_storage.alignment;
+
+        DLOG(INFO) << "AllocStorage: allocation_size=" << size << ", alignment=" << alignment
+                   << ", dtype_hint=" << DLDataType2String(instr.alloc_storage.dtype_hint)
+                   << ", device_type=" << instr.alloc_storage.device_type;
+
         auto storage_obj = SimpleObjAllocator().make_object<StorageObj>();
         auto dev_type = instr.alloc_storage.device_type;
         ICHECK_LT(static_cast<size_t>(dev_type), allocators_.size())
             << "Memory allocator for device " << dev_type << " has not been initialized";
         auto* alloc = allocators_[dev_type];
         ICHECK(alloc) << "Did you forget to init the VirtualMachine with devices?";
-        VLOG(2) << "AllocStorage: allocation_size=" << size << ", alignment=" << alignment
-                << ", dtype_hint=" << DLDataType2String(instr.alloc_storage.dtype_hint)
-                << ", device_type=" << instr.alloc_storage.device_type;
         storage_obj->buffer = alloc->Alloc(size, alignment, instr.alloc_storage.dtype_hint);
         Storage storage(storage_obj);
         WriteRegister(instr.dst, storage);
diff --git a/src/support/utils.h b/src/support/utils.h
index 3bb8700222147..d8e3bf5f30ab8 100644
--- a/src/support/utils.h
+++ b/src/support/utils.h
@@ -145,7 +145,7 @@ inline bool StartsWith(const String& str, const char* prefix) {
     if (str.data()[i] != prefix[i]) return false;
   }
   // return true if the str is equal to the prefix
-  return prefix[n] == '\0';
+  return prefix[n + 1] == '\0';
 }
 
 /*!
diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc
index 8c9ad11e8f29b..49a451c178321 100644
--- a/src/target/source/codegen_cuda.cc
+++ b/src/target/source/codegen_cuda.cc
@@ -530,7 +530,7 @@ void CodeGenCUDA::PrintStorageSync(const CallNode* op) {
   const std::string& sync = op->args[0].as<StringImmNode>()->value;
   if (sync == "warp") {
     // DO nothing.
-  } else if (sync == "shared" || sync == "shared.dyn") {
+  } else if (sync == "shared") {
     this->PrintIndent();
     this->stream << "__syncthreads();\n";
   } else if (sync == "global") {
diff --git a/src/target/source/codegen_opencl.cc b/src/target/source/codegen_opencl.cc
index 507a6243cb0c6..d93a7fde639a4 100644
--- a/src/target/source/codegen_opencl.cc
+++ b/src/target/source/codegen_opencl.cc
@@ -478,31 +478,6 @@ void CodeGenOpenCL::VisitExpr_(const FloatImmNode* op, std::ostream& os) {  // N
   }
 }
 
-template <typename T>
-inline void PrintBinaryExpr(const T* op, const char* opstr, std::ostream& os, CodeGenOpenCL* p) {
-  if (op->dtype.lanes() == 1) {
-    os << opstr << "((";
-    p->PrintType(op->a->dtype, os);
-    os << ")";
-    p->PrintExpr(op->a, os);
-    os << ", (";
-    p->PrintType(op->b->dtype, os);
-    os << ")";
-    p->PrintExpr(op->b, os);
-    os << ')';
-  } else {
-    p->PrintVecBinaryOp(opstr, op->dtype, op->a, op->b, os);
-  }
-}
-
-void CodeGenOpenCL::VisitExpr_(const MinNode* op, std::ostream& os) {
-  PrintBinaryExpr(op, "min", os, this);
-}
-
-void CodeGenOpenCL::VisitExpr_(const MaxNode* op, std::ostream& os) {
-  PrintBinaryExpr(op, "max", os, this);
-}
-
 void CodeGenOpenCL::SetTextureScope(
     const std::unordered_map<const VarNode*, std::string>& scope) {  // NOLINT(*)
   for (auto& texture : scope) {
diff --git a/src/target/source/codegen_opencl.h b/src/target/source/codegen_opencl.h
index 8c36a817753cd..a8c293c030561 100644
--- a/src/target/source/codegen_opencl.h
+++ b/src/target/source/codegen_opencl.h
@@ -65,10 +65,6 @@ class CodeGenOpenCL final : public CodeGenC {
   void VisitExpr_(const FloatImmNode* op, std::ostream& os) final;   // NOLINT(*)
   void VisitStmt_(const StoreNode* op) final;                        // NOLINT(*)
 
-  // overload min and max to avoid ambiguous call errors
-  void VisitExpr_(const MinNode* op, std::ostream& os) final;
-  void VisitExpr_(const MaxNode* op, std::ostream& os) final;
-
  private:
   // whether enable fp16 and fp64 extension
   bool enable_fp16_{false};
diff --git a/src/tir/analysis/verify_gpu_code.cc b/src/tir/analysis/verify_gpu_code.cc
index dc1ed1c193e80..efffa9031ac05 100644
--- a/src/tir/analysis/verify_gpu_code.cc
+++ b/src/tir/analysis/verify_gpu_code.cc
@@ -198,12 +198,12 @@ class GPUCodeVerifier : public StmtExprVisitor {
   }
 
   void VisitStmt_(const StoreNode* op) {
-    if (op->value->dtype.lanes() > 1) {
-      if (static_cast<size_t>(op->value->dtype.lanes() * op->value->dtype.bytes()) >
+    if (op->index->dtype.lanes() > 1) {
+      if (static_cast<size_t>(op->index->dtype.lanes() * op->index->dtype.bytes()) >
           max_vector_bytes_) {
         std::stringstream s;
-        s << "Number of lanes (" << op->value->dtype.lanes() << ") times number of bytes ("
-          << op->value->dtype.bytes() << ") for dtype " << op->value->dtype
+        s << "Number of lanes (" << op->index->dtype.lanes() << ") times number of bytes ("
+          << op->index->dtype.bytes() << ") for dtype " << op->index->dtype
           << " is greater than the maximum number of vector bytes (" << max_vector_bytes_ << ")";
         errors_.push_back(s.str());
       }
diff --git a/src/tir/transforms/merge_dynamic_shared_memory_allocations.cc b/src/tir/transforms/merge_dynamic_shared_memory_allocations.cc
index f3ff1f37a5da7..e8865b260dc13 100644
--- a/src/tir/transforms/merge_dynamic_shared_memory_allocations.cc
+++ b/src/tir/transforms/merge_dynamic_shared_memory_allocations.cc
@@ -31,256 +31,51 @@
 #include <unordered_set>
 
 #include "../../runtime/thread_storage_scope.h"
-#include "../../support/arena.h"
 #include "ir_utils.h"
 
 namespace tvm {
 namespace tir {
 
-using runtime::StorageRank;
-using runtime::StorageScope;
-
 bool IsDynamicSharedMemory(Var buffer_var) {
-  StorageScope storage_scope = runtime::StorageScope::Create(GetPtrStorageScope(buffer_var));
+  auto storage_scope = runtime::StorageScope::Create(GetPtrStorageScope(buffer_var));
   return storage_scope.rank == runtime::StorageRank::kShared && storage_scope.tag == ".dyn";
 }
 
-/*!
- * \brief collect the mapping from the buffer var to its allocate
- */
 class AllocateCollector : public StmtExprVisitor {
  public:
   void VisitStmt_(const AllocateNode* op) final {
     if (IsDynamicSharedMemory(op->buffer_var)) {
-      dyn_shmem_allocs_[op->buffer_var.get()] = op;
-    }
-    StmtExprVisitor::VisitStmt_(op);
-  }
-  // The mapping from the original buffer var to its allocate
-  std::unordered_map<const VarNode*, const AllocateNode*> dyn_shmem_allocs_;
-};
-
-// Find a linear pattern of storage access
-// Used for liveness analysis.
-// "linear" means fitting a complex access pattern into an array of StmtEntry
-//
-// Define "scope" as the body of For/thread_launch/IfThenElse
-// Composite scopes(loop/thread_launch/IfThen) is represented by three StmtEntry:
-// before_scope -> scope_body -> after_scope
-//
-// This pass tries to detect last point that we need to keep memory
-// alive under the same scope as Allocate.
-// The storage need to be kept alive between Allocate and last access.
-// The free point is only inserted at the same scope of Allocate.
-//
-class DynSharedMemLinearAccessPatternFinder final : public StmtExprVisitor {
- public:
-  /*! \brief record the touch list of statement. */
-  struct StmtEntry {
-    // The statement
-    const Object* stmt;
-    // The index in the linear_seq_ to point to end of the nested scope.
-    // This is only set to non-zero if stmt is a nested scope.
-    // if offset > 0, means this is the begin, the end entry is current_index + offset
-    // if offset < 0, means this is the end, the begin entry is current_index + offset
-    int64_t scope_pair_offset{0};
-    // The buffer variables this statement touched.
-    std::vector<const VarNode*> touched;
-  };
-  // The scope of each allocation
-  struct AllocEntry {
-    // the level in the scope stack
-    size_t level{0};
-    // allocation stmt
-    const AllocateNode* alloc{nullptr};
-  };
-
-  void VisitStmt_(const AllocateNode* op) final {
-    size_t level = scope_.size();
-    const VarNode* buf = op->buffer_var.get();
-    alloc_info_[buf].alloc = op;
-    alloc_info_[buf].level = level;
-    StmtExprVisitor::VisitStmt_(op);
-  }
-  void VisitStmt_(const StoreNode* op) final {
-    scope_.push_back(StmtEntry());
-    // visit subexpr
-    StmtExprVisitor::VisitStmt_(op);
-    // Add write access.
-    const VarNode* buf = op->buffer_var.get();
-    auto it = alloc_info_.find(buf);
-    if (it != alloc_info_.end() && it->second.alloc) {
-      ICHECK_LT(it->second.level, scope_.size());
-      if (IsDynamicSharedMemory(GetRef<Var>(buf))) {
-        scope_[it->second.level].touched.push_back(buf);
-      }
-    }
-    StmtEntry e = scope_.back();
-    scope_.pop_back();
-    if (e.touched.size() != 0) {
-      e.stmt = op;
-      linear_seq_.push_back(e);
-    }
-  }
-  void VisitStmt_(const EvaluateNode* op) final {
-    scope_.push_back(StmtEntry());
-    // visit subexpr
-    StmtExprVisitor::VisitStmt_(op);
-    StmtEntry e = scope_.back();
-    scope_.pop_back();
-    if (e.touched.size() != 0) {
-      e.stmt = op;
-      linear_seq_.push_back(e);
-    }
-  }
-  void VisitExpr_(const LoadNode* op) final {
-    // Add write access.
-    StmtExprVisitor::VisitExpr_(op);
-    const VarNode* buf = op->buffer_var.get();
-    auto it = alloc_info_.find(buf);
-    if (it != alloc_info_.end() && it->second.alloc) {
-      ICHECK_LT(it->second.level, scope_.size()) << "Load memory in places other than store.";
-      if (IsDynamicSharedMemory(GetRef<Var>(buf))) {
-        scope_[it->second.level].touched.push_back(buf);
-      }
-    }
-  }
-  void VisitExpr_(const CallNode* op) final {
-    if (op->op.same_as(builtin::address_of())) {
-      const LoadNode* l = op->args[0].as<LoadNode>();
-      this->VisitExpr(l->index);
-    } else {
-      StmtExprVisitor::VisitExpr_(op);
+      dyn_shmem_allocs_.insert(op);
     }
-  }
-  void VisitExpr_(const VarNode* buf) final {
-    // Directly reference to the variable count as a read.
-    auto it = alloc_info_.find(buf);
-    if (it != alloc_info_.end() && it->second.alloc) {
-      ICHECK_LT(it->second.level, scope_.size());
-      if (IsDynamicSharedMemory(GetRef<Var>(buf))) {
-        scope_[it->second.level].touched.push_back(buf);
-      }
-    }
-  }
-  template <typename T>
-  void VisitNewScope(const T* op) {
-    scope_.push_back(StmtEntry());
-    StmtEntry e;
-    e.stmt = op;
-    int64_t begin_index = static_cast<int64_t>(linear_seq_.size());
-    // before scope.
-    linear_seq_.push_back(e);
     StmtExprVisitor::VisitStmt_(op);
-    // after scope.
-    e.touched = std::move(scope_.back().touched);
-    scope_.pop_back();
-    int64_t end_index = static_cast<int64_t>(linear_seq_.size());
-    ICHECK_GT(end_index, begin_index);
-    e.scope_pair_offset = begin_index - end_index;
-    linear_seq_.push_back(e);
-    // record the pointer to end index.
-    ICHECK_NE(end_index, 0U);
-    linear_seq_[begin_index].scope_pair_offset = end_index - begin_index;
   }
-  void VisitStmt_(const AttrStmtNode* op) final {
-    // Only record the outer most thread extent.
-    if (op->attr_key == attr::thread_extent && !in_thread_env_) {
-      in_thread_env_ = true;
-      VisitNewScope(op);
-      in_thread_env_ = false;
-    } else if (op->attr_key == attr::extern_scope) {
-      VisitNewScope(op);
-    } else if (op->attr_key == attr::virtual_thread) {
-      VisitNewScope(op);
-    } else {
-      StmtExprVisitor::VisitStmt_(op);
-    }
-  }
-  void VisitStmt_(const IfThenElseNode* op) final { VisitNewScope(op); }
-
-  void VisitStmt_(const ForNode* op) final { VisitNewScope(op); }
-
-  void VisitStmt_(const WhileNode* op) final { VisitNewScope(op); }
-
-  void VisitStmt_(const AssertStmtNode* op) final { VisitNewScope(op); }
 
-  // linearized access sequence.
-  std::vector<StmtEntry> linear_seq_;
-  // The storage scope of each buffer
-  std::unordered_map<const VarNode*, AllocEntry> alloc_info_;
-
- private:
-  // Whether already in thread env.
-  bool in_thread_env_{false};
-  // The scope stack.
-  std::vector<StmtEntry> scope_;
+  std::unordered_set<const AllocateNode*> dyn_shmem_allocs_;
 };
 
-/*!
- * \brief merge the buffers whose live range has no intersection and rewrite the body
- */
 class DynamicSharedMemoryRewriter : public StmtExprMutator {
  public:
   explicit DynamicSharedMemoryRewriter(
-      const std::unordered_map<const VarNode*, const AllocateNode*>& dyn_shmem_allocs)
+      const std::unordered_set<const AllocateNode*>& dyn_shmem_allocs)
       : dyn_shmem_allocs_{dyn_shmem_allocs} {}
 
-  /*!
-   * \brief plan the memory reuse for all the buffer allocated in the statement
-   * \param stmt the statement
-   */
-  void PlanReuse(const Stmt& stmt) {
-    DynSharedMemLinearAccessPatternFinder finder;
-    finder(stmt);
-    this->LivenessAnalysis(finder.linear_seq_);
-    this->PlanMemory(finder.linear_seq_);
-  }
-
- private:
   Stmt VisitStmt_(const AttrStmtNode* op) final {
-    if (op->attr_key == attr::thread_extent && !allocated_) {
+    if (op->attr_key == attr::thread_extent && !allocated) {
       // Allocate one dynamic shared memory allocation at the beginning of thread scope
-      int max_layer_num = 0;
-      std::vector<const StorageEntry*> all_entry;
-      for (const auto& e : const_free_map_) {
-        all_entry.push_back(e.second);
-      }
-      for (const StorageEntry* e : sym_free_list_) {
-        all_entry.push_back(e);
-      }
-      for (const StorageEntry* e : all_entry) {
-        max_layer_num = std::max(max_layer_num, static_cast<int>(e->allocs.size()));
-      }
-      // calculate align for each layer of each storage entry.
-      std::vector<int> align(max_layer_num, 0);
-      for (const StorageEntry* e : all_entry) {
-        for (int i = 0; i < static_cast<int>(e->allocs.size()); i++) {
-          for (const VarNode* buffer : e->allocs[i]) {
-            const AllocateNode* alloc = dyn_shmem_allocs_[buffer];
-            align[i] = std::max(align[i], alloc->dtype.bytes());
-          }
-        }
+      int align = 1;
+      for (const auto& alloc : dyn_shmem_allocs_) {
+        ICHECK_EQ(alloc->dtype.lanes(), 1) << "vector dtype allocation not supported.";
+        align = std::max(align, alloc->dtype.bytes());
       }
-      // calculate offset for each buffer based on the align of each layer
-      for (const StorageEntry* e : all_entry) {
-        PrimExpr max_inner_offset = 0;
-        for (int i = 0; i < static_cast<int>(e->allocs.size()); i++) {
-          PrimExpr inner_offset = 0;
-          for (const VarNode* buffer : e->allocs[i]) {
-            const AllocateNode* alloc = dyn_shmem_allocs_[buffer];
-            buffer_byte_offsets_[buffer] = merged_alloc_size_ + inner_offset;
-            inner_offset += alloc->extents[0] * alloc->dtype.bytes();
-            inner_offset += indexmod(align[i] - indexmod(inner_offset, align[i]), align[i]);
-          }
-          max_inner_offset = max(max_inner_offset, inner_offset);
-        }
-        merged_alloc_size_ += max_inner_offset;
+      for (const auto& alloc : dyn_shmem_allocs_) {
+        ICHECK_EQ(alloc->extents.size(), 1);
+        buffer_byte_offsets_[alloc->buffer_var.get()] = merged_alloc_size_;
+        merged_alloc_size_ += alloc->extents[0] * align;
       }
 
-      allocated_ = true;
-      Allocate new_body(merged_buf_var_, DataType::UInt(8), {merged_alloc_size_}, const_true(),
-                        StmtExprMutator::VisitStmt(op->body));
+      allocated = true;
+      auto new_body = Allocate(merged_buf_var_, DataType::UInt(8), {merged_alloc_size_},
+                               const_true(), StmtExprMutator::VisitStmt(op->body));
       return AttrStmt(op->node, op->attr_key, op->value, new_body, op->span);
     }
     return StmtMutator::VisitStmt_(op);
@@ -295,8 +90,8 @@ class DynamicSharedMemoryRewriter : public StmtExprMutator {
 
   PrimExpr VisitExpr_(const LoadNode* op) final {
     if (IsDynamicSharedMemory(op->buffer_var)) {
-      PrimExpr offset = GetBufferOffset(op->buffer_var, op->dtype);
-      PrimExpr index = StmtExprMutator::VisitExpr(op->index);
+      auto offset = GetBufferOffset(op->buffer_var, op->dtype);
+      auto index = StmtExprMutator::VisitExpr(op->index);
       return Load(op->dtype, merged_buf_var_, offset + index, op->predicate, op->span);
     }
     return StmtExprMutator::VisitExpr_(op);
@@ -304,265 +99,33 @@ class DynamicSharedMemoryRewriter : public StmtExprMutator {
 
   Stmt VisitStmt_(const StoreNode* op) final {
     if (IsDynamicSharedMemory(op->buffer_var)) {
-      PrimExpr offset = GetBufferOffset(op->buffer_var, op->value->dtype);
-      PrimExpr index = StmtExprMutator::VisitExpr(op->index);
-      PrimExpr value = StmtExprMutator::VisitExpr(op->value);
+      auto offset = GetBufferOffset(op->buffer_var, op->value->dtype);
+      auto index = StmtExprMutator::VisitExpr(op->index);
+      auto value = StmtExprMutator::VisitExpr(op->value);
       return Store(merged_buf_var_, value, offset + index, op->predicate, op->span);
     }
     return StmtExprMutator::VisitStmt_(op);
   }
 
-  PrimExpr VisitExpr_(const CallNode* op) final {
-    if (op->op.same_as(builtin::tvm_access_ptr())) {
-      ICHECK_EQ(op->args.size(), 5U);
-      DataType dtype = op->args[0].dtype();
-      Var buffer = Downcast<Var>(op->args[1]);
-      if (!IsDynamicSharedMemory(buffer)) {
-        return StmtExprMutator::VisitExpr_(op);
-      }
-      PrimExpr extra_offset = GetBufferOffset(buffer, dtype);
-
-      PrimExpr offset = this->VisitExpr(op->args[2]);
-      PrimExpr extent = this->VisitExpr(op->args[3]);
-      return Call(op->dtype, op->op,
-                  {op->args[0], merged_buf_var_, extra_offset + offset, extent, op->args[4]});
-    } else {
-      return StmtExprMutator::VisitExpr_(op);
-    }
-  }
-
+ private:
   PrimExpr GetBufferOffset(Var buffer_var, DataType dtype) {
     auto it = buffer_byte_offsets_.find(buffer_var.get());
     ICHECK(it != buffer_byte_offsets_.end());
     return indexdiv(it->second, dtype.bytes());
   }
 
-  using StmtEntry = DynSharedMemLinearAccessPatternFinder::StmtEntry;
-  struct StorageEntry {
-    // The constant size of the buffer in bits, only used if it is constant
-    uint64_t const_nbits{0};
-    // Allocs that shares this entry.
-    // The inner vector means a "layer"
-    // For example, it we need to allocate C in the memory of A and B:
-    // |  A: 4096 bytes |  B: 4096 bytes |
-    // |            C: 8192 bytes        |
-    // Then the allocs = {{A, B}, {C}}
-    std::vector<std::vector<const VarNode*>> allocs;
-  };
-
-  // Event entry in liveness analysis
-  struct EventEntry {
-    // variables we generate
-    std::vector<const VarNode*> gen;
-    // variables we kill
-    std::vector<const VarNode*> kill;
-  };
-
-  /*!
-   * \brief Liveness analysis to find gen and kill point of each variable.
-   * \param seq the linear pattern of storage access
-   */
-  void LivenessAnalysis(const std::vector<StmtEntry>& seq) {
-    // find kill point, do a reverse linear scan.
-    std::unordered_set<const VarNode*> touched;
-    for (size_t i = seq.size(); i != 0; --i) {
-      const StmtEntry& s = seq[i - 1];
-      for (const VarNode* buffer : s.touched) {
-        if (!touched.count(buffer)) {
-          touched.insert(buffer);
-          event_map_[s.stmt].kill.push_back(buffer);
-        }
-      }
-    }
-    // find gen point, do forward scan
-    touched.clear();
-    for (size_t i = 0; i < seq.size(); ++i) {
-      int64_t offset = seq[i].scope_pair_offset;
-      if (offset < 0) continue;
-      const StmtEntry& s = seq[i + offset];
-      for (const VarNode* buffer : s.touched) {
-        if (!touched.count(buffer)) {
-          touched.insert(buffer);
-          event_map_[s.stmt].gen.push_back(buffer);
-        }
-      }
-    }
-  }
-
-  /*!
-   * \brief Memory plan algorithm
-   * \param seq the linear pattern of storage access
-   * \param alloc_info
-   */
-  void PlanMemory(const std::vector<StmtEntry>& seq) {
-    std::unordered_set<const VarNode*> inplace_flag;
-
-    for (size_t i = 0; i < seq.size(); ++i) {
-      auto it = event_map_.find(seq[i].stmt);
-      // scope_pair_offset <= 0 means it is either
-      // - leaf stmt(offset = 0)
-      // - end of scope(offset < 0)
-      // In both cases, we need to handle the kill event correctly
-      if (it != event_map_.end() && seq[i].scope_pair_offset <= 0) {
-        for (const VarNode* var : it->second.kill) {
-          this->Free(var);
-        }
-      }
-      // scope_pair_offset >= 0 means it is either
-      // - leaf stmt(offset = 0)
-      // - beginning of scope(offset < 0)
-      // In both cases, we need to handle the gen event correctly
-      if (it != event_map_.end() && seq[i].scope_pair_offset >= 0) {
-        for (const VarNode* var : it->second.gen) {
-          ICHECK(dyn_shmem_allocs_.count(var));
-          const AllocateNode* alloc = dyn_shmem_allocs_[var];
-          StorageEntry* dst_entry = FindAlloc(alloc);
-          alloc_map_[var] = dst_entry;
-        }
-      }
-    }
-  }
-  /*!
-   * \brief Allocate new storage entry.
-   * \param op the allocate node
-   * \param the size of the allocation in bits
-   * \return the new storage entry
-   */
-  StorageEntry* NewAlloc(const AllocateNode* op, size_t const_nbits) {
-    ICHECK(op != nullptr);
-    // Re-use not successful, allocate a new buffer.
-    StorageEntry* entry = arena_.make<StorageEntry>();
-    entry->allocs.push_back({op->buffer_var.get()});
-    entry->const_nbits = const_nbits;
-    return entry;
-  }
-  /*!
-   * \brief find the storage entry in the free list for the allocate
-   * \param op the allocate node
-   * \return the storage entry
-   */
-  StorageEntry* FindAlloc(const AllocateNode* op) {
-    ICHECK(op != nullptr);
-    // skip plan for local variable,
-    // compiler can do a better job with register allocation.
-    const uint64_t match_range = 16;
-    uint64_t op_elem_bits = op->dtype.bits() * op->dtype.lanes();
-    uint64_t const_nbits = static_cast<uint64_t>(op->constant_allocation_size() * op_elem_bits);
-    // disable reuse of small arrays, they will be lowered to registers in LLVM
-    // This rules only apply if we are using non special memory
-    if (const_nbits > 0 && const_nbits <= 32) {
-      return NewAlloc(op, const_nbits);
-    }
-
-    if (const_nbits != 0) {
-      // constant allocation.
-      auto begin = const_free_map_.lower_bound(0);
-      auto mid = const_free_map_.lower_bound(const_nbits);
-      auto end = const_free_map_.upper_bound(const_nbits * match_range);
-      // Start looking at the buffer that is bigger than the required size first.
-      // If we find one, directly allocate the buffer in its location and remove its entry in the
-      // free list
-      for (auto it = mid; it != end; ++it) {
-        StorageEntry* e = it->second;
-        e->const_nbits = std::max(const_nbits, e->const_nbits);
-        const_free_map_.erase(it);
-        return e;
-      }
-      // Then start looking at smaller buffers.
-      // Keep collecting the buffer until the sum of their size exceeds the buffer to allocate
-      // and finally free all these entry in the free list
-      std::vector<std::multimap<uint64_t, StorageEntry*>::iterator> delete_it;
-      // the alloc list for the new entry
-      std::vector<std::vector<const VarNode*>> reuse_allocs;
-      uint64_t mem_ct = 0;
-      for (auto it = mid; it != begin;) {
-        --it;
-        delete_it.push_back(it);
-        mem_ct += it->second->const_nbits;
-        int n = it->second->allocs.size();
-        if (n > static_cast<int>(reuse_allocs.size())) {
-          reuse_allocs.resize(n, {});
-        }
-        for (int i = 0; i < n; i++) {
-          for (const VarNode* alloc : it->second->allocs[i]) {
-            reuse_allocs[i].push_back(alloc);
-          }
-        }
-        if (mem_ct >= const_nbits) {
-          break;
-        }
-      }
-      reuse_allocs.push_back({op->buffer_var.get()});
-      if (mem_ct != 0) {
-        StorageEntry* e = arena_.make<StorageEntry>();
-        e->const_nbits = std::max(const_nbits, mem_ct);
-        e->allocs = reuse_allocs;
-        for (auto it : delete_it) {
-          const_free_map_.erase(it);
-        }
-        return e;
-      }
-    } else {
-      // if its symbolic allocation, just arbitrarily choose one entry to fit in because we don't
-      // know its actual size
-      for (auto it = sym_free_list_.begin(); it != sym_free_list_.end(); ++it) {
-        StorageEntry* e = *it;
-        sym_free_list_.erase(it);
-        return e;
-      }
-    }
-    return NewAlloc(op, const_nbits);
-  }
-
-  /*!
-   * \brief add the storage entry to the buffer var into the free list.
-   * \param var the buffer var
-   */
-  void Free(const VarNode* var) {
-    auto it = alloc_map_.find(var);
-    ICHECK(it != alloc_map_.end());
-    StorageEntry* e = it->second;
-    ICHECK_NE(e->allocs.size(), 0U);
-
-    // disable reuse of small arrays
-    if (e->const_nbits > 0 && e->const_nbits <= 32) return;
-
-    // normal free.
-    if (e->const_nbits != 0) {
-      const_free_map_.insert({e->const_nbits, e});
-    } else {
-      sym_free_list_.push_back(e);
-    }
-  }
-  // The var for the merged buffer
   Var merged_buf_var_{"buf_dyn_shmem", PointerType(PrimType(DataType::UInt(8)), "shared.dyn")};
-  // The mapping from the original buffer var to its allocate
-  std::unordered_map<const VarNode*, const AllocateNode*> dyn_shmem_allocs_;
-  // The size of the merged buffer
+  std::unordered_set<const AllocateNode*> dyn_shmem_allocs_;
   PrimExpr merged_alloc_size_{0};
-  // The mapping from the original buffer var to its offset in the merged buffer
   std::unordered_map<const VarNode*, PrimExpr> buffer_byte_offsets_;
-  // The flag indicating whether the merged buffer has been allocated
-  bool allocated_{false};
-  // Locations of free ops.
-  std::unordered_map<const Object*, EventEntry> event_map_;
-  // constant size free map.
-  std::multimap<uint64_t, StorageEntry*> const_free_map_;
-  // symbolic free list, for non constant items.
-  std::list<StorageEntry*> sym_free_list_;
-  // The allocation assign map
-  std::unordered_map<const VarNode*, StorageEntry*> alloc_map_;
-  /*! \brief allocator of all the StorageEntry*/
-  support::Arena arena_;
+  bool allocated{false};
 };
 
 Stmt MergeDynamicSharedMemoryAllocations(Stmt stmt) {
   AllocateCollector collector;
   collector(stmt);
   if (collector.dyn_shmem_allocs_.size() > 1) {
-    DynamicSharedMemoryRewriter rewriter(collector.dyn_shmem_allocs_);
-    rewriter.PlanReuse(stmt);
-    return rewriter(std::move(stmt));
+    return DynamicSharedMemoryRewriter(collector.dyn_shmem_allocs_)(std::move(stmt));
   }
   return stmt;
 }
diff --git a/tests/cpp/support_test.cc b/tests/cpp/support_test.cc
index 01111d9102460..df9271f4b49c4 100644
--- a/tests/cpp/support_test.cc
+++ b/tests/cpp/support_test.cc
@@ -56,11 +56,5 @@ TEST(HashTests, HashStability) {
   EXPECT_EQ(::tvm::support::HashCombine(e, f), 2722928432);
 }
 
-TEST(StartsWithTests, Basic) {
-  EXPECT_TRUE(::tvm::support::StartsWith("abc", "abc"));
-  EXPECT_TRUE(::tvm::support::StartsWith("abcd", "abc"));
-  EXPECT_FALSE(::tvm::support::StartsWith("abc", "abcd"));
-}
-
 }  // namespace test
 }  // namespace tvm
diff --git a/tests/micro/arduino/conftest.py b/tests/micro/arduino/conftest.py
index 8625b4a453642..73361774821b9 100644
--- a/tests/micro/arduino/conftest.py
+++ b/tests/micro/arduino/conftest.py
@@ -24,10 +24,16 @@
 from tvm.micro import project
 from tvm import micro, relay
 
-TEMPLATE_PROJECT_DIR = pathlib.Path(tvm.micro.get_microtvm_template_projects("arduino"))
-
-
-BOARDS = TEMPLATE_PROJECT_DIR / "boards.json"
+TEMPLATE_PROJECT_DIR = (
+    pathlib.Path(__file__).parent
+    / ".."
+    / ".."
+    / ".."
+    / "apps"
+    / "microtvm"
+    / "arduino"
+    / "template_project"
+).resolve()
 
 BOARDS = TEMPLATE_PROJECT_DIR / "boards.json"
 
diff --git a/tests/micro/zephyr/test_utils.py b/tests/micro/zephyr/test_utils.py
index e4a22d2be6478..c27c869509d74 100644
--- a/tests/micro/zephyr/test_utils.py
+++ b/tests/micro/zephyr/test_utils.py
@@ -18,9 +18,8 @@
 import os
 import json
 import pathlib
+import logging
 import tarfile
-import tempfile
-from typing import Union
 
 import numpy as np
 
@@ -30,10 +29,18 @@
 import requests
 
 import tvm.micro
-from tvm.micro import export_model_library_format
-from tvm.micro.testing import mlf_extract_workspace_size_bytes
 
-TEMPLATE_PROJECT_DIR = pathlib.Path(tvm.micro.get_microtvm_template_projects("zephyr"))
+
+TEMPLATE_PROJECT_DIR = (
+    pathlib.Path(__file__).parent
+    / ".."
+    / ".."
+    / ".."
+    / "apps"
+    / "microtvm"
+    / "zephyr"
+    / "template_project"
+).resolve()
 
 BOARDS = TEMPLATE_PROJECT_DIR / "boards.json"
 
@@ -70,29 +77,19 @@ def has_fpu(board: str):
 
 def build_project(temp_dir, zephyr_board, west_cmd, mod, build_config, extra_files_tar=None):
     project_dir = temp_dir / "project"
-
-    with tempfile.TemporaryDirectory() as tar_temp_dir:
-        model_tar_path = pathlib.Path(tar_temp_dir) / "model.tar"
-        export_model_library_format(mod, model_tar_path)
-
-        workspace_size = mlf_extract_workspace_size_bytes(model_tar_path)
-        project = tvm.micro.project.generate_project_from_mlf(
-            str(TEMPLATE_PROJECT_DIR),
-            project_dir,
-            model_tar_path,
-            {
-                "extra_files_tar": extra_files_tar,
-                "project_type": "aot_demo",
-                "west_cmd": west_cmd,
-                "verbose": bool(build_config.get("debug")),
-                "zephyr_board": zephyr_board,
-                "compile_definitions": [
-                    # TODO(mehrdadh): It fails without offset.
-                    f"-DWORKSPACE_SIZE={workspace_size + 128}",
-                ],
-            },
-        )
-        project.build()
+    project = tvm.micro.generate_project(
+        str(TEMPLATE_PROJECT_DIR),
+        mod,
+        project_dir,
+        {
+            "extra_files_tar": extra_files_tar,
+            "project_type": "aot_demo",
+            "west_cmd": west_cmd,
+            "verbose": bool(build_config.get("debug")),
+            "zephyr_board": zephyr_board,
+        },
+    )
+    project.build()
     return project, project_dir
 
 
@@ -132,6 +129,31 @@ def create_header_file(tensor_name, npy_data, output_path, tar_file):
     tar_file.addfile(ti, io.BytesIO(header_file_bytes))
 
 
+def _read_line(fd, timeout_sec: int):
+    data = ""
+    new_line = False
+    while True:
+        if new_line:
+            break
+        new_data = fd.read(1, timeout_sec=timeout_sec)
+        logging.debug(f"read data: {new_data}")
+        for item in new_data:
+            new_c = chr(item)
+            data = data + new_c
+            if new_c == "\n":
+                new_line = True
+                break
+    return data
+
+
+def get_message(fd, expr: str, timeout_sec: int):
+    while True:
+        data = _read_line(fd, timeout_sec)
+        logging.debug(f"new line: {data}")
+        if expr in data:
+            return data
+
+
 # TODO move CMSIS integration to microtvm_api_server.py
 # see https://discuss.tvm.apache.org/t/tvm-capturing-dependent-libraries-of-code-generated-tir-initially-for-use-in-model-library-format/11080
 def loadCMSIS(temp_dir):
diff --git a/tests/micro/zephyr/test_zephyr.py b/tests/micro/zephyr/test_zephyr.py
index 10759c3790db4..0895980076511 100644
--- a/tests/micro/zephyr/test_zephyr.py
+++ b/tests/micro/zephyr/test_zephyr.py
@@ -374,8 +374,8 @@ def test_tensors(sess):
 @tvm.testing.requires_micro
 def test_autotune_conv2d(temp_dir, board, west_cmd, tvm_debug):
     """Test AutoTune for microTVM Zephyr"""
-    if board != "qemu_x86":
-        pytest.xfail(f"Autotune fails on {board}.")
+    if board in ["qemu_riscv32", "qemu_riscv64"]:
+        pytest.xfail(f"Autotune fails on  {board}.")
 
     model = test_utils.ZEPHYR_BOARDS[board]
     build_config = {"debug": tvm_debug}
diff --git a/tests/micro/zephyr/test_zephyr_aot.py b/tests/micro/zephyr/test_zephyr_aot.py
index 7cd32f4e1879f..f79aa8bd70d2b 100644
--- a/tests/micro/zephyr/test_zephyr_aot.py
+++ b/tests/micro/zephyr/test_zephyr_aot.py
@@ -33,7 +33,6 @@
 
 from tvm.contrib.download import download_testdata
 from tvm.micro.model_library_format import generate_c_interface_header
-from tvm.micro.testing import aot_transport_init_wait, aot_transport_find_message
 
 import test_utils
 
@@ -41,13 +40,25 @@
 @tvm.testing.requires_micro
 def test_tflite(temp_dir, board, west_cmd, tvm_debug):
     """Testing a TFLite model."""
+
+    if board not in [
+        "qemu_x86",
+        "mps2_an521",
+        "nrf5340dk_nrf5340_cpuapp",
+        "nucleo_l4r5zi",
+        "qemu_cortex_r5",
+        "qemu_riscv32",
+        "qemu_riscv64",
+    ]:
+        pytest.skip(msg="Model does not fit.")
+
     model = test_utils.ZEPHYR_BOARDS[board]
-    input_shape = (1, 49, 10, 1)
-    output_shape = (1, 12)
+    input_shape = (1, 32, 32, 3)
+    output_shape = (1, 10)
     build_config = {"debug": tvm_debug}
 
-    model_url = "https://github.com/tlc-pack/web-data/raw/25fe99fb00329a26bd37d3dca723da94316fd34c/testdata/microTVM/model/keyword_spotting_quant.tflite"
-    model_path = download_testdata(model_url, "keyword_spotting_quant.tflite", module="model")
+    model_url = "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/model/image_classification_fp32.tflite"
+    model_path = download_testdata(model_url, "image_classification_fp32.tflite", module="model")
 
     # Import TFLite model
     tflite_model_buf = open(model_path, "rb").read()
@@ -62,25 +73,20 @@ def test_tflite(temp_dir, board, west_cmd, tvm_debug):
 
     # Load TFLite model and convert to Relay
     relay_mod, params = relay.frontend.from_tflite(
-        tflite_model, shape_dict={"input_1": input_shape}, dtype_dict={"input_1 ": "int8"}
+        tflite_model, shape_dict={"input_1": input_shape}, dtype_dict={"input_1 ": "float32"}
     )
 
     target = tvm.target.target.micro(
-        model,
-        options=[
-            "-link-params=1",
-            "--executor=aot",
-            "--unpacked-api=1",
-            "--interface-api=c",
-            "--workspace-byte-alignment=4",
-        ],
+        model, options=["-link-params=1", "--executor=aot", "--unpacked-api=1", "--interface-api=c"]
     )
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         lowered = relay.build(relay_mod, target, params=params)
 
     # Load sample and generate input/output header files
-    sample_url = "https://github.com/tlc-pack/web-data/raw/967fc387dadb272c5a7f8c3461d34c060100dbf1/testdata/microTVM/data/keyword_spotting_int8_6.pyc.npy"
-    sample_path = download_testdata(sample_url, "keyword_spotting_int8_6.pyc.npy", module="data")
+    sample_url = "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/data/testdata_image_classification_fp32_8.npy"
+    sample_path = download_testdata(
+        sample_url, "testdata_image_classification_fp32_8.npy", module="data"
+    )
     sample = np.load(sample_path)
 
     with tempfile.NamedTemporaryFile() as tar_temp_file:
@@ -95,7 +101,7 @@ def test_tflite(temp_dir, board, west_cmd, tvm_debug):
 
             test_utils.create_header_file("input_data", sample, "include", tf)
             test_utils.create_header_file(
-                "output_data", np.zeros(shape=output_shape, dtype="int8"), "include", tf
+                "output_data", np.zeros(shape=output_shape, dtype="float32"), "include", tf
             )
 
         project, _ = test_utils.build_project(
@@ -109,16 +115,17 @@ def test_tflite(temp_dir, board, west_cmd, tvm_debug):
 
     project.flash()
     with project.transport() as transport:
-        aot_transport_init_wait(transport)
-        transport.write(b"infer%", timeout_sec=5)
-        result_line = aot_transport_find_message(transport, "result", timeout_sec=60)
+        timeout_read = 60
+        test_utils.get_message(transport, "#wakeup", timeout_sec=timeout_read)
+        transport.write(b"start\n", timeout_sec=5)
+        result_line = test_utils.get_message(transport, "#result", timeout_sec=timeout_read)
 
     result_line = result_line.strip("\n")
     result_line = result_line.split(":")
     result = int(result_line[1])
     time = int(result_line[2])
     logging.info(f"Result: {result}\ttime: {time} ms")
-    assert result == 6
+    assert result == 8
 
 
 @tvm.testing.requires_micro
diff --git a/tests/micro/zephyr/test_zephyr_armv7m.py b/tests/micro/zephyr/test_zephyr_armv7m.py
index 2366bad203be8..972ffe2bda357 100644
--- a/tests/micro/zephyr/test_zephyr_armv7m.py
+++ b/tests/micro/zephyr/test_zephyr_armv7m.py
@@ -25,6 +25,8 @@
 import pytest
 import numpy as np
 
+import test_utils
+
 import tvm
 import tvm.rpc
 import tvm.micro
@@ -33,17 +35,18 @@
 
 from tvm.contrib.download import download_testdata
 from tvm.micro.model_library_format import generate_c_interface_header
-from tvm.micro.testing import aot_transport_init_wait, aot_transport_find_message
 
-import test_utils
+import conftest
+
 
 _LOG = logging.getLogger(__name__)
+logging.basicConfig(level=logging.INFO)
 
 
 def _open_tflite_model():
     # Import TFLite model
 
-    model_url = "https://github.com/tlc-pack/web-data/raw/b2f3c02427b67267a00fd968ba1fce28fc833028/testdata/microTVM/model/mnist_model_quant.tflite"
+    model_url = "https://github.com/tlc-pack/web-data/raw/main/testdata/microTVM/model/mnist_model_quant.tflite"
     model_path = download_testdata(model_url, "mnist_model_quant.tflite", module="model")
 
     tflite_model_buf = open(model_path, "rb").read()
@@ -142,15 +145,15 @@ def _run_model(temp_dir, board, west_cmd, lowered, build_config, sample, output_
     project.flash()
 
     with project.transport() as transport:
-        aot_transport_init_wait(transport)
-        transport.write(b"infer%", timeout_sec=5)
-        result_line = aot_transport_find_message(transport, "result", timeout_sec=60)
+        timeout_read = 60
+        transport.write(b"start\n", timeout_sec=5)
+        result_line = test_utils.get_message(transport, "#result", timeout_sec=timeout_read)
 
     result_line = result_line.strip("\n")
     result_line = result_line.split(":")
     result = int(result_line[1])
     time = int(result_line[2])
-    _LOG.info(f"Result: {result}\ttime: {time} ms")
+    logging.info(f"Result: {result}\ttime: {time} ms")
 
     return result, time
 
@@ -183,17 +186,6 @@ def test_armv7m_intrinsic(temp_dir, board, west_cmd, tvm_debug):
     relay_mod_no_simd = _apply_desired_layout_no_simd(relay_mod)
 
     target = tvm.target.target.micro(
-        model,
-        options=[
-            "-keys=cpu",
-            "-link-params=1",
-            "--executor=aot",
-            "--unpacked-api=1",
-            "--interface-api=c",
-        ],
-    )
-
-    target_simd = tvm.target.target.micro(
         model,
         options=[
             "-keys=arm_cpu,cpu",
@@ -211,7 +203,7 @@ def test_armv7m_intrinsic(temp_dir, board, west_cmd, tvm_debug):
     os.makedirs(temp_dir_no_simd, exist_ok=True)
 
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
-        lowered_simd = relay.build(relay_mod_simd, target_simd, params=params)
+        lowered_simd = relay.build(relay_mod_simd, target, params=params)
         lowered_no_simd = relay.build(relay_mod_no_simd, target, params=params)
         result_simd, time_simd = _run_model(
             temp_dir_simd, board, west_cmd, lowered_simd, build_config, sample, output_shape
diff --git a/tests/python/contrib/test_cutlass.py b/tests/python/contrib/test_cutlass.py
deleted file mode 100644
index 0927c41981bd7..0000000000000
--- a/tests/python/contrib/test_cutlass.py
+++ /dev/null
@@ -1,222 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-import logging
-import math
-import pytest
-import tvm
-from tvm import relay
-import numpy as np
-from tvm.runtime.vm import VirtualMachine
-from tvm.relay.op.contrib.cutlass import partition_for_cutlass
-from tvm.contrib.cutlass import (
-    tune_cutlass_kernels,
-    build_cutlass_kernels,
-    build_cutlass_kernels_vm,
-)
-
-logging.basicConfig(level=logging.INFO)
-
-
-def has_cublas():
-    return tvm.get_global_func("tvm.contrib.cublas.matmul", True) != None
-
-
-def has_cutlass():
-    return tvm.get_global_func("relay.ext.cutlass", True) != None
-
-
-def get_ref_rt_mod(mod, params, target="cuda"):
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, target=target, params=params)
-    dev = tvm.device(target, 0)
-    rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-    return rt_mod, dev
-
-
-def get_ref_vm(mod, params, target="cuda"):
-    with tvm.transform.PassContext(opt_level=3):
-        vm_exec = relay.vm.compile(mod, target=target, params=params)
-        code, lib = vm_exec.save()
-    dev = tvm.device(target, 0)
-    vm_exec = tvm.runtime.vm.Executable.load_exec(code, lib)
-    return VirtualMachine(vm_exec, dev), dev
-
-
-def get_output(rt_mod, x):
-    rt_mod.set_input("data", x)
-    rt_mod.run()
-    return rt_mod.get_output(0).asnumpy()
-
-
-def get_output_vm(vm, x):
-    return vm.invoke("main", data=x).numpy()
-
-
-def get_dense_with_shape(data_shape, weight_shape, out_dtype="float16"):
-    data = relay.var("data", shape=data_shape, dtype="float16")
-    weight = relay.var("weight", shape=weight_shape, dtype="float16")
-    return relay.nn.dense(data, weight, out_dtype=out_dtype)
-
-
-def get_dense(M, N, K, out_dtype="float16"):
-    return get_dense_with_shape((M, K), (N, K), out_dtype)
-
-
-def get_dense_bias(M, N, K, out_dtype="float16"):
-    dense = get_dense(M, N, K, out_dtype=out_dtype)
-    bias = relay.var("bias", shape=(N,), dtype=out_dtype)
-    return relay.nn.bias_add(dense, bias)
-
-
-def get_dense_bias_relu(M, N, K, out_dtype="float16"):
-    return relay.nn.relu(get_dense_bias(M, N, K, out_dtype="float16"))
-
-
-def get_dense_bias_gelu(M, N, K, out_dtype="float16"):
-    bias_add = get_dense_bias(M, N, K, out_dtype)
-    mul = bias_add * relay.const((1.0 / math.sqrt(2.0)), dtype=out_dtype)
-    if out_dtype == "float16":
-        erf = relay.cast(relay.op.erf(relay.cast(mul, "float32")), "float16")
-    else:
-        erf = relay.op.erf(mul)
-    mul_half = erf * relay.const(0.5, dtype=out_dtype)
-    add = mul_half + relay.const(0.5, dtype=out_dtype)
-    return add * bias_add
-
-
-def profile_and_build(mod, params, sm, tmp_dir="./tmp", lib_path="compile.so"):
-    mod = partition_for_cutlass(mod)
-    mod, num_cutlass_partition = tune_cutlass_kernels(
-        mod, sm, profile_all=False, use_multiprocessing=False, tmp_dir=tmp_dir
-    )
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, target="cuda", params=params)
-    lib = build_cutlass_kernels(lib, sm, tmp_dir, lib_path)
-    dev = tvm.device("cuda", 0)
-    rt_mod = tvm.contrib.graph_executor.GraphModule(lib["default"](dev))
-    return rt_mod, dev, num_cutlass_partition
-
-
-def profile_and_build_vm(
-    mod, params, sm, tmp_dir="./tmp", lib_path="compile.so", vmcode_path="vmcode.ro"
-):
-    mod = partition_for_cutlass(mod)
-    mod, num_cutlass_partition = tune_cutlass_kernels(mod, sm, tmp_dir=tmp_dir)
-    with tvm.transform.PassContext(opt_level=3):
-        vm_exec = relay.vm.compile(mod, target="cuda", params=params)
-    vm_exec = build_cutlass_kernels_vm(vm_exec, sm, tmp_dir, lib_path, vmcode_path)
-    dev = tvm.device("cuda", 0)
-    return VirtualMachine(vm_exec, dev), dev, num_cutlass_partition
-
-
-def verify(func, M, N, K, ref_target="cuda", sm=80, atol=1e-5, rtol=1e-5, run_benchmark=False):
-    if not has_cutlass():
-        return
-    mod = tvm.IRModule.from_expr(func)
-    typ = relay.transform.InferType()(mod)["main"].body.checked_type
-    out_dtype = typ.dtype
-    use_vm = any(isinstance(s, tvm.tir.Any) for s in typ.shape)
-    np_data = np.random.uniform(-1, 1, (M, K)).astype("float16")
-    np_weight = np.random.uniform(-1, 1, (N, K)).astype("float16")
-    np_bias = np.random.uniform(-1, 1, (N,)).astype(out_dtype)
-
-    params = {"weight": np_weight, "bias": np_bias}
-
-    if use_vm:
-        if ref_target == "cuda" and out_dtype == "float16":
-            # Uncomment "return" below to see the accuracy difference of static vs dynamic TVM native fp16 dense
-            # The static one can use a tensorcore schedule, but the dynamic one cannot
-            rt_mod, dev = get_ref_vm(tvm.IRModule.from_expr(get_dense(M, N, K)), params)
-            num_partition = 1
-            logging.warning(
-                "The reference fp16 dense with dynamic shape using fp16 accumulation has accuracy issues."
-            )
-            return
-        else:
-            rt_mod, dev, num_partition = profile_and_build_vm(mod, params, sm)
-
-        rt_mod_ref, dev = get_ref_vm(mod, params, target=ref_target)
-        x = tvm.nd.array(np_data, device=dev)
-        out = get_output_vm(rt_mod, x)
-        ref_out = get_output_vm(rt_mod_ref, x)
-    else:
-        rt_mod_ref, dev = get_ref_rt_mod(mod, params, target=ref_target)
-        rt_mod, dev, num_partition = profile_and_build(mod, params, sm)
-        x = tvm.nd.array(np_data, device=dev)
-        out = get_output(rt_mod, x)
-        ref_out = get_output(rt_mod_ref, x)
-
-    assert num_partition > 0
-    np.testing.assert_allclose(out, ref_out, atol=atol, rtol=rtol)
-
-    if run_benchmark:
-        print("CUTLASS:", rt_mod.benchmark(dev, number=1, repeat=600))
-        print("TVM with target %s:" % ref_target, rt_mod_ref.benchmark(dev, number=1, repeat=600))
-
-
-M = 1820
-N = 768
-K = 768
-
-
-def test_dense():
-    verify(get_dense(M, N, K), M, N, K)
-    verify(get_dense(M, N, K, out_dtype="float32"), M, N, K)
-
-
-def test_dense_bias():
-    verify(get_dense_bias(M, N, K), M, N, K)
-    verify(get_dense_bias(M, N, K, out_dtype="float32"), M, N, K)
-
-
-def test_dense_bias_relu():
-    verify(get_dense_bias_relu(M, N, K), M, N, K)
-    verify(get_dense_bias_relu(M, N, K, out_dtype="float32"), M, N, K)
-
-
-def test_dense_bias_gelu():
-    verify(get_dense_bias_gelu(M, N, K), M, N, K, atol=1e-3, rtol=1e-3)
-    verify(get_dense_bias_gelu(M, N, K, out_dtype="float32"), M, N, K, atol=1e-3, rtol=1e-3)
-
-
-def test_dense_dynamic():
-    data_shape = (relay.Any(), K)
-    weight_shape = (relay.Any(), K)
-
-    if has_cublas():
-        # TVM native fp16 dense (without tensorcore), using fp16 accum, seems to have accuracy issues
-        # Use cublas as a reference
-        verify(
-            get_dense_with_shape(data_shape, weight_shape),
-            M,
-            N,
-            K,
-            ref_target="cuda -libs=cublas",
-        )
-
-    verify(
-        get_dense_with_shape(data_shape, weight_shape, out_dtype="float32"),
-        M,
-        N,
-        K,
-        atol=1e-4,
-        rtol=1e-4,
-    )
-
-
-if __name__ == "__main__":
-    pytest.main([__file__])
diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
index c284f6488bfb0..c5ebde4b9c613 100644
--- a/tests/python/contrib/test_ethosn/infrastructure.py
+++ b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -254,9 +254,7 @@ def inference_result(outputs):
 
 def test_error(mod, params, err_msg):
     caught = None
-    with tvm.transform.PassContext(
-        opt_level=3, config={"relay.ext.ethos-n.options": {"variant": get_ethosn_variant()}}
-    ):
+    with tvm.transform.PassContext(opt_level=3):
         with tvm.target.Target("llvm"):
             try:
                 mod = relay.transform.InferType()(mod)
@@ -326,4 +324,7 @@ def get_ethosn_api_version():
 
 
 def get_ethosn_variant():
-    return os.getenv("ETHOSN_VARIANT_CONFIG", default="Ethos-N77")
+    ethosn_variant_config = os.getenv("ETHOSN_VARIANT_CONFIG")
+    if ethosn_variant_config is not None:
+        return "Ethos-N78_1TOPS_2PLE_RATIO"
+    return "Ethos-N77"
diff --git a/tests/python/contrib/test_ethosn/test_partition_params.py b/tests/python/contrib/test_ethosn/test_partition_params.py
deleted file mode 100644
index da1750a7e4cb7..0000000000000
--- a/tests/python/contrib/test_ethosn/test_partition_params.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""Ethos(TM)-N partition parameter tests"""
-
-import pytest
-import tvm
-from tvm import relay
-import numpy as np
-
-from tvm.relay.op.contrib.ethosn import partition_for_ethosn77
-from tvm.relay.op.contrib.ethosn import partition_for_ethosn78
-from tvm.testing import requires_ethosn
-
-
-@requires_ethosn
-def test_ethosn78_partition_no_error():
-    a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
-    w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
-    res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8")
-    b = relay.var("b", shape=[8], dtype="uint8")
-    res = relay.nn.bias_add(res, b, axis=1)
-
-    mod = tvm.IRModule.from_expr(res)
-    opts = {"variant": "Ethos-N78"}
-    partition_for_ethosn78(mod, **opts)
-
-
-@requires_ethosn
-def test_ethosn78_partition_undefined_variant():
-    with pytest.raises(
-        ValueError, match=r".*When targeting Ethos\(TM\)-N78, -variant=Ethos-N78 should be set.*"
-    ):
-        a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
-        w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
-        res = relay.nn.conv2d(
-            a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8"
-        )
-        b = relay.var("b", shape=[8], dtype="uint8")
-        res = relay.nn.bias_add(res, b, axis=1)
-
-        mod = tvm.IRModule.from_expr(res)
-        partition_for_ethosn78(mod)
-
-
-@requires_ethosn
-def test_ethosn78_partition_invalid_variant():
-    with pytest.raises(
-        ValueError, match=r".*When targeting Ethos\(TM\)-N78, -variant=Ethos-N78 should be set.*"
-    ):
-        a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
-        w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
-        res = relay.nn.conv2d(
-            a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8"
-        )
-        b = relay.var("b", shape=[8], dtype="uint8")
-        res = relay.nn.bias_add(res, b, axis=1)
-
-        mod = tvm.IRModule.from_expr(res)
-        opts = {"variant": "Ethos-N"}
-        partition_for_ethosn78(mod, **opts)
-
-
-@requires_ethosn
-def test_ethosn78_partition_error():
-    with pytest.raises(
-        ValueError, match=r".*When targeting Ethos\(TM\)-N78, -variant=Ethos-N78 should be set.*"
-    ):
-        a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
-        w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
-        res = relay.nn.conv2d(
-            a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8"
-        )
-        b = relay.var("b", shape=[8], dtype="uint8")
-        res = relay.nn.bias_add(res, b, axis=1)
-
-        mod = tvm.IRModule.from_expr(res)
-        opts = {"variant": "Ethos-N77"}
-        partition_for_ethosn78(mod, **opts)
-
-
-@requires_ethosn
-def test_ethosn77_partition_no_error():
-    a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
-    w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
-    res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8")
-    b = relay.var("b", shape=[8], dtype="uint8")
-    res = relay.nn.bias_add(res, b, axis=1)
-
-    mod = tvm.IRModule.from_expr(res)
-    partition_for_ethosn77(mod)
-
-
-@requires_ethosn
-def test_ethosn77_partition_error():
-    with pytest.raises(
-        ValueError,
-        match=r".*Setting tops, ple_ratio or sram_size has no effect when targeting Ethos\(TM\)-N77.*",
-    ):
-        a = relay.var("a", shape=[2, 7, 8, 8], dtype="uint8")
-        w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype("uint8"))
-        res = relay.nn.conv2d(
-            a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype="uint8"
-        )
-        b = relay.var("b", shape=[8], dtype="uint8")
-        res = relay.nn.bias_add(res, b, axis=1)
-
-        mod = tvm.IRModule.from_expr(res)
-        opts = {"tops": 4}
-        partition_for_ethosn77(mod, **opts)
diff --git a/tests/python/contrib/test_ethosu/test_replace_copy.py b/tests/python/contrib/test_ethosu/test_replace_copy.py
index 9590db57dd328..76b7ef2a70ee4 100644
--- a/tests/python/contrib/test_ethosu/test_replace_copy.py
+++ b/tests/python/contrib/test_ethosu/test_replace_copy.py
@@ -22,7 +22,7 @@
 from tvm import relay
 from tvm.relay.testing import run_opt_pass
 from tvm.relay.backend.contrib.ethosu.tir.compiler import lower_to_tir
-from tvm.relay.backend.contrib.ethosu.tir.scheduler import copy_constants, Convolution2DCompute
+from tvm.relay.backend.contrib.ethosu.tir.scheduler import copy_constants
 
 from .infra import make_ethosu_conv2d
 
@@ -73,67 +73,5 @@ def _get_func():
     tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
 
 
-# fmt: off
-@tvm.script.ir_module
-class WeightStream:
-    @T.prim_func
-    def main(placeholder: T.handle, ethosu_write: T.handle, placeholder_1: T.handle, placeholder_2: T.handle, placeholder_3: T.handle, placeholder_4: T.handle) -> None:
-        # function attr dict
-        T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True})
-        placeholder_5 = T.match_buffer(placeholder, [1, 16, 16, 32], dtype="int8")
-        ethosu_write_1 = T.match_buffer(ethosu_write, [1, 16, 16, 16], dtype="int8")
-        buffer = T.match_buffer(placeholder_1, [416], dtype="uint8")
-        buffer_1 = T.match_buffer(placeholder_2, [112], dtype="uint8")
-        buffer_2 = T.match_buffer(placeholder_3, [272], dtype="uint8")
-        buffer_3 = T.match_buffer(placeholder_4, [64], dtype="uint8")
-        # body
-        placeholder_global = T.allocate([416], "uint8", "global")
-        placeholder_d_global = T.allocate([112], "uint8", "global")
-        T.evaluate(T.call_extern("ethosu_copy", T.load("uint8", buffer.data, 0), 416, T.load("uint8", placeholder_global, 0), dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", T.load("uint8", buffer_1.data, 0), 112, T.load("uint8", placeholder_d_global, 0), dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, T.load("int8", placeholder_5.data, 0), 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 10, 16, 0, 16, T.load("int8", ethosu_write_1.data, 0), 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", placeholder_global, 0), 416, 12, T.load("uint8", placeholder_d_global, 0), 112, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", T.load("uint8", buffer_2.data, 0), 272, T.load("uint8", placeholder_global, 0), dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_copy", T.load("uint8", buffer_3.data, 0), 64, T.load("uint8", placeholder_d_global, 0), dtype="handle"))
-        T.evaluate(T.call_extern("ethosu_conv2d", "int8", 16, 16, 32, 16, 0, 16, T.load("int8", placeholder_5.data, 0), 0, 0, 0, T.float32(0.5), 10, "NHWC", 512, 32, 1, "int8", 16, 16, 6, 16, 0, 16, T.load("int8", ethosu_write_1.data, 10), 0, 0, 0, T.float32(0.25), 14, "NHWC", 256, 16, 1, 1, 1, 1, 1, 1, 1, T.load("uint8", placeholder_global, 0), 272, 12, T.load("uint8", placeholder_d_global, 0), 64, 0, 0, 0, 0, "NONE", 0, 0, "NONE", dtype="handle"))
-    __tvm_meta__ = None
-# fmt: on
-
-
-def test_weight_stream():
-    def _cascader(cached_func, const_dict, sch):
-        weight = cached_func.inputs[1]
-        scale_bias = cached_func.inputs[2]
-        out = cached_func.outputs[0]
-        conv_compute = Convolution2DCompute.from_output(out)
-        co = conv_compute.split(sch, 3, 10)
-        cache_weight = sch.cache_read(weight, "global", [conv_compute.conv2d])
-        cache_scale_bias = sch.cache_read(scale_bias, "global", [conv_compute.conv2d])
-        sch[cache_weight].compute_at(sch[out], co)
-        sch[cache_scale_bias].compute_at(sch[out], co)
-
-    def _get_func():
-        ifm = relay.var("ifm", shape=(1, 16, 16, 32), dtype="int8")
-        conv = make_ethosu_conv2d(
-            ifm,
-            32,
-            16,
-            (1, 1),
-            (0, 0),
-            (1, 1),
-            (1, 1),
-        )
-        func = relay.Function(relay.analysis.free_vars(conv), conv)
-        func = run_opt_pass(func, relay.transform.InferType())
-        return func
-
-    func = _get_func()
-    mod, _ = lower_to_tir(func, cascader=_cascader)
-
-    script = mod.script(show_meta=True)
-    test_mod = tvm.script.from_source(script)
-    reference_mod = WeightStream
-    tvm.ir.assert_structural_equal(test_mod["main"], reference_mod["main"], True)
-
-
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/contrib/test_hexagon/README.md b/tests/python/contrib/test_hexagon/README.md
index 44f22011de3f9..a47c3438bf57c 100644
--- a/tests/python/contrib/test_hexagon/README.md
+++ b/tests/python/contrib/test_hexagon/README.md
@@ -44,7 +44,7 @@ This is a baseline 1x1 conv2d schedule for Hexagon.
 
 ## Command
 
-pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[conv2d_nhwc8h8w32c-1-1-0-float32-1-1-1-64-64-128-llvm]"
+pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[conv2d_packed_filter-1-1-0-float32-1-1-1-64-64-128-llvm]"
 
 ## Parameters
 
@@ -195,7 +195,7 @@ The key changes in TIR versus the above are...
 
 ## Command
 
-pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[conv2d_nhwc8h8w32c-1-1-0-float32-2-2-1-64-64-128-llvm]"
+pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[conv2d_packed_filter-1-1-0-float32-2-2-1-64-64-128-llvm]"
 
 ## Parameters
 
@@ -371,7 +371,7 @@ The `if` statement above indicates NOT to prefetch the vertically adjacent slice
 
 ## Command
 
-pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[conv2d_nhwc8h8w32c-3-1-0-float32-2-2-1-64-64-128-llvm]"
+pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[conv2d_packed_filter-3-1-0-float32-2-2-1-64-64-128-llvm]"
 
 ## Parameters
 
diff --git a/tests/python/contrib/test_hexagon/infrastructure.py b/tests/python/contrib/test_hexagon/infrastructure.py
index 4befcc62556f1..193a8630c3d2f 100644
--- a/tests/python/contrib/test_hexagon/infrastructure.py
+++ b/tests/python/contrib/test_hexagon/infrastructure.py
@@ -39,25 +39,16 @@ def get_packed_activation_layout(shape_nhwc, block_shape, packed_C=True):
     return shape
 
 
-def get_block_shape():
-    return 8, 8, 32
-
-
-def get_filter_block_shape():
-    return 8, 32, 4
-
-
 def get_packed_filter_layout(out_channel, in_channel, kernel_h, kernel_w):
-    filter_Cio, filter_Ki, filter_Cii = get_filter_block_shape()
-    filter_Ci = filter_Cio * filter_Cii
+    out_factor, in_first_factor, in_second_factor = 32, 32, 4
     return (
-        int(ceildiv(out_channel, filter_Ki)),
-        int(ceildiv(in_channel, filter_Ci)),
+        int(ceildiv(out_channel, out_factor)),
+        int(ceildiv(in_channel, in_first_factor)),
         kernel_h,
         kernel_w,
-        filter_Cio,
-        filter_Ki,
-        filter_Cii,
+        in_first_factor // in_second_factor,
+        out_factor,
+        in_second_factor,
     )
 
 
@@ -80,6 +71,10 @@ def build_and_run(inputs, func, target, target_host, *args, **kwargs):
     return tensors[-1].asnumpy()
 
 
+def get_block_shape():
+    return 8, 8, 32
+
+
 def get_conv2d_nhwc_shape(shape_nhwc, kernel_size, strides, padding, dilation, out_channels):
     assert len(shape_nhwc) == 4
     kernel = []
@@ -91,41 +86,3 @@ def get_conv2d_nhwc_shape(shape_nhwc, kernel_size, strides, padding, dilation, o
         (shape_nhwc[2] - kernel[1] + padding[2] + padding[3]) // strides[1] + 1,
         out_channels,
     )
-
-
-def verify_conv2d(output, ref_output, dtype):
-    # nhwc8h8w32c
-    if len(output.shape) == 7:
-        # nhwc8h8w32c -> nhwc
-        output = output.transpose(0, 1, 4, 2, 5, 3, 6).reshape(
-            output.shape[0],
-            output.shape[1] * output.shape[4],
-            output.shape[2] * output.shape[5],
-            output.shape[3] * output.shape[6],
-        )
-
-    # nhwhwc
-    else:
-        # nhwhwc -> nhwc
-        output = output.transpose(0, 1, 3, 2, 4, 5).reshape(
-            output.shape[0],
-            output.shape[1] * output.shape[3],
-            output.shape[2] * output.shape[4],
-            output.shape[5],
-        )
-
-    # slice output to match ref_output shape
-    # e.g. 8x8 spatial 3x3 filter = 6x6 ref output
-    # but still 8x8 output given the blocked layout
-    output = output[
-        0 : ref_output.shape[0] : 1,
-        0 : ref_output.shape[1] : 1,
-        0 : ref_output.shape[2] : 1,
-        0 : ref_output.shape[3] : 1,
-    ]
-
-    if "int" in dtype:
-        tol = {"atol": 0, "rtol": 0}
-    elif dtype == "float32":
-        tol = {"rtol": 1e-4, "atol": 2e-4}
-    tvm.testing.assert_allclose(output, ref_output, **tol)
diff --git a/tests/python/contrib/test_hexagon/test_conv2d_blocked.py b/tests/python/contrib/test_hexagon/test_conv2d_blocked.py
index f3da3e1f8c09d..07696b51a327e 100644
--- a/tests/python/contrib/test_hexagon/test_conv2d_blocked.py
+++ b/tests/python/contrib/test_hexagon/test_conv2d_blocked.py
@@ -26,19 +26,138 @@
     build_and_run,
     get_block_shape,
     get_conv2d_nhwc_shape,
-    get_filter_block_shape,
     get_packed_filter_layout,
     get_packed_activation_layout,
-    verify_conv2d,
 )
 
 import numpy as np
 import pytest
 
 
-def conv2d_nhwc8h8w32c(
+def conv2d_logical(
     shape_nhwc,
-    shape_filter,
+    shape_oihw,
+    kernel_size,
+    stride,
+    padding,
+    dtype,
+    storage_scope="global",
+):
+    """
+    Conv2d TE wherein both input activation and filter tensors
+    are defined with their logical NHWC/OIHW shapes, respectively.
+    The packed physical layout for the activation and filter are:
+      Activation: nhwc8h8w32c
+      Filter: oihw8i32o4i
+    """
+    assert kernel_size == tuple(shape_oihw[2:])
+
+    block_shape = get_block_shape()
+    block_H, block_W, block_C = block_shape
+    shape = get_packed_activation_layout(shape_nhwc, block_shape)
+    logical_output_shape = get_conv2d_nhwc_shape(
+        shape_nhwc, kernel_size, stride, padding, [1, 1], shape_oihw[0]
+    )
+    output_shape = get_packed_activation_layout(logical_output_shape, block_shape)
+
+    N, H, W, C = shape_nhwc
+    X = te.placeholder(shape_nhwc, dtype=dtype)
+    # Combination of padding required by conv2d operator and padding to evenly divisible
+    # number of blocks. Note that this padding should be inlined in the schedule so
+    # as to avoid input copying.
+    pad_h = (block_H - ((H + padding[1]) % block_H)) % block_H
+    pad_w = (block_W - ((W + padding[3]) % block_W)) % block_W
+    X_pad = topi.nn.pad(X, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0)
+    # Calculate packed layout
+    X_packed = te.compute(
+        shape,
+        lambda n, ho, wo, co, hi, wi, ci: X_pad[
+            n, ho * block_H + hi, wo * block_W + wi, co * block_C + ci
+        ],
+    )
+
+    # Filter shape using KCRS (OIHW) notation
+    K, C, R, S = shape_oihw
+    filter_Ki, filter_Ci, filter_Cii = 32, 32, 4
+    shape_filter = get_packed_filter_layout(K, C, R, S)
+    filt = te.placeholder(shape_oihw, dtype=dtype)
+    # Channel padding to multiples of 32
+    pad_c = (filter_Ci - (C % filter_Ci)) % filter_Ci
+    pad_k = (filter_Ki - (K % filter_Ki)) % filter_Ki
+    filt_pad = topi.nn.pad(
+        filt, [0, 0, 0, 0], [pad_k, pad_c, R, S], pad_value=0, name="padded_filter"
+    )
+    filt_packed = te.compute(
+        shape_filter,
+        lambda ko, co, r, s, cio, ki, cii: filt_pad[
+            ko * filter_Ki + ki, co * filter_Ci + cio * filter_Cii + cii, r, s
+        ],
+        name="packed_filter",
+    )
+
+    rh = te.reduce_axis((0, kernel_size[0]), name="rh")
+    rw = te.reduce_axis((0, kernel_size[1]), name="rw")
+    rc = te.reduce_axis((0, C), name="rc")
+
+    def compute(n, ho, wo, ko, hi, wi, ki):
+        # Construct blockized strided conv2d height index
+        h = ho * block_H + hi
+        h_contig = h * stride[0] + rh
+        h_block_id = h_contig // block_H
+        h_block_offset = h_contig % block_H
+
+        # Construct blockized strided conv2d width index
+        w = wo * block_W + wi
+        w_contig = w * stride[1] + rw
+        w_block_id = w_contig // block_W
+        w_block_offset = w_contig % block_W
+
+        # Construct blockized conv2d channel index
+        c_block_id = rc // block_C
+        c_block_offset = rc % block_C
+
+        # Construct flat filter input channel indices
+        rco = rc // filter_Ci
+        rcio = (rc % filter_Ci) // filter_Cii
+        rcii = rc % filter_Cii
+
+        return te.sum(
+            X_packed[
+                n,
+                h_block_id,
+                w_block_id,
+                c_block_id,
+                h_block_offset,
+                w_block_offset,
+                c_block_offset,
+            ]
+            * filt_packed[ko, rco, rh, rw, rcio, ki, rcii],
+            axis=[rh, rw, rc],
+        )
+
+    Y = te.compute(output_shape, compute)
+    s = te.create_schedule(Y.op)
+
+    # Ensure the padding and array packing is performed inline
+    s[X_pad].compute_inline()
+    s[X_packed].compute_inline()
+
+    s[filt_pad].compute_inline()
+    s[filt_packed].compute_inline()
+
+    binds = {}
+    if storage_scope and storage_scope != "global":
+        with tvm.transform.PassContext():
+            Xb = tvm.tir.decl_buffer(shape, name="Xb", dtype=dtype, scope=storage_scope)
+            Yb = tvm.tir.decl_buffer(output_shape, name="Yb", dtype=dtype, scope=storage_scope)
+            binds = {X: Xb, Y: Yb}
+
+    return (s, [X, filt, Y], binds)
+
+
+def conv2d_packed_filter(
+    shape_nhwc,
+    shape_oihw8i32o4i,
     kernel_size,
     stride,
     padding,
@@ -49,19 +168,11 @@ def conv2d_nhwc8h8w32c(
 ):
     """
     Conv2d TE wherein the input activation is defined by its
-    logical NHWC shape.  The filter is provided in either its
-    logical (OIHW) or physical packed (oihw8i32o4i) shape. The
-    physical packed layout for the input / output is nhwc8h8w32c.
+    logical NHWC shape, but the filter is provided in the
+    packed layout oihw8i32o4i. The physical packed layout used
+    for the activation is: nhwc8h8w32c
     """
-
-    # oihw8i32o41
-    if len(shape_filter) == 7:
-        assert kernel_size == tuple(shape_filter[2:4])
-        out_channels = shape_filter[0] * shape_filter[5]
-    # oihw
-    else:
-        assert kernel_size == tuple(shape_filter[2:])
-        out_channels = shape_filter[0]
+    assert kernel_size == tuple(shape_oihw8i32o4i[2:4])
 
     block_shape = get_block_shape()
     block_H, block_W, block_C = block_shape
@@ -72,7 +183,7 @@ def conv2d_nhwc8h8w32c(
         stride,
         padding,
         [1, 1],
-        out_channels,
+        shape_oihw8i32o4i[0] * shape_oihw8i32o4i[5],
     )
 
     output_shape = get_packed_activation_layout(logical_output_shape, block_shape)
@@ -84,10 +195,11 @@ def conv2d_nhwc8h8w32c(
     # as to avoid input copying.
     pad_h = (block_H - ((H + padding[1]) % block_H)) % block_H
     pad_w = (block_W - ((W + padding[3]) % block_W)) % block_W
-    X_pad = topi.nn.pad(X, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0)
 
+    X_pad = topi.nn.pad(X, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0)
     # Calculate packed layout
     packed_shape = get_packed_activation_layout(X_pad.shape, block_shape)
+
     X_packed = te.compute(
         packed_shape,
         lambda n, ho, wo, co, hi, wi, ci: X_pad[
@@ -95,38 +207,13 @@ def conv2d_nhwc8h8w32c(
         ],
     )
 
-    filter_Cio, filter_Ki, filter_Cii = get_filter_block_shape()
-    filter_Ci = filter_Cio * filter_Cii
-
-    if len(shape_filter) == 7:
-        assert shape_filter[-1] == filter_Cii
-        assert shape_filter[-2] == filter_Ki
-        assert shape_filter[-3] == filter_Cio
+    # Filter shape using KCRS (OIHW) notation
+    filter_Ki, filter_Ci, filter_Cii = 32, 32, 4
+    assert shape_oihw8i32o4i[-1] == filter_Cii
+    assert shape_oihw8i32o4i[-2] == filter_Ki
+    assert shape_oihw8i32o4i[-3] == filter_Ci // filter_Cii
 
-        filt = te.placeholder(shape_filter, dtype=dtype)
-        filt_packed = filt
-
-    else:
-        filt = te.placeholder(shape_filter, dtype=dtype)
-
-        # get logical filter shape KCRS (OIHW)
-        K, C, R, S = shape_filter
-
-        # Channel padding to multiples of 32
-        pad_c = (filter_Ci - (C % filter_Ci)) % filter_Ci
-        pad_k = (filter_Ki - (K % filter_Ki)) % filter_Ki
-        filt_pad = topi.nn.pad(
-            filt, [0, 0, 0, 0], [pad_k, pad_c, R, S], pad_value=0, name="padded_filter"
-        )
-
-        shape_packed_filter = get_packed_filter_layout(K, C, R, S)
-        filt_packed = te.compute(
-            shape_packed_filter,
-            lambda ko, co, r, s, cio, ki, cii: filt_pad[
-                ko * filter_Ki + ki, co * filter_Ci + cio * filter_Cii + cii, r, s
-            ],
-            name="packed_filter",
-        )
+    filt_packed = te.placeholder(shape_oihw8i32o4i, dtype=dtype)
 
     rh = te.reduce_axis((0, kernel_size[0]), name="rh")
     rw = te.reduce_axis((0, kernel_size[1]), name="rw")
@@ -175,11 +262,6 @@ def compute(n, ho, wo, ko, hi, wi, ki):
     s[X_pad].compute_inline()
     s[X_packed].compute_inline()
 
-    # if we did filter padding, packing
-    if filt != filt_packed:
-        s[filt_pad].compute_inline()
-        s[filt_packed].compute_inline()
-
     # cache read for the input / activation (X)
     Xl = s.cache_read(X_packed, storage_scope, [Y])
     Fl = s.cache_read(filt_packed, storage_scope, [Y])
@@ -232,12 +314,12 @@ def compute(n, ho, wo, ko, hi, wi, ki):
             Yb = tvm.tir.decl_buffer(output_shape, name="Yb", dtype=dtype, scope=storage_scope)
             binds = {X: Xb, Y: Yb}
 
-    return (s, [X, filt, Y], binds)
+    return (s, [X, filt_packed, Y], binds)
 
 
-def conv2d_nhw8h8wc(
+def conv2d_packed_filter_nhwhwc(
     shape_nhwc,
-    shape_filter,
+    shape_oihw8i32o4i,
     kernel_size,
     stride,
     padding,
@@ -248,19 +330,12 @@ def conv2d_nhw8h8wc(
 ):
     """
     Conv2d TE wherein the input activation is defined by its
-    logical NHWC shape.  The filter is provided in either its
-    logical (OIHW) or physical packed (oihw8i32o4i) shape. The
-    physical packed layout for the input / output is nhw8h8wc.
-    """
+    logical NHWC shape, but the filter is provided in the
+    packed layout oihw8i32o4i. The physical packed layout used
+    for the activation is: nhw8h8wc
 
-    # oihw8i32o41
-    if len(shape_filter) == 7:
-        assert kernel_size == tuple(shape_filter[2:4])
-        out_channels = shape_filter[0] * shape_filter[5]
-    # oihw
-    else:
-        assert kernel_size == tuple(shape_filter[2:])
-        out_channels = shape_filter[0]
+    """
+    assert kernel_size == tuple(shape_oihw8i32o4i[2:4])
 
     block_shape = get_block_shape()
     block_H, block_W, block_C = block_shape
@@ -271,9 +346,8 @@ def conv2d_nhw8h8wc(
         stride,
         padding,
         [1, 1],
-        out_channels,
+        shape_oihw8i32o4i[0] * shape_oihw8i32o4i[5],
     )
-
     output_shape = get_packed_activation_layout(logical_output_shape, block_shape, packed_C=False)
 
     N, H, W, C = shape_nhwc
@@ -284,45 +358,19 @@ def conv2d_nhw8h8wc(
     pad_h = (block_H - ((H + padding[1]) % block_H)) % block_H
     pad_w = (block_W - ((W + padding[3]) % block_W)) % block_W
     X_pad = topi.nn.pad(X, [0, padding[0], padding[2], 0], [0, pad_h, pad_w, 0], pad_value=0)
-
     # Calculate packed layout
     packed_shape = get_packed_activation_layout(X_pad.shape, block_shape, packed_C=False)
     X_packed = te.compute(
         packed_shape, lambda n, ho, wo, hi, wi, c: X_pad[n, ho * block_H + hi, wo * block_W + wi, c]
     )
 
-    filter_Cio, filter_Ki, filter_Cii = get_filter_block_shape()
-    filter_Ci = filter_Cio * filter_Cii
-
-    if len(shape_filter) == 7:
-        assert shape_filter[-1] == filter_Cii
-        assert shape_filter[-2] == filter_Ki
-        assert shape_filter[-3] == filter_Cio
-
-        filt = te.placeholder(shape_filter, dtype=dtype)
-        filt_packed = filt
-
-    else:
-        filt = te.placeholder(shape_filter, dtype=dtype)
-
-        # get logical filter shape KCRS (OIHW)
-        K, C, R, S = shape_filter
-
-        # Channel padding to multiples of 32
-        pad_c = (filter_Ci - (C % filter_Ci)) % filter_Ci
-        pad_k = (filter_Ki - (K % filter_Ki)) % filter_Ki
-        filt_pad = topi.nn.pad(
-            filt, [0, 0, 0, 0], [pad_k, pad_c, R, S], pad_value=0, name="padded_filter"
-        )
+    # Filter shape using KCRS (OIHW) notation
+    filter_Ki, filter_Ci, filter_Cii = 32, 32, 4
+    assert shape_oihw8i32o4i[-1] == filter_Cii
+    assert shape_oihw8i32o4i[-2] == filter_Ki
+    assert shape_oihw8i32o4i[-3] == filter_Ci // filter_Cii
 
-        shape_packed_filter = get_packed_filter_layout(K, C, R, S)
-        filt_packed = te.compute(
-            shape_packed_filter,
-            lambda ko, co, r, s, cio, ki, cii: filt_pad[
-                ko * filter_Ki + ki, co * filter_Ci + cio * filter_Cii + cii, r, s
-            ],
-            name="packed_filter",
-        )
+    filt_packed = te.placeholder(shape_oihw8i32o4i, dtype=dtype)
 
     rh = te.reduce_axis((0, kernel_size[0]), name="rh")
     rw = te.reduce_axis((0, kernel_size[1]), name="rw")
@@ -363,11 +411,6 @@ def compute(n, ho, wo, hi, wi, k):
     s[X_pad].compute_inline()
     s[X_packed].compute_inline()
 
-    # if we did filter padding, packing
-    if filt != filt_packed:
-        s[filt_pad].compute_inline()
-        s[filt_packed].compute_inline()
-
     # cache read for the input / activation (X)
     Xl = s.cache_read(X_packed, storage_scope, [Y])
     Fl = s.cache_read(filt_packed, storage_scope, [Y])
@@ -436,7 +479,7 @@ def compute(n, ho, wo, hi, wi, k):
             Yb = tvm.tir.decl_buffer(output_shape, name="Yb", dtype=dtype, scope=storage_scope)
             binds = {X: Xb, Y: Yb}
 
-    return (s, [X, filt, Y], binds)
+    return (s, [X, filt_packed, Y], binds)
 
 
 class BaseConv2d:
@@ -452,23 +495,9 @@ class BaseConv2d:
     h_split_factor = tvm.testing.parameter(1, 2)
 
 
-class TestConv2dLogicalFilter(BaseConv2d):
-    conv2d_impl = tvm.testing.parameter(conv2d_nhwc8h8w32c, conv2d_nhw8h8wc)
-
+class TestConv2dLogical(BaseConv2d):
     @tvm.testing.parametrize_targets("llvm")
-    def test_conv2d(
-        self,
-        conv2d_impl,
-        shape_nhwc,
-        shape_oihw,
-        kernel,
-        stride,
-        pad,
-        dtype,
-        target,
-        k_split_factor,
-        h_split_factor,
-    ):
+    def test_conv2d(self, shape_nhwc, shape_oihw, kernel, stride, pad, dtype, target):
         inputs = [
             np.random.uniform(0, 255, size=shape_nhwc).astype(dtype),
             np.random.uniform(0, 255, size=shape_oihw).astype(dtype),
@@ -477,24 +506,44 @@ def test_conv2d(
         ref_output = testing.conv2d_nhwc_python(inputs[0], np_filter, stride, pad)
         output = build_and_run(
             inputs,
-            conv2d_impl,
+            conv2d_logical,
             target,
             target,
             shape_nhwc=shape_nhwc,
-            shape_filter=shape_oihw,
+            shape_oihw=shape_oihw,
             kernel_size=(kernel, kernel),
             stride=(stride, stride),
             padding=(pad, pad, pad, pad),
             dtype=dtype,
-            k_split_factor=k_split_factor,
-            h_split_factor=h_split_factor,
         )
 
-        verify_conv2d(output, ref_output, dtype)
+        # nhwc8h8w32c -> nhwc
+        output = output.transpose(0, 1, 4, 2, 5, 3, 6).reshape(
+            output.shape[0],
+            output.shape[1] * output.shape[4],
+            output.shape[2] * output.shape[5],
+            output.shape[3] * output.shape[6],
+        )
+
+        # slice output to match ref_output shape
+        # e.g. 8x8 spatial 3x3 filter = 6x6 ref output
+        # but still 8x8 output given the blocked layout
+        output = output[
+            0 : ref_output.shape[0] : 1,
+            0 : ref_output.shape[1] : 1,
+            0 : ref_output.shape[2] : 1,
+            0 : ref_output.shape[3] : 1,
+        ]
+
+        if "int" in dtype:
+            tol = {"atol": 0, "rtol": 0}
+        elif dtype == "float32":
+            tol = {"rtol": 1e-4, "atol": 2e-4}
+        tvm.testing.assert_allclose(output, ref_output, **tol)
 
 
 class TestConv2dPackedFilter(BaseConv2d):
-    conv2d_impl = tvm.testing.parameter(conv2d_nhwc8h8w32c, conv2d_nhw8h8wc)
+    conv2d_impl = tvm.testing.parameter(conv2d_packed_filter, conv2d_packed_filter_nhwhwc)
 
     @tvm.testing.parametrize_targets("llvm")
     @pytest.mark.skip("Skip due to being flaky on i386.")
@@ -526,7 +575,7 @@ def test_conv2d(
             target,
             target,
             shape_nhwc=shape_nhwc,
-            shape_filter=shape_oihw8i32o4i,
+            shape_oihw8i32o4i=shape_oihw8i32o4i,
             kernel_size=(kernel, kernel),
             stride=(stride, stride),
             padding=(pad, pad, pad, pad),
@@ -535,7 +584,41 @@ def test_conv2d(
             h_split_factor=h_split_factor,
         )
 
-        verify_conv2d(output, ref_output, dtype)
+        # nhwc8h8w32c
+        if len(output.shape) == 7:
+            # nhwc8h8w32c -> nhwc
+            output = output.transpose(0, 1, 4, 2, 5, 3, 6).reshape(
+                output.shape[0],
+                output.shape[1] * output.shape[4],
+                output.shape[2] * output.shape[5],
+                output.shape[3] * output.shape[6],
+            )
+
+        # nhwhwc
+        else:
+            # nhwhwc -> nhwc
+            output = output.transpose(0, 1, 3, 2, 4, 5).reshape(
+                output.shape[0],
+                output.shape[1] * output.shape[3],
+                output.shape[2] * output.shape[4],
+                output.shape[5],
+            )
+
+        # slice output to match ref_output shape
+        # e.g. 8x8 spatial 3x3 filter = 6x6 ref output
+        # but still 8x8 output given the blocked layout
+        output = output[
+            0 : ref_output.shape[0] : 1,
+            0 : ref_output.shape[1] : 1,
+            0 : ref_output.shape[2] : 1,
+            0 : ref_output.shape[3] : 1,
+        ]
+
+        if "int" in dtype:
+            tol = {"atol": 0, "rtol": 0}
+        elif dtype == "float32":
+            tol = {"rtol": 1e-4, "atol": 2e-4}
+        tvm.testing.assert_allclose(output, ref_output, **tol)
 
 
 if __name__ == "__main__":
diff --git a/tests/python/contrib/test_onnx.py b/tests/python/contrib/test_onnx.py
index 6f23228be68c2..121edc4b8c609 100644
--- a/tests/python/contrib/test_onnx.py
+++ b/tests/python/contrib/test_onnx.py
@@ -47,11 +47,12 @@ def run_onnx(onnx_model, input_data):
     return res
 
 
-def run_relay(func, data_tuple, is_dyn=False):
+def run_relay(func, data_tuple):
     target = "llvm"
     dev = tvm.device("llvm", 0)
-    kind = "graph" if not is_dyn else "vm"
-    relay_res = relay.create_executor(kind, device=dev, target=target).evaluate(func)(*data_tuple)
+    relay_res = relay.create_executor("graph", device=dev, target=target).evaluate(func)(
+        *data_tuple
+    )
 
     result = []
     relay_res = relay_res if isinstance(relay_res, list) else [relay_res]
@@ -61,8 +62,8 @@ def run_relay(func, data_tuple, is_dyn=False):
     return result
 
 
-def verify_results(relay_func, indata, test_name, rtol=1e-7, atol=0, is_dyn=False):
-    relay_results = run_relay(relay_func, indata, is_dyn)
+def verify_results(relay_func, indata, test_name, rtol=1e-7, atol=0):
+    relay_results = run_relay(relay_func, indata)
     onnx_results = run_onnx(func_to_onnx(relay_func, test_name), indata)
 
     for relay_res, onnx_res in zip(relay_results, onnx_results):
@@ -110,7 +111,7 @@ def verify_conv2d(
         func = relay.Function([x, w], y)
         data = np.random.uniform(-scale, scale, size=dshape).astype(dtype)
         kernel = np.random.uniform(-scale, scale, size=kshape).astype(dtype)
-        verify_results(func, [data, kernel], "test_conv2d", rtol=1e-5, atol=1e-5, is_dyn=True)
+        verify_results(func, [data, kernel], "test_conv2d", rtol=1e-5, atol=1e-5)
 
     dshape = (1, 32, 18, 18)
     kshape = (32, 1, 3, 3)
@@ -699,26 +700,6 @@ def verify_resize(dshape, outsize, method, coord_trans, rounding_method, dtype="
                 verify_resize(isize, osize, method=i, coord_trans=j, rounding_method=k)
 
 
-def test_dyn():
-    """Dynamic unit test."""
-
-    def verify_dyn_bcast(lhs_shape, rhs_shape, dtype):
-        lhs_dyn_shape = tuple(relay.Any() for i in range(len(lhs_shape)))
-        rhs_dyn_shape = tuple(relay.Any() for i in range(len(rhs_shape)))
-        x = relay.var("x", shape=lhs_dyn_shape, dtype=dtype)
-        y = relay.var("y", shape=rhs_dyn_shape, dtype=dtype)
-        z = relay.add(x, y)
-        func = relay.Function([x, y], z)
-        lhs_data = np.random.uniform(size=lhs_shape).astype(dtype)
-        rhs_data = np.random.uniform(size=rhs_shape).astype(dtype)
-        verify_results(
-            func, [lhs_data, rhs_data], "test_dyn_bcast", rtol=1e-5, atol=1e-5, is_dyn=True
-        )
-
-    verify_dyn_bcast((1, 3, 32, 1), (1, 3, 1, 3), "float32")
-    verify_dyn_bcast((1, 13), (4, 3, 5, 1), "float32")
-
-
 if __name__ == "__main__":
     test_add()
     test_bias_add()
@@ -749,4 +730,3 @@ def verify_dyn_bcast(lhs_shape, rhs_shape, dtype):
     test_round()
     test_cast()
     test_resize()
-    test_dyn()
diff --git a/tests/python/contrib/test_rpc_server_device.py b/tests/python/contrib/test_rpc_server_device.py
deleted file mode 100644
index f1b8647683acf..0000000000000
--- a/tests/python/contrib/test_rpc_server_device.py
+++ /dev/null
@@ -1,440 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-"""iOS RPC Server tests."""
-# pylint: disable=invalid-name, no-value-for-parameter, missing-function-docstring, import-error
-import sys
-import multiprocessing
-import pytest
-import numpy as np
-
-import tvm.testing
-import tvm.relay.testing
-from tvm import te
-from tvm import rpc
-from tvm import relay, auto_scheduler
-from tvm.contrib import utils, xcode, graph_executor
-from tvm.autotvm.measure import request_remote
-from tvm.auto_scheduler.measure_record import load_records
-from tvm.auto_scheduler.measure import MeasureErrorNo
-from tvm.auto_scheduler.utils import call_func_with_timeout
-from tvm.contrib.popen_pool import PopenWorker, StatusKind
-from tvm.rpc import tracker, proxy, server_ios_launcher
-
-
-HOST_URL = "0.0.0.0"
-HOST_PORT = 9190
-DEVICE_KEY = "ios_mobile_device"
-
-
-TEMPORARY_DIRECTORY = utils.tempdir()
-ARCH = "x86_64"
-SDK = "iphonesimulator"
-DSO_NAME = "lib.dylib"
-DTYPE = "float32"
-
-
-np.random.seed(0)
-
-
-ios_rpc_bundle_description_required = pytest.mark.skipif(
-    not server_ios_launcher.ServerIOSLauncher.is_compatible_environment(),
-    reason="To run this test, you need to set environment variables required in ServerIOSLauncher.",
-)
-
-
-@pytest.fixture(scope="session", autouse=True)
-def setup_and_teardown_actions():
-    """Setup and teardown actions for pytest."""
-
-    # No setup actions
-    yield
-    # Teardown actions:
-    server_ios_launcher.ServerIOSLauncher.shutdown_booted_devices()
-
-
-def setup_rpc_standalone_configuration(f):
-    """
-    Host  --  RPC server
-    """
-
-    def wrapper():
-        with server_ios_launcher.ServerIOSContextManager(
-            mode=server_ios_launcher.RPCServerMode.standalone.value,
-            host=HOST_URL,
-            port=HOST_PORT,
-            key=DEVICE_KEY,
-        ) as ios_server:
-            f(host=ios_server.host, port=ios_server.port)
-
-    return wrapper
-
-
-def setup_rpc_proxy_configuration(f):
-    """
-    Host -- Proxy -- RPC server
-    """
-
-    def wrapper():
-        proxy_server = proxy.Proxy(host=HOST_URL, port=HOST_PORT)
-        with server_ios_launcher.ServerIOSContextManager(
-            mode=server_ios_launcher.RPCServerMode.proxy.value,
-            host=proxy_server.host,
-            port=proxy_server.port,
-            key=DEVICE_KEY,
-        ):
-            f(host=proxy_server.host, port=proxy_server.port)
-        proxy_server.terminate()
-
-    return wrapper
-
-
-def setup_rpc_tracker_configuration(f):
-    """
-         tracker
-         /     \
-    Host   --   RPC server
-    """
-
-    def wrapper():
-        tracker_server = tracker.Tracker(host=HOST_URL, port=HOST_PORT, silent=True)
-        with server_ios_launcher.ServerIOSContextManager(
-            mode=server_ios_launcher.RPCServerMode.tracker.value,
-            host=tracker_server.host,
-            port=tracker_server.port,
-            key=DEVICE_KEY,
-        ):
-            f(host=tracker_server.host, port=tracker_server.port)
-        tracker_server.terminate()
-
-    return wrapper
-
-
-def setup_rpc_tracker_via_proxy_configuration(f):
-    """
-         tracker
-         /     \
-    Host   --   Proxy -- RPC server
-    """
-
-    def wrapper():
-        tracker_server = tracker.Tracker(host=HOST_URL, port=HOST_PORT, silent=True)
-        proxy_server_tracker = proxy.Proxy(
-            host=HOST_URL, port=8888, tracker_addr=(tracker_server.host, tracker_server.port)
-        )
-        with server_ios_launcher.ServerIOSContextManager(
-            mode=server_ios_launcher.RPCServerMode.proxy.value,
-            host=proxy_server_tracker.host,
-            port=proxy_server_tracker.port,
-            key=DEVICE_KEY,
-        ):
-            f(host=tracker_server.host, port=tracker_server.port)
-        proxy_server_tracker.terminate()
-        tracker_server.terminate()
-
-    return wrapper
-
-
-def wrapper_for_call_function_with_timeout(timeout, func, args=(), kwargs=None):
-    """Wrapper for call_func_with_timeout."""
-
-    def wrapper(*_args, **_kwargs):
-        """
-        This wrapper is needed because the cloudpicle
-        cannot serialize objects that contain pointers (RPCSession)
-        """
-        func(*_args, **_kwargs)
-        return StatusKind.COMPLETE
-
-    worker = PopenWorker()
-    ret = call_func_with_timeout(worker, timeout=timeout, func=wrapper, args=args, kwargs=kwargs)
-    if isinstance(ret, Exception):
-        raise ret
-    return ret
-
-
-def try_create_remote_session(session_factory, args=(), kwargs=None):
-    """Deadlock-safe RPC Session creation."""
-
-    try:
-        successful_attempt = True
-        results = []
-        for _ in range(2):
-            ret = wrapper_for_call_function_with_timeout(
-                timeout=10, func=session_factory, args=args, kwargs=kwargs
-            )
-            results.append(ret)
-        if not np.all(np.array(results) == StatusKind.COMPLETE):
-            raise ValueError("One or more sessions ended incorrectly.")
-    except Exception as e:  # pylint: disable=broad-except
-        successful_attempt = False
-        print(e)
-    return successful_attempt
-
-
-def ios_create_dylib(output, objects, **kwargs):  # pylint: disable=unused-argument
-    xcode.create_dylib(output, objects, arch=ARCH, sdk=SDK)
-
-
-ios_create_dylib.output_format = "dylib"
-
-
-def export_lib(lib):
-    """Export lib to temporary directory."""
-
-    path_dso = TEMPORARY_DIRECTORY.relpath(DSO_NAME)
-    lib.export_library(path_dso, fcompile=ios_create_dylib)
-    return path_dso
-
-
-def get_add_relay_module(a_numpy, b_numpy):
-    """Get simple relay module that add two tensors."""
-
-    a = relay.var("a", shape=a_numpy.shape, dtype=DTYPE)
-    b = relay.var("b", shape=b_numpy.shape, dtype=DTYPE)
-    params = {}
-    out = tvm.IRModule.from_expr(relay.add(a, b))
-    return out, params
-
-
-def get_add_module(target):
-    """Get simple module that add two tensors."""
-
-    n = te.var("n")
-    A = te.placeholder((n,), name="A")
-    B = te.placeholder((n,), name="B")
-    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")
-    s = te.create_schedule(C.op)
-    return tvm.build(s, [A, B, C], target=target, target_host=target, name="simple_add")
-
-
-@pytest.mark.dependency()
-@ios_rpc_bundle_description_required
-@setup_rpc_standalone_configuration
-def test_rpc_standalone(host, port):
-    status_ok = try_create_remote_session(session_factory=rpc.connect, args=(host, port))
-    assert status_ok
-
-
-@pytest.mark.dependency()
-@ios_rpc_bundle_description_required
-@setup_rpc_proxy_configuration
-def test_rpc_proxy(host, port):
-    status_ok = try_create_remote_session(
-        session_factory=rpc.connect, args=(host, port, DEVICE_KEY)
-    )
-    assert status_ok
-
-
-@pytest.mark.dependency()
-@ios_rpc_bundle_description_required
-@setup_rpc_tracker_configuration
-def test_rpc_tracker(host, port):
-    status_ok = try_create_remote_session(
-        session_factory=request_remote, args=(DEVICE_KEY, host, port)
-    )
-    assert status_ok
-
-
-@pytest.mark.dependency()
-@ios_rpc_bundle_description_required
-@setup_rpc_tracker_via_proxy_configuration
-def test_rpc_tracker_via_proxy(host, port):
-    status_ok = try_create_remote_session(
-        session_factory=request_remote, args=(DEVICE_KEY, host, port)
-    )
-    assert status_ok
-
-
-@pytest.mark.dependency(depends=["test_rpc_standalone"])
-@ios_rpc_bundle_description_required
-@setup_rpc_standalone_configuration
-def test_can_call_remote_function_with_rpc_standalone(host, port):
-    remote_session = rpc.connect(host, port)
-    f = remote_session.get_function("runtime.GetFFIString")
-    assert f("hello") == "hello"
-
-
-@pytest.mark.dependency(depends=["test_rpc_proxy"])
-@ios_rpc_bundle_description_required
-@setup_rpc_proxy_configuration
-def test_can_call_remote_function_with_rpc_proxy(host, port):
-    remote_session = rpc.connect(host, port, key=DEVICE_KEY)
-    f = remote_session.get_function("runtime.GetFFIString")
-    assert f("hello") == "hello"
-
-
-@pytest.mark.dependency(depends=["test_rpc_tracker"])
-@ios_rpc_bundle_description_required
-@setup_rpc_tracker_configuration
-def test_can_call_remote_function_with_rpc_tracker(host, port):
-    remote_session = request_remote(DEVICE_KEY, host, port)
-    f = remote_session.get_function("runtime.GetFFIString")
-    assert f("hello") == "hello"
-
-
-@pytest.mark.dependency(depends=["test_rpc_tracker_via_proxy"])
-@ios_rpc_bundle_description_required
-@setup_rpc_tracker_via_proxy_configuration
-def test_can_call_remote_function_with_rpc_tracker_via_proxy(host, port):
-    remote_session = request_remote(DEVICE_KEY, host, port)
-    f = remote_session.get_function("runtime.GetFFIString")
-    assert f("hello") == "hello"
-
-
-@pytest.mark.dependency(depends=["test_rpc_standalone"])
-@ios_rpc_bundle_description_required
-@setup_rpc_standalone_configuration
-def test_basic_functionality_of_rpc_session(host, port):
-    remote_session = rpc.connect(host, port)
-    device = remote_session.cpu(0)
-
-    target = tvm.target.Target(target=f"llvm -mtriple={ARCH}-apple-darwin")
-    lib = get_add_module(target)
-    path_dso = export_lib(lib)
-
-    # Check correct upload
-    remote_session.upload(path_dso)
-
-    # Check correct download
-    downloaded_lib = remote_session.download(DSO_NAME)
-    with open(path_dso, "rb") as source_lib_file:
-        assert downloaded_lib == bytearray(
-            source_lib_file.read()
-        ), "The downloaded module does not match the loaded module"
-
-    # Check correct remote computing
-    lib = remote_session.load_module(DSO_NAME)
-    n = 100
-    a = tvm.nd.array(np.random.uniform(size=n).astype(DTYPE), device)
-    b = tvm.nd.array(np.random.uniform(size=n).astype(DTYPE), device)
-    c = tvm.nd.array(np.zeros(n, dtype=DTYPE), device)
-    lib(a, b, c)
-    tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())
-
-    # Check correct remove
-    remote_session.remove(DSO_NAME)
-
-
-@pytest.mark.dependency(depends=["test_rpc_standalone"])
-@pytest.mark.xfail(reason="Not implemented functionality")
-@ios_rpc_bundle_description_required
-@setup_rpc_standalone_configuration
-def test_cleanup_workspace_after_session_end(host, port):
-    # Arrange
-    remote_session = rpc.connect(host, port)
-    target = tvm.target.Target(target=f"llvm -mtriple={ARCH}-apple-darwin")
-    lib = get_add_module(target)
-    path_dso = export_lib(lib)
-    remote_session.upload(path_dso)
-
-    # Act
-    del remote_session
-    remote_session = rpc.connect(host, port)
-    try:
-        remote_session.download(DSO_NAME)
-        status_ok = False
-    except Exception as _:  # pylint: disable=broad-except
-        status_ok = True
-
-    # Assert
-    assert status_ok, "Workspace not cleared after RPC Session termination."
-
-
-@pytest.mark.dependency(depends=["test_rpc_standalone"])
-@ios_rpc_bundle_description_required
-@setup_rpc_standalone_configuration
-def test_graph_executor_remote_run(host, port):
-    remote_session = rpc.connect(host, port)
-    target = tvm.target.Target(target=f"llvm -mtriple={ARCH}-apple-darwin")
-    device = remote_session.cpu(0)
-
-    size = 100
-    a = np.random.uniform(size=size).astype(DTYPE)
-    b = np.random.uniform(size=size).astype(DTYPE)
-    mod, params = get_add_relay_module(a, b)
-    with tvm.transform.PassContext(opt_level=3):
-        lib = relay.build(mod, target=target, target_host=target, params=params)
-
-    path_dso = export_lib(lib)
-    remote_session.upload(path_dso)
-    lib = remote_session.load_module(DSO_NAME)
-
-    gen_module = graph_executor.GraphModule(lib["default"](device))
-
-    # Check set input
-    gen_module.set_input("a", tvm.nd.array(a))
-    gen_module.set_input("b", tvm.nd.array(b))
-    tvm.testing.assert_allclose(gen_module.get_input(0).numpy(), a)
-    tvm.testing.assert_allclose(gen_module.get_input(1).numpy(), b)
-
-    # Check run
-    gen_module.run()
-    out = gen_module.get_output(0)
-    tvm.testing.assert_allclose(out.numpy(), a + b)
-
-
-@pytest.mark.dependency(depends=["test_rpc_tracker"])
-@ios_rpc_bundle_description_required
-@setup_rpc_tracker_configuration
-def test_check_auto_schedule_tuning(host, port):  # pylint: disable=too-many-locals
-    log_file = TEMPORARY_DIRECTORY.relpath("ios_tuning_stat.log")
-    target = tvm.target.Target(target=f"llvm -mtriple={ARCH}-apple-darwin")
-    mod, params = relay.testing.mlp.get_workload(batch_size=4, image_shape=(1, 4, 4))
-
-    try:
-        status_ok = True
-        measure_runner = auto_scheduler.RPCRunner(
-            DEVICE_KEY,
-            host,
-            port,
-            min_repeat_ms=1,
-            timeout=10,
-            n_parallel=multiprocessing.cpu_count(),
-        )
-        builder = auto_scheduler.LocalBuilder(timeout=10, build_func=ios_create_dylib)
-        tune_option = auto_scheduler.TuningOptions(
-            builder=builder,
-            num_measure_trials=2,
-            num_measures_per_round=1,
-            runner=measure_runner,
-            measure_callbacks=[auto_scheduler.RecordToFile(log_file)],
-            verbose=0,
-        )
-
-        tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target)
-        tasks, task_weights = tasks[:2], task_weights[:2]
-        tuner = auto_scheduler.TaskScheduler(tasks, task_weights)
-        tuner.tune(tune_option, search_policy="sketch.random")
-
-        # Check tuning log
-        tuning_statistic = list(load_records(log_file))
-        for _, measure_result in tuning_statistic:
-            if measure_result.error_no != MeasureErrorNo.NO_ERROR:
-                raise ValueError(
-                    f"Error for MeasureResult. Error code: {measure_result.error_no},"
-                    f" for details see MeasureErrorNO."
-                )
-
-    except Exception as e:  # pylint: disable=broad-except
-        status_ok = False
-        print(e)
-
-    assert status_ok, "Tuning failed, see logs."
-
-
-if __name__ == "__main__":
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py
index 0ee5ce3118f2e..df4234e7e605f 100644
--- a/tests/python/contrib/test_tensorrt.py
+++ b/tests/python/contrib/test_tensorrt.py
@@ -1526,7 +1526,4 @@ def test_empty_subgraph(run_module):
 
 
 if __name__ == "__main__":
-    import sys
-
-    # sys.exit(pytest.main([__file__] + sys.argv[1:]))
-    test_maskrcnn_resnet50(run_module)
+    pytest.main([__file__])
diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py
index 147c420cc9029..9d44d8f22f411 100644
--- a/tests/python/driver/tvmc/test_compiler.py
+++ b/tests/python/driver/tvmc/test_compiler.py
@@ -24,7 +24,7 @@
 import pytest
 
 import tvm
-from tvm.testing.utils import ethosn_available
+import tvm.testing
 
 from tvm.contrib.target.vitis_ai import vitis_ai_available
 
@@ -370,11 +370,8 @@ def test_compile_opencl(tflite_mobilenet_v1_0_25_128):
     assert os.path.exists(dumps_path)
 
 
-@pytest.mark.skipif(
-    not ethosn_available(),
-    reason="--target=Ethos(TM)-N77 is not available. TVM built with 'USE_ETHOSN OFF'",
-)
-def test_compile_tflite_module_with_external_codegen_ethos_n77(tflite_mobilenet_v1_1_quant):
+@tvm.testing.requires_ethosn
+def test_compile_tflite_module_with_external_codegen(tflite_mobilenet_v1_1_quant):
     pytest.importorskip("tflite")
     tvmc_model = tvmc.load(tflite_mobilenet_v1_1_quant)
     tvmc_package = tvmc.compile(tvmc_model, target="ethos-n77, llvm", dump_code="relay")
@@ -419,26 +416,6 @@ def test_compile_tflite_module_with_external_codegen_cmsisnn(
         assert len(c_source_files) == 3
 
 
-@pytest.mark.skipif(
-    not ethosn_available(),
-    reason="--target=Ethos(TM)-N78 is not available. TVM built with 'USE_ETHOSN OFF'",
-)
-def test_compile_tflite_module_with_external_codegen_ethos_n78(tflite_mobilenet_v1_1_quant):
-    pytest.importorskip("tflite")
-    tvmc_model = tvmc.load(tflite_mobilenet_v1_1_quant)
-    tvmc_package = tvmc.compile(
-        tvmc_model, target="ethos-n78 -variant=ethos-n78, llvm", dump_code="relay"
-    )
-    dumps_path = tvmc_package.package_path + ".relay"
-
-    # check for output types
-    assert type(tvmc_package) is TVMCPackage
-    assert type(tvmc_package.graph) is str
-    assert type(tvmc_package.lib_path) is str
-    assert type(tvmc_package.params) is bytearray
-    assert os.path.exists(dumps_path)
-
-
 @pytest.mark.skipif(
     not vitis_ai_available(),
     reason="--target=vitis-ai is not available. TVM built with 'USE_VITIS_AI OFF'",
@@ -523,9 +500,3 @@ def test_compile_check_configs_composite_target(mock_pkg, mock_pc, mock_fe, mock
         config={"relay.ext.mock.options": {"testopt": "value"}},
         disabled_pass=None,
     )
-
-
-if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/driver/tvmc/test_composite_target.py b/tests/python/driver/tvmc/test_composite_target.py
index 80b4d1be93d54..0a0b45eeb9702 100644
--- a/tests/python/driver/tvmc/test_composite_target.py
+++ b/tests/python/driver/tvmc/test_composite_target.py
@@ -34,7 +34,6 @@ def test_get_codegen_names():
     names = tvmc.composite_target.get_codegen_names()
 
     assert "ethos-n77" in names
-    assert "ethos-n78" in names
     assert "vitis-ai" in names
     assert len(names) > 0
 
diff --git a/tests/python/driver/tvmc/test_target.py b/tests/python/driver/tvmc/test_target.py
index 001ac18ca6d95..afb099f3add65 100644
--- a/tests/python/driver/tvmc/test_target.py
+++ b/tests/python/driver/tvmc/test_target.py
@@ -118,6 +118,16 @@ def test_parse_multiple_target():
     assert "llvm" == targets[1]["name"]
 
 
+def test_parse_multiple_target_with_opts():
+    targets = tvmc.common.parse_target("ethos-n77 -myopt=value, llvm -device=arm_cpu --system-lib")
+
+    assert len(targets) == 2
+    assert "ethos-n77" == targets[0]["name"]
+    assert "myopt" in targets[0]["opts"]
+    assert "value" == targets[0]["opts"]["myopt"]
+    assert "llvm" == targets[1]["name"]
+
+
 def test_parse_quotes_and_separators_on_options():
     targets_no_quote = tvmc.common.parse_target("foo -option1=+v1.0x,+value,+bar")
     targets_single_quote = tvmc.common.parse_target("foo -option1='+v1.0x,+value'")
@@ -131,23 +141,3 @@ def test_parse_quotes_and_separators_on_options():
 
     assert len(targets_double_quote) == 1
     assert "+v1.0x,+value" == targets_double_quote[0]["opts"]["option1"]
-
-
-def test_parse_multiple_target_with_opts_ethos_n77():
-    targets = tvmc.common.parse_target("ethos-n77 -myopt=value, llvm -device=arm_cpu --system-lib")
-
-    assert len(targets) == 2
-    assert "ethos-n77" == targets[0]["name"]
-    assert "myopt" in targets[0]["opts"]
-    assert "value" == targets[0]["opts"]["myopt"]
-    assert "llvm" == targets[1]["name"]
-
-
-def test_parse_multiple_target_with_opts_ethos_n78():
-    targets = tvmc.common.parse_target("ethos-n78 -myopt=value, llvm -device=arm_cpu --system-lib")
-
-    assert len(targets) == 2
-    assert "ethos-n78" == targets[0]["name"]
-    assert "myopt" in targets[0]["opts"]
-    assert "value" == targets[0]["opts"]["myopt"]
-    assert "llvm" == targets[1]["name"]
diff --git a/tests/python/frontend/paddlepaddle/test_forward.py b/tests/python/frontend/paddlepaddle/test_forward.py
index b8d4c1150238c..b274d178c9c29 100644
--- a/tests/python/frontend/paddlepaddle/test_forward.py
+++ b/tests/python/frontend/paddlepaddle/test_forward.py
@@ -125,33 +125,6 @@ def add_subtract3(inputs1, inputs2):
     verify_model(add_subtract3, [input_data, input_data2])
 
 
-@tvm.testing.uses_gpu
-def test_forward_addmm():
-    class Addmm(nn.Layer):
-        def __init__(self, alpha=1.0, beta=1.0):
-            super(Addmm, self).__init__()
-            self.alpha = alpha
-            self.beta = beta
-
-        @paddle.jit.to_static
-        def forward(self, inputs, x, y):
-            return paddle.addmm(inputs, x, y, self.alpha, self.beta)
-
-    input_shapes = [[10, 10], [1, 1], [7, 1]]
-    x_shapes = [[10, 3], [5, 6], [7, 7]]
-    y_shapes = [[3, 10], [6, 2], [7, 3]]
-    input_shapes = [[10, 10]]
-    x_shapes = [[10, 3]]
-    y_shapes = [[3, 10]]
-
-    for i in range(len(input_shapes)):
-        input_data = paddle.rand(input_shapes[i], dtype="float32")
-        x_data = paddle.rand(x_shapes[i], dtype="float32")
-        y_data = paddle.rand(y_shapes[i], dtype="float32")
-        verify_model(Addmm(), input_data=[input_data, x_data, y_data])
-        verify_model(Addmm(0.5, 0.3), input_data=[input_data, x_data, y_data])
-
-
 @tvm.testing.uses_gpu
 def test_forward_arg_max_min():
     class ArgMax(nn.Layer):
@@ -306,24 +279,6 @@ def forward(self, input_data):
     verify_model(BatchNorm3D(), input_data=input_data)
 
 
-@tvm.testing.uses_gpu
-def test_forward_bmm():
-    class Bmm(nn.Layer):
-        def __init__(self):
-            super(Bmm, self).__init__()
-
-        @paddle.jit.to_static
-        def forward(self, x, y):
-            return paddle.bmm(x, y)
-
-    x_shapes = [[10, 3, 4], [5, 6, 2], [1, 7, 7]]
-    y_shapes = [[10, 4, 5], [5, 2, 7], [1, 7, 3]]
-    for i in range(len(x_shapes)):
-        x_data = paddle.rand(x_shapes[i], dtype="float32")
-        y_data = paddle.rand(y_shapes[i], dtype="float32")
-        verify_model(Bmm(), input_data=[x_data, y_data])
-
-
 @tvm.testing.uses_gpu
 def test_forward_cast():
     @paddle.jit.to_static
@@ -506,25 +461,13 @@ def forward(self, input1, input2):
 
     api_list = [
         "equal",
-        "floor_divide",
-        "greater_equal",
-        "greater_than",
-        "less_equal",
-        "less_than",
-        "maximum",
-        "minimum",
-        "pow",
     ]
     x_shapes = [[128], [8, 20], [4, 20, 3], [2, 3, 8, 8], [2, 3, 3, 9, 9]]
     y_shapes = [[1], [8, 20], [4, 1, 1], [2, 3, 8, 8], [2, 3, 3, 9, 1]]
     for x_shape, y_shape in zip(x_shapes, y_shapes):
-        x_data = paddle.randint(1, 10, x_shape, dtype="int32")
-        y_data = paddle.randint(1, 10, y_shape, dtype="int32")
+        x_data = paddle.randint(1, 1000, x_shape, dtype="int32")
+        y_data = paddle.randint(1, 1000, y_shape, dtype="int32")
         for api_name in api_list:
-            if api_name == "pow":
-                # only support float for pow
-                x_data = x_data.astype("float32")
-                y_data = y_data.astype("float32")
             verify_model(ElemwiseAPI(api_name), [x_data, y_data])
 
 
@@ -585,100 +528,6 @@ def forward(self, x, y):
         verify_model(ExpandAs(), [x_data, y_data])
 
 
-@tvm.testing.uses_gpu
-def test_forward_gather():
-    class Gather(nn.Layer):
-        def __init__(self, axis=None):
-            super(Gather, self).__init__()
-            self.axis = axis
-
-        @paddle.jit.to_static
-        def forward(self, x, index):
-            return paddle.gather(x, index, axis=self.axis)
-
-    x_shapes = [[20, 10], [10, 10, 8]]
-    index = paddle.to_tensor(np.array([1, 3, 5]).astype("int64"))
-    for x_shape in x_shapes:
-        x_data = paddle.rand(x_shape, dtype="float32")
-        verify_model(Gather(), [x_data, index])
-        verify_model(Gather(axis=0), [x_data, index])
-        verify_model(Gather(axis=1), [x_data, index])
-
-
-@tvm.testing.uses_gpu
-def test_forward_gather_nd():
-    class GatherNd(nn.Layer):
-        @paddle.jit.to_static
-        def forward(self, x, index):
-            return paddle.gather_nd(x, index)
-
-    x_shapes = [[20], [8, 8], [4, 5, 6], [3, 4, 3, 5]]
-    y_shapes = [[2, 1], [2], [1, 2, 3], [3]]
-    for x_shape, y_shape in zip(x_shapes, y_shapes):
-        x_data = paddle.rand(x_shape, dtype="float32")
-        y_data = paddle.randint(low=0, high=3, shape=y_shape, dtype="int64")
-        verify_model(GatherNd(), [x_data, y_data])
-
-
-@tvm.testing.uses_gpu
-def test_forward_group_norm():
-    class GroupNorm(nn.Layer):
-        def __init__(self, channels, groups):
-            super(GroupNorm, self).__init__()
-            self.group_norm = paddle.nn.GroupNorm(num_channels=channels, num_groups=groups)
-
-        def forward(self, inputs):
-            return self.group_norm(inputs)
-
-    input_shapes = [[1, 4, 6, 6], [2, 2, 4, 7], [2, 8, 1, 1]]
-    for input_shape in input_shapes:
-        num_channels = input_shape[1]
-        input_data = paddle.uniform(input_shape)
-        verify_model(GroupNorm(num_channels, 1), input_data)
-        verify_model(GroupNorm(num_channels, 2), input_data)
-
-
-@tvm.testing.uses_gpu
-def test_forward_scatter():
-    class Scatter(nn.Layer):
-        def __init__(self, overwrite=True):
-            super(Scatter, self).__init__()
-            self.overwrite = overwrite
-
-        @paddle.jit.to_static
-        def forward(self, x, index, updates):
-            return paddle.scatter(x, index, updates, overwrite=self.overwrite)
-
-    x_shapes = [[10], [4, 5], [6, 4, 5], [4, 5, 6, 4]]
-    index_shapes = [[10], [4], [6], [4]]
-    for x_shape, index_shape in zip(x_shapes, index_shapes):
-        x_data = paddle.rand(x_shape, dtype="float32")
-        updates = paddle.rand(x_shape, dtype="float32") + 1.0
-        index = paddle.randint(low=0, high=3, shape=index_shape)
-        verify_model(Scatter(), [x_data, index, updates])
-        verify_model(Scatter(False), [x_data, index, updates])
-
-
-def test_forward_scatter_nd():
-    @paddle.jit.to_static
-    def scatter_nd(index, updates):
-        shape = [3, 5, 9, 10]
-        return paddle.scatter_nd(index, updates, shape)
-
-    @paddle.jit.to_static
-    def scatter_nd_add(x, index, updates):
-        return paddle.scatter_nd_add(x, index, updates)
-
-    index_data = np.array([[1, 1], [0, 1], [1, 3]]).astype(np.int64)
-    index = paddle.to_tensor(index_data)
-    updates = paddle.rand(shape=[3, 9, 10], dtype="float32")
-    verify_model(scatter_nd, [index, updates])
-    x = paddle.rand(shape=[3, 5, 4, 9, 10], dtype="float32")
-    updates = paddle.rand(shape=[3, 2, 9, 10], dtype="float32")
-    index = paddle.randint(0, 3, shape=[3, 2, 3])
-    verify_model(scatter_nd_add, [x, index, updates])
-
-
 @tvm.testing.uses_gpu
 def test_forward_shape_full():
     @paddle.jit.to_static
@@ -825,22 +674,6 @@ def forward(self, x, y):
         verify_model(LogicalAPI("logical_xor"), [x_data, y_data])
 
 
-@tvm.testing.uses_gpu
-def test_forward_logical_not():
-    class LogicalNot(nn.Layer):
-        def __init__(self):
-            super(LogicalNot, self).__init__()
-
-        @paddle.jit.to_static
-        def forward(self, x):
-            return paddle.logical_not(x).astype("int32")
-
-    input_shapes = [[128], [8, 20], [4, 20, 3], [2, 3, 8, 8], [2, 3, 3, 9, 9]]
-    for input_shape in input_shapes:
-        input_data = paddle.randint(-2, 2, input_shape).astype("bool")
-        verify_model(LogicalNot(), input_data)
-
-
 @tvm.testing.uses_gpu
 def test_forward_look_up():
     @paddle.jit.to_static
@@ -945,48 +778,6 @@ def forward(self, inputs):
         verify_model(Pool2D3(), input_data=input_data)
 
 
-@tvm.testing.uses_gpu
-def test_forward_pad1d():
-    class Pad1D(nn.Layer):
-        def __init__(self, padding=0, mode="constant", value=0.0, data_format="NCL"):
-            super(Pad1D, self).__init__()
-            self.pad1d = paddle.nn.Pad1D(padding, mode=mode, value=value, data_format=data_format)
-
-        @paddle.jit.to_static
-        def forward(self, inputs):
-            return self.pad1d(inputs)
-
-    input_shapes = [[1, 2, 5], [2, 5, 9]]
-    for input_shape in input_shapes:
-        input_data = paddle.rand(input_shape, dtype="float32")
-        verify_model(Pad1D(padding=2), input_data=input_data)
-        verify_model(Pad1D(padding=[1, 2], data_format="NLC"), input_data=input_data)
-        verify_model(Pad1D(padding=[0, 2], value=0.3), input_data=input_data)
-        verify_model(Pad1D(padding=[2, 2], mode="reflect"), input_data=input_data)
-        verify_model(Pad1D(padding=3, mode="replicate"), input_data=input_data)
-
-
-@tvm.testing.uses_gpu
-def test_forward_pad2d():
-    class Pad2D(nn.Layer):
-        def __init__(self, padding=0, mode="constant", value=0.0, data_format="NCHW"):
-            super(Pad2D, self).__init__()
-            self.pad2d = paddle.nn.Pad2D(padding, mode=mode, value=value, data_format=data_format)
-
-        @paddle.jit.to_static
-        def forward(self, inputs):
-            return self.pad2d(inputs)
-
-    input_shapes = [[1, 2, 5, 5], [2, 2, 5, 9]]
-    for input_shape in input_shapes:
-        input_data = paddle.rand(input_shape, dtype="float32")
-        verify_model(Pad2D(padding=2), input_data=input_data)
-        verify_model(Pad2D(padding=[1, 2, 0, 2], data_format="NHWC"), input_data=input_data)
-        verify_model(Pad2D(padding=[1, 2, 0, 2], value=0.3), input_data=input_data)
-        verify_model(Pad2D(padding=[1, 2, 0, 2], mode="reflect"), input_data=input_data)
-        verify_model(Pad2D(padding=3, mode="replicate"), input_data=input_data)
-
-
 @tvm.testing.uses_gpu
 def test_forward_pad3d():
     class Pad3D(nn.Layer):
@@ -1001,55 +792,11 @@ def forward(self, inputs):
     input_shapes = [[1, 2, 2, 5, 5], [1, 2, 2, 5, 9]]
     for input_shape in input_shapes:
         input_data = paddle.rand(input_shape, dtype="float32")
-        verify_model(Pad3D(padding=2), input_data=input_data)
-        verify_model(Pad3D(padding=[1, 2, 0, 2, 1, 1], data_format="NDHWC"), input_data=input_data)
-        verify_model(Pad3D(padding=[1, 2, 0, 2, 1, 1], value=0.3), input_data=input_data)
-        verify_model(Pad3D(padding=[1, 2, 0, 2, 1, 1], mode="reflect"), input_data=input_data)
-        verify_model(Pad3D(padding=3, mode="replicate"), input_data=input_data)
-
-
-@tvm.testing.uses_gpu
-def test_forward_transpose():
-    class Transpose(nn.Layer):
-        def __init__(self, perm):
-            super(Transpose, self).__init__()
-            self.perm = perm
-
-        @paddle.jit.to_static
-        def forward(self, inputs):
-            inputs = inputs + inputs.size()
-            return paddle.transpose(inputs, perm=self.perm)
-
-    input_data = paddle.rand([1, 3, 5, 4, 3], dtype="float32")
-    verify_model(Transpose([0, 1, 2, 3, 4]), input_data=input_data)
-    verify_model(Transpose([4, 3, 2, 0, 1]), input_data=input_data)
-
-
-@tvm.testing.uses_gpu
-def test_forward_reduce():
-    class Reduce(nn.Layer):
-        def __init__(self, op_name, axis=None, keepdim=False):
-            super(Reduce, self).__init__()
-            self.op_name = op_name
-            self.axis = axis
-            self.keepdim = keepdim
-
-        @paddle.jit.to_static
-        def forward(self, inputs):
-            result = getattr(paddle, self.op_name)(inputs, axis=self.axis, keepdim=self.keepdim)
-            result = result.astype("float32")
-            return result
-
-    input_shapes = [[1, 2, 2, 5, 5], [2, 3, 4], [4, 20], [2, 3, 30, 30]]
-    for input_shape in input_shapes:
-        input_data = paddle.uniform(min=-3, max=3, shape=input_shape, dtype="float32")
-        verify_model(Reduce("all"), input_data=input_data.astype("bool"))
-        verify_model(Reduce("any", 1), input_data=input_data.astype("bool"))
-        verify_model(Reduce("max", 0, True), input_data=input_data)
-        verify_model(Reduce("min", 1, True), input_data=input_data)
-        verify_model(Reduce("prod", 0), input_data=input_data)
-        verify_model(Reduce("sum", 0, True), input_data=input_data)
-        verify_model(Reduce("mean", -1, True), input_data=input_data)
+    verify_model(Pad3D(padding=2), input_data=input_data)
+    verify_model(Pad3D(padding=[1, 2, 0, 2, 1, 1]), input_data=input_data)
+    verify_model(Pad3D(padding=[1, 2, 0, 2, 1, 1], value=0.3), input_data=input_data)
+    verify_model(Pad3D(padding=[1, 2, 0, 2, 1, 1], mode="reflect"), input_data=input_data)
+    verify_model(Pad3D(padding=3, mode="replicate"), input_data=input_data)
 
 
 @tvm.testing.uses_gpu
@@ -1152,46 +899,14 @@ def forward(self, inputs):
             return self.func(inputs)
 
     api_list = [
-        "abs",
-        "acos",
-        "asin",
-        "atan",
-        "ceil",
-        "cos",
-        "cosh",
-        "erf",
         "exp",
-        "floor",
-        "hardshrink",
-        "hardtanh",
-        "log",
-        "log2",
-        "log10",
-        "reciprocal",
         "relu",
-        "relu6",
-        "round",
-        "rsqrt",
-        "selu",
-        "sigmoid",
-        "sign",
-        "sin",
-        "sinh",
-        "softplus",
-        "softsign",
-        "sqrt",
-        "square",
-        "swish",
-        "tan",
         "tanh",
     ]
     input_shapes = [[128], [2, 100], [10, 2, 5], [7, 3, 4, 1]]
     for input_shape in input_shapes:
         input_data = paddle.rand(input_shape, dtype="float32")
         for api_name in api_list:
-            if api_name in ["log", "log2", "log10", "reciprocal", "sqrt", "rsqrt"]:
-                # avoid illegal input, all elements should be positive
-                input_data = paddle.uniform(input_shape, min=0.01, max=0.99)
             verify_model(MathAPI(api_name), input_data=input_data)
 
 
diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py
index 5057f0d2b6b8a..0031f4143fabc 100644
--- a/tests/python/frontend/pytorch/test_forward.py
+++ b/tests/python/frontend/pytorch/test_forward.py
@@ -735,30 +735,13 @@ def test_forward_log_sigmoid():
 
 
 @tvm.testing.uses_gpu
-def test_forward_adaptive_avgpool():
+def test_forward_adaptiveavgpool():
     torch.set_grad_enabled(False)
     input_shape = [1, 3, 10, 10]
     input_data = torch.rand(input_shape).float()
     verify_model(torch.nn.AdaptiveAvgPool2d([1, 1]).eval(), input_data=input_data)
     verify_model(torch.nn.AdaptiveAvgPool2d([10, 10]).eval(), input_data=input_data)
 
-    input_data = torch.rand([1, 3, 10]).float()
-    verify_model(torch.nn.AdaptiveAvgPool1d([1]).eval(), input_data=input_data)
-    verify_model(torch.nn.AdaptiveAvgPool1d([5]).eval(), input_data=input_data)
-
-
-@tvm.testing.uses_gpu
-def test_forward_adaptive_maxpool():
-    torch.set_grad_enabled(False)
-    input_shape = [1, 3, 10, 10]
-    input_data = torch.rand(input_shape).float()
-    verify_model(torch.nn.AdaptiveMaxPool2d([1, 1]).eval(), input_data=input_data)
-    verify_model(torch.nn.AdaptiveMaxPool2d([10, 10]).eval(), input_data=input_data)
-
-    input_data = torch.rand([1, 3, 10]).float()
-    verify_model(torch.nn.AdaptiveMaxPool1d([1]).eval(), input_data=input_data)
-    verify_model(torch.nn.AdaptiveMaxPool1d([5]).eval(), input_data=input_data)
-
 
 @tvm.testing.uses_gpu
 def test_forward_maxpool2d():
@@ -4009,16 +3992,5 @@ def test_fn(out_int32=False, right=False):
     verify_model(test_fn(out_int32=True, right=True), [values, boundaries])
 
 
-@tvm.testing.uses_gpu
-def test_roll():
-    def test_fn(shifts, dims):
-        return lambda x: torch.roll(x, shifts, dims)
-
-    x = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]).view(4, 2)
-    verify_model(test_fn(1, 0), [x])
-    verify_model(test_fn(-1, 0), [x])
-    verify_model(test_fn(shifts=(2, 1), dims=(0, 1)), [x])
-
-
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py
index f8a603c878009..754976ca8c133 100644
--- a/tests/python/frontend/tflite/test_forward.py
+++ b/tests/python/frontend/tflite/test_forward.py
@@ -25,7 +25,6 @@
 import pytest
 import numpy as np
 import tvm
-import tempfile
 from tvm import te
 from tvm import relay
 
@@ -1770,6 +1769,16 @@ def _test_unary_elemwise(math_op, data):
         compare_tflite_with_tvm(data, ["in:0"], [in_data], [out])
 
 
+#######################################################################
+# Abs
+# ---
+
+
+def _test_abs(data):
+    """One iteration of abs"""
+    return _test_unary_elemwise(math_ops.abs, data)
+
+
 #######################################################################
 # Ceil
 # ----
@@ -1860,6 +1869,16 @@ def _test_sqrt(data):
     return _test_unary_elemwise(math_ops.sqrt, data)
 
 
+#######################################################################
+# Neg
+# ---
+
+
+def _test_neg(data):
+    """One iteration of neg"""
+    return _test_unary_elemwise(math_ops.neg, data)
+
+
 #######################################################################
 # Square
 # ------
@@ -1889,11 +1908,13 @@ def _test_forward_unary_elemwise(test_op):
 
 
 def test_all_unary_elemwise():
+    _test_forward_unary_elemwise(_test_abs)
     _test_forward_unary_elemwise(_test_floor)
     _test_forward_unary_elemwise(_test_exp)
     _test_forward_unary_elemwise(_test_log)
     _test_forward_unary_elemwise(_test_sin)
     _test_forward_unary_elemwise(_test_sqrt)
+    _test_forward_unary_elemwise(_test_neg)
     _test_forward_unary_elemwise(_test_square)
     # ceil and cos come with TFLite 1.14.0.post1 fbs schema
     if package_version.parse(tf.VERSION) >= package_version.parse("1.14.0"):
@@ -3360,110 +3381,6 @@ def test_forward_rsqrt():
     _test_rsqrt(np.arange(1, 240, 40, dtype=np.uint8).reshape((2, 1, 3)), quantized=True)
 
 
-#######################################################################
-# NEG
-# ----
-
-
-def _test_neg(data, quantized=False):
-    """One iteration of NEG"""
-    with tf.Graph().as_default():
-        in_data = array_ops.placeholder(shape=data.shape, dtype="float32", name="in_0")
-
-        if quantized:
-            inq_data = tf.quantization.fake_quant_with_min_max_args(
-                in_data, min=1, max=6, name="inq_0"
-            )
-            input_range = {"inq_0": (1, 6)}
-            out = math_ops.neg(inq_data)
-            out = tf.quantization.fake_quant_with_min_max_args(out, min=1, max=6, name="out")
-            compare_tflite_with_tvm(
-                data,
-                "inq_0:0",
-                [inq_data],
-                [out],
-                quantized=True,
-                input_range=input_range,
-                experimental_new_converter=True,
-            )
-        else:
-            out = math_ops.neg(in_data)
-            compare_tflite_with_tvm(data, "in_0:0", [in_data], [out])
-
-
-def test_forward_neg():
-    """NEG"""
-    _test_neg(np.arange(-2.0, 4.0, dtype=np.float32), quantized=False)
-    _test_neg(np.arange(-2.0, 4.0, dtype=np.float32).reshape((2, 1, 3)), quantized=False)
-    _test_neg(np.arange(1, 240, 40, dtype=np.uint8), quantized=True)
-    _test_neg(np.arange(1, 240, 40, dtype=np.uint8).reshape((2, 1, 3)), quantized=True)
-
-
-#######################################################################
-# ABS
-# ----
-
-
-def _test_abs(data, quantized=False):
-    """One iteration of ABS"""
-    if quantized:
-
-        def _create_model():
-            class Model(tf.Module):
-                @tf.function
-                def tf_function(self, x):
-                    op = tf.math.abs(x)
-                    return op
-
-            dtype = "int8"
-            model = Model()
-
-            # Save the model
-            export_dir = tempfile.gettempdir() + "/tf_model"
-            tf.saved_model.save(
-                model,
-                export_dir,
-                signatures=model.tf_function.get_concrete_function(
-                    tf.TensorSpec(data.shape, tf.float32, name="input"),
-                ),
-            )
-
-            # Convert the model
-            def representative_dataset():
-                for _ in range(100):
-                    tmp_data = np.random.rand(*tuple(data.shape))
-                    yield [tmp_data.astype(np.float32) * 2 - 1]
-
-            converter = tf.lite.TFLiteConverter.from_saved_model(export_dir)
-            converter.optimizations = [tf.lite.Optimize.DEFAULT]
-            converter.representative_dataset = representative_dataset
-            converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8]
-            converter.inference_input_type = tf.int8
-            converter.inference_output_type = tf.int8
-            tflite_model = converter.convert()
-            return tflite_model
-
-        tflite_model_quant = _create_model()
-        tflite_output = run_tflite_graph(tflite_model_quant, data)
-        in_node = ["serving_default_input_int8"]
-        tvm_output = run_tvm_graph(tflite_model_quant, data, in_node)
-        tvm.testing.assert_allclose(
-            np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2
-        )
-    else:
-        with tf.Graph().as_default():
-            in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype, name="in_0")
-            out = math_ops.abs(in_data)
-            compare_tflite_with_tvm(data, "in_0:0", [in_data], [out])
-
-
-def test_forward_abs():
-    """ABS"""
-    _test_abs(np.arange(-3.0, 3.0, dtype=np.float32), quantized=False)
-    _test_abs(np.arange(-3.0, 3.0, dtype=np.float32).reshape((2, 1, 3)), quantized=False)
-    _test_abs(np.arange(-128, 127, 45, dtype=np.int8), quantized=True)
-
-
 #######################################################################
 # ReLu
 # ----
@@ -4740,8 +4657,6 @@ def test_prevent_tensorflow_dynamic_range():
     test_forward_softmax()
     test_forward_tanh()
     test_forward_rsqrt()
-    test_forward_neg()
-    test_forward_abs()
     test_forward_relu()
     test_forward_relu6()
     test_forward_leaky_relu()
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index 278a95f60b6c4..276cad3753570 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -39,7 +39,7 @@
 from tvm.relay.backend.te_compiler import TECompiler
 from tvm.relay.backend.utils import mangle_module_name
 from tvm.micro import export_model_library_format
-from tvm.micro.testing import mlf_extract_workspace_size_bytes
+
 
 _LOG = logging.getLogger(__name__)
 
@@ -538,6 +538,12 @@ def create_header_file(tensor_name, npy_data, output_path, data_linkage):
         header_file.write("};\n\n")
 
 
+def extract_main_workspace_size_bytes(extract_dir):
+    with open(os.path.join(extract_dir, "metadata.json")) as json_f:
+        metadata = json.load(json_f)
+        return metadata["memory"]["functions"]["main"][0]["workspace_size_bytes"]
+
+
 def compile_models(
     models: Union[List[AOTTestModel], AOTTestModel],
     interface_api: str,
@@ -620,7 +626,7 @@ def run_and_check(
         t.extractall(base_path)
 
         workspace_bytes += model.extra_memory_in_bytes
-        workspace_bytes += mlf_extract_workspace_size_bytes(tar_file)
+        workspace_bytes += extract_main_workspace_size_bytes(base_path)
 
         for key in model.inputs:
             sanitized_tensor_name = re.sub(r"\W", "_", key)
diff --git a/tests/python/relay/test_pass_fold_constant.py b/tests/python/relay/test_pass_fold_constant.py
index 3a5f458d59705..7b4eb5231a2c2 100644
--- a/tests/python/relay/test_pass_fold_constant.py
+++ b/tests/python/relay/test_pass_fold_constant.py
@@ -22,16 +22,6 @@
 from tvm.relay.testing import run_infer_type, create_workload
 
 
-def annot_func(f):
-    """Returns f with arg/result device attributes for the argument and result."""
-    return relay.op.annotation.function_on_device(f, [tvm.cpu()], tvm.cpu())
-
-
-def annot_expr(e):
-    """Returns e wrapped with an on_device annotation."""
-    return relay.op.annotation.on_device(e, tvm.cpu(), is_fixed=True)
-
-
 def run_opt_pass(expr, opt_pass):
     assert isinstance(opt_pass, tvm.transform.Pass)
 
@@ -85,35 +75,7 @@ def expected():
     with tvm.target.Target("cuda"):
         zz = run_opt_pass(before(), transform.FoldConstant())
     zexpected = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, zexpected)
-
-
-def test_fold_const_with_on_device():
-    """Make sure on_device annotations don't get in the way of constant folding"""
-    c_data = np.array([1, 2, 3]).astype("float32")
-    t = relay.TensorType([1, 2, 3], "float32")
-
-    def before():
-        c = relay.const(c_data)
-        x = relay.var("x", t)
-        y = relay.add(c, c)
-        y = relay.multiply(y, relay.const(2, "float32"))
-        y = relay.add(x, y)
-        z = relay.add(y, c)
-        f = relay.Function([x], z)
-        return annot_func(f)
-
-    def expected():
-        x = relay.var("x", t)
-        c_folded = (c_data + c_data) * 2
-        y = relay.add(x, relay.const(c_folded))
-        z = relay.add(y, relay.const(c_data))
-        f = relay.Function([x], z)
-        return annot_func(f)
-
-    zz = run_opt_pass(before(), transform.FoldConstant())
-    zexpected = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, zexpected)
+    assert tvm.ir.structural_equal(zz, zexpected)
 
 
 def test_fold_let():
@@ -139,37 +101,7 @@ def expected():
 
     zz = run_opt_pass(before(), transform.FoldConstant())
     zexpected = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, zexpected)
-
-
-def test_fold_let_with_on_device():
-    """Make sure on_device annotations don't get in the way of constant folding,
-    and inlined constants bring their annotations with them."""
-    c_data = np.array(1).astype("float32")
-    t = relay.TensorType([1], "float32")
-
-    def before():
-        sb = relay.ScopeBuilder()
-        x = relay.var("x", t)
-        t1 = sb.let("t1", annot_expr(relay.const(c_data)))
-        t2 = sb.let("t2", annot_expr(relay.add(t1, t1)))
-        t3 = sb.let("t3", annot_expr(relay.add(t2, x)))
-        sb.ret(t3)
-        f = relay.Function([x], sb.get())
-        return annot_func(f)
-
-    def expected():
-        sb = relay.ScopeBuilder()
-        x = relay.var("x", t)
-        c_folded = c_data + c_data
-        t3 = sb.let("t3", annot_expr(relay.add(annot_expr(relay.const(c_folded)), x)))
-        sb.ret(t3)
-        f = relay.Function([x], sb.get())
-        return annot_func(f)
-
-    zz = run_opt_pass(before(), transform.FoldConstant())
-    zexpected = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, zexpected)
+    assert tvm.ir.structural_equal(zz, zexpected)
 
 
 def test_fold_tuple():
@@ -192,7 +124,7 @@ def expected():
 
     zz = run_opt_pass(before(), transform.FoldConstant())
     zexpected = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, zexpected)
+    assert tvm.ir.structural_equal(zz, zexpected)
 
 
 def test_fold_concat():
@@ -211,7 +143,7 @@ def expected():
 
     zz = run_opt_pass(before(), transform.FoldConstant())
     zexpected = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, zexpected)
+    assert tvm.ir.structural_equal(zz, zexpected)
 
 
 def test_fold_if():
@@ -232,7 +164,7 @@ def expected():
 
     zz = run_opt_pass(before(), transform.FoldConstant())
     zexpected = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, zexpected)
+    assert tvm.ir.structural_equal(zz, zexpected)
 
     cond_data = np.array(0).astype("bool")
 
@@ -250,7 +182,7 @@ def expected():
 
     zz = run_opt_pass(before(), transform.FoldConstant())
     zexpected = run_opt_pass(expected(), transform.InferType())
-    tvm.ir.assert_structural_equal(zz, zexpected)
+    assert tvm.ir.structural_equal(zz, zexpected)
 
 
 def test_fold_shape_of():
@@ -272,7 +204,7 @@ def expected(dtype):
     for dtype in ["int32", "float32"]:
         zz = run_opt_pass(before(dtype), transform.FoldConstant())
         zexpected = run_opt_pass(expected(dtype), transform.InferType())
-        tvm.ir.assert_structural_equal(zz, zexpected)
+        assert tvm.ir.structural_equal(zz, zexpected)
 
 
 def test_fold_ndarray_size():
@@ -295,7 +227,7 @@ def expected(dtype):
     for dtype in ["int32", "float32"]:
         zz = run_opt_pass(before(dtype), transform.FoldConstant())
         zexpected = run_opt_pass(expected(dtype), transform.InferType())
-        tvm.ir.assert_structural_equal(zz, zexpected)
+        assert tvm.ir.structural_equal(zz, zexpected)
 
 
 def test_fold_batch_norm():
@@ -340,7 +272,7 @@ def initializer(_, param):
         mod = remove_bn_pass(mod)
 
     expect = run_infer_type(expected())
-    tvm.ir.assert_structural_equal(mod["main"], expect)
+    assert tvm.ir.structural_equal(mod["main"], expect)
 
 
 def test_fold_dropout():
@@ -363,11 +295,15 @@ def before():
     with tvm.transform.PassContext(opt_level=3):
         after_mod = passes(before_mod)
 
-    tvm.ir.assert_structural_equal(run_infer_type(before_mod["main"]), after_mod["main"])
+    assert tvm.ir.structural_equal(run_infer_type(before_mod["main"]), after_mod["main"])
 
 
 if __name__ == "__main__":
-    import sys
-    import pytest
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    test_fold_const()
+    test_fold_let()
+    test_fold_tuple()
+    test_fold_concat()
+    test_fold_shape_of()
+    test_fold_batch_norm()
+    test_fold_ndarray_size()
+    test_fold_dropout()
diff --git a/tests/python/relay/test_pass_instrument.py b/tests/python/relay/test_pass_instrument.py
index 58baad2b0e8f1..321c74f4bbd8d 100644
--- a/tests/python/relay/test_pass_instrument.py
+++ b/tests/python/relay/test_pass_instrument.py
@@ -59,11 +59,9 @@ def test_pass_timing_instrument():
     assert profiles == ""
 
 
-instrument_definition_type = tvm.testing.parameter("decorator", "subclass")
-
-
-def test_custom_instrument(instrument_definition_type):
-    class BaseTest:
+def test_custom_instrument():
+    @pass_instrument
+    class MyTest:
         def __init__(self):
             self.events = []
 
@@ -79,16 +77,6 @@ def run_before_pass(self, mod, info):
         def run_after_pass(self, mod, info):
             self.events.append("run after " + info.name)
 
-    if instrument_definition_type == "decorator":
-        MyTest = pass_instrument(BaseTest)
-
-    elif instrument_definition_type == "subclass":
-
-        class MyTest(BaseTest, tvm.ir.instrument.PassInstrument):
-            def __init__(self):
-                BaseTest.__init__(self)
-                tvm.ir.instrument.PassInstrument.__init__(self)
-
     mod = get_test_model()
     my_test = MyTest()
     with tvm.transform.PassContext(instruments=[my_test]):
diff --git a/tests/python/relay/test_prng.py b/tests/python/relay/test_prng.py
index 29e271b1c4d7f..79ed014c5503e 100644
--- a/tests/python/relay/test_prng.py
+++ b/tests/python/relay/test_prng.py
@@ -166,6 +166,7 @@ def test_threefry_generate_out_size():
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    test_threefry_repeatability(tvm.target.Target("llvm"), tvm.device("cpu"))
+    test_threefry_split(tvm.target.Target("llvm"), tvm.device("cpu"))
+    test_threefry_sequential_generate(tvm.target.Target("llvm"), tvm.device("cpu"))
+    test_threefry_sequential_generate_remaining(tvm.target.Target("llvm"), tvm.device("cpu"))
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 79979747dfd8a..8ec41523f9dc8 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -32,8 +32,6 @@
 import tvm.testing
 from tvm.relay.transform import InferType
 from tvm.relay.testing import mlp
-from tvm.relay.dataflow_pattern import wildcard, is_op
-from tvm.relay.backend.vm import VMCompiler
 
 
 def check_result(target, dev, args, expected_result, mod=None):
@@ -975,91 +973,6 @@ def test_benchmark_end_to_end_rpc():
     assert result.mean > 0
 
 
-def test_shape_func_nested_function():
-    data_shape = (relay.Any(), 16)
-    weight_shape = (relay.Any(), 16)
-
-    dense = relay.nn.dense(
-        relay.var("data", shape=data_shape), relay.var("weight", shape=weight_shape)
-    )
-    mod = tvm.IRModule.from_expr(dense)
-
-    patterns = [("test.dense", is_op("nn.dense")(wildcard(), wildcard()))]
-    passes = tvm.transform.Sequential(
-        [
-            relay.transform.MergeComposite(patterns),
-            relay.transform.AnnotateTarget(["test"]),
-            relay.transform.PartitionGraph(),
-        ]
-    )
-
-    mod = passes(mod)
-
-    compiler = VMCompiler()
-    compiler.lower(mod, "llvm")
-
-
-@tvm.testing.requires_cuda
-def test_storage_size_and_offset_on_cpu():
-    """Tests allocations place sizes and offsets on the CPU host even if the rest
-    of the computation is on a different device type."""
-    # TODO(mbs): Better would be to test ManifestAlloc independently.
-
-    # CPU = device type 1
-    # GPU = device type 2
-    def input():
-        return tvm.parser.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%a: Tensor[(5, 7), float32],
-                      param_device_types=[2], result_device_type=2) {
-              add(%a, %a)
-            }
-        """
-        )
-
-    exe = relay.vm.compile(
-        input(),
-        tvm.target.Target("cuda"),
-    )
-
-    # This program needs two constants:
-    # - The size of the tensor's storage (first arg) to alloc_storage
-    # - The offset of the tensor within the storage (second arg) to alloc_tensor
-    # Both should be on the CPU
-    assert not "on device of type 2" in exe.constants
-    assert "on device of type 1" in exe.constants
-
-
-@tvm.testing.requires_cuda
-def test_reshape_shape_on_cpu():
-    """Tests the argument to a reshape places the shape on the CPU host even if the rest
-    of the computation is on a different device type."""
-    # TODO(mbs): Better would be to test ManifestAlloc independently.
-
-    # CPU = device type 1
-    # GPU = device type 2
-    def input():
-        return tvm.parser.fromtext(
-            """
-            #[version = "0.0.5"]
-            def @main(%x: Tensor[(2, 8), float32],
-                      param_device_types=[2], result_device_type=2) {
-              reshape(%x, newshape=[2, 4, 2])
-            }
-        """
-        )
-
-    exe = relay.vm.compile(
-        input(),
-        tvm.target.Target("cuda"),
-    )
-
-    # The newshape annotation should have been turned into a constant on the CPU.
-    assert not "on device of type 2" in exe.constants
-    assert "on device of type 1" in exe.constants
-
-
 if __name__ == "__main__":
     import sys
 
diff --git a/tests/python/unittest/test_target_codegen_opencl.py b/tests/python/unittest/test_target_codegen_opencl.py
index 2ac2ec9dd9e9c..56392ec8cccc9 100644
--- a/tests/python/unittest/test_target_codegen_opencl.py
+++ b/tests/python/unittest/test_target_codegen_opencl.py
@@ -142,5 +142,4 @@ def check_erf(dev, n, dtype):
 if __name__ == "__main__":
     test_opencl_ternary_expression()
     test_opencl_inf_nan()
-    test_opencl_max()
     test_opencl_erf()
diff --git a/tests/python/unittest/test_target_codegen_vulkan.py b/tests/python/unittest/test_target_codegen_vulkan.py
index 7b708cbe0c128..1edc5d3117599 100644
--- a/tests/python/unittest/test_target_codegen_vulkan.py
+++ b/tests/python/unittest/test_target_codegen_vulkan.py
@@ -17,6 +17,7 @@
 
 import random
 import re
+import sys
 import threading
 
 import numpy as np
@@ -556,6 +557,4 @@ def do_compute(ins, outs):
 
 
 if __name__ == "__main__":
-    import sys
-
-    sys.exit(pytest.main([__file__] + sys.argv[1:]))
+    sys.exit(pytest.main(sys.argv))
diff --git a/tests/python/unittest/test_tir_analysis_verify_gpu_code.py b/tests/python/unittest/test_tir_analysis_verify_gpu_code.py
index b7d78aad140d3..9e9563a66a5d0 100644
--- a/tests/python/unittest/test_tir_analysis_verify_gpu_code.py
+++ b/tests/python/unittest/test_tir_analysis_verify_gpu_code.py
@@ -343,34 +343,6 @@ def test_vectorize():
         assert not valid[0]
 
 
-@tvm.testing.requires_gpu
-def test_vectorize_half():
-    N = 1024
-
-    A = te.placeholder((N, N), name="A", dtype="float16")
-    B = te.compute((N, N), lambda i, j: A[i, j])
-
-    s = te.create_schedule([B.op])
-
-    i, j = s[B].op.axis
-
-    s[B].bind(i, te.thread_axis("blockIdx.x"))
-    jo, ji = s[B].split(j, factor=8)
-    s[B].bind(jo, te.thread_axis("threadIdx.x"))
-    s[B].vectorize(ji)
-
-    for target in ["opencl", "cuda"]:
-        if not tvm.testing.device_enabled(target):
-            continue
-
-        valid = [None]
-        with tvm.transform.PassContext(
-            config={"tir.add_lower_pass": [(2, get_verify_pass(valid, max_vector_bytes=16))]}
-        ):
-            tvm.lower(s, [A, B])
-        assert valid[0]
-
-
 @tvm.testing.requires_gpu
 def test_vthread():
     N = 1024
@@ -437,6 +409,5 @@ def test_redundant_kernels():
     test_multiple_kernels()
     test_wrong_bind()
     test_vectorize()
-    test_vectorize_half()
     test_vthread()
     test_redundant_kernels()
diff --git a/tests/python/unittest/test_tir_transform_merge_dynamic_shared_memory_allocations.py b/tests/python/unittest/test_tir_transform_merge_dynamic_shared_memory_allocations.py
index 46d39c034454a..cc78b84f9b4e2 100644
--- a/tests/python/unittest/test_tir_transform_merge_dynamic_shared_memory_allocations.py
+++ b/tests/python/unittest/test_tir_transform_merge_dynamic_shared_memory_allocations.py
@@ -82,14 +82,13 @@ def test_matmul_ir(A, B, C):
         # Create a dynamic shared memory for the accumulation.
         # This is for testing merging dynamic shared memory alloctions with different data type.
         # In practice, there is no need to allocate a shared memory for C.
-        C_local = ib.allocate(C.dtype, (1,), scope="local", name="C_local")
         C_sh = ib.allocate(C.dtype, (block, block), scope="shared.dyn", name="C_sh")  # fp32
 
         A_ptr = ib.buffer_ptr(A)
         B_ptr = ib.buffer_ptr(B)
         C_ptr = ib.buffer_ptr(C)
 
-        C_local[0] = 0.0
+        C_sh[ty, tx] = 0.0
 
         with ib.for_range(0, n // block, name="i") as i:
             A_sh[ty, tx] = A_ptr[by * block + ty, i * block + tx]
@@ -97,10 +96,10 @@ def test_matmul_ir(A, B, C):
             ib.emit(syncthread())
 
             with ib.for_range(0, block, name="k") as k:
-                C_local[0] += cast(A_sh[ty, k] * B_sh[k, tx], "float32")
+                C_sh[ty, tx] += cast(A_sh[ty, k] * B_sh[k, tx], "float32")
+
             ib.emit(syncthread())
 
-        C_sh[ty, tx] = C_local[0]
         C_ptr[by * block + ty, bx * block + tx] = C_sh[ty, tx]
 
         return ib.get()
@@ -114,8 +113,7 @@ def test_matmul_ir(A, B, C):
     )
     s = te.create_schedule(C.op)
     mod = run_passes(s, [A, B, C])
-    # C can be allocated at the start of A, so we only need to allocate 2 block * block memory with dtype = float16
-    expected_alloc_size = block * block * 4
+    expected_alloc_size = block * block * 3 * 4
     verify_single_allocation(mod["main"].body, expected_alloc_size)
 
     def check_target(target):
@@ -251,83 +249,8 @@ def test_device_ir(A, B, C, D):
     # allocate(buf_dyn_shmem: Pointer(shared.dyn uint8), uint8, [((n_dyn*4) + 256)]);
     verify_single_allocation(mod["main"].body)
 
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        fadd = tvm.build(s, [A, B, C, D], target)
-        dev = tvm.device(target, 0)
-
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.random.uniform(size=n).astype(C.dtype), dev)
-        d = tvm.nd.array(np.zeros((n,), dtype=D.dtype), dev)
-        fadd(a, b, c, d)
-        tvm.testing.assert_allclose(d.numpy(), a.numpy() + b.numpy() + c.numpy(), 1e-4, 1e-4)
-
-    for target in ["cuda", "nvptx"]:
-        check_target(target)
-
-
-def test_dyn_shared_more_dtype():
-    """Test vectorized store into dynamic shared memory"""
-    n = 512
-    A = te.placeholder((n,), name="A", dtype="int8")
-    B = te.placeholder((n,), name="B", dtype="int16")
-
-    def test_device_ir(A, B, C):
-        n = A.shape[0]
-        ib = tvm.tir.ir_builder.create()
-
-        tx = te.thread_axis("threadIdx.x")
-        ib.scope_attr(tx, "thread_extent", n)
-
-        A_sh = ib.allocate(A.dtype, (n,), scope="shared.dyn")  # i8
-        B_sh = ib.allocate(B.dtype, (n,), scope="shared.dyn")  # i16
-        C_sh = ib.allocate(C.dtype, (n,), scope="shared.dyn")  # i32
-
-        Aptr = ib.buffer_ptr(A)
-        Bptr = ib.buffer_ptr(B)
-        Cptr = ib.buffer_ptr(C)
-
-        A_sh[tx] = Aptr[tx]
-        B_sh[tx] = Bptr[tx]
-
-        C_sh[tx] = cast(A_sh[tx], "int32") + cast(B_sh[tx], "int32")
-        Cptr[tx] = C_sh[tx]
-        return ib.get()
-
-    C = te.extern(
-        (n,),
-        [A, B],
-        lambda ins, outs: test_device_ir(ins[0], ins[1], outs[0]),
-        name="vadd",
-        dtype="int32",
-    )
-    s = te.create_schedule(C.op)
-
-    mod = run_passes(s, [A, B, C])
-    verify_single_allocation(mod["main"].body, n * 4)
-
-    def check_target(target):
-        if not tvm.testing.device_enabled(target):
-            return
-
-        fadd = tvm.build(s, [A, B, C], target)
-        dev = tvm.device(target, 0)
-
-        a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
-        b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
-        c = tvm.nd.array(np.zeros((n,), dtype=C.dtype), dev)
-        fadd(a, b, c)
-        tvm.testing.assert_allclose(c.numpy(), a.numpy().astype("float32") + b.numpy(), 1e-4, 1e-4)
-
-    for target in ["cuda", "nvptx"]:
-        check_target(target)
-
 
 if __name__ == "__main__":
     test_matmul_dyn_shared()
     test_dyn_shared_vectorized_store()
     test_dyn_shared_reuse_and_merge()
-    test_dyn_shared_more_dtype()
diff --git a/tests/scripts/task_python_ethosn_tests.sh b/tests/scripts/task_python_ethosn_tests.sh
index 525cc26d743ee..ae9b82b679ef4 100755
--- a/tests/scripts/task_python_ethosn_tests.sh
+++ b/tests/scripts/task_python_ethosn_tests.sh
@@ -29,6 +29,6 @@ make cython3
 
 # Note: Default behaviour is to assume the test target is Ethos-N77
 # but setting ETHOSN_VARIANT_CONFIG appropriately
-# (e.g. ETHOSN_VARIANT_CONFIG=Ethos-N78_1TOPS_2PLE_RATIO)
+# (e.g. ETHOSN_VARIANT_CONFIG=ETHOSN78_1TOPS_4PLE_448KSRAM)
 # switches the target to an Ethos-N78 configuration.
 run_pytest ctypes python-ethosn tests/python/contrib/test_ethosn
diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh
index 8618619d65ad3..00b63af486460 100755
--- a/tests/scripts/task_python_integration.sh
+++ b/tests/scripts/task_python_integration.sh
@@ -60,8 +60,8 @@ run_pytest cython ${TVM_INTEGRATION_TESTSUITE_NAME}-dso_plugin_module apps/dso_p
 # TVM_FFI=ctypes sh prepare_and_test_tfop_module.sh
 
 run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME} tests/python/integration
-if python3 -c "import tvm; from tvm.relay.op.contrib.ethosn import ethosn_available; print(ethosn_available().name)" -eq "SW_ONLY"; then
-  ETHOSN_VARIANT_CONFIG=Ethos-N78_1TOPS_2PLE_RATIO run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-contrib-test_ethosn tests/python/contrib/test_ethosn
+if python -c "import tvm; from tvm.relay.op.contrib.ethosn import ethosn_available; print(ethosn_available().name)" -eq "SW_ONLY"; then
+  ETHOSN_VARIANT_CONFIG=ETHOSN78_1TOPS_4PLE_448KSRAM run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-contrib-test_ethosn tests/python/contrib/test_ethosn
 fi
 run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-contrib tests/python/contrib
 
diff --git a/tests/scripts/task_python_integration_i386only.sh b/tests/scripts/task_python_integration_i386only.sh
deleted file mode 100755
index 9c378a647e3e2..0000000000000
--- a/tests/scripts/task_python_integration_i386only.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-set -e
-set -u
-
-export TVM_INTEGRATION_I386_ONLY=1
-
-./tests/scripts/task_python_integration.sh