From f83a80b9e852b49eb5538e48ae9ebeb7bddbde67 Mon Sep 17 00:00:00 2001
From: Christopher Sidebottom <chris.sidebottom@arm.com>
Date: Tue, 3 Aug 2021 00:04:55 +0100
Subject: [PATCH] Introduce --interface-api={c,packed} parameter (#8280)

* Introduce --interface-api={c,packed} parameter

This introduces structures generated to provide a documented and stable user
friendly interface to a TVM generated model, as can be seen in the AOT
demo application:
```
struct tvmgen_default_inputs inputs = {
  .input_1 = input_data,
};
struct tvmgen_default_outputs outputs = {
  .output = output_data,
};
int ret_val = tvmgen_default_run(&inputs, &outputs, NULL, NULL);
```

To facilitate this, some other changes are included:
* Removed dependency on `aot_executor.{c,h}` in tests, pending the
discussion in the interface RFC as to whether we keep them.
* Moved creation of test DLTensor's into the AOT test utils, in future this
can be replaced by loading via the Python API or otherwise
* Introduce `parametrize_aot_options` which can be used to test
permutations of AOT which work together - for now this filters C
interface and packed operators
* Updated demo application to generate the header for demonstration
purposes, we should consider porting the demo application to Model
Library Format and using the toolchain in the Zephyr App via CMake
instead?

This patch builds upon the improvements @giuseros made to AOT testing
and name mangling from #8014

* Tweak metadata variable description and MLF target loop

* Remove direct usage of `relay::Var` in meta_data.h

This looks like the only place that could be causing the Windows CI failures, so trying removing the additional header in meta_data.h

* Linting fix

* Post-rebase files fixing

These tests were somehow transmuted in transit, I've updated them to the
most recent variant of the test helpers.

* Strip back interface API to just inputs and outputs

This removes any speculative structures from the generated code and cleans up some of the documentation.

* Add header guards and tweak documentation
---
 apps/microtvm/zephyr/aot_demo/src/main.c      |  11 +-
 include/tvm/runtime/module.h                  |   4 +-
 python/tvm/micro/interface_api.py             |  79 +++++
 python/tvm/micro/model_library_format.py      |  42 ++-
 src/relay/backend/aot_executor_codegen.cc     |  10 +-
 src/runtime/meta_data.h                       |   8 +-
 src/target/source/source_module.cc            |  89 +++--
 src/target/target_kind.cc                     |   2 +
 tests/micro/zephyr/test_zephyr_aot.py         |   4 +-
 tests/python/relay/aot/aot_test.mk            |   3 +-
 tests/python/relay/aot/aot_test_utils.py      | 238 ++++++++++---
 tests/python/relay/aot/test_crt_aot.py        | 327 ++++++++++++------
 .../test_micro_model_library_format.py        |  38 +-
 13 files changed, 637 insertions(+), 218 deletions(-)
 create mode 100644 python/tvm/micro/interface_api.py

diff --git a/apps/microtvm/zephyr/aot_demo/src/main.c b/apps/microtvm/zephyr/aot_demo/src/main.c
index 43cc7b33987b..0c16572fc744 100644
--- a/apps/microtvm/zephyr/aot_demo/src/main.c
+++ b/apps/microtvm/zephyr/aot_demo/src/main.c
@@ -32,6 +32,7 @@
 
 #include "input_data.h"
 #include "output_data.h"
+#include "tvmgen_default.h"
 #include "zephyr_uart.h"
 
 #ifdef CONFIG_ARCH_POSIX
@@ -194,18 +195,18 @@ void main(void) {
   }
   TVMLogf("Zephyr AOT Runtime\n");
 
-  void* inputs[1] = {
-      input_data,
+  struct tvmgen_default_inputs inputs = {
+      .input_1 = input_data,
   };
-  void* outputs[1] = {
-      output_data,
+  struct tvmgen_default_outputs outputs = {
+      .output = output_data,
   };
 
   StackMemoryManager_Init(&app_workspace, g_aot_memory, WORKSPACE_SIZE);
 
   double elapsed_time = 0;
   TVMPlatformTimerStart();
-  int ret_val = tvm_runtime_run(&tvmgen_default_network, inputs, outputs);
+  int ret_val = tvmgen_default_run(&inputs, &outputs);
   TVMPlatformTimerStop(&elapsed_time);
 
   if (ret_val != 0) {
diff --git a/include/tvm/runtime/module.h b/include/tvm/runtime/module.h
index 9dd7423c6679..71be8d218d2d 100644
--- a/include/tvm/runtime/module.h
+++ b/include/tvm/runtime/module.h
@@ -230,8 +230,10 @@ constexpr const char* tvm_module_main = "__tvm_main__";
 constexpr const char* tvm_param_prefix = "__tvm_param__";
 /*! \brief A PackedFunc that looks up linked parameters by storage_id. */
 constexpr const char* tvm_lookup_linked_param = "_lookup_linked_param";
-/*! \brief The main AOT executor function */
+/*! \brief The main AOT executor function generated from TIR */
 constexpr const char* tvm_run_func_suffix = "run_model";
+/*! \brief Model entrypoint generated as an interface to the AOT function outside of TIR */
+constexpr const char* tvm_entrypoint_suffix = "run";
 }  // namespace symbol
 
 // implementations of inline functions.
diff --git a/python/tvm/micro/interface_api.py b/python/tvm/micro/interface_api.py
new file mode 100644
index 000000000000..915bee08175c
--- /dev/null
+++ b/python/tvm/micro/interface_api.py
@@ -0,0 +1,79 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Defines functions for generating a C interface header"""
+
+import os
+
+from tvm.relay.backend.utils import mangle_module_name
+
+
+def _emit_brief(header_file, module_name, description):
+    header_file.write("/*!\n")
+    header_file.write(f' * \\brief {description} for TVM module "{module_name}" \n')
+    header_file.write(" */\n")
+
+
+def generate_c_interface_header(module_name, inputs, outputs, output_path):
+    """Generates a C interface header for a given modules inputs and outputs
+
+    Parameters
+    ----------
+    module_name : str
+        Name of the module to be used in defining structs and naming the header
+    inputs : list[str]
+        List of module input names to be placed in generated structs
+    outputs : list[str]
+        List of module output names to be placed in generated structs
+    output_path : str
+        Path to the output folder to generate the header into
+    """
+
+    mangled_name = mangle_module_name(module_name)
+    metadata_header = os.path.join(output_path, f"{mangled_name}.h")
+    with open(metadata_header, "w") as header_file:
+        header_file.write(
+            "#include <stdint.h>\n"
+            f"#ifndef {mangled_name.upper()}_H_\n"
+            f"#define {mangled_name.upper()}_H_\n"
+        )
+
+        _emit_brief(header_file, module_name, "Input tensor pointers")
+        header_file.write(f"struct {mangled_name}_inputs {{\n")
+        for input_name in inputs:
+            header_file.write(f"  void* {input_name};\n")
+        header_file.write("};\n\n")
+
+        _emit_brief(header_file, module_name, "Output tensor pointers")
+        header_file.write(f"struct {mangled_name}_outputs {{\n")
+        for output_name in outputs:
+            header_file.write(f"  void* {output_name};\n")
+        header_file.write("};\n\n")
+
+        header_file.write(
+            "/*!\n"
+            f' * \\brief entrypoint function for TVM module "{module_name}"\n'
+            " * \\param inputs Input tensors for the module \n"
+            " * \\param outputs Output tensors for the module \n"
+            " */\n"
+            f"int32_t {mangled_name}_run(\n"
+            f"  struct {mangled_name}_inputs* inputs,\n"
+            f"  struct {mangled_name}_outputs* outputs\n"
+            ");\n"
+        )
+
+        header_file.write(f"#endif // {mangled_name.upper()}_H_\n")
diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py
index ad49ee7d9578..5e682c72ed73 100644
--- a/python/tvm/micro/model_library_format.py
+++ b/python/tvm/micro/model_library_format.py
@@ -25,7 +25,9 @@
 import tarfile
 import typing
 
+from tvm.ir.type import TupleType
 from .._ffi import get_global_func
+from .interface_api import generate_c_interface_header
 from ..contrib import utils
 from ..driver import build_module
 from ..runtime import ndarray as _nd
@@ -55,7 +57,6 @@ def _populate_codegen_dir(mod, codegen_dir: str, module_name: str = None):
 
     """
     dso_modules = mod._collect_dso_modules()
-    dso_module_handles = [m.handle.value for m in dso_modules]
     non_dso_modules = mod._collect_from_import_tree(lambda m: m not in dso_modules)
     if non_dso_modules:
         raise UnsupportedInModelLibraryFormatError(
@@ -213,6 +214,39 @@ def _build_function_memory_map(function_metadata):
     return ret
 
 
+def _get_main_relay_func(mod: executor_factory.ExecutorFactoryModule):
+    main_func = mod.function_metadata[MAIN_FUNC_NAME_STR]
+    target = list(main_func.relay_primfuncs.keys())[0]
+    return main_func.relay_primfuncs[target]
+
+
+def _convert_tuple_to_outputs(ret_type, offset=0):
+    outputs = []
+    added_fields = len(ret_type.fields)
+    for output_index in range(added_fields):
+        next_output = offset + len(outputs)
+        if isinstance(ret_type.fields[output_index], TupleType):
+            outputs.extend(_convert_tuple_to_outputs(ret_type.fields[output_index], next_output))
+        else:
+            outputs.append(f"output{next_output}")
+    return outputs
+
+
+def _get_inputs_and_outputs_from_module(mod):
+    main_func = _get_main_relay_func(mod)
+    inputs = [argument.name_hint for argument in main_func.params]
+
+    outputs = ["output"]
+    if isinstance(main_func.ret_type, TupleType):
+        outputs = _convert_tuple_to_outputs(main_func.ret_type)
+
+    return inputs, outputs
+
+
+def _should_generate_interface_header(mod):
+    return any(target.attrs.get("interface-api") == "c" for target in mod.target.values())
+
+
 def _make_tar(source_dir, tar_file_path):
     """Build a tar file from source_dir."""
     with tarfile.open(tar_file_path, "w") as tar_f:
@@ -260,6 +294,12 @@ def _export_graph_model_library_format(
     codegen_dir.mkdir()
     _populate_codegen_dir(mod.lib, codegen_dir, mod.libmod_name)
 
+    if _should_generate_interface_header(mod):
+        include_path = codegen_dir / "host" / "include"
+        include_path.mkdir()
+        inputs, outputs = _get_inputs_and_outputs_from_module(mod)
+        generate_c_interface_header(mod.libmod_name, inputs, outputs, include_path)
+
     parameters_dir = tempdir / "parameters"
     parameters_dir.mkdir()
     param_filename = parameters_dir / f"{mod.libmod_name}.params"
diff --git a/src/relay/backend/aot_executor_codegen.cc b/src/relay/backend/aot_executor_codegen.cc
index fd6ee27eb6be..221df958a8cb 100644
--- a/src/relay/backend/aot_executor_codegen.cc
+++ b/src/relay/backend/aot_executor_codegen.cc
@@ -650,7 +650,7 @@ class AOTExecutorCodegen : public ExprVisitor {
   /*! \brief mod */
   runtime::Module* mod_;
   /*! \brief list of input expressions (i.e., variable passed by the user) */
-  std::vector<Expr> input_vars_;
+  std::vector<Var> input_vars_;
   /*! \brief input and output variables belonging to the main function signature */
   Array<tir::Var> main_signature_;
   /*! \brief target device */
@@ -782,8 +782,12 @@ class AOTExecutorCodegen : public ExprVisitor {
       ret.lowered_funcs.Set(target_host_str, mod_run);
     }
     ret.function_metadata = std::move(function_metadata_);
-    ret.metadata = runtime::Metadata(input_vars_.size(), return_sid_.size(),
-                                     runtime::kTvmExecutorAot, mod_name);
+
+    std::vector<String> input_var_names(input_vars_.size());
+    std::transform(input_vars_.begin(), input_vars_.end(), input_var_names.begin(),
+                   [](Var input_var) -> String { return input_var->name_hint(); });
+    ret.metadata =
+        runtime::Metadata(input_var_names, return_sid_.size(), runtime::kTvmExecutorAot, mod_name);
     return ret;
   }
 };
diff --git a/src/runtime/meta_data.h b/src/runtime/meta_data.h
index 002012a1e1cc..66d9a44099da 100644
--- a/src/runtime/meta_data.h
+++ b/src/runtime/meta_data.h
@@ -54,8 +54,8 @@ inline String get_name_mangled(const String& module_name, const String& name) {
  */
 class MetadataNode : public Object {
  public:
-  /*! \brief number of inputs of the main function */
-  int num_inputs = 1;
+  /*! \brief input information for the main function */
+  Array<String> inputs;
   /*! \brief number of outputs of the main function */
   int num_outputs = 1;
   /*! \brief the executor to be used to run the model */
@@ -73,9 +73,9 @@ class MetadataNode : public Object {
  */
 class Metadata : public ObjectRef {
  public:
-  TVM_DLL Metadata(int num_inputs, int num_outputs, String executor, String mod_name) {
+  TVM_DLL Metadata(Array<String> inputs, int num_outputs, String executor, String mod_name) {
     auto n = make_object<MetadataNode>();
-    n->num_inputs = num_inputs;
+    n->inputs = inputs;
     n->num_outputs = num_outputs;
     n->executor = executor;
     n->mod_name = mod_name;
diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc
index ac4d7e3666ea..7728773b13d7 100644
--- a/src/target/source/source_module.cc
+++ b/src/target/source/source_module.cc
@@ -192,25 +192,26 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
           << "}\n";
   }
 
-  void GenerateEntrypointForUnpackedAPI(const std::string& run_func) {
+  void GenerateEntrypointForUnpackedAPI(const std::string& entrypoint_name,
+                                        const std::string& run_func) {
     code_ << "TVM_DLL int32_t " << run_func << "(";
-    int total_args = (metadata_->num_inputs + metadata_->num_outputs);
-    for (int i = 0; i < total_args; ++i) {
-      code_ << "arg" << i;
+    unsigned int total_args = (metadata_->inputs.size() + metadata_->num_outputs);
+    for (unsigned int i = 0; i < total_args; ++i) {
+      code_ << "void* arg" << i;
       if (i + 1 != total_args) {
         code_ << ",";
       }
     }
     code_ << ");\n";
-    code_ << "static int32_t " << ::tvm::runtime::symbol::tvm_module_main;
+    code_ << "int32_t " << entrypoint_name;
     code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
              "out_type_code, void* resource_handle) {\n";
     code_ << "return " << run_func << "(";
-    for (int i = 0; i < metadata_->num_inputs; ++i) {
+    for (unsigned int i = 0; i < metadata_->inputs.size(); ++i) {
       code_ << "((DLTensor*)(((TVMValue*)args)[" << i << "].v_handle))[0].data,";
     }
     for (int i = 0; i < metadata_->num_outputs; ++i) {
-      int j = metadata_->num_inputs + i;
+      int j = metadata_->inputs.size() + i;
       code_ << "((DLTensor*)(((TVMValue*)args)[" << j << "].v_handle))[0].data";
       if (i + 1 != metadata_->num_outputs) {
         code_ << ",";
@@ -220,11 +221,12 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
     code_ << "}\n";
   }
 
-  void GenerateEntrypointForPackedAPI(const std::string& run_func) {
+  void GenerateEntrypointForPackedAPI(const std::string& entrypoint_name,
+                                      const std::string& run_func) {
     code_ << "TVM_DLL int32_t " << run_func;
     code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
              "out_type_code, void* resource_handle);\n";
-    code_ << "static int32_t " << ::tvm::runtime::symbol::tvm_module_main;
+    code_ << "int32_t " << entrypoint_name;
     code_ << "(void* args, void* type_code, int num_args, void* out_value, void* "
              "out_type_code, void* resource_handle) {\n";
     code_ << "return " << run_func;
@@ -232,25 +234,70 @@ class CSourceCrtMetadataModuleNode : public runtime::ModuleNode {
     code_ << "}\n";
   }
 
+  void GenerateCInterfaceEntrypoint(const std::string& entrypoint_name, const std::string& run_func,
+                                    const std::string& mod_name) {
+    code_ << "#include <" << mod_name << ".h>\n";
+    code_ << "TVM_DLL int32_t " << run_func << "(";
+    unsigned int total_args = (metadata_->inputs.size() + metadata_->num_outputs);
+    for (unsigned int i = 0; i < total_args; ++i) {
+      code_ << "void* arg" << i;
+      if (i + 1 != total_args) {
+        code_ << ",";
+      }
+    }
+    code_ << ");\n";
+    code_ << "int32_t " << entrypoint_name << "(";
+    code_ << "struct " << runtime::get_name_mangled(mod_name, "inputs") << "* inputs,"
+          << "struct " << runtime::get_name_mangled(mod_name, "outputs") << "* outputs"
+          << ") {";
+    code_ << "return " << run_func << "(";
+    for (const auto& input : metadata_->inputs) {
+      code_ << "inputs->" << input << ",";
+    }
+    if (metadata_->num_outputs == 1) {
+      code_ << "outputs->output";
+    } else {
+      for (int i = 0; i < metadata_->num_outputs; ++i) {
+        code_ << "outputs->output" << i;
+        if (i + 1 != metadata_->num_outputs) {
+          code_ << ",";
+        }
+      }
+    }
+    code_ << ");\n";
+    code_ << "}\n";
+  }
+
   void GenerateAOTDescriptor() {
-    const std::string run_func = ::tvm::runtime::symbol::tvm_run_func_suffix;
-    const std::string run_func_mangled = runtime::get_name_mangled(metadata_->mod_name, run_func);
+    const std::string run_func_suffix = ::tvm::runtime::symbol::tvm_run_func_suffix;
+    const std::string tvm_entrypoint_suffix = ::tvm::runtime::symbol::tvm_entrypoint_suffix;
+    const std::string run_func_mangled =
+        runtime::get_name_mangled(metadata_->mod_name, run_func_suffix);
+    const std::string entrypoint_mangled =
+        runtime::get_name_mangled(metadata_->mod_name, tvm_entrypoint_suffix);
     const std::string network_mangled = runtime::get_name_mangled(metadata_->mod_name, "network");
-    code_ << "#include \"tvm/runtime/crt/internal/aot_executor/aot_executor.h\"\n";
+    auto unpacked_api = target_->GetAttr<Bool>("unpacked-api").value_or(Bool(false));
+    auto interface_api = target_->GetAttr<String>("interface-api").value_or(String("packed"));
+
     code_ << "#include \"tvm/runtime/c_runtime_api.h\"\n";
     code_ << "#ifdef __cplusplus\n";
-    code_ << "extern \"C\"\n";
+    code_ << "extern \"C\" {\n";
     code_ << "#endif\n";
-    if (target_->GetAttr<Bool>("unpacked-api").value_or(Bool(false))) {
-      GenerateEntrypointForUnpackedAPI(run_func_mangled);
+
+    if (unpacked_api) {
+      if (interface_api == "c") {
+        GenerateCInterfaceEntrypoint(entrypoint_mangled, run_func_mangled, metadata_->mod_name);
+      } else {
+        GenerateEntrypointForUnpackedAPI(entrypoint_mangled, run_func_mangled);
+      }
     } else {
-      GenerateEntrypointForPackedAPI(run_func_mangled);
+      ICHECK_EQ(interface_api, "packed") << "Packed interface required for packed operators";
+      GenerateEntrypointForPackedAPI(entrypoint_mangled, run_func_mangled);
     }
-    code_ << "const tvm_model_t " << network_mangled << " = {\n"
-          << "    .run_func = &" << ::tvm::runtime::symbol::tvm_module_main << ",\n"
-          << "    .num_input_tensors = " << metadata_->num_inputs << ",\n"
-          << "    .num_output_tensors = " << metadata_->num_outputs << ", \n"
-          << "};\n";
+
+    code_ << "#ifdef __cplusplus\n";
+    code_ << "}\n";
+    code_ << "#endif\n";
   }
 
   void CreateSource() {
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index a56916248858..3ad04eb3d577 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -299,6 +299,7 @@ TVM_REGISTER_TARGET_KIND("llvm", kDLCPU)
     .add_attr_option<String>("runtime")
     .add_attr_option<Bool>("link-params", Bool(false))
     .add_attr_option<Bool>("unpacked-api")
+    .add_attr_option<String>("interface-api")
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("c", kDLCPU)
@@ -310,6 +311,7 @@ TVM_REGISTER_TARGET_KIND("c", kDLCPU)
     .add_attr_option<String>("executor")
     .add_attr_option<Integer>("workspace-byte-alignment")
     .add_attr_option<Bool>("unpacked-api")
+    .add_attr_option<String>("interface-api")
     .set_default_keys({"cpu"});
 
 TVM_REGISTER_TARGET_KIND("cuda", kDLCUDA)
diff --git a/tests/micro/zephyr/test_zephyr_aot.py b/tests/micro/zephyr/test_zephyr_aot.py
index 48bdc5d3a283..d1c9d393770a 100644
--- a/tests/micro/zephyr/test_zephyr_aot.py
+++ b/tests/micro/zephyr/test_zephyr_aot.py
@@ -35,6 +35,7 @@
 from tvm.micro.contrib import zephyr
 from tvm.contrib import utils
 from tvm.contrib.download import download_testdata
+from tvm.micro.interface_api import generate_c_interface_header
 
 import conftest
 
@@ -184,7 +185,7 @@ def test_tflite(platform, west_cmd, skip_build, tvm_debug):
     )
 
     target = tvm.target.target.micro(
-        model, options=["-link-params=1", "--executor=aot", "--unpacked-api=1"]
+        model, options=["-link-params=1", "--executor=aot", "--unpacked-api=1", "--interface-api=c"]
     )
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         lowered = relay.build(relay_mod, target, params=params)
@@ -196,6 +197,7 @@ def test_tflite(platform, west_cmd, skip_build, tvm_debug):
     )
     sample = np.load(sample_path)
     model_files_path = os.path.join(runtime_path, "include")
+    generate_c_interface_header(lowered.libmod_name, ["input_1"], ["output"], model_files_path)
     _create_header_file((f"input_data"), sample, model_files_path)
     _create_header_file(
         "output_data", np.zeros(shape=output_shape, dtype="float32"), model_files_path
diff --git a/tests/python/relay/aot/aot_test.mk b/tests/python/relay/aot/aot_test.mk
index 2426d9fd2963..81e31762611f 100644
--- a/tests/python/relay/aot/aot_test.mk
+++ b/tests/python/relay/aot/aot_test.mk
@@ -34,7 +34,8 @@ PKG_CFLAGS = ${PKG_COMPILE_OPTS} \
 	-I$(DMLC_CORE)/include \
 	-I$(TVM_ROOT)/3rdparty/dlpack/include \
 	-I$(AOT_ROOT)\
-	-I$(build_dir)
+	-I$(build_dir) \
+	-I$(CODEGEN_ROOT)/host/include
 
 $(ifeq VERBOSE,1)
 QUIET ?=
diff --git a/tests/python/relay/aot/aot_test_utils.py b/tests/python/relay/aot/aot_test_utils.py
index 1c4dddc4c718..900eb67e2b48 100644
--- a/tests/python/relay/aot/aot_test_utils.py
+++ b/tests/python/relay/aot/aot_test_utils.py
@@ -16,24 +16,20 @@
 # under the License.
 
 import os
-import io
-import struct
-import numpy as np
+import itertools
 import pathlib
-import shutil
 import subprocess
-import tempfile
 import tarfile
 import json
 
+import pytest
+import numpy as np
 
 import tvm
 from tvm import relay
-from tvm.relay import transform
 from tvm.contrib import utils, graph_executor
 from tvm.relay.backend import compile_engine
 from tvm.relay.backend.utils import mangle_module_name
-from tvm.contrib import utils
 from tvm.micro import export_model_library_format
 
 
@@ -82,6 +78,26 @@ def convert_to_list(x):
     return mod, params
 
 
+def parametrize_aot_options(test):
+    """Parametrize over valid option combinations"""
+
+    interface_api = ["packed", "c"]
+    use_unpacked_api = [True, False]
+    use_calculated_workspaces = [True, False]
+
+    all_combinations = itertools.product(interface_api, use_unpacked_api, use_calculated_workspaces)
+    # Filter out packed operators with c interface
+    valid_combinations = filter(
+        lambda parameters: not (parameters[0] == "c" and parameters[1] == False),
+        all_combinations,
+    )
+
+    return pytest.mark.parametrize(
+        ["interface_api", "use_unpacked_api", "use_calculated_workspaces"],
+        valid_combinations,
+    )(test)
+
+
 def subprocess_with_stdout_and_log(cmd, cwd, logfile, stdout):
     """
     This method runs a process and logs the output to both a log file and stdout
@@ -102,10 +118,6 @@ def subprocess_with_stdout_and_log(cmd, cwd, logfile, stdout):
                     print(text, end="")
 
 
-def emit_main_network_definition(main_file, mod_name):
-    main_file.write(f'extern tvm_model_t {mangle_name(mod_name,"network")};\n')
-
-
 def emit_main_prologue(main_file, workspace_bytes):
     # Add TVM_RUNTIME_ALLOC_ALIGNMENT_BYTES because of memory alignment.
     main_file.write(
@@ -133,46 +145,121 @@ def emit_main_prologue(main_file, workspace_bytes):
     )
 
 
-def emit_main_data(main_file, input_list, output_list, mod_name):
-    for i in range(0, len(input_list)):
-        main_file.write(f'#include "{mangle_name(mod_name,"input_data")}{i}.h"\n')
+def emit_main_data(main_file, input_map, output_list, mod_name):
+    for key in input_map:
+        main_file.write(f'#include "{mangle_name(mod_name,"input_data")}_{key}.h"\n')
 
     for i in range(0, len(output_list)):
         main_file.write(f'#include "{mangle_name(mod_name,"expected_output_data")}{i}.h"\n')
         main_file.write(f'#include "{mangle_name(mod_name,"output_data")}{i}.h"\n')
 
 
-def emit_main_run(main_file, input_list, output_list, mod_name):
+def emit_main_data_structs(main_file, input_map, output_list, mod_name):
+    main_file.write(
+        f"struct {mangle_name(mod_name, 'inputs')} {mangle_name(mod_name, 'inputs')} = {{"
+    )
+    for key in input_map:
+        main_file.write(f"\t.{key} = {mangle_name(mod_name, 'input_data')}_{key},\n")
+    main_file.write("};\n")
+
+    main_file.write(
+        f"struct {mangle_name(mod_name, 'outputs')} {mangle_name(mod_name, 'outputs')} = {{"
+    )
     num_outputs = len(output_list)
-    num_inputs = len(input_list)
+    if num_outputs == 1:
+        main_file.write(f"\t.output = {mangle_name(mod_name, 'output_data')}0,\n")
+    else:
+        for i in range(0, num_outputs):
+            main_file.write(f"\t.output{i} = {mangle_name(mod_name, 'output_data')}{i},\n")
+    main_file.write("};\n")
 
-    main_file.write(f'void* {mangle_name(mod_name,"inputs")}[{num_inputs}] = {{ ')
 
-    for i in range(0, len(input_list)):
-        main_file.write(f'{mangle_name(mod_name,"input_data")}{i}, ')
+def emit_main_data_setup(main_file, input_map, output_list, mod_name):
+    num_outputs = len(output_list)
+    num_inputs = len(input_map)
+
+    main_file.write(f'void* {mangle_name(mod_name,"inputs")}[{num_inputs}] = {{ ')
+    for key in input_map:
+        main_file.write(f'{mangle_name(mod_name,"input_data")}_{key}, ')
     main_file.write("};\n")
 
     main_file.write(f'void* {mangle_name(mod_name,"outputs")}[{num_outputs}]  = {{ ')
-    for i in range(0, len(output_list)):
+    for i in range(0, num_outputs):
         main_file.write(f'{mangle_name(mod_name,"output_data")}{i}, ')
     main_file.write("};\n")
+
+
+def emit_main_c_interface_call(main_file, mod_name):
+    main_file.write(
+        f'{mangle_name(mod_name,"run")}(&{mangle_name(mod_name,"inputs")}, &{mangle_name(mod_name,"outputs")});\n'
+    )
+
+
+def emit_main_fake_packed_values(main_file):
+    main_file.write(
+        """
+    static DLDevice fake_device = {kDLCPU, 0};
+    static int64_t fake_dims = 0;
+    static int64_t fake_shape = {0};
+    """
+    )
+
+
+def emit_main_packed_call(main_file, input_map, output_list, mod_name):
+    tensors_name = mangle_name(mod_name, "tensors")
+    values_name = mangle_name(mod_name, "values")
+    typeids_name = mangle_name(mod_name, "typeids")
+
+    def fake_tensor(source, source_index, packed_index):
+        main_file.write(
+            f"""
+        {tensors_name}[{packed_index}].device = fake_device;
+        {tensors_name}[{packed_index}].data = {source}[{source_index}];
+        {tensors_name}[{packed_index}].shape = &fake_shape;
+        {tensors_name}[{packed_index}].ndim = fake_dims;
+        {tensors_name}[{packed_index}].byte_offset = 0;
+        {tensors_name}[{packed_index}].strides = NULL;
+        {values_name}[{packed_index}].v_handle = &{tensors_name}[{packed_index}];
+        """
+        )
+
+    num_outputs = len(output_list)
+    num_inputs = len(input_map)
+    num_tensors = num_inputs + num_outputs
     main_file.write(
-        f'tvm_runtime_run(&{mangle_name(mod_name,"network")}, {mangle_name(mod_name,"inputs")}, {mangle_name(mod_name,"outputs")});'
+        f"""
+    DLTensor {tensors_name}[{num_tensors}];
+    TVMValue {values_name}[{num_tensors}];
+    int32_t {typeids_name}[{num_tensors}];
+    """
+    )
+
+    for i in range(0, num_inputs):
+        fake_tensor(mangle_name(mod_name, "inputs"), i, i)
+    for i in range(0, num_outputs):
+        fake_tensor(mangle_name(mod_name, "outputs"), i, i + num_inputs)
+
+    main_file.write(
+        f'{mangle_name(mod_name, "run")}({values_name}, {typeids_name}, 0, NULL, 0, NULL);\n'
     )
     main_file.write("\n")
 
 
 def emit_main_compare(main_file, output_list, mod_name):
-    for i in range(0, len(output_list)):
+    num_outputs = len(output_list)
+    actual_data_name = mangle_name(mod_name, "output_data")
+    expected_data_name = mangle_name(mod_name, "expected_output_data")
+
+    for i in range(0, num_outputs):
         is_float_dtype = output_list[i].dtype == "float32"
-        main_file.write(f'for (int i = 0; i<{mangle_name(mod_name,"output_data")}{i}_len; i++){{\n')
+        main_file.write(f"for (int i = 0; i<{actual_data_name}{i}_len; i++){{\n")
         if is_float_dtype:
             main_file.write(
-                f'if (fabs({mangle_name(mod_name,"output_data")}{i}[i]-{mangle_name(mod_name,"expected_output_data")}{i}[i]) > 0.001f){{\n\tprintf("ko\\n");\n\treturn -1;}}\n'
+                f'if (fabs({actual_data_name}{i}[i]-{expected_data_name}{i}[i]) > 0.001f){{\n\tprintf("ko\\n");\n\treturn -1;}}\n'
             )
         else:
             main_file.write(
-                f'if ({mangle_name(mod_name,"output_data")}{i}[i]!={mangle_name(mod_name, "expected_output_data")}{i}[i]){{\n\tprintf("ko\\n");\n\treturn -1;}}\n'
+                f'if ({actual_data_name}{i}[i]!={expected_data_name}{i}[i]){{\n\tprintf("ko\\n");\n\treturn -1;}}\n'
             )
         main_file.write("}\n")
 
@@ -191,33 +278,48 @@ def emit_main_epilogue(main_file):
 def emit_main_common_includes(main_file):
     main_file.write("#include <stdio.h>\n")
     main_file.write("#include <math.h>\n")
-    main_file.write('#include "tvm/runtime/crt/internal/aot_executor/aot_executor.h"\n')
+    main_file.write('#include "tvm/runtime/c_runtime_api.h"\n')
     main_file.write('#include "tvm/runtime/crt/stack_allocator.h"\n')
 
 
-def create_main(test_name, input_list_map, output_list_map, output_path, workspace_bytes):
+def emit_main_micro_include(main_file, mod_name):
+    main_file.write(f"#include <{mangle_module_name(mod_name)}.h>\n")
+
+
+def create_main(test_name, input_map, output_list_map, output_path, interface_api, workspace_bytes):
     file_path = pathlib.Path(f"{output_path}/" + test_name).resolve()
     # create header file
     raw_path = file_path.with_suffix(".c").resolve()
     with open(raw_path, "w") as main_file:
         emit_main_common_includes(main_file)
 
-        for k in input_list_map:
-            emit_main_network_definition(main_file, k)
+        if interface_api == "c":
+            for mod_name in input_map:
+                emit_main_micro_include(main_file, mod_name)
 
         emit_main_prologue(main_file, workspace_bytes)
-
-        for k in input_list_map:
-            emit_main_data(main_file, input_list_map[k], output_list_map[k], k)
-
+        for mod_name in input_map:
+            emit_main_data(main_file, input_map[mod_name], output_list_map[mod_name], mod_name)
         emit_main_init_memory_manager(main_file)
 
-        for k in input_list_map:
-            emit_main_run(main_file, input_list_map[k], output_list_map[k], k)
-
-        for k in input_list_map:
-            emit_main_compare(main_file, output_list_map[k], k)
-
+        if interface_api == "c":
+            for mod_name in input_map:
+                emit_main_data_structs(
+                    main_file, input_map[mod_name], output_list_map[mod_name], mod_name
+                )
+                emit_main_c_interface_call(main_file, mod_name)
+        else:
+            emit_main_fake_packed_values(main_file)
+            for mod_name in input_map:
+                emit_main_data_setup(
+                    main_file, input_map[mod_name], output_list_map[mod_name], mod_name
+                )
+                emit_main_packed_call(
+                    main_file, input_map[mod_name], output_list_map[mod_name], mod_name
+                )
+
+        for mod_name in input_map:
+            emit_main_compare(main_file, output_list_map[mod_name], mod_name)
         emit_main_epilogue(main_file)
 
 
@@ -258,19 +360,22 @@ def extract_main_workspace_sizebytes(extract_dir):
 
 def compile_and_run(
     mod,
-    input_list,
+    inputs,
     output_list,
-    target_options,
+    interface_api,
+    use_unpacked_api,
     use_calculated_workspaces,
     params=None,
     workspace_byte_alignment=8,
-    mod_name=None,
+    mod_name="default",
     enable_op_fusion=True,
 ):
     """
     This method verifies the generated source
     """
-    target = f"c -runtime=c --link-params --executor=aot --workspace-byte-alignment={workspace_byte_alignment} {target_options}"
+    base_target = "c -runtime=c --link-params --executor=aot"
+    extra_target = f"--workspace-byte-alignment={workspace_byte_alignment} --interface-api={interface_api} --unpacked-api={int(use_unpacked_api)}"
+    target = f"{base_target} {extra_target}"
     cflags = f"-DTVM_RUNTIME_ALLOC_ALIGNMENT_BYTES={workspace_byte_alignment} "
 
     # The calculated workspaces will not account for stack allocator tags used for debugging
@@ -300,8 +405,8 @@ def compile_and_run(
     else:
         workspace_bytes = 16384 * 1024
 
-    for i in range(len(input_list)):
-        create_header_file((f'{mangle_name(mod_name, "input_data")}{i}'), input_list[i], build_path)
+    for key in inputs:
+        create_header_file(f'{mangle_name(mod_name, "input_data")}_{key}', inputs[key], build_path)
 
     for i in range(len(output_list)):
         create_header_file(
@@ -314,16 +419,23 @@ def compile_and_run(
         )
 
     create_main(
-        "test.c", {mod_name: input_list}, {mod_name: output_list}, build_path, workspace_bytes
+        "test.c",
+        {mod_name: inputs},
+        {mod_name: output_list},
+        build_path,
+        interface_api,
+        workspace_bytes,
     )
 
     # Verify that compiles fine
     file_dir = os.path.dirname(os.path.abspath(__file__))
+    codegen_path = os.path.join(base_path, "codegen")
     makefile = os.path.join(file_dir, "aot_test.mk")
     make_cmd = (
         f"make CFLAGS='{cflags}' -f {makefile} build_dir="
         + build_path
         + f" TVM_ROOT={file_dir}/../../../.."
+        + f" CODEGEN_ROOT={codegen_path}"
     )
 
     compile_log_path = os.path.join(build_path, "test_compile.log")
@@ -337,12 +449,21 @@ def compile_and_run(
 
 
 def compile_and_run_multiple_models(
-    mod_map, input_list_map, output_list_map, target_options, param_map
+    mod_map,
+    input_list_map,
+    output_list_map,
+    interface_api,
+    use_unpacked_api,
+    use_calculated_workspaces,
+    param_map,
+    workspace_byte_alignment=8,
 ):
     """
     This method verifies the generated source
     """
-    target = f"c -runtime=c --link-params --executor=aot {target_options}"
+    base_target = "c -runtime=c --link-params --executor=aot"
+    extra_target = f"--workspace-byte-alignment={workspace_byte_alignment} --interface-api={interface_api} --unpacked-api={int(use_unpacked_api)}"
+    target = f"{base_target} {extra_target}"
     tmp_path = utils.tempdir()
     tmp_dir = tmp_path.temp_dir
 
@@ -364,9 +485,9 @@ def compile_and_run_multiple_models(
         input_list = input_list_map[mod_name]
         output_list = output_list_map[mod_name]
 
-        for i in range(len(input_list_map[mod_name])):
+        for key in input_list:
             create_header_file(
-                (f'{mangle_name(mod_name,"input_data")}{i}'), input_list[i], build_path
+                (f'{mangle_name(mod_name,"input_data")}_{key}'), input_list[key], build_path
             )
 
         for i in range(len(output_list_map[mod_name])):
@@ -379,12 +500,25 @@ def compile_and_run_multiple_models(
                 (f'{mangle_name(mod_name,"expected_output_data")}{i}'), output_list[i], build_path
             )
 
-    create_main("test.c", input_list_map, output_list_map, build_path, workspace_bytes=16384 * 1024)
+    create_main(
+        "test.c",
+        input_list_map,
+        output_list_map,
+        build_path,
+        interface_api,
+        workspace_bytes=16384 * 1024,
+    )
 
     # Verify that compiles fine
     file_dir = os.path.dirname(os.path.abspath(__file__))
+    codegen_path = os.path.join(base_path, "codegen")
     makefile = os.path.join(file_dir, "aot_test.mk")
-    make_cmd = f"make -f {makefile} build_dir=" + build_path + f" TVM_ROOT={file_dir}/../../../.."
+    make_cmd = (
+        f"make -f {makefile} build_dir="
+        + build_path
+        + f" TVM_ROOT={file_dir}/../../../.."
+        + f" CODEGEN_ROOT={codegen_path}"
+    )
 
     compile_log_path = os.path.join(build_path, "test_compile.log")
     ret = subprocess_with_stdout_and_log(make_cmd, ".", compile_log_path, False)
diff --git a/tests/python/relay/aot/test_crt_aot.py b/tests/python/relay/aot/test_crt_aot.py
index 13cbfa71b6ae..26eca2688436 100644
--- a/tests/python/relay/aot/test_crt_aot.py
+++ b/tests/python/relay/aot/test_crt_aot.py
@@ -15,37 +15,48 @@
 # specific language governing permissions and limitations
 # under the License.
 
-import os
-import io
-import struct
+from collections import OrderedDict
+
 import numpy as np
-import pathlib
-import shutil
-import subprocess
-import tempfile
-import tarfile
 import pytest
 
 import tvm
 from tvm import relay
-from tvm.relay import transform
-from tvm.relay.op.contrib import get_pattern_table
-from tvm.contrib import utils
-from tvm.relay.backend import compile_engine
-from tvm.contrib import utils
-from tvm.contrib import graph_executor
-from tvm.micro import export_model_library_format
-from tvm.relay import testing
+from tvm.relay import testing, transform
 from tvm.relay.op.annotation import compiler_begin, compiler_end
-from tvm.contrib import utils
 from tvm.relay.expr_functor import ExprMutator
+from aot_test_utils import (
+    generate_ref_data,
+    convert_to_relay,
+    compile_and_run,
+    compile_and_run_multiple_models,
+    parametrize_aot_options,
+)
 
-from aot_test_utils import *
 
+def test_error_c_interface_with_packed_api():
+    interface_api = "c"
+    use_unpacked_api = False
+    use_calculated_workspaces = True
 
-@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
-def test_conv_with_params(use_calculated_workspaces, target_options):
+    two = relay.add(relay.const(1), relay.const(1))
+    func = relay.Function([], two)
+    output_list = generate_ref_data(func, {})
+    input_list = []
+
+    with pytest.raises(tvm.TVMError, match="Packed interface required for packed operators"):
+        compile_and_run(
+            func,
+            input_list,
+            output_list,
+            interface_api,
+            use_unpacked_api,
+            use_calculated_workspaces,
+        )
+
+
+@parametrize_aot_options
+def test_conv_with_params(interface_api, use_unpacked_api, use_calculated_workspaces):
     RELAY_MODEL = """
 #[version = "0.0.5"]
 def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5), int8]) {
@@ -73,13 +84,19 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
     inputs = {"data": input_data}
     output_list = generate_ref_data(mod, inputs, params)
 
-    input_list = [input_data]
-    compile_and_run(mod, input_list, output_list, target_options, use_calculated_workspaces, params)
+    compile_and_run(
+        mod,
+        inputs,
+        output_list,
+        interface_api,
+        use_unpacked_api,
+        use_calculated_workspaces,
+        params,
+    )
 
 
-@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
-def test_add_with_params(use_calculated_workspaces, target_options):
+@parametrize_aot_options
+def test_add_with_params(interface_api, use_unpacked_api, use_calculated_workspaces):
     x = relay.var("x", shape=(1, 10))
     y = relay.var("y", shape=(1, 10))
     z = relay.add(x, y)
@@ -92,15 +109,19 @@ def test_add_with_params(use_calculated_workspaces, target_options):
     inputs = {"y": y_in}
     output_list = generate_ref_data(func, inputs, params)
 
-    input_list = [y_in]
     compile_and_run(
-        func, input_list, output_list, target_options, use_calculated_workspaces, params
+        func,
+        inputs,
+        output_list,
+        interface_api,
+        use_unpacked_api,
+        use_calculated_workspaces,
+        params,
     )
 
 
-@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
-def test_conv2d(use_calculated_workspaces, target_options):
+@parametrize_aot_options
+def test_conv2d(use_calculated_workspaces, interface_api, use_unpacked_api):
     """Test a subgraph with a single conv2d operator."""
 
     def conv2d_direct():
@@ -119,7 +140,8 @@ def conv2d_direct():
         i_data = np.random.uniform(0, 1, ishape).astype(dtype)
         w1_data = np.random.uniform(0, 1, w1shape).astype(dtype)
 
-        return mod, {"data": i_data, "weight": w1_data}, (1, 32, 14, 14)
+        inputs = OrderedDict([("data", i_data), ("weight", w1_data)])
+        return mod, inputs, (1, 32, 14, 14)
 
     def group_conv2d():
         dtype = "float32"
@@ -137,17 +159,23 @@ def group_conv2d():
         i_data = np.random.uniform(0, 1, ishape).astype(dtype)
         w_data = np.random.uniform(0, 1, w2shape).astype(dtype)
 
-        return mod, {"data": i_data, "weight": w_data}, (1, 32, 14, 14)
+        inputs = OrderedDict([("data", i_data), ("weight", w_data)])
+        return mod, inputs, (1, 32, 14, 14)
 
     for mod, inputs, out_shape in [conv2d_direct(), group_conv2d()]:
         output_list = generate_ref_data(mod, inputs)
-        input_list = [inputs["data"], inputs["weight"]]
-        compile_and_run(mod, input_list, output_list, target_options, use_calculated_workspaces)
-
-
-@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
-def test_concatenate(use_calculated_workspaces, target_options):
+        compile_and_run(
+            mod,
+            inputs,
+            output_list,
+            interface_api,
+            use_unpacked_api,
+            use_calculated_workspaces,
+        )
+
+
+@parametrize_aot_options
+def test_concatenate(interface_api, use_unpacked_api, use_calculated_workspaces):
     dtype = "float32"
     x = relay.var("x", shape=(10, 5), dtype=dtype)
     y = relay.var("y", shape=(10, 5), dtype=dtype)
@@ -159,16 +187,21 @@ def test_concatenate(use_calculated_workspaces, target_options):
     x_data = np.random.rand(10, 5).astype(dtype)
     y_data = np.random.rand(10, 5).astype(dtype)
     t_data = np.random.uniform(size=()).astype(dtype)
-    inputs = {"x": x_data, "y": y_data, "z": t_data}
+    inputs = OrderedDict([("x", x_data), ("y", y_data), ("z", t_data)])
 
     output_list = generate_ref_data(func, inputs)
-    input_list = [inputs["x"], inputs["y"], inputs["z"]]
-    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
+    compile_and_run(
+        func,
+        inputs,
+        output_list,
+        interface_api,
+        use_unpacked_api,
+        use_calculated_workspaces,
+    )
 
 
-@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
-def test_nested_tuples(use_calculated_workspaces, target_options):
+@parametrize_aot_options
+def test_nested_tuples(interface_api, use_unpacked_api, use_calculated_workspaces):
     x = relay.var("x", shape=(10,))
     x1 = x + relay.const(1.0)
     x2 = x1 + relay.const(1.0)
@@ -180,71 +213,109 @@ def test_nested_tuples(use_calculated_workspaces, target_options):
     x_data = np.random.uniform(size=(10,)).astype(np.float32)
     inputs = {"x": x_data}
     output_list = generate_ref_data(func, inputs)
-    input_list = [x_data]
-    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
 
+    compile_and_run(
+        func,
+        inputs,
+        output_list,
+        interface_api,
+        use_unpacked_api,
+        use_calculated_workspaces,
+    )
 
-@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
-def test_tuple_getitem(use_calculated_workspaces, target_options):
+
+@parametrize_aot_options
+def test_tuple_getitem(interface_api, use_unpacked_api, use_calculated_workspaces):
     func = relay.Function([], relay.TupleGetItem(relay.Tuple([relay.const(1), relay.const(2)]), 0))
     output_list = generate_ref_data(func, {})
-    input_list = []
-    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
+    inputs = {}
+
+    compile_and_run(
+        func,
+        inputs,
+        output_list,
+        interface_api,
+        use_unpacked_api,
+        use_calculated_workspaces,
+    )
 
 
-@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
-def test_id(use_calculated_workspaces, target_options):
+@parametrize_aot_options
+def test_id(interface_api, use_unpacked_api, use_calculated_workspaces):
     x = relay.var("x", "float32")
     ident = relay.Function([x], x)
     one = np.array(1.0, "float32")
     inputs = {"x": one}
     output_list = generate_ref_data(ident, inputs)
-    input_list = [one]
-    compile_and_run(ident, input_list, output_list, target_options, use_calculated_workspaces)
 
+    compile_and_run(
+        ident,
+        inputs,
+        output_list,
+        interface_api,
+        use_unpacked_api,
+        use_calculated_workspaces,
+    )
 
-@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
-def test_add_const(use_calculated_workspaces, target_options):
+
+@parametrize_aot_options
+def test_add_const(interface_api, use_unpacked_api, use_calculated_workspaces):
     two = relay.add(relay.const(1), relay.const(1))
     func = relay.Function([], two)
     output_list = generate_ref_data(func, {})
-    input_list = []
-    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
+    inputs = {}
 
+    compile_and_run(
+        func,
+        inputs,
+        output_list,
+        interface_api,
+        use_unpacked_api,
+        use_calculated_workspaces,
+    )
 
-@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
-def test_mul_param(use_calculated_workspaces, target_options):
+
+@parametrize_aot_options
+def test_mul_param(interface_api, use_unpacked_api, use_calculated_workspaces):
     x = relay.var("x", shape=(10, 10))
     y = relay.var("y", shape=(1, 10))
     func = relay.Function([x, y], relay.multiply(x, y))
     x_data = np.random.rand(10, 10).astype("float32")
     y_data = np.random.rand(1, 10).astype("float32")
-    inputs = {"x": x_data, "y": y_data}
+
+    inputs = OrderedDict([("x", x_data), ("y", y_data)])
     output_list = generate_ref_data(func, inputs)
-    input_list = [inputs["x"], inputs["y"]]
-    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
 
+    compile_and_run(
+        func,
+        inputs,
+        output_list,
+        interface_api,
+        use_unpacked_api,
+        use_calculated_workspaces,
+    )
 
-@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
-def test_subtract(use_calculated_workspaces, target_options):
+
+@parametrize_aot_options
+def test_subtract(interface_api, use_unpacked_api, use_calculated_workspaces):
     i = relay.var("i", shape=[], dtype="int32")
     sub = relay.subtract(i, relay.const(1, dtype="int32"))
     func = relay.Function([i], sub, ret_type=relay.TensorType([], "int32"))
     i_data = np.array(1, dtype="int32")
     inputs = {"i": i_data}
     output_list = generate_ref_data(func, inputs)
-    input_list = [inputs["i"]]
-    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
+    compile_and_run(
+        func,
+        inputs,
+        output_list,
+        interface_api,
+        use_unpacked_api,
+        use_calculated_workspaces,
+    )
 
 
-@pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
-def test_tuple_output(use_calculated_workspaces, target_options):
+@parametrize_aot_options
+def test_tuple_output(interface_api, use_unpacked_api, use_calculated_workspaces):
     x = relay.var("x", shape=(6, 9))
     y = relay.split(x, 3).astuple()
     a = relay.TupleGetItem(y, 0)
@@ -255,29 +326,34 @@ def test_tuple_output(use_calculated_workspaces, target_options):
     x_data = np.random.rand(6, 9).astype("float32")
     inputs = {"x": x_data}
     output_list = generate_ref_data(func, inputs)
-    input_list = [inputs["x"]]
-    compile_and_run(func, input_list, output_list, target_options, use_calculated_workspaces)
+    compile_and_run(
+        func,
+        inputs,
+        output_list,
+        interface_api,
+        use_unpacked_api,
+        use_calculated_workspaces,
+    )
 
 
 @pytest.mark.parametrize(
-    "use_calculated_workspaces_and_alignment", [(True, 1), (True, 16), (False, 1)]
+    ["use_calculated_workspaces", "workspace_byte_alignment"], [(True, 1), (True, 16), (False, 1)]
 )
-@pytest.mark.parametrize("target_options", ["--unpacked-api"])
-def test_mobilenet(use_calculated_workspaces_and_alignment, target_options):
-    use_calculated_workspaces = use_calculated_workspaces_and_alignment[0]
-    workspace_byte_alignment = use_calculated_workspaces_and_alignment[1]
+def test_mobilenet(use_calculated_workspaces, workspace_byte_alignment):
+    use_unpacked_api = True
+    interface_api = "c"
 
     mod, params = testing.mobilenet.get_workload(batch_size=1)
     data_shape = [int(x) for x in mod["main"].checked_type.arg_types[0].shape]
     data = np.random.uniform(size=data_shape).astype("float32")
     inputs = {"data": data}
     output_list = generate_ref_data(mod, inputs, params)
-    input_list = [inputs["data"]]
     compile_and_run(
         mod,
-        input_list,
+        inputs,
         output_list,
-        target_options,
+        interface_api,
+        use_unpacked_api,
         use_calculated_workspaces,
         params,
         workspace_byte_alignment,
@@ -339,9 +415,11 @@ def visit_call(self, call):
 
 
 @pytest.mark.parametrize("use_calculated_workspaces", [True, False])
-@pytest.mark.parametrize("target_options", [""])
-def test_byoc_microtvm(use_calculated_workspaces, target_options):
+def test_byoc_microtvm(use_calculated_workspaces):
     """This is a simple test case to check BYOC capabilities of AOT"""
+    use_unpacked_api = False
+    interface_api = "packed"
+
     x = relay.var("x", shape=(10, 10))
     w0 = relay.var("w0", shape=(10, 10))
     w1 = relay.var("w1", shape=(10, 10))
@@ -379,18 +457,23 @@ def test_byoc_microtvm(use_calculated_workspaces, target_options):
     for _ in range(8):
         w_data.append(np.random.rand(10, 10).astype("float32"))
 
-    map_inputs = {"w{}".format(i): w_data[i] for i in range(8)}
-    map_inputs["x"] = x_data
+    map_inputs = OrderedDict([("x", x_data)] + [("w{}".format(i), w_data[i]) for i in range(8)])
     output_list = generate_ref_data(mod, map_inputs)
     input_list = [map_inputs["x"]]
     input_list.extend([map_inputs["w{}".format(i)] for i in range(8)])
     compile_and_run(
-        mod, input_list, output_list, target_options, use_calculated_workspaces, mod_name="my_mod"
+        mod,
+        map_inputs,
+        output_list,
+        interface_api,
+        use_unpacked_api,
+        use_calculated_workspaces,
+        mod_name="my_mod",
     )
 
 
-@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
-def test_add_name_mangling_with_params(target_options):
+@parametrize_aot_options
+def test_add_name_mangling_with_params(interface_api, use_unpacked_api, use_calculated_workspaces):
     x = relay.var("x", shape=(1, 10))
     y = relay.var("y", shape=(1, 10))
     z = relay.add(x, y)
@@ -403,27 +486,26 @@ def test_add_name_mangling_with_params(target_options):
     inputs = {"y": y_in}
     output_list = generate_ref_data(func, inputs, params)
 
-    input_list = [y_in]
     compile_and_run(
         func,
-        input_list,
+        inputs,
         output_list,
-        target_options,
-        use_calculated_workspaces=False,
+        interface_api,
+        use_unpacked_api,
+        use_calculated_workspaces,
         params=params,
         mod_name="my_mod",
     )
 
 
-@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
-def test_multiple_models(target_options):
+@parametrize_aot_options
+def test_multiple_models(interface_api, use_unpacked_api, use_calculated_workspaces):
     # Identity model without params
     x = relay.var("x", "float32")
     mod1 = relay.Function([x], x)
     one = np.array(1.0, "float32")
     inputs1 = {"x": one}
     output_list1 = generate_ref_data(mod1, inputs1)
-    input_list1 = [one]
     params1 = None
 
     # Convolution model
@@ -453,15 +535,20 @@ def @main(%data : Tensor[(1, 3, 64, 64), uint8], %weight : Tensor[(8, 3, 5, 5),
     params2 = {"weight": weight_data}
     inputs2 = {"data": input_data}
     output_list2 = generate_ref_data(mod2, inputs2, params2)
-    input_list2 = [input_data]
 
-    input_list_map = {"mod1": input_list1, "mod2": input_list2}
+    input_list_map = {"mod1": inputs1, "mod2": inputs2}
     output_list_map = {"mod1": output_list1, "mod2": output_list2}
     mod_map = {"mod1": mod1, "mod2": mod2}
     param_map = {"mod1": params1, "mod2": params2}
 
     compile_and_run_multiple_models(
-        mod_map, input_list_map, output_list_map, target_options, param_map
+        mod_map,
+        input_list_map,
+        output_list_map,
+        interface_api,
+        use_unpacked_api,
+        use_calculated_workspaces,
+        param_map,
     )
 
 
@@ -473,6 +560,10 @@ def test_quant_mobilenet_tfl():
 
     import tvm.relay.testing.tf as tf_testing
 
+    interface_api = "packed"
+    use_unpacked_api = False
+    use_calculated_workspaces = True
+
     tflite_model_file = tf_testing.get_workload_official(
         "https://storage.googleapis.com/download.tensorflow.org/"
         "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz",
@@ -486,12 +577,19 @@ def test_quant_mobilenet_tfl():
     mod, params = convert_to_relay(tflite_model_buf, data, "input")
     inputs = {"input": data}
     output_list = generate_ref_data(mod, inputs, params)
-    input_list = [inputs["input"]]
-    compile_and_run(mod, input_list, output_list, "--unpacked-api=0", True, params)
+    compile_and_run(
+        mod,
+        inputs,
+        output_list,
+        interface_api,
+        use_unpacked_api,
+        use_calculated_workspaces,
+        params=params,
+    )
 
 
-@pytest.mark.parametrize("target_options", ["--unpacked-api=0", "--unpacked-api=1"])
-def test_transpose(target_options):
+@parametrize_aot_options
+def test_transpose(interface_api, use_unpacked_api, use_calculated_workspaces):
     """Test that non-inpleaceable operations (e.g., transpose) do not happen in-place."""
 
     dtype = "float32"
@@ -506,11 +604,18 @@ def test_transpose(target_options):
     x_data = np.random.rand(10, 5).astype(dtype)
     y_data = np.random.rand(10, 5).astype(dtype)
     t_data = np.random.uniform(size=()).astype(dtype)
-    inputs = {"x": x_data, "y": y_data, "z": t_data}
 
+    inputs = {"x": x_data, "y": y_data, "z": t_data}
     output_list = generate_ref_data(func, inputs)
-    input_list = [inputs["x"], inputs["y"], inputs["z"]]
-    compile_and_run(func, input_list, output_list, target_options, True, enable_op_fusion=False)
+    compile_and_run(
+        func,
+        inputs,
+        output_list,
+        interface_api,
+        use_unpacked_api,
+        use_calculated_workspaces,
+        enable_op_fusion=False,
+    )
 
 
 if __name__ == "__main__":
diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py
index a15e37925eea..5a32385632fc 100644
--- a/tests/python/unittest/test_micro_model_library_format.py
+++ b/tests/python/unittest/test_micro_model_library_format.py
@@ -102,14 +102,20 @@ def validate_graph_json(extract_dir, factory):
 
 @tvm.testing.requires_micro
 @pytest.mark.parametrize(
-    "target",
+    "executor,target,should_generate_interface",
     [
-        ("graph", tvm.target.target.micro("host")),
-        ("aot", tvm.target.target.micro("host", options="-executor=aot")),
+        ("graph", tvm.target.target.micro("host"), False),
+        ("aot", tvm.target.target.micro("host", options="-executor=aot"), False),
+        (
+            "aot",
+            tvm.target.target.micro(
+                "host", options="-executor=aot --unpacked-api=1 --interface-api=c"
+            ),
+            True,
+        ),
     ],
 )
-def test_export_model_library_format_c(target):
-    executor, _target = target
+def test_export_model_library_format_c(executor, target, should_generate_interface):
     with utils.TempDirectory.set_keep_for_debug(True):
         with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
             relay_mod = tvm.parser.fromtext(
@@ -122,8 +128,8 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
             )
             factory = tvm.relay.build(
                 relay_mod,
-                _target,
-                target_host=_target,
+                target,
+                target_host=target,
                 mod_name="add",
                 params={"c": numpy.array([[2.0, 4.0]], dtype="float32")},
             )
@@ -147,7 +153,7 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
                 metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
             )
             assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
-            assert metadata["target"] == {"1": str(_target)}
+            assert metadata["target"] == {"1": str(target)}
             if executor == "graph":
                 assert metadata["memory"]["sids"] == [
                     {"storage_id": 0, "size_bytes": 2, "input_binding": "a"},
@@ -173,6 +179,9 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
 
         assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "add_lib0.c"))
         assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "add_lib1.c"))
+        assert should_generate_interface == os.path.exists(
+            os.path.join(extract_dir, "codegen", "host", "include", "tvmgen_add.h")
+        )
 
         if executor == "graph":
             validate_graph_json(extract_dir, factory)
@@ -265,13 +274,9 @@ def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[
 @tvm.testing.requires_micro
 @pytest.mark.parametrize(
     "target",
-    [
-        ("graph", tvm.target.target.micro("host")),
-        ("aot", tvm.target.target.micro("host", options="-executor=aot")),
-    ],
+    [tvm.target.target.micro("host"), tvm.target.target.micro("host", options="-executor=aot")],
 )
 def test_export_model_library_format_workspace(target):
-    executor, _target = target
     with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}):
         relay_mod = tvm.parser.fromtext(
             """
@@ -285,7 +290,7 @@ def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int1
             }
             """
         )
-        factory = tvm.relay.build(relay_mod, _target, target_host=_target, mod_name="qnn_conv2d")
+        factory = tvm.relay.build(relay_mod, target, target_host=target, mod_name="qnn_conv2d")
 
     temp_dir = utils.tempdir()
     mlf_tar_path = temp_dir.relpath("lib.tar")
@@ -306,7 +311,7 @@ def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int1
             metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ"
         )
         assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5)
-        assert metadata["target"] == {"1": str(_target)}
+        assert metadata["target"] == {"1": str(target)}
         assert metadata["memory"]["functions"]["main"] == [
             {
                 "constants_size_bytes": 0,
@@ -327,9 +332,6 @@ def @main(%p0: Tensor[(1, 56, 56, 128), int16], %p1: Tensor[(3, 3, 128, 1), int1
 @tvm.testing.requires_micro
 def test_export_non_dso_exportable():
     module = tvm.support.FrontendTestModule()
-    factory = executor_factory.GraphExecutorFactoryModule(
-        None, tvm.target.target.micro("host"), '"graph_json"', module, "test_module", {}, {}
-    )
 
     temp_dir = utils.tempdir()
     import tvm.micro as micro