From 89b16763d5057c49c9b6818678639475f54828bd Mon Sep 17 00:00:00 2001
From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com>
Date: Fri, 17 Dec 2021 17:32:45 -0800
Subject: [PATCH] [Relay] Fix invalid shape function for "copy" operator
 (#9749)

The 'script' for of the shape function was ill-formed,
resulting in a TIR shape function which did not assign
to it's output, which in turn caused either OOM or
assert fails as uninitialized dimensions worked their
way downstream. That fix is in python/tvm/relay/op/tensor.py.

Everything else is for testing and debugging as I tracked
this down.

Special thanks to Lily for helping me with the scalar vs
tensor switch in the copy shape function.

[This is CORE-112 in OctoML JIRA.]
---
 include/tvm/runtime/debug.h                   |  54 ++++++++
 python/tvm/relay/op/tensor.py                 |  19 ++-
 src/relay/backend/te_compiler.cc              |   2 +-
 src/relay/backend/te_compiler_cache.cc        |  31 +++--
 src/runtime/debug.cc                          | 128 ++++++++++++++++++
 src/runtime/vm/executable.cc                  |  19 +--
 src/runtime/vm/vm.cc                          |  62 +++++----
 src/target/compilation_config.cc              |  50 +++----
 .../relay/dyn/test_dynamic_op_level3.py       |  66 ++++++++-
 tests/python/relay/test_vm.py                 |  46 +++----
 10 files changed, 364 insertions(+), 113 deletions(-)
 create mode 100644 include/tvm/runtime/debug.h
 create mode 100644 src/runtime/debug.cc
diff --git a/include/tvm/runtime/debug.h b/include/tvm/runtime/debug.h
new file mode 100644
index 000000000000..29d812b74dd8
--- /dev/null
+++ b/include/tvm/runtime/debug.h
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file tvm/runtime/debug.h
+ * \brief Helpers for debugging at runtime.
+ */
+#ifndef TVM_RUNTIME_DEBUG_H_
+#define TVM_RUNTIME_DEBUG_H_
+
+#include <tvm/runtime/container/adt.h>
+#include <tvm/runtime/ndarray.h>
+
+#include <ostream>
+#include <string>
+
+namespace tvm {
+namespace runtime {
+
+/*!
+ * \brief Helpers to describe runtime objects in human-friendly form. For \p nd_arrays we show their
+ * shapes and dtypes, but also their contents if 'small' and on the \p host_device (mostly so that
+ * we can see dynamic shapes as they are computed). For \p adts we show the ADT fields. For
+ * \p objects we dispatch to one of the above as appropriate.
+ */
+void AppendNDArray(std::ostream& os, const NDArray& nd_array, const DLDevice& host_device,
+                   bool show_content = true);
+void AppendADT(std::ostream& os, const ADT& adt, const DLDevice& host_device,
+               bool show_content = true);
+void AppendRuntimeObject(std::ostream& os, const ObjectRef& object, const DLDevice& host_device,
+                         bool show_content = true);
+std::string RuntimeObject2String(const ObjectRef& object, const DLDevice& host_device,
+                                 bool show_content = true);
+
+}  // namespace runtime
+}  // namespace tvm
+
+#endif  // TVM_RUNTIME_DEBUG_H_
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index 20b883ba2616..963bb3d55693 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -1178,8 +1178,18 @@ def copy(data):
 
 
 @script
-def _copy_shape_func(data_shape):
-    return data_shape
+def _copy_shape_func_tensor(data_shape):
+    ndim = data_shape.shape[0]
+    out = output_tensor((ndim,), "int64")
+    for i in const_range(ndim):
+        out[i] = data_shape[i]
+    return out
+
+
+@script
+def _copy_shape_func_scalar(data_shape):
+    out = output_tensor((), "int64")
+    return out
 
 
 @reg.register_shape_func("copy", False)
@@ -1187,7 +1197,10 @@ def copy_shape_func(attrs, inputs, _):
     """
     Shape function for copy op.
     """
-    return [_copy_shape_func(inputs[0])]
+    input = inputs[0]
+    if len(input.shape) == 0:
+        return [_copy_shape_func_scalar(input)]
+    return [_copy_shape_func_tensor(input)]
 
 
 def device_copy(data, src_device, dst_device):
diff --git a/src/relay/backend/te_compiler.cc b/src/relay/backend/te_compiler.cc
index 901661dd87a3..3ff6076473f1 100644
--- a/src/relay/backend/te_compiler.cc
+++ b/src/relay/backend/te_compiler.cc
@@ -350,7 +350,7 @@ class TECompilerImpl : public TECompilerNode {
 
   // implement lowered shape func
   CCacheValue LowerShapeFuncInternal(const CCacheKey& key) {
-    VLOG(1) << "lowering dynamic shape function:" << std::endl
+    VLOG(1) << "lowering dynamic shape function for:" << std::endl
             << PrettyPrint(key->source_func) << std::endl
             << "for target:" << std::endl
             << key->target->ToDebugString();
diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc
index f028c3da02ab..32164f3fdf20 100644
--- a/src/relay/backend/te_compiler_cache.cc
+++ b/src/relay/backend/te_compiler_cache.cc
@@ -145,7 +145,7 @@ class ScheduleBuilder : public backend::MemoizedExprTranslator<Array<te::Tensor>
       candidate_name = truncated_name.str();
     }
 
-    // TODO(mbs): This should be the definititive global by which the PrimFunc is known and
+    // TODO(mbs): This should be the definitive global by which the PrimFunc is known and
     // no other GlobalVar ctors should appear inside the lowering machinery.
     auto prim_fn_var = GlobalVar(renamer(candidate_name));
     prim_fn_var->checked_type_ = relay_func->checked_type();
@@ -371,6 +371,7 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
 
   CachedFunc Create(const Function& prim_func, const Target& target,
                     std::function<std::string(std::string)> renamer) {
+    VLOG_CONTEXT << "MakeShapeFunc";
     TShapeDataDependent shape_func_param_states;
 
     for (auto param : prim_func->params) {
@@ -399,11 +400,12 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     // Setup the name;
     readable_name_stream_ << "shape_func";
 
-    // Create the `te::Tensor`s which represent the output.
-    auto outputs = VisitExpr(prim_func->body);
+    // Create the tensor expressions representing the output shapes.
+    Array<te::Tensor> outputs = VisitExpr(prim_func->body);
 
     // Generate a name.
     auto candidate_name = readable_name_stream_.str();
+
     constexpr static size_t kMaxFuncNameLength = 80;
     // WARNING: Please make sure to also update TVM_CRT_MAX_STRLEN_FUNCTION_NAME
     //          whenever the value of kMaxFuncNameLength changes
@@ -463,7 +465,7 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     for (auto t : outputs) {
       out_ops.push_back(t->op);
     }
-    auto schedule = te::create_schedule(out_ops);
+    te::Schedule schedule = te::create_schedule(out_ops);
     tvm::te::AutoInlineInjective(schedule);
     for (const auto& scalar : scalars_) {
       auto scalar_op = scalar->op;
@@ -589,12 +591,15 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
   }
 
   Array<te::Tensor> VisitExpr_(const CallNode* call_node) final {
+    VLOG(1) << "considering call:" << std::endl << PrettyPrint(GetRef<Call>(call_node));
     if (auto* func = call_node->op.as<FunctionNode>()) {
+      VLOG(1) << "user function";
       for (size_t i = 0; i < func->params.size(); ++i) {
         param_arg_map_[func->params[i]] = call_node->args[i];
       }
       return VisitExpr(func->body);
     }
+
     static auto fshape_func = Op::GetAttrMap<FShapeFunc>("FShapeFunc");
     static auto tshape_data_dependent = Op::GetAttrMap<TShapeDataDependent>("TShapeDataDependent");
     ICHECK(call_node->op.as<OpNode>()) << "Primitive function only allows call into primitive ops";
@@ -635,20 +640,16 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator<Array<te::Tensor>>
     // Get output ndims
     auto ret_type = call_node->checked_type();
     Array<IndexExpr> out_ndims;
-    if (const auto* ttype = ret_type.as<TensorTypeNode>()) {
+    for (const auto& ttype : FlattenTupleType(ret_type)) {
       out_ndims.push_back(IntImm(DataType::Int(32), ttype->shape.size()));
-    } else {
-      auto rtype = ret_type.as<TupleTypeNode>();
-      // TODO(@icemelon): Allow recursive tuple
-      ICHECK(rtype);
-      for (size_t i = 0; i < rtype->fields.size(); ++i) {
-        auto ttype = rtype->fields[i].as<TensorTypeNode>();
-        ICHECK(ttype);
-        out_ndims.push_back(IntImm(DataType::Int(32), ttype->shape.size()));
-      }
     }
+
     // Call shape function
-    auto outputs = fshape_func[op](call_node->attrs, inputs, out_ndims);
+    Array<te::Tensor> outputs = fshape_func[op](call_node->attrs, inputs, out_ndims);
+    VLOG(1) << "shape function for '" << op->name << "' with inputs:" << std::endl
+            << inputs << std::endl
+            << "yielded outputs:" << std::endl
+            << outputs;
     readable_name_stream_ << "_" << op->name;
     return outputs;
   }
diff --git a/src/runtime/debug.cc b/src/runtime/debug.cc
new file mode 100644
index 000000000000..e5d9f0ead09e
--- /dev/null
+++ b/src/runtime/debug.cc
@@ -0,0 +1,128 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/runtime/debug.cc
+ * \brief Helpers for debugging at runtime.
+ */
+
+#include <tvm/runtime/debug.h>
+
+namespace tvm {
+namespace runtime {
+
+template <typename T>
+void AppendMembers(std::ostream& os, const NDArray& nd_array, int64_t dim0) {
+  os << "=[";
+  for (int64_t i = 0; i < dim0; ++i) {
+    if (i > 0) {
+      os << ",";
+    }
+    os << reinterpret_cast<T*>(nd_array->data)[i];
+  }
+  os << "]";
+}
+
+void AppendNDArray(std::ostream& os, const NDArray& nd_array, const DLDevice& host_device,
+                   bool show_contents) {
+  os << "NDArray[";
+  os << "(";
+  for (int dim = 0; dim < nd_array->ndim; ++dim) {
+    if (dim > 0) {
+      os << ",";
+    }
+    os << nd_array->shape[dim];
+  }
+  std::string basic_type = DLDataType2String(nd_array->dtype);
+  os << ")," << basic_type;
+  os << ",(" << nd_array->device.device_type;
+  os << "," << nd_array->device.device_id;
+  os << ")]";
+  if (show_contents && nd_array->device.device_type == host_device.device_type &&
+      nd_array->device.device_id == host_device.device_id) {
+    int64_t dim0;
+    if (nd_array->ndim == 0) {
+      dim0 = 1;
+    } else if (nd_array->ndim == 1) {
+      dim0 = nd_array->shape[0];
+      if (dim0 > 10) {
+        // Too large.
+        dim0 = 0;
+      }
+    } else {
+      // Not rank-1.
+      dim0 = 0;
+    }
+    if (dim0 > 0) {
+      if (basic_type == "bool") {
+        AppendMembers<bool>(os, nd_array, dim0);
+      } else if (basic_type == "int8") {
+        AppendMembers<int8_t>(os, nd_array, dim0);
+      } else if (basic_type == "int16") {
+        AppendMembers<int16_t>(os, nd_array, dim0);
+      } else if (basic_type == "int32") {
+        AppendMembers<int32_t>(os, nd_array, dim0);
+      } else if (basic_type == "int64") {
+        AppendMembers<int64_t>(os, nd_array, dim0);
+      } else if (basic_type == "uint8") {
+        AppendMembers<uint8_t>(os, nd_array, dim0);
+      } else if (basic_type == "uint16") {
+        AppendMembers<uint16_t>(os, nd_array, dim0);
+      } else if (basic_type == "uint32") {
+        AppendMembers<uint32_t>(os, nd_array, dim0);
+      } else if (basic_type == "uint64") {
+        AppendMembers<uint64_t>(os, nd_array, dim0);
+      } else if (basic_type == "float32") {
+        AppendMembers<float>(os, nd_array, dim0);
+      } else if (basic_type == "float64") {
+        AppendMembers<double>(os, nd_array, dim0);
+      }
+    }
+  }
+}
+
+void AppendADT(std::ostream& os, const ADT& adt, const DLDevice& host_device, bool show_contents) {
+  os << "ADT(" << adt->tag;
+  for (size_t i = 0; i < adt->size; ++i) {
+    os << ",";
+    AppendRuntimeObject(os, adt[i], host_device, show_contents);
+  }
+  os << ")";
+}
+
+void AppendRuntimeObject(std::ostream& os, const ObjectRef& object, const DLDevice& host_device,
+                         bool show_contents) {
+  if (const auto* adt_obj = object.as<ADTObj>()) {
+    AppendADT(os, GetRef<ADT>(adt_obj), host_device, show_contents);
+  } else if (const auto* nd_array_cont = object.as<NDArray::Container>()) {
+    AppendNDArray(os, GetRef<NDArray>(nd_array_cont), host_device, show_contents);
+  } else {
+    os << "?";
+  }
+}
+
+std::string RuntimeObject2String(const ObjectRef& object, const DLDevice& host_device,
+                                 bool show_contents) {
+  std::ostringstream os;
+  AppendRuntimeObject(os, object, host_device, show_contents);
+  return os.str();
+}
+
+}  // namespace runtime
+}  // namespace tvm
diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc
index 76c385ae9918..e2fe867630b0 100644
--- a/src/runtime/vm/executable.cc
+++ b/src/runtime/vm/executable.cc
@@ -24,6 +24,7 @@
 
 #include <dmlc/memory_io.h>
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/debug.h>
 #include <tvm/runtime/registry.h>
 #include <tvm/runtime/vm/executable.h>
 #include <tvm/runtime/vm/vm.h>
@@ -171,27 +172,13 @@ std::string Executable::GetBytecode() const {
   return oss.str();
 }
 
-namespace {
-String ShapeString(const ShapeTuple& shape_tuple, DLDataType dtype) {
-  std::stringstream sizes;
-  sizes << DLDataType2String(dtype) << "[";
-  for (size_t i = 0; i < shape_tuple.size(); i++) {
-    if (i != 0) {
-      sizes << ", ";
-    }
-    sizes << shape_tuple.data()[i];
-  }
-  sizes << "]";
-  return String(sizes.str());
-}
-}  // namespace
-
 std::string Executable::GetConstants() const {
   std::ostringstream oss;
   for (size_t i = 0; i < constants.size(); ++i) {
     const auto& constant = constants[i];
     auto ndarray = Downcast<NDArray>(constant);
-    oss << "VM Const[" << i << "]: has shape " << ShapeString(ndarray.Shape(), ndarray->dtype)
+    oss << "VM Const[" << i
+        << "]: " << RuntimeObject2String(ndarray, virtual_devices[host_device_index])
         << " on device index " << const_device_indexes[i] << std::endl;
   }
   return oss.str();
diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc
index acbbec0d2991..7a83c9acb906 100644
--- a/src/runtime/vm/vm.cc
+++ b/src/runtime/vm/vm.cc
@@ -24,6 +24,8 @@
 
 #include <dmlc/memory_io.h>
 #include <tvm/runtime/container/adt.h>
+#include <tvm/runtime/data_type.h>
+#include <tvm/runtime/debug.h>
 #include <tvm/runtime/logging.h>
 #include <tvm/runtime/memory.h>
 #include <tvm/runtime/object.h>
@@ -292,13 +294,14 @@ Index VirtualMachine::PopFrame() {
 }
 
 void VirtualMachine::InvokeGlobal(const VMFunction& func, const std::vector<ObjectRef>& args) {
-  VLOG(2) << "Invoking global " << func.name << " " << args.size();
+  VLOG(2) << "Invoking global " << func.name << " with " << args.size() << " args";
 
   PushFrame(func.params.size(), this->pc_ + 1, func);
   for (size_t i = 0; i < args.size(); ++i) {
     WriteRegister(i, args[i]);
+    VLOG(2) << "arg " << i << " = "
+            << RuntimeObject2String(args[i], GetDevice(exec_->host_device_index));
   }
-  VLOG(2) << "func.params= " << func.params.size();
 
   code_ = func.instructions.data();
   pc_ = 0;
@@ -527,20 +530,35 @@ void VirtualMachine::RunLoop() {
         goto main_loop;
       }
       case Opcode::InvokePacked: {
-        VLOG(2) << "InvokedPacked " << instr.packed_index << " arity=" << instr.arity;
         ICHECK_LE(instr.packed_index, packed_funcs_.size());
         const auto& func = packed_funcs_[instr.packed_index];
         const auto& arity = instr.arity;
         std::vector<ObjectRef> args;
         for (Index i = 0; i < arity; ++i) {
-          VLOG(2) << "arg" << i << " $" << instr.packed_args[i];
           auto arg = ReadRegister(instr.packed_args[i]);
           args.push_back(arg);
+#if TVM_LOG_DEBUG
+          if (i < arity) {
+            const bool is_input = i < arity - instr.output_size;
+            VLOG(2) << (is_input ? "input" : "placeholder") << " arg " << i << " = "
+                    << RuntimeObject2String(arg, GetDevice(exec_->host_device_index),
+                                            /*show_contents=*/is_input);
+          }
+#endif
         }
 
         // We no longer need to write the registers back, we write directly
         // through the registers mutably.
         InvokePacked(instr.packed_index, func, arity, instr.output_size, args);
+
+#if TVM_LOG_DEBUG
+        for (Index i = arity - instr.output_size; i < arity; ++i) {
+          auto arg = ReadRegister(instr.packed_args[i]);
+          VLOG(2) << "output arg " << i << " = "
+                  << RuntimeObject2String(arg, GetDevice(exec_->host_device_index));
+        }
+#endif
+
         pc_++;
         goto main_loop;
       }
@@ -606,19 +624,10 @@ void VirtualMachine::RunLoop() {
         auto storage_obj = ReadRegister(instr.alloc_tensor.storage);
         auto offset = LoadScalarInt(instr.alloc_tensor.offset);
         auto storage = Downcast<Storage>(storage_obj);
-#if TVM_LOG_DEBUG
-        std::ostringstream os;
-        os << "AllocTensor: ";
-        os << "offset=" << offset;
-        os << ", shape=[";
-        for (auto i : shape) {
-          os << i << ",";
-        }
-        os << "]";
-        os << ", dtype=" << DLDataType2String(instr.alloc_tensor.dtype);
-        VLOG(2) << os.str();
-#endif
         auto obj = storage->AllocNDArray(offset, shape, instr.alloc_tensor.dtype);
+        VLOG(2) << "allocated "
+                << RuntimeObject2String(obj, GetDevice(exec_->host_device_index),
+                                        /*show_contents=*/false);
 
         WriteRegister(instr.dst, obj);
         OpStopHook();
@@ -635,6 +644,9 @@ void VirtualMachine::RunLoop() {
         auto storage = Downcast<Storage>(storage_obj);
         auto offset = LoadScalarInt(instr.alloc_tensor.offset);
         auto obj = storage->AllocNDArray(offset, shape, instr.alloc_tensor_reg.dtype);
+        VLOG(2) << "allocated "
+                << RuntimeObject2String(obj, GetDevice(exec_->host_device_index),
+                                        /*show_contents=*/false);
 
         WriteRegister(instr.dst, obj);
         OpStopHook();
@@ -668,7 +680,7 @@ void VirtualMachine::RunLoop() {
         auto storage_obj = SimpleObjAllocator().make_object<StorageObj>();
         Allocator* allocator = GetAllocator(instr.alloc_storage.device_index);
         ICHECK(allocator) << "Did you forget to init the VirtualMachine with devices?";
-        VLOG(2) << "AllocStorage: allocation_size=" << size << ", alignment=" << alignment
+        VLOG(2) << "allocating with allocation_size=" << size << ", alignment=" << alignment
                 << ", dtype_hint=" << DLDataType2String(instr.alloc_storage.dtype_hint)
                 << ", device_index=" << instr.alloc_storage.device_index;
 
@@ -688,6 +700,8 @@ void VirtualMachine::RunLoop() {
         for (int i = 0; i < ndim; ++i) {
           reinterpret_cast<int64_t*>(out_tensor->data)[i] = input_array->shape[i];
         }
+        VLOG(2) << "shape = "
+                << RuntimeObject2String(out_tensor, GetDevice(exec_->host_device_index));
         WriteRegister(instr.dst, out_tensor);
         pc_++;
         goto main_loop;
@@ -722,18 +736,10 @@ void VirtualMachine::RunLoop() {
         int64_t ndim = shape_tensor->shape[0];
         std::vector<int64_t> shape(dims, dims + ndim);
         // Reshape the input tensor
-#if TVM_LOG_DEBUG
-        std::ostringstream os;
-        os << "ReshapeTensor: ";
-        os << "shape=[";
-        for (auto i : shape) {
-          os << i << ",";
-        }
-        os << "]";
-        os << ", dtype=" << DLDataType2String(tensor_arr->dtype);
-        VLOG(2) << os.str();
-#endif
         auto out_tensor = tensor_arr.CreateView(shape, tensor_arr->dtype);
+        VLOG(2) << "reshaped "
+                << RuntimeObject2String(tensor_obj, GetDevice(exec_->host_device_index)) << " to "
+                << RuntimeObject2String(out_tensor, GetDevice(exec_->host_device_index));
         WriteRegister(instr.dst, out_tensor);
         OpStopHook();
         pc_++;
diff --git a/src/target/compilation_config.cc b/src/target/compilation_config.cc
index 0401eebe51ef..a56e0ad0777c 100644
--- a/src/target/compilation_config.cc
+++ b/src/target/compilation_config.cc
@@ -62,31 +62,31 @@ void CompilationConfigNode::EstablishDefaultVirtualDevices(const transform::Pass
   if (host_target.defined()) {
     CHECK(!host_target->host.defined()) << "Host targets are not expected to have hosts";
     host_device_type = static_cast<DLDeviceType>(host_target->kind->device_type);
-    DLOG(INFO) << "Using the given host target " << host_target->ToDebugString()
-               << " of device type " << host_device_type << " for the host target";
+    VLOG(1) << "Using the given host target " << host_target->ToDebugString() << " of device type "
+            << host_device_type << " for the host target";
     for (const auto& primitive_target : primitive_targets) {
       if (primitive_target->host.defined() &&
           !StructuralEqual()(primitive_target->host, host_target)) {
-        DLOG(WARNING) << "The primitive target " << primitive_target->ToDebugString()
-                      << " already has a host which disagrees with the desired host target. It "
-                      << "will be ignored.";
+        VLOG(1) << "The primitive target " << primitive_target->ToDebugString()
+                << " already has a host which disagrees with the desired host target. It "
+                << "will be ignored.";
       }
     }
   } else if (primitive_targets.size() == 1 && primitive_targets.front()->host.defined()) {
     host_target = primitive_targets.front()->GetHost().value();
     CHECK(!host_target->host.defined()) << "Host targets are not expected to have hosts";
     host_device_type = static_cast<DLDeviceType>(host_target->kind->device_type);
-    DLOG(INFO) << "Using the host of the unique primitive target, namely "
-               << host_target->ToDebugString() << " of device type " << host_device_type
-               << " for the host target";
+    VLOG(1) << "Using the host of the unique primitive target, namely "
+            << host_target->ToDebugString() << " of device type " << host_device_type
+            << " for the host target";
   } else if (primitive_targets.size() == 1 &&
              primitive_targets.front()->kind->device_type == kDLCPU) {
     // In the homogenous case without an explicit host target just use the given target so long as
     // it's a CPU.
     host_device_type = kDLCPU;
     host_target = primitive_targets.front();
-    DLOG(INFO) << "Using the unique primitive target " << host_target->ToDebugString()
-               << " of device type " << host_device_type << " for the host target";
+    VLOG(1) << "Using the unique primitive target " << host_target->ToDebugString()
+            << " of device type " << host_device_type << " for the host target";
   } else {
     // Fallback.
     host_device_type = kDLCPU;
@@ -94,15 +94,15 @@ void CompilationConfigNode::EstablishDefaultVirtualDevices(const transform::Pass
     // in the hetrogeneous case since its options may not be appropriate for host code
     // (eg shape functions). Instead, create a fresh default Target.
     host_target = MakeDefaultTarget(host_device_type);
-    DLOG(WARNING) << "Using the default target " << host_target->ToDebugString()
-                  << " of device type " << host_device_type << " for the host target";
+    VLOG(1) << "Using the default target " << host_target->ToDebugString() << " of device type "
+            << host_device_type << " for the host target";
   }
   ICHECK(host_target.defined());
   ICHECK(!host_target->host.defined());
 
   if (host_device_type != kDLCPU) {
     // I think we're on thin ice here until we've audited the code base for assumed kDLCPU.
-    LOG(WARNING) << "The host target is not a CPU.";
+    VLOG(1) << "The host target is not a CPU.";
   }
 
   //
@@ -132,22 +132,22 @@ void CompilationConfigNode::EstablishDefaultVirtualDevices(const transform::Pass
     CHECK_GT(v, 0)
         << "The 'relay.fallback_device_type' pass attribute is set to an invalid device type " << v;
     default_primitive_device_type = static_cast<DLDeviceType>(v);
-    DLOG(INFO) << "Using the 'relay.fallback_device_type' pass attribute "
-               << default_primitive_device_type
-               << " as the default device type for all primitive operations";
+    VLOG(1) << "Using the 'relay.fallback_device_type' pass attribute "
+            << default_primitive_device_type
+            << " as the default device type for all primitive operations";
   } else if (primitive_targets.size() == 1) {
     // In the homogeneous case there's no free choice.
     default_primitive_device_type =
         static_cast<DLDeviceType>(primitive_targets.front()->kind->device_type);
-    DLOG(INFO) << "Using the device type " << default_primitive_device_type
-               << " of the unique primitive target as the default device type for all primitive "
-               << "operations";
+    VLOG(1) << "Using the device type " << default_primitive_device_type
+            << " of the unique primitive target as the default device type for all primitive "
+            << "operations";
   } else {
     // Fallback. Note that we'll require a primitive Target of kDLCPU device_type to be given
     // and won't manufacture one out of thin air.
     default_primitive_device_type = kDLCPU;
-    DLOG(WARNING) << "Using " << default_primitive_device_type
-                  << " as the default device type for all primitive operations";
+    VLOG(1) << "Using " << default_primitive_device_type
+            << " as the default device type for all primitive operations";
   }
 
   //
@@ -227,11 +227,11 @@ CompilationConfig::CompilationConfig(const transform::PassContext& pass_ctx,
       node->legacy_target_map.size() == 1 ? (*node->legacy_target_map.begin()).second : Target();
 
   for (const auto& target : node->primitive_targets) {
-    DLOG(INFO) << "Target " << target->ToDebugString() << " of device type "
-               << target->kind->device_type << " is available for primitives";
+    VLOG(1) << "Target " << target->ToDebugString() << " of device type "
+            << target->kind->device_type << " is available for primitives";
   }
-  DLOG(INFO) << "Using default primitive virtual device " << node->default_primitive_virtual_device;
-  DLOG(INFO) << "Using host virtual device " << node->host_virtual_device;
+  VLOG(1) << "Using default primitive virtual device " << node->default_primitive_virtual_device;
+  VLOG(1) << "Using host virtual device " << node->host_virtual_device;
 
   data_ = std::move(node);
 }
diff --git a/tests/python/relay/dyn/test_dynamic_op_level3.py b/tests/python/relay/dyn/test_dynamic_op_level3.py
index 7669d02cd536..0456401e8ad2 100644
--- a/tests/python/relay/dyn/test_dynamic_op_level3.py
+++ b/tests/python/relay/dyn/test_dynamic_op_level3.py
@@ -21,7 +21,6 @@
 import tvm
 import tvm.testing
 from tvm import relay, te
-from tvm.relay import create_executor, transform
 from tvm.relay.testing import check_grad, run_infer_type
 
 
@@ -44,6 +43,15 @@ def verify_func(func, data, ref_res, target_device=tvm.testing.enabled_targets()
             relay.backend.te_compiler.get().clear()
 
 
+def check_on_vm(target, dev, args, expected_result, mod):
+    """
+    Check that evaluating `expr` applied to the arguments produces
+    `result` on Relay VM.
+    """
+    rts_result = relay.create_executor("vm", device=dev, target=target, mod=mod).evaluate()(*args)
+    tvm.testing.assert_allclose(expected_result, rts_result.numpy())
+
+
 @tvm.testing.uses_gpu
 def test_dyn_reshape():
     def verify_reshape(shape, newshape, oshape):
@@ -410,5 +418,59 @@ def verify_sparse_fill_empty_rows(
     )
 
 
+def test_dyn_copy():
+    target = tvm.target.Target("llvm")
+    dev = tvm.cpu()
+    mod = tvm.parser.fromtext(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: Tensor[(?, 3), int64]) -> Tensor[(?, 3), int64] {
+          copy(%x)
+        }
+        """
+    )
+    x_data = np.random.rand(15, 3).astype("int64")
+    expected = x_data
+    check_on_vm(target, dev, [x_data], expected, mod)
+
+
+def test_dyn_copy_scalar():
+    target = tvm.target.Target("llvm")
+    dev = tvm.cpu()
+    mod = tvm.parser.fromtext(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: int32, %y: Tensor[(?), int32]) -> Tensor[(?), int32] {
+          %0 = copy(%x);
+          %1 = expand_dims(%0, axis=0);
+          %2 = (%y, %1);
+          concatenate(%2)
+        }
+        """
+    )
+    x_data = 3
+    y_data = np.random.rand(7).astype("int32")
+    expected = np.concatenate((y_data, np.expand_dims(x_data, axis=0)))
+    check_on_vm(target, dev, [x_data, y_data], expected, mod)
+
+
+def test_dyn_cast():
+    target = tvm.target.Target("llvm")
+    dev = tvm.cpu()
+    mod = tvm.parser.fromtext(
+        """
+        #[version = "0.0.5"]
+        def @main(%x: Tensor[(?, 3), int64]) -> Tensor[(?, 3), int32] {
+          cast(%x, dtype="int32")
+        }
+        """
+    )
+    x_data = np.random.rand(15, 3).astype("int64")
+    expected = x_data.astype("int32")
+    check_on_vm(target, dev, [x_data], expected, mod)
+
+
 if __name__ == "__main__":
-    pytest.main([__file__])
+    import sys
+
+    sys.exit(pytest.main([__file__] + sys.argv[1:]))
diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py
index 1c60702982cc..7f0f8041b1a2 100644
--- a/tests/python/relay/test_vm.py
+++ b/tests/python/relay/test_vm.py
@@ -36,7 +36,7 @@
 from tvm.relay.backend.vm import VMCompiler
 
 
-def check_result(target, dev, args, expected_result, mod=None):
+def check_result(target, dev, args, expected_result, mod):
     """
     Check that evaluating `expr` applied to the arguments produces
     `result` on Relay VM.
@@ -111,7 +111,7 @@ def test_id(target, dev):
     x_data = np.random.rand(10, 10).astype("float64")
     mod = tvm.IRModule()
     mod["main"] = f
-    check_result(target, dev, [x_data], x_data, mod=mod)
+    check_result(target, dev, [x_data], x_data, mod)
 
 
 def test_op(target, dev):
@@ -120,7 +120,7 @@ def test_op(target, dev):
     x_data = np.random.rand(10, 10).astype("float32")
     mod = tvm.IRModule()
     mod["main"] = f
-    check_result(target, dev, [x_data], 2 * x_data, mod=mod)
+    check_result(target, dev, [x_data], 2 * x_data, mod)
 
 
 def any(x):
@@ -140,10 +140,10 @@ def test_cond(target, dev):
     mod = tvm.IRModule()
     mod["main"] = f
     # same
-    check_result(target, dev, [x_data, x_data], True, mod=mod)
+    check_result(target, dev, [x_data, x_data], True, mod)
 
     # diff
-    check_result(target, dev, [x_data, y_data], False, mod=mod)
+    check_result(target, dev, [x_data, y_data], False, mod)
 
 
 @tvm.testing.known_failing_targets("vulkan")
@@ -157,10 +157,10 @@ def test_simple_if(target, dev):
     mod = tvm.IRModule()
     mod["main"] = f
     # same
-    check_result(target, dev, [x_data, x_data], x_data, mod=mod)
+    check_result(target, dev, [x_data, x_data], x_data, mod)
 
     # diff
-    check_result(target, dev, [x_data, y_data], y_data, mod=mod)
+    check_result(target, dev, [x_data, y_data], y_data, mod)
 
 
 @tvm.testing.parametrize_targets("llvm")
@@ -204,7 +204,7 @@ def test_unused_function(target, dev):
     x_data = np.random.rand(2, 2).astype("float32")
     y_data = x_data * 2
 
-    check_result(target, dev, [x_data], y_data, mod=mod)
+    check_result(target, dev, [x_data], y_data, mod)
 
 
 def test_simple_call(target, dev):
@@ -218,7 +218,7 @@ def test_simple_call(target, dev):
     i_data = np.array(0, dtype="int32")
     iarg = relay.var("iarg", shape=[], dtype="int32")
     mod["main"] = relay.Function([iarg], sum_up(iarg))
-    check_result(target, dev, [i_data], i_data, mod=mod)
+    check_result(target, dev, [i_data], i_data, mod)
 
 
 def test_count_loop(target, dev):
@@ -239,7 +239,7 @@ def test_count_loop(target, dev):
     mod["main"] = relay.Function([iarg], sum_up(iarg))
     result = veval(mod, i_data, device=dev, target=target)
     tvm.testing.assert_allclose(result.numpy(), i_data)
-    check_result(target, dev, [i_data], i_data, mod=mod)
+    check_result(target, dev, [i_data], i_data, mod)
 
 
 def test_sum_loop(target, dev):
@@ -263,7 +263,7 @@ def test_sum_loop(target, dev):
     iarg = relay.var("i", shape=[], dtype="int32")
     aarg = relay.var("accum", shape=[], dtype="int32")
     mod["main"] = relay.Function([iarg, aarg], sum_up(iarg, aarg))
-    check_result(target, dev, [i_data, accum_data], sum(range(1, loop_bound + 1)), mod=mod)
+    check_result(target, dev, [i_data, accum_data], sum(range(1, loop_bound + 1)), mod)
 
 
 def test_tuple_fst(target, dev):
@@ -274,7 +274,7 @@ def test_tuple_fst(target, dev):
     j_data = np.random.rand(10).astype("float32")
     mod = tvm.IRModule()
     mod["main"] = f
-    check_result(target, dev, [(i_data, j_data)], i_data, mod=mod)
+    check_result(target, dev, [(i_data, j_data)], i_data, mod)
 
 
 def test_tuple_second(target, dev):
@@ -285,7 +285,7 @@ def test_tuple_second(target, dev):
     j_data = np.random.rand(10).astype("float32")
     mod = tvm.IRModule()
     mod["main"] = f
-    check_result(target, dev, [(i_data, j_data)], j_data, mod=mod)
+    check_result(target, dev, [(i_data, j_data)], j_data, mod)
 
 
 def test_list_constructor(target, dev):
@@ -325,7 +325,7 @@ def test_let_tensor(target, dev):
     x_data = np.random.rand(*shape).astype("float32")
     mod = tvm.IRModule()
     mod["main"] = f
-    check_result(target, dev, [x_data], x_data + 42.0, mod=mod)
+    check_result(target, dev, [x_data], x_data + 42.0, mod)
 
 
 def test_let_scalar(target, dev):
@@ -342,7 +342,7 @@ def test_let_scalar(target, dev):
     x_data = np.array(np.random.rand()).astype("float32")
     mod = tvm.IRModule()
     mod["main"] = f
-    check_result(target, dev, [x_data], x_data + 42.0, mod=mod)
+    check_result(target, dev, [x_data], x_data + 42.0, mod)
 
 
 def test_compose(target, dev):
@@ -616,7 +616,7 @@ def test_add_op_scalar(target, dev):
     ]
     for (x_data, y_data) in x_y_data:
         mod["main"] = func
-        check_result(target, dev, [x_data, y_data], x_data + y_data, mod=mod)
+        check_result(target, dev, [x_data, y_data], x_data + y_data, mod)
 
 
 def test_add_op_scalar_int(target, dev):
@@ -637,7 +637,7 @@ def test_add_op_scalar_int(target, dev):
     ]
     for (x_data, y_data) in x_y_data:
         mod["main"] = func
-        check_result(target, dev, [x_data, y_data], x_data + y_data, mod=mod)
+        check_result(target, dev, [x_data, y_data], x_data + y_data, mod)
 
 
 def test_add_op_tensor(target, dev):
@@ -654,7 +654,7 @@ def test_add_op_tensor(target, dev):
     x_data = np.random.rand(10, 5).astype("float32")
     y_data = np.random.rand(10, 5).astype("float32")
     mod["main"] = func
-    check_result(target, dev, [x_data, y_data], x_data + y_data, mod=mod)
+    check_result(target, dev, [x_data, y_data], x_data + y_data, mod)
 
 
 def test_add_op_broadcast(target, dev):
@@ -671,7 +671,7 @@ def test_add_op_broadcast(target, dev):
     x_data = np.random.rand(10, 5).astype("float32")
     y_data = np.random.rand(1, 5).astype("float32")
     mod["main"] = func
-    check_result(target, dev, [x_data, y_data], x_data + y_data, mod=mod)
+    check_result(target, dev, [x_data, y_data], x_data + y_data, mod)
 
 
 def test_vm_optimize_dynamic():
@@ -717,7 +717,7 @@ def body_with_free_var(i, acc):
         ret = relay.TupleGetItem(tup, 1)
         mod = tvm.IRModule()
         mod["main"] = relay.Function(relay.analysis.free_vars(ret), ret)
-        check_result(target, dev, args, expected, mod=mod)
+        check_result(target, dev, args, expected, mod)
 
 
 def test_vm_reshape_tensor(target, dev):
@@ -1040,8 +1040,8 @@ def @main(%a: Tensor[(5, 7), float32],
     # - The offset of the tensor within the storage (second arg) to alloc_tensor
     # Both should be on the CPU
     assert "VirtualDevice[0]: device type 1" in exe.virtual_devices
-    assert "Const[0]: has shape int64[] on device index 0" in exe.constants
-    assert "Const[1]: has shape int64[] on device index 0" in exe.constants
+    assert "VM Const[0]: NDArray[(),int64,(1,0)]=[140] on device index 0" in exe.constants
+    assert "VM Const[1]: NDArray[(),int64,(1,0)]=[0] on device index 0" in exe.constants
 
 
 @tvm.testing.requires_cuda
@@ -1073,7 +1073,7 @@ def @main(%x: Tensor[(2, 8), float32],
 
     # The newshape annotation should have been turned into a constant on the CPU.
     assert "VirtualDevice[0]: device type 1" in exe.virtual_devices
-    assert "Const[0]: has shape int64[3] on device index 0" in exe.constants
+    assert "VM Const[0]: NDArray[(3),int64,(1,0)]=[2,4,2] on device index 0" in exe.constants
 
 
 @tvm.testing.requires_cuda