diff --git a/dali/benchmark/caffe2_alexnet_bench.cc b/dali/benchmark/caffe2_alexnet_bench.cc
index 52e824f991d..cd43eb4b9c3 100755
--- a/dali/benchmark/caffe2_alexnet_bench.cc
+++ b/dali/benchmark/caffe2_alexnet_bench.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -127,7 +127,7 @@ BENCHMARK_DEFINE_F(C2Alexnet, Caffe2Pipe)(benchmark::State& st) { // NOLINT
     }
   }
 
-  WriteCHWBatch<float16>(ws.Output<GPUBackend>(0), 128, 1, "img");
+  WriteCHWBatch<float16>(ws.OutputRef<GPUBackend>(0), 128, 1, "img");
   int num_batches = st.iterations() + static_cast<int>(pipelined);
   st.counters["FPS"] = benchmark::Counter(batch_size*num_batches,
       benchmark::Counter::kIsRate);
@@ -258,7 +258,7 @@ BENCHMARK_DEFINE_F(C2Alexnet, HybridPipe)(benchmark::State& st) { // NOLINT
     }
   }
 
-  // WriteCHWBatch<float16>(ws.Output<GPUBackend>(0), 128, 1, "img");
+  // WriteCHWBatch<float16>(ws.OutputRef<GPUBackend>(0), 128, 1, "img");
   int num_batches = st.iterations() + static_cast<int>(pipelined);
   st.counters["FPS"] = benchmark::Counter(batch_size*num_batches,
       benchmark::Counter::kIsRate);
diff --git a/dali/benchmark/caffe_alexnet_bench.cc b/dali/benchmark/caffe_alexnet_bench.cc
index 7af2b9565db..d4369e71c26 100755
--- a/dali/benchmark/caffe_alexnet_bench.cc
+++ b/dali/benchmark/caffe_alexnet_bench.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -128,7 +128,7 @@ BENCHMARK_DEFINE_F(Alexnet, CaffePipe)(benchmark::State& st) { // NOLINT
     }
   }
 
-  WriteCHWBatch<float16>(ws.Output<GPUBackend>(0), 128, 1, "img");
+  WriteCHWBatch<float16>(ws.OutputRef<GPUBackend>(0), 128, 1, "img");
   int num_batches = st.iterations() + static_cast<int>(pipelined);
   st.counters["FPS"] = benchmark::Counter(batch_size*num_batches,
       benchmark::Counter::kIsRate);
@@ -259,7 +259,7 @@ BENCHMARK_DEFINE_F(Alexnet, HybridPipe)(benchmark::State& st) { // NOLINT
     }
   }
 
-  // WriteCHWBatch<float16>(ws.Output<GPUBackend>(0), 128, 1, "img");
+  // WriteCHWBatch<float16>(ws.OutputRef<GPUBackend>(0), 128, 1, "img");
   int num_batches = st.iterations() + static_cast<int>(pipelined);
   st.counters["FPS"] = benchmark::Counter(batch_size*num_batches,
       benchmark::Counter::kIsRate);
diff --git a/dali/benchmark/decoder_bench.cc b/dali/benchmark/decoder_bench.cc
index d44259f1fa7..c6e2294de6f 100644
--- a/dali/benchmark/decoder_bench.cc
+++ b/dali/benchmark/decoder_bench.cc
@@ -81,7 +81,7 @@ class DecoderBench : public DALIBenchmark {
       }
     }
 
-    // WriteCHWBatch<float16>(ws.Output<GPUBackend>(0), 128, 1, "img");
+    // WriteCHWBatch<float16>(ws.OutputRef<GPUBackend>(0), 128, 1, "img");
     int num_batches = st.iterations() + 1;
     st.counters["FPS"] = benchmark::Counter(batch_size*num_batches,
         benchmark::Counter::kIsRate);
diff --git a/dali/benchmark/file_reader_alexnet_bench.cc b/dali/benchmark/file_reader_alexnet_bench.cc
index a1cc6867a5c..e5989388b20 100755
--- a/dali/benchmark/file_reader_alexnet_bench.cc
+++ b/dali/benchmark/file_reader_alexnet_bench.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2017, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -128,7 +128,7 @@ BENCHMARK_DEFINE_F(FileReaderAlexnet, CaffePipe)(benchmark::State& st) { // NOLI
     }
   }
 
-  WriteCHWBatch<float16>(ws.Output<GPUBackend>(0), 128, 1, "img");
+  WriteCHWBatch<float16>(ws.OutputRef<GPUBackend>(0), 128, 1, "img");
   int num_batches = st.iterations() + static_cast<int>(pipelined);
   st.counters["FPS"] = benchmark::Counter(batch_size*num_batches,
       benchmark::Counter::kIsRate);
diff --git a/dali/benchmark/resnet50_bench.cc b/dali/benchmark/resnet50_bench.cc
index 12633d258d3..1bbf1c1ed07 100755
--- a/dali/benchmark/resnet50_bench.cc
+++ b/dali/benchmark/resnet50_bench.cc
@@ -131,7 +131,7 @@ BENCHMARK_DEFINE_F(RN50, C2Pipe)(benchmark::State& st) { // NOLINT
     }
   }
 
-  // WriteCHWBatch<float16>(ws.Output<GPUBackend>(0), 128, 1, "img");
+  // WriteCHWBatch<float16>(ws.OutputRef<GPUBackend>(0), 128, 1, "img");
   int num_batches = st.iterations() + static_cast<int>(pipelined);
   st.counters["FPS"] = benchmark::Counter(batch_size*num_batches,
       benchmark::Counter::kIsRate);
@@ -266,7 +266,7 @@ BENCHMARK_DEFINE_F(RN50, HybridPipe)(benchmark::State& st) { // NOLINT
     }
   }
 
-  // WriteCHWBatch<float16>(ws.Output<GPUBackend>(0), 128, 1, "img");
+  // WriteCHWBatch<float16>(ws.OutputRef<GPUBackend>(0), 128, 1, "img");
   int num_batches = st.iterations() + static_cast<int>(pipelined);
   st.counters["FPS"] = benchmark::Counter(batch_size*num_batches,
       benchmark::Counter::kIsRate);
@@ -379,7 +379,7 @@ BENCHMARK_DEFINE_F(RN50, nvJPEGPipe)(benchmark::State& st) { // NOLINT
     }
   }
 
-  // WriteCHWBatch<float16>(ws.Output<GPUBackend>(0), 128, 1, "img");
+  // WriteCHWBatch<float16>(ws.OutputRef<GPUBackend>(0), 128, 1, "img");
   int num_batches = st.iterations() + static_cast<int>(pipelined);
   st.counters["FPS"] = benchmark::Counter(batch_size*num_batches,
       benchmark::Counter::kIsRate);
diff --git a/dali/benchmark/resnet50_nvjpeg_bench.cc b/dali/benchmark/resnet50_nvjpeg_bench.cc
index 893ce3eeb9f..b089be10e2f 100755
--- a/dali/benchmark/resnet50_nvjpeg_bench.cc
+++ b/dali/benchmark/resnet50_nvjpeg_bench.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -112,7 +112,7 @@ BENCHMARK_DEFINE_F(RealRN50, nvjpegPipe)(benchmark::State& st) { // NOLINT
   }
 
 #if DALI_DEBUG
-  WriteHWCBatch(ws.Output<GPUBackend>(0), "img");
+  WriteHWCBatch(ws.OutputRef<GPUBackend>(0), "img");
 #endif
   int num_batches = st.iterations() + static_cast<int>(pipelined);
   st.counters["FPS"] = benchmark::Counter(batch_size*num_batches,
diff --git a/dali/c_api/c_api.cc b/dali/c_api/c_api.cc
index 8902a3113f7..f1e0860ded7 100644
--- a/dali/c_api/c_api.cc
+++ b/dali/c_api/c_api.cc
@@ -318,9 +318,9 @@ void daliOutputRelease(daliPipelineHandle *pipe_handle) {
 int64_t daliOutputHasUniformShape(daliPipelineHandle* pipe_handle, int i) {
   dali::DeviceWorkspace* ws = reinterpret_cast<dali::DeviceWorkspace*>(pipe_handle->ws);
   if (ws->OutputIsType<dali::CPUBackend>(i)) {
-    return is_uniform(ws->Output<dali::CPUBackend>(i).shape());
+    return is_uniform(ws->OutputRef<dali::CPUBackend>(i).shape());
   } else {
-    return is_uniform(ws->Output<dali::GPUBackend>(i).shape());
+    return is_uniform(ws->OutputRef<dali::GPUBackend>(i).shape());
   }
 }
 
@@ -328,7 +328,7 @@ template<typename T>
 static int64_t *daliShapeAtHelper(dali::DeviceWorkspace *ws, int n, int k) {
   int64_t *c_shape = nullptr;
   std::vector<dali::Index> shape;
-  const auto &out_tensor_list = ws->Output<T>(n);
+  const auto &out_tensor_list = ws->OutputRef<T>(n);
   if (k >= 0) {
     auto shape_span = out_tensor_list.tensor_shape_span(k);
     shape = std::vector<dali::Index>(shape_span.begin(), shape_span.end());
@@ -366,7 +366,7 @@ int64_t* daliShapeAt(daliPipelineHandle* pipe_handle, int n) {
 
 template <typename T>
 static dali_data_type_t daliTypeAtHelper(dali::DeviceWorkspace* ws, int n) {
-  const auto &out_tensor_list = ws->Output<T>(n);
+  const auto &out_tensor_list = ws->OutputRef<T>(n);
   auto type_id = out_tensor_list.type();
   return static_cast<dali_data_type_t>(static_cast<int>(type_id));
 }
@@ -383,7 +383,7 @@ dali_data_type_t daliTypeAt(daliPipelineHandle* pipe_handle, int n) {
 
 template <typename T>
 static size_t daliNumTensorsHelper(dali::DeviceWorkspace* ws, int n) {
-  return ws->Output<T>(n).num_samples();
+  return ws->OutputRef<T>(n).num_samples();
 }
 
 size_t daliNumTensors(daliPipelineHandle* pipe_handle, int n) {
@@ -397,7 +397,7 @@ size_t daliNumTensors(daliPipelineHandle* pipe_handle, int n) {
 
 template <typename T>
 static size_t daliNumElementsHelper(dali::DeviceWorkspace* ws, int n) {
-  return ws->Output<T>(n)._num_elements();
+  return ws->OutputRef<T>(n)._num_elements();
 }
 
 size_t daliNumElements(daliPipelineHandle* pipe_handle, int n) {
@@ -411,7 +411,7 @@ size_t daliNumElements(daliPipelineHandle* pipe_handle, int n) {
 
 template <typename T>
 static size_t daliTensorSizeHelper(dali::DeviceWorkspace* ws, int n) {
-  return ws->Output<T>(n).nbytes();
+  return ws->OutputRef<T>(n).nbytes();
 }
 
 size_t daliTensorSize(daliPipelineHandle* pipe_handle, int n) {
@@ -425,7 +425,7 @@ size_t daliTensorSize(daliPipelineHandle* pipe_handle, int n) {
 
 template <typename T>
 static size_t daliMaxDimTensorsHelper(dali::DeviceWorkspace* ws, int n) {
-  const auto &out_tensor_list = ws->Output<T>(n);
+  const auto &out_tensor_list = ws->OutputRef<T>(n);
   size_t tensors_num = out_tensor_list.num_samples();
   int max_num_dim = 0;
   for (size_t i = 0; i < tensors_num; ++i) {
diff --git a/dali/c_api/c_api_test.cc b/dali/c_api/c_api_test.cc
index 644e7974750..df83ef5c1f0 100644
--- a/dali/c_api/c_api_test.cc
+++ b/dali/c_api/c_api_test.cc
@@ -152,7 +152,7 @@ void ComparePipelinesOutputs(daliPipelineHandle &handle, Pipeline &baseline,
     EXPECT_EQ(daliNumTensors(&handle, output), batch_size);
     for (int elem = 0; elem < batch_size; elem++) {
       auto *shape = daliShapeAtSample(&handle, output, elem);
-      auto ref_shape = ws.Output<Backend>(output).shape()[elem];
+      auto ref_shape = ws.OutputRef<Backend>(output).shape()[elem];
       int D = ref_shape.size();
       for (int d = 0; d < D; d++)
         EXPECT_EQ(shape[d], ref_shape[d]);
@@ -162,7 +162,7 @@ void ComparePipelinesOutputs(daliPipelineHandle &handle, Pipeline &baseline,
 
     TensorList<CPUBackend> pipeline_output_cpu, c_api_output_cpu;
     // Unnecessary copy in case of CPUBackend, makes the code generic across Backends
-    pipeline_output_cpu.Copy(ws.Output<Backend>(0), cuda_stream);
+    pipeline_output_cpu.Copy(ws.OutputRef<Backend>(0), cuda_stream);
 
     auto num_elems = pipeline_output_cpu.shape().num_elements();
     auto backend_buf = AllocBuffer<Backend>(num_elems * sizeof(uint8_t), false);
diff --git a/dali/operators/bbox/bbox_paste.cc b/dali/operators/bbox/bbox_paste.cc
index 4b8588d8e21..82aee69da84 100644
--- a/dali/operators/bbox/bbox_paste.cc
+++ b/dali/operators/bbox/bbox_paste.cc
@@ -56,14 +56,14 @@ canvas and ``(1,1)`` aligns it to bottom-right.
 
 template<>
 void BBoxPaste<CPUBackend>::RunImpl(Workspace<CPUBackend> &ws) {
-  const auto &input = ws.Input<CPUBackend>(0);
+  const auto &input = ws.InputRef<CPUBackend>(0);
   const auto input_data = input.data<float>();
 
   DALI_ENFORCE(input.type() == DALI_FLOAT, "Bounding box in wrong format");
   DALI_ENFORCE(input.size() % 4 == 0, "Bounding box tensor size must be a multiple of 4."
                                       "Got: " + std::to_string(input.size()));
 
-  auto &output = ws.Output<CPUBackend>(0);
+  auto &output = ws.OutputRef<CPUBackend>(0);
   output.Resize(input.shape(), DALI_FLOAT);
   auto *output_data = output.mutable_data<float>();
 
diff --git a/dali/operators/debug/dump_image.cc b/dali/operators/debug/dump_image.cc
index 266b320a242..29740871d0f 100644
--- a/dali/operators/debug/dump_image.cc
+++ b/dali/operators/debug/dump_image.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -21,8 +21,8 @@ namespace dali {
 
 template<>
 void DumpImage<CPUBackend>::RunImpl(SampleWorkspace &ws) {
-  auto &input = ws.Input<CPUBackend>(0);
-  auto &output = ws.Output<CPUBackend>(0);
+  auto &input = ws.InputRef<CPUBackend>(0);
+  auto &output = ws.OutputRef<CPUBackend>(0);
 
   DALI_ENFORCE(input.ndim() == 3,
                make_string("Input images must have three dimensions, got input with `",
diff --git a/dali/operators/debug/dump_image.cu b/dali/operators/debug/dump_image.cu
index edcd5bd7e0a..913ce81da21 100644
--- a/dali/operators/debug/dump_image.cu
+++ b/dali/operators/debug/dump_image.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2021, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -20,8 +20,8 @@ namespace dali {
 
 template<>
 void DumpImage<GPUBackend>::RunImpl(DeviceWorkspace &ws) {
-  auto &input = ws.Input<GPUBackend>(0);
-  auto &output = ws.Output<GPUBackend>(0);
+  auto &input = ws.InputRef<GPUBackend>(0);
+  auto &output = ws.OutputRef<GPUBackend>(0);
 
 
   DALI_ENFORCE(input.shape().sample_dim() == 3,
diff --git a/dali/operators/decoder/host/host_decoder.cc b/dali/operators/decoder/host/host_decoder.cc
index 55518397181..afc37b83896 100644
--- a/dali/operators/decoder/host/host_decoder.cc
+++ b/dali/operators/decoder/host/host_decoder.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -21,8 +21,8 @@
 namespace dali {
 
 void HostDecoder::RunImpl(SampleWorkspace &ws) {
-  const auto &input = ws.Input<CPUBackend>(0);
-  auto &output = ws.Output<CPUBackend>(0);
+  const auto &input = ws.InputRef<CPUBackend>(0);
+  auto &output = ws.OutputRef<CPUBackend>(0);
   auto file_name = input.GetSourceInfo();
 
   // Verify input
diff --git a/dali/operators/decoder/nvjpeg/nvjpeg_decoder_decoupled_api.h b/dali/operators/decoder/nvjpeg/nvjpeg_decoder_decoupled_api.h
index 29842aa08e2..267b27dd8d9 100644
--- a/dali/operators/decoder/nvjpeg/nvjpeg_decoder_decoupled_api.h
+++ b/dali/operators/decoder/nvjpeg/nvjpeg_decoder_decoupled_api.h
@@ -554,7 +554,7 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
 #endif  // NVJPEG2K_ENABLED
 
     for (int i = 0; i < curr_batch_size; i++) {
-      const auto &in = ws.Input<CPUBackend>(0, i);
+      const auto &in = ws.InputRef<CPUBackend>(0)[i];
       const auto in_size = in.size();
       thread_pool_.AddWork([this, i, &in, in_size](int tid) {
         auto *input_data = in.data<uint8_t>();
@@ -683,7 +683,7 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
   }
 
   void ProcessImagesCache(MixedWorkspace &ws) {
-    auto& output = ws.Output<GPUBackend>(0);
+    auto& output = ws.OutputRef<GPUBackend>(0);
     for (auto *sample : samples_cache_) {
       assert(sample);
       auto i = sample->sample_idx;
@@ -694,12 +694,12 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
   }
 
   void ProcessImagesCuda(MixedWorkspace &ws) {
-    auto& output = ws.Output<GPUBackend>(0);
+    auto& output = ws.OutputRef<GPUBackend>(0);
     for (auto *sample : samples_single_) {
       assert(sample);
       auto i = sample->sample_idx;
       auto *output_data = output.mutable_tensor<uint8_t>(i);
-      const auto &in = ws.Input<CPUBackend>(0, i);
+      const auto &in = ws.InputRef<CPUBackend>(0)[i];
       thread_pool_.AddWork(
         [this, sample, &in, output_data](int tid) {
           SampleWorker(sample->sample_idx, sample->file_name, in.size(), tid,
@@ -799,11 +799,11 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
   }
 
   void ProcessImagesHost(MixedWorkspace &ws) {
-    auto& output = ws.Output<GPUBackend>(0);
+    auto& output = ws.OutputRef<GPUBackend>(0);
     for (auto *sample : samples_host_) {
       auto i = sample->sample_idx;
       auto *output_data = output.mutable_tensor<uint8_t>(i);
-      const auto &in = ws.Input<CPUBackend>(0, i);
+      const auto &in = ws.InputRef<CPUBackend>(0)[i];
       ImageCache::ImageShape shape = output_shape_[i].to_static<3>();
       thread_pool_.AddWork(
         [this, sample, &in, output_data, shape](int tid) {
@@ -816,7 +816,7 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
 
   void ProcessImagesHw(MixedWorkspace &ws) {
 #if IS_HW_DECODER_COMPATIBLE
-    auto& output = ws.Output<GPUBackend>(0);
+    auto& output = ws.OutputRef<GPUBackend>(0);
     if (!samples_hw_batched_.empty()) {
       nvjpegJpegState_t &state = state_hw_batched_;
       assert(state != nullptr);
@@ -839,7 +839,7 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
 
       for (auto *sample : samples_hw_batched_) {
         int i = sample->sample_idx;
-        const auto &in = ws.Input<CPUBackend>(0, i);
+        const auto &in = ws.InputRef<CPUBackend>(0)[i];
         const auto &out_shape = output_shape_.tensor_shape(i);
 
         tv[j].ShareData(const_cast<Tensor<CPUBackend> &>(in));
@@ -891,7 +891,7 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
   }
 
   void ProcessImages(MixedWorkspace &ws) {
-    auto &output = ws.Output<GPUBackend>(0);
+    auto &output = ws.OutputRef<GPUBackend>(0);
     assert(output_shape_.num_samples() ==
            ws.GetInputBatchSize(0));  // If fails: Incorrect number of samples in shape
     output.Resize(output_shape_, DALI_UINT8);
diff --git a/dali/operators/generic/flip.cc b/dali/operators/generic/flip.cc
index 54e321101bf..11be631d296 100644
--- a/dali/operators/generic/flip.cc
+++ b/dali/operators/generic/flip.cc
@@ -59,8 +59,8 @@ void RunFlip(Tensor<CPUBackend> &output, const Tensor<CPUBackend> &input,
 
 template <>
 void Flip<CPUBackend>::RunImpl(Workspace<CPUBackend> &ws) {
-  const auto &input = ws.Input<CPUBackend>(0);
-  auto &output = ws.Output<CPUBackend>(0);
+  const auto &input = ws.InputRef<CPUBackend>(0);
+  auto &output = ws.OutputRef<CPUBackend>(0);
   auto layout = input.GetLayout();
   output.SetLayout(layout);
   output.Resize(input.shape(), input.type());
diff --git a/dali/operators/generic/flip.cu b/dali/operators/generic/flip.cu
index fb682f7b943..cc86291c4eb 100644
--- a/dali/operators/generic/flip.cu
+++ b/dali/operators/generic/flip.cu
@@ -43,8 +43,8 @@ void RunKernel(TensorList<GPUBackend> &output, const TensorList<GPUBackend> &inp
 
 template <>
 void Flip<GPUBackend>::RunImpl(Workspace<GPUBackend> &ws) {
-  const auto &input = ws.Input<GPUBackend>(0);
-  auto &output = ws.Output<GPUBackend>(0);
+  const auto &input = ws.InputRef<GPUBackend>(0);
+  auto &output = ws.OutputRef<GPUBackend>(0);
   output.SetLayout(input.GetLayout());
   output.Resize(input.shape(), input.type());
   auto curr_batch_size = ws.GetInputBatchSize(0);
diff --git a/dali/operators/generic/pad.cu b/dali/operators/generic/pad.cu
index 62ceb2c074e..11bef081727 100644
--- a/dali/operators/generic/pad.cu
+++ b/dali/operators/generic/pad.cu
@@ -25,7 +25,7 @@ template <>
 bool Pad<GPUBackend>::SetupImpl(std::vector<OutputDesc> &output_desc,
                                 const workspace_t<GPUBackend> &ws) {
   output_desc.resize(1);
-  const auto &input = ws.Input<GPUBackend>(0);
+  const auto &input = ws.InputRef<GPUBackend>(0);
   auto in_shape = input.shape();
   auto in_layout = input.GetLayout();
   int ndim = in_shape.sample_dim();
@@ -57,8 +57,8 @@ bool Pad<GPUBackend>::SetupImpl(std::vector<OutputDesc> &output_desc,
 
 template <>
 void Pad<GPUBackend>::RunImpl(workspace_t<GPUBackend> &ws) {
-  const auto &input = ws.Input<GPUBackend>(0);
-  auto &output = ws.Output<GPUBackend>(0);
+  const auto &input = ws.InputRef<GPUBackend>(0);
+  auto &output = ws.OutputRef<GPUBackend>(0);
   output.SetLayout(input.GetLayout());
   int ndim = input.shape().sample_dim();
   TYPE_SWITCH(input.type(), type2id, T, PAD_SUPPORTED_TYPES, (
diff --git a/dali/operators/image/color/brightness_contrast.cu b/dali/operators/image/color/brightness_contrast.cu
index 501a4f53763..fb42505d024 100644
--- a/dali/operators/image/color/brightness_contrast.cu
+++ b/dali/operators/image/color/brightness_contrast.cu
@@ -54,8 +54,8 @@ bool BrightnessContrastGpu::SetupImpl(std::vector<OutputDesc> &output_desc,
 
 
 void BrightnessContrastGpu::RunImpl(workspace_t<GPUBackend> &ws) {
-  const auto &input = ws.template Input<GPUBackend>(0);
-  auto &output = ws.template Output<GPUBackend>(0);
+  const auto &input = ws.template InputRef<GPUBackend>(0);
+  auto &output = ws.template OutputRef<GPUBackend>(0);
   output.SetLayout(input.GetLayout());
   TYPE_SWITCH(input.type(), type2id, InputType, (uint8_t, int16_t, int32_t, float), (
       TYPE_SWITCH(output_type_, type2id, OutputType, (uint8_t, int16_t, int32_t, float), (
diff --git a/dali/operators/image/color/color_twist.cu b/dali/operators/image/color/color_twist.cu
index 7917d1bc80b..e4b10ed26f8 100644
--- a/dali/operators/image/color/color_twist.cu
+++ b/dali/operators/image/color/color_twist.cu
@@ -49,8 +49,8 @@ bool ColorTwistGpu::SetupImpl(std::vector<OutputDesc> &output_desc, const Device
 
 
 void ColorTwistGpu::RunImpl(workspace_t<GPUBackend> &ws) {
-  const auto &input = ws.template Input<GPUBackend>(0);
-  auto &output = ws.template Output<GPUBackend>(0);
+  const auto &input = ws.template InputRef<GPUBackend>(0);
+  auto &output = ws.template OutputRef<GPUBackend>(0);
   output.SetLayout(input.GetLayout());
   TYPE_SWITCH(input.type(), type2id, InputType, (uint8_t, int16_t, int32_t, float), (
       TYPE_SWITCH(output_type_, type2id, OutputType, (uint8_t, int16_t, int32_t, float), (
diff --git a/dali/operators/image/color/old_color_twist.cc b/dali/operators/image/color/old_color_twist.cc
index 0b52b9f25b7..986503497dd 100644
--- a/dali/operators/image/color/old_color_twist.cc
+++ b/dali/operators/image/color/old_color_twist.cc
@@ -213,9 +213,9 @@ typedef NppStatus (*colorTwistFunc)(const Npp8u *pSrc, int nSrcStep, Npp8u *pDst
 
 template<>
 void OldColorTwistBase<GPUBackend>::RunImpl(DeviceWorkspace &ws) {
-  const auto &input = ws.Input<GPUBackend>(0);
+  const auto &input = ws.InputRef<GPUBackend>(0);
   DALI_ENFORCE(IsType<uint8_t>(input.type()), "Color augmentations accept only uint8 tensors");
-  auto &output = ws.Output<GPUBackend>(0);
+  auto &output = ws.OutputRef<GPUBackend>(0);
   output.Resize(input.shape(), DALI_UINT8);
   output.SetLayout(input.GetLayout());
 
@@ -248,8 +248,8 @@ void OldColorTwistBase<GPUBackend>::RunImpl(DeviceWorkspace &ws) {
 
 template <>
 void OldColorTwistBase<CPUBackend>::RunImpl(SampleWorkspace &ws) {
-  const auto &input = ws.Input<CPUBackend>(0);
-  auto &output = ws.Output<CPUBackend>(0);
+  const auto &input = ws.InputRef<CPUBackend>(0);
+  auto &output = ws.OutputRef<CPUBackend>(0);
   const auto &input_shape = input.shape();
 
   CheckParam(input, "Color augmentation");
diff --git a/dali/operators/image/paste/multipaste.cu b/dali/operators/image/paste/multipaste.cu
index aea5171782d..64e2eeaa9e7 100644
--- a/dali/operators/image/paste/multipaste.cu
+++ b/dali/operators/image/paste/multipaste.cu
@@ -55,7 +55,7 @@ void MultiPasteGPU::InitSamples(const TensorListShape<> &out_shape) {
 template<typename OutputType, typename InputType>
 void MultiPasteGPU::SetupTyped(const workspace_t<GPUBackend> &ws,
                                const TensorListShape<> &out_shape) {
-  const auto &images = ws.template Input<GPUBackend>(0);
+  const auto &images = ws.template InputRef<GPUBackend>(0);
   const auto &in = view<const InputType, 3>(images);
   using Kernel = kernels::PasteGPU<OutputType, InputType, 3>;
   kernels::KernelContext ctx;
@@ -68,8 +68,8 @@ void MultiPasteGPU::SetupTyped(const workspace_t<GPUBackend> &ws,
 
 template<typename OutputType, typename InputType>
 void MultiPasteGPU::RunTyped(workspace_t<GPUBackend> &ws) {
-  const auto &images = ws.template Input<GPUBackend>(0);
-  auto &output = ws.template Output<GPUBackend>(0);
+  const auto &images = ws.template InputRef<GPUBackend>(0);
+  auto &output = ws.template OutputRef<GPUBackend>(0);
 
   output.SetLayout(images.GetLayout());
   auto out_shape = output.shape();
diff --git a/dali/operators/image/paste/paste.cu b/dali/operators/image/paste/paste.cu
index 0bd6e78ae30..e420d0eb386 100644
--- a/dali/operators/image/paste/paste.cu
+++ b/dali/operators/image/paste/paste.cu
@@ -124,8 +124,8 @@ void Paste<GPUBackend>::SetupSharedSampleParams(DeviceWorkspace &ws) {
 
 template<>
 void Paste<GPUBackend>::SetupSampleParams(DeviceWorkspace &ws) {
-  auto &input = ws.Input<GPUBackend>(0);
-  auto &output = ws.Output<GPUBackend>(0);
+  auto &input = ws.InputRef<GPUBackend>(0);
+  auto &output = ws.OutputRef<GPUBackend>(0);
   auto curr_batch_size = ws.GetInputBatchSize(0);
 
   std::vector<TensorShape<>> output_shape(curr_batch_size);
diff --git a/dali/operators/image/resize/random_resized_crop.cu b/dali/operators/image/resize/random_resized_crop.cu
index 58c6a0304a9..49049937417 100644
--- a/dali/operators/image/resize/random_resized_crop.cu
+++ b/dali/operators/image/resize/random_resized_crop.cu
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2020, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -30,8 +30,8 @@ void RandomResizedCrop<GPUBackend>::BackendInit() {
 
 template<>
 void RandomResizedCrop<GPUBackend>::RunImpl(DeviceWorkspace &ws) {
-  auto &input = ws.Input<GPUBackend>(0);
-  auto &output = ws.Output<GPUBackend>(0);
+  auto &input = ws.InputRef<GPUBackend>(0);
+  auto &output = ws.OutputRef<GPUBackend>(0);
   RunResize(ws, output, input);
   output.SetLayout(input.GetLayout());
 }
diff --git a/dali/operators/image/resize/resize.cc b/dali/operators/image/resize/resize.cc
index 086b4409ac1..95a506d44c5 100755
--- a/dali/operators/image/resize/resize.cc
+++ b/dali/operators/image/resize/resize.cc
@@ -82,14 +82,14 @@ void Resize<GPUBackend>::InitializeBackend() {
 
 template<>
 void Resize<GPUBackend>::RunImpl(DeviceWorkspace &ws) {
-  const auto &input = ws.Input<GPUBackend>(0);
-  auto &output = ws.Output<GPUBackend>(0);
+  const auto &input = ws.InputRef<GPUBackend>(0);
+  auto &output = ws.OutputRef<GPUBackend>(0);
 
   RunResize(ws, output, input);
   output.SetLayout(input.GetLayout());
 
   if (save_attrs_) {
-    auto &attr_out = ws.Output<GPUBackend>(1);
+    auto &attr_out = ws.OutputRef<GPUBackend>(1);
     const auto &attr_shape = attr_out.shape();
     assert(attr_shape.num_samples() == input.shape().num_samples() &&
            attr_shape.sample_dim() == 1 &&
diff --git a/dali/operators/image/resize/resize_crop_mirror.h b/dali/operators/image/resize/resize_crop_mirror.h
index c1621085518..04ff4aade6d 100755
--- a/dali/operators/image/resize/resize_crop_mirror.h
+++ b/dali/operators/image/resize/resize_crop_mirror.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -164,10 +164,10 @@ class ResizeCropMirrorAttr : protected CropAttr {
    * @return const vector<Index> One matching shape for all inputs
    */
   virtual const std::vector<Index> CheckShapes(const SampleWorkspace *ws) {
-    const auto &input = ws->Input<CPUBackend>(0);
+    const auto &input = ws->InputRef<CPUBackend>(0);
     // enforce that all shapes match
     for (int i = 1; i < ws->NumInput(); ++i) {
-      DALI_ENFORCE(input.SameShape(ws->Input<CPUBackend>(i)));
+      DALI_ENFORCE(input.SameShape(ws->InputRef<CPUBackend>(i)));
     }
 
     DALI_ENFORCE(input.ndim() == 3, "Operator expects 3-dimensional image input.");
@@ -230,8 +230,8 @@ class ResizeCropMirror : public Operator<CPUBackend>, protected ResizeCropMirror
   }
 
   inline void RunResizeImpl(SampleWorkspace &ws, resizeCropMirroHost func) {
-    auto &input = ws.Input<CPUBackend>(0);
-    auto &output = ws.Output<CPUBackend>(0);
+    auto &input = ws.InputRef<CPUBackend>(0);
+    auto &output = ws.OutputRef<CPUBackend>(0);
     CheckParam(input, "ResizeCropMirror");
 
     const TransformMeta &meta = per_thread_meta_[ws.thread_idx()];
diff --git a/dali/operators/python_function/dltensor_function.cc b/dali/operators/python_function/dltensor_function.cc
index e5987d45da5..845ae62185c 100644
--- a/dali/operators/python_function/dltensor_function.cc
+++ b/dali/operators/python_function/dltensor_function.cc
@@ -78,7 +78,7 @@ py::list PrepareDLTensorInputs<CPUBackend>(HostWorkspace &ws) {
   for (Index idx = 0; idx < ws.NumInput(); ++idx) {
     py::list dl_tensor_list;
     for (Index i = 0; i < ws.GetInputBatchSize(idx); ++i) {
-      auto &t = ws.Input<CPUBackend>(idx, i);
+      auto &t = ws.InputRef<CPUBackend>(idx)[i];
       auto dl_capsule = TensorToDLPackView(const_cast<Tensor<CPUBackend>&>(t));
       dl_tensor_list.append(dl_capsule);
     }
@@ -106,7 +106,7 @@ py::list PrepareDLTensorInputsPerSample<CPUBackend>(HostWorkspace &ws) {
   for (Index s = 0; s < batch_size; ++s) {
     py::list tuple;
     for (Index idx = 0; idx < ws.NumInput(); ++idx) {
-      auto &t = ws.Input<CPUBackend>(idx, s);
+      auto &t = ws.InputRef<CPUBackend>(idx)[s];
       auto dl_capsule = TensorToDLPackView(const_cast<Tensor<CPUBackend>&>(t));
       tuple.append(dl_capsule);
     }
diff --git a/dali/operators/reader/coco_reader_op.cc b/dali/operators/reader/coco_reader_op.cc
index aaafb98702c..62d46589acf 100755
--- a/dali/operators/reader/coco_reader_op.cc
+++ b/dali/operators/reader/coco_reader_op.cc
@@ -201,7 +201,7 @@ void COCOReader::RunImpl(SampleWorkspace &ws) {
   const ImageLabelWrapper& image_label = GetSample(ws.data_idx());
 
   Index image_size = image_label.image.size();
-  auto &image_output = ws.Output<CPUBackend>(0);
+  auto &image_output = ws.OutputRef<CPUBackend>(0);
   int image_idx = image_label.label;
 
   image_output.Resize({image_size}, DALI_UINT8);
@@ -210,20 +210,20 @@ void COCOReader::RunImpl(SampleWorkspace &ws) {
 
   auto &loader_impl = LoaderImpl();
   auto bboxes = loader_impl.bboxes(image_idx);
-  auto &boxes_output = ws.Output<CPUBackend>(1);
+  auto &boxes_output = ws.OutputRef<CPUBackend>(1);
   boxes_output.Resize({bboxes.size(), 4}, DALI_FLOAT);
   std::memcpy(boxes_output.mutable_data<float>(), bboxes.data(),
               bboxes.size() * sizeof(vec<4>));
 
   auto labels = loader_impl.labels(image_idx);
-  auto &labels_output = ws.Output<CPUBackend>(2);
+  auto &labels_output = ws.OutputRef<CPUBackend>(2);
   labels_output.Resize({labels.size()}, DALI_INT32);  // 0.28dev: changed shape from {N, 1} to {N}
   std::memcpy(labels_output.mutable_data<int>(), labels.data(),
               labels.size() * sizeof(int));
 
   int curr_out_idx = 3;
   if (output_polygon_masks_) {
-    auto &polygons_output = ws.Output<CPUBackend>(curr_out_idx++);
+    auto &polygons_output = ws.OutputRef<CPUBackend>(curr_out_idx++);
     auto polygons = loader_impl.polygons(image_idx);
     polygons_output.Resize({polygons.size(), 3}, DALI_INT32);
     std::memcpy(polygons_output.mutable_data<int>(),
@@ -235,7 +235,7 @@ void COCOReader::RunImpl(SampleWorkspace &ws) {
         poly_data[i * 3 + 2] *= 2;
       }
     }
-    auto &vertices_output = ws.Output<CPUBackend>(curr_out_idx++);
+    auto &vertices_output = ws.OutputRef<CPUBackend>(curr_out_idx++);
     auto vertices = loader_impl.vertices(image_idx);
     vertices_output.Resize({vertices.size(), 2}, DALI_FLOAT);
     std::memcpy(vertices_output.mutable_data<float>(),
@@ -243,7 +243,7 @@ void COCOReader::RunImpl(SampleWorkspace &ws) {
   }
 
   if (output_pixelwise_masks_) {
-    auto &masks_output = ws.Output<CPUBackend>(curr_out_idx++);
+    auto &masks_output = ws.OutputRef<CPUBackend>(curr_out_idx++);
     auto masks_info = loader_impl.pixelwise_masks_info(image_idx);
     masks_output.Resize(masks_info.shape, DALI_INT32);
     masks_output.SetLayout("HWC");
@@ -251,7 +251,7 @@ void COCOReader::RunImpl(SampleWorkspace &ws) {
   }
 
   if (output_image_ids_) {
-    auto &id_output = ws.Output<CPUBackend>(curr_out_idx++);
+    auto &id_output = ws.OutputRef<CPUBackend>(curr_out_idx++);
     id_output.Resize({1}, DALI_INT32);
     *(id_output.mutable_data<int>()) = loader_impl.image_id(image_idx);
   }
diff --git a/dali/operators/reader/coco_reader_op_test.cc b/dali/operators/reader/coco_reader_op_test.cc
index 6b86bfe0051..84dda50f870 100644
--- a/dali/operators/reader/coco_reader_op_test.cc
+++ b/dali/operators/reader/coco_reader_op_test.cc
@@ -107,7 +107,7 @@ class CocoReaderTest : public ::testing::Test {
   int ObjectCount(bool masks) { return masks ? 7 : 194; }
 
   std::vector<int> CopyIds(DeviceWorkspace &ws, int ids_out_idx = 3) {
-    auto &output = ws.Output<dali::CPUBackend>(ids_out_idx);
+    auto &output = ws.OutputRef<dali::CPUBackend>(ids_out_idx);
     const auto &shape = output.shape();
 
     vector<int> ids(shape.size());
@@ -182,8 +182,8 @@ class CocoReaderTest : public ::testing::Test {
 
   void CheckInstances(DeviceWorkspace &ws, bool ltrb, bool ratio, bool skip_empty,
                       int expected_size, bool polygon_masks, bool polygon_masks_legacy) {
-    const auto &boxes_output = ws.Output<dali::CPUBackend>(1);
-    const auto &labels_output = ws.Output<dali::CPUBackend>(2);
+    const auto &boxes_output = ws.OutputRef<dali::CPUBackend>(1);
+    const auto &labels_output = ws.OutputRef<dali::CPUBackend>(2);
 
     const auto &boxes_shape = boxes_output.shape();
     const auto &labels_shape = labels_output.shape();
@@ -254,8 +254,8 @@ class CocoReaderTest : public ::testing::Test {
     }
 
     if (polygon_masks || polygon_masks_legacy) {
-      const auto &polygons_output = ws.Output<dali::CPUBackend>(3);
-      const auto &vertices_output = ws.Output<dali::CPUBackend>(4);
+      const auto &polygons_output = ws.OutputRef<dali::CPUBackend>(3);
+      const auto &vertices_output = ws.OutputRef<dali::CPUBackend>(4);
 
       const auto &polygons_shape = polygons_output.shape();
       const auto &vertices_shape = vertices_output.shape();
@@ -605,7 +605,7 @@ TEST_F(CocoReaderTest, PixelwiseMasks) {
   pipe2.Outputs(&ws2);
 
   for (auto *ws : {&ws1, &ws2}) {
-    const auto &masks_output = ws->Output<dali::CPUBackend>(3);
+    const auto &masks_output = ws->OutputRef<dali::CPUBackend>(3);
 
     const auto &masks_shape = masks_output.shape();
     TensorListShape<3> pixelwise_masks_shape({
diff --git a/dali/operators/reader/file_reader_op.h b/dali/operators/reader/file_reader_op.h
index 3f87eca9cbd..8cd41e72568 100644
--- a/dali/operators/reader/file_reader_op.h
+++ b/dali/operators/reader/file_reader_op.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -37,8 +37,8 @@ class FileReader : public DataReader<CPUBackend, ImageLabelWrapper> {
     const auto& image_label = GetSample(idx);
 
     // copy from raw_data -> outputs directly
-    auto &image_output = ws.Output<CPUBackend>(0);
-    auto &label_output = ws.Output<CPUBackend>(1);
+    auto &image_output = ws.OutputRef<CPUBackend>(0);
+    auto &label_output = ws.OutputRef<CPUBackend>(1);
 
     Index image_size = image_label.image.size();
 
diff --git a/dali/operators/reader/nemo_asr_reader_op.cc b/dali/operators/reader/nemo_asr_reader_op.cc
index 02ad5280411..0d2bc7472ff 100755
--- a/dali/operators/reader/nemo_asr_reader_op.cc
+++ b/dali/operators/reader/nemo_asr_reader_op.cc
@@ -193,7 +193,7 @@ void NemoAsrReader::RunImpl(SampleWorkspace &ws) {
   const auto &sample = GetSample(ws.data_idx());
   const auto &sample_audio = GetDecodedAudioSample(ws.data_idx());
 
-  auto &audio = ws.Output<CPUBackend>(0);
+  auto &audio = ws.OutputRef<CPUBackend>(0);
   audio.Copy(sample_audio, 0);
 
   DALIMeta meta;
@@ -203,14 +203,14 @@ void NemoAsrReader::RunImpl(SampleWorkspace &ws) {
 
   int next_out_idx = 1;
   if (read_sr_) {
-    auto &sample_rate = ws.Output<CPUBackend>(next_out_idx++);
+    auto &sample_rate = ws.OutputRef<CPUBackend>(next_out_idx++);
     sample_rate.Resize({}, DALI_FLOAT);
     sample_rate.mutable_data<float>()[0] = sample.audio_meta().sample_rate;
     sample_rate.SetMeta(meta);
   }
 
   if (read_text_) {
-    auto &text_out = ws.Output<CPUBackend>(next_out_idx++);
+    auto &text_out = ws.OutputRef<CPUBackend>(next_out_idx++);
     const auto &text = sample.text();
     int64_t text_sz = text.length();
     text_out.Resize({text_sz}, DALI_UINT8);
@@ -219,7 +219,7 @@ void NemoAsrReader::RunImpl(SampleWorkspace &ws) {
   }
 
   if (read_idxs_) {
-    auto &idxs_out = ws.Output<CPUBackend>(next_out_idx++);
+    auto &idxs_out = ws.OutputRef<CPUBackend>(next_out_idx++);
     idxs_out.Resize({1}, DALI_INT64);
     *idxs_out.mutable_data<int64_t>() = sample.index();
     idxs_out.SetMeta(meta);
diff --git a/dali/operators/reader/parser/caffe2_parser.h b/dali/operators/reader/parser/caffe2_parser.h
index 15a336b6ab2..fa2317e3848 100644
--- a/dali/operators/reader/parser/caffe2_parser.h
+++ b/dali/operators/reader/parser/caffe2_parser.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -121,19 +121,19 @@ void ParseLabels(const caffe2::TensorProtos& protos,
                  const int num_labels,
                  SampleWorkspace* ws,
                  int consumed_inputs) {
-  auto& label_tensor = ws->Output<CPUBackend>(consumed_inputs);
+  auto& label_tensor = ws->OutputRef<CPUBackend>(consumed_inputs);
   switch (label_type) {
     case SINGLE_LABEL: {
       // single element, from protos(1) to Output(consumed_inputs)
       // ensure we only have a single label in the proto
       DALI_ENFORCE(proto_data_size<T>(protos.protos(consumed_inputs)) == 1);
 
-      extract_data<T>(protos.protos(consumed_inputs), ws->Output<CPUBackend>(consumed_inputs));
+      extract_data<T>(protos.protos(consumed_inputs), ws->OutputRef<CPUBackend>(consumed_inputs));
       break;
     }
     case MULTI_LABEL_SPARSE: {
       // multiple labels, all 1. in elements defined in protos(consumed_inputs)
-      auto& label_tensor = ws->Output<CPUBackend>(consumed_inputs);
+      auto& label_tensor = ws->OutputRef<CPUBackend>(consumed_inputs);
       label_tensor.set_type<T>();
       label_tensor.Resize({num_labels});
 
@@ -150,7 +150,7 @@ void ParseLabels(const caffe2::TensorProtos& protos,
     }
     case MULTI_LABEL_DENSE: {
       // multiple elements, stored contiguously
-      extract_data<T>(protos.protos(consumed_inputs), ws->Output<CPUBackend>(consumed_inputs));
+      extract_data<T>(protos.protos(consumed_inputs), ws->OutputRef<CPUBackend>(consumed_inputs));
       break;
     }
     case MULTI_LABEL_WEIGHTED_SPARSE: {
@@ -196,7 +196,7 @@ class Caffe2Parser : public Parser<Tensor<CPUBackend>> {
 
 
     if (image_available_) {
-      auto& image = ws->Output<CPUBackend>(consumed_inputs);
+      auto& image = ws->OutputRef<CPUBackend>(consumed_inputs);
       const caffe2::TensorProto& image_proto = protos.protos(consumed_inputs);
 
       // copy image -- if type is string, image is encoded, if bytes, image isn't encoded
@@ -243,7 +243,7 @@ class Caffe2Parser : public Parser<Tensor<CPUBackend>> {
 
     for (int i = additional_proto_start; i < additional_proto_end; ++i) {
       auto& additional_proto = protos.protos(i);
-      auto& output_tensor = ws->Output<CPUBackend>(consumed_inputs);
+      auto& output_tensor = ws->OutputRef<CPUBackend>(consumed_inputs);
 
       switch (additional_proto.data_type()) {
        case caffe2::TensorProto::FLOAT:
@@ -269,7 +269,7 @@ class Caffe2Parser : public Parser<Tensor<CPUBackend>> {
       DALI_ENFORCE(bbox_proto.data_type() == caffe2::TensorProto::INT32);
       DALI_ENFORCE(bbox_proto.int32_data_size() == 4);
 
-      extract_data<int>(bbox_proto, ws->Output<CPUBackend>(consumed_inputs));
+      extract_data<int>(bbox_proto, ws->OutputRef<CPUBackend>(consumed_inputs));
     }
   }
 
diff --git a/dali/operators/reader/parser/caffe_parser.h b/dali/operators/reader/parser/caffe_parser.h
index e6a3399e33c..6a68cda7ac1 100644
--- a/dali/operators/reader/parser/caffe_parser.h
+++ b/dali/operators/reader/parser/caffe_parser.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -36,7 +36,7 @@ class CaffeParser : public Parser<Tensor<CPUBackend>> {
 
     if (image_available_ && datum.has_data()) {
       bool encoded_data = true;
-      auto& image = ws->Output<CPUBackend>(out_tensors);
+      auto& image = ws->OutputRef<CPUBackend>(out_tensors);
       if (datum.has_encoded() && !datum.encoded()) {
         encoded_data = false;
       }
@@ -53,7 +53,7 @@ class CaffeParser : public Parser<Tensor<CPUBackend>> {
     }
 
     if (label_available_ && datum.has_label()) {
-      auto& label = ws->Output<CPUBackend>(out_tensors);
+      auto& label = ws->OutputRef<CPUBackend>(out_tensors);
 
       // copy label
       label.Resize({1}, DALI_INT32);
diff --git a/dali/operators/reader/parser/parser_test.cc b/dali/operators/reader/parser/parser_test.cc
index 2d21e306b7e..fb4723ab1d7 100644
--- a/dali/operators/reader/parser/parser_test.cc
+++ b/dali/operators/reader/parser/parser_test.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ class IntArrayParser : public Parser<IntArrayWrapper> {
 
     printf("H: %d, W: %d, C: %d\n", H, W, C);
 
-    Tensor<Backend>& output = ws->template Output<Backend>(0);
+    Tensor<Backend>& output = ws->template OutputRef<Backend>(0);
     output.Resize({H, W, C}, DALI_INT32);
 
     int *output_data = output.template mutable_data<int>();
@@ -75,10 +75,10 @@ TYPED_TEST(ParserTest, BasicTest) {
   HostWorkspace workspace;
   SampleWorkspace ws;
 
-  workspace.GetSample(&ws, 0, 0);
+  MakeSampleView(ws, workspace, 0, 0);
 
   shared_ptr<Tensor<CPUBackend>> t(new Tensor<CPUBackend>());
-  ws.AddOutput(t);
+  ws.AddOutput(t.get());
 
   IntArrayParser<CPUBackend> parser(OpSpec("temp"));
   IntArrayWrapper ia_wrapper = {data.data(), data.size()};
diff --git a/dali/operators/reader/parser/recordio_parser.h b/dali/operators/reader/parser/recordio_parser.h
index 98c182fd889..f6d9daeb329 100644
--- a/dali/operators/reader/parser/recordio_parser.h
+++ b/dali/operators/reader/parser/recordio_parser.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -35,8 +35,8 @@ class RecordIOParser : public Parser<Tensor<CPUBackend>> {
   }
 
   void Parse(const Tensor<CPUBackend>& data, SampleWorkspace* ws) override {
-    auto& image = ws->Output<CPUBackend>(0);
-    auto& label = ws->Output<CPUBackend>(1);
+    auto& image = ws->OutputRef<CPUBackend>(0);
+    auto& label = ws->OutputRef<CPUBackend>(1);
     ReadSingleImageRecordIO(image, label, data.data<uint8_t>());
     image.SetSourceInfo(data.GetSourceInfo());
   }
diff --git a/dali/operators/reader/parser/sequence_parser.cc b/dali/operators/reader/parser/sequence_parser.cc
index 0f76cdaeef4..0f02c2f6f3c 100644
--- a/dali/operators/reader/parser/sequence_parser.cc
+++ b/dali/operators/reader/parser/sequence_parser.cc
@@ -21,7 +21,7 @@
 namespace dali {
 
 void SequenceParser::Parse(const TensorSequence& data, SampleWorkspace* ws) {
-  auto& sequence = ws->Output<CPUBackend>(0);
+  auto& sequence = ws->OutputRef<CPUBackend>(0);
   sequence.SetLayout("FHWC");
   Index seq_length = data.tensors.size();
 
diff --git a/dali/operators/reader/parser/tfrecord_parser.h b/dali/operators/reader/parser/tfrecord_parser.h
index a9d6c5444df..c38450f5ca6 100644
--- a/dali/operators/reader/parser/tfrecord_parser.h
+++ b/dali/operators/reader/parser/tfrecord_parser.h
@@ -63,7 +63,7 @@ class TFRecordParser : public Parser<Tensor<CPUBackend>> {
                   " (raw data length: ", length, "bytes)."));
 
     for (size_t i = 0; i < features_.size(); ++i) {
-      auto& output = ws->Output<CPUBackend>(i);
+      auto& output = ws->OutputRef<CPUBackend>(i);
       Feature& f = features_[i];
       std::string& name = feature_names_[i];
       auto& feature = example.features().feature();
diff --git a/dali/operators/reader/reader_op.h b/dali/operators/reader/reader_op.h
index 08ed601b963..accd945b681 100644
--- a/dali/operators/reader/reader_op.h
+++ b/dali/operators/reader/reader_op.h
@@ -237,7 +237,7 @@ class DataReader : public Operator<Backend> {
       DALI_ENFORCE(cached_outputs.size() == num_outputs,
         "Unexpected number of outputs");
       for (std::size_t i = 0; i < cached_outputs.size(); i++) {
-        auto& output = ws->Output<CPUBackend>(i);
+        auto& output = ws->OutputRef<CPUBackend>(i);
         output.Copy(cached_outputs[i], 0);
       }
       return;
@@ -262,7 +262,7 @@ class DataReader : public Operator<Backend> {
       first_output.Resize({1}, DALI_UINT8);
 
       for (std::size_t i = 1; i < cached_outputs.size(); i++) {
-        auto& output = ws->Output<CPUBackend>(i);
+        auto& output = ws->OutputRef<CPUBackend>(i);
         cached_outputs[i].set_pinned(false);
         cached_outputs[i].Copy(output, 0);
       }
diff --git a/dali/operators/reader/reader_op_test.cc b/dali/operators/reader/reader_op_test.cc
index fd25495d242..231a1d0ba44 100644
--- a/dali/operators/reader/reader_op_test.cc
+++ b/dali/operators/reader/reader_op_test.cc
@@ -88,7 +88,7 @@ class DummyDataReader : public DataReader<CPUBackend, Tensor<CPUBackend>> {
   void RunImpl(SampleWorkspace &ws) override {
     std::this_thread::sleep_for(std::chrono::milliseconds(5));
 
-    ws.Output<CPUBackend>(0).Copy(GetSample(ws.data_idx()), 0);
+    ws.OutputRef<CPUBackend>(0).Copy(GetSample(ws.data_idx()), 0);
   }
 
  private:
@@ -220,7 +220,7 @@ TYPED_TEST(ReaderTest, SequenceTest) {
     pipe.RunCPU();
     pipe.RunGPU();
     pipe.Outputs(&ws);
-    auto shape = ws.Output<CPUBackend>(0).AsTensor()->shape();
+    auto shape = ws.OutputRef<CPUBackend>(0).AsTensor()->shape();
     // We have NFHWC format
     const auto batch_size = shape[0];
     const auto frame_count = shape[1];
@@ -236,7 +236,7 @@ TYPED_TEST(ReaderTest, SequenceTest) {
       auto start_frame = (i * batch_size + sample) % (16 - 3 + 1);
       for (int frame = 0; frame < frame_count; frame++) {
         auto off = sample * seq_size + frame * frame_size;
-        auto val = ws.Output<CPUBackend>(0).AsTensor()->data<uint8_t>()[off];
+        auto val = ws.OutputRef<CPUBackend>(0).AsTensor()->data<uint8_t>()[off];
         decltype(val) expected = start_frame + frame;
         ASSERT_EQ(val, expected);
       }
diff --git a/dali/operators/reader/video_reader_decoder_op.cc b/dali/operators/reader/video_reader_decoder_op.cc
index 664e693937a..1548a08c04f 100644
--- a/dali/operators/reader/video_reader_decoder_op.cc
+++ b/dali/operators/reader/video_reader_decoder_op.cc
@@ -26,12 +26,12 @@ VideoReaderDecoder::VideoReaderDecoder(const OpSpec &spec)
 
 void VideoReaderDecoder::RunImpl(SampleWorkspace &ws) {
   const auto &sample = GetSample(ws.data_idx());
-  auto &video_output = ws.Output<CPUBackend>(0);
+  auto &video_output = ws.OutputRef<CPUBackend>(0);
 
   video_output.Copy(sample.data_, 0);
 
   if (has_labels_) {
-    auto &label_output = ws.Output<CPUBackend>(1);
+    auto &label_output = ws.OutputRef<CPUBackend>(1);
     label_output.Resize({}, DALIDataType::DALI_INT32);
     label_output.mutable_data<int>()[0] = sample.label_;
   }
diff --git a/dali/operators/reader/video_reader_op.h b/dali/operators/reader/video_reader_op.h
index 5e7d9a73697..a00032912c7 100644
--- a/dali/operators/reader/video_reader_op.h
+++ b/dali/operators/reader/video_reader_op.h
@@ -129,16 +129,16 @@ class VideoReader : public DataReader<GPUBackend, SequenceWrapper> {
   void PrepareAdditionalOutputs(DeviceWorkspace &ws) {
     int output_index = 1;
     if (output_labels_) {
-      label_output_ = &ws.Output<GPUBackend>(output_index++);
+      label_output_ = &ws.OutputRef<GPUBackend>(output_index++);
       label_output_->Resize(label_shape_, DALI_INT32);
       if (can_use_frames_timestamps_) {
         if (enable_frame_num_) {
-          frame_num_output_ = &ws.Output<GPUBackend>(output_index++);
+          frame_num_output_ = &ws.OutputRef<GPUBackend>(output_index++);
           frame_num_output_->Resize(frame_num_shape_, DALI_INT32);
         }
 
         if (enable_timestamps_) {
-          timestamp_output_ = &ws.Output<GPUBackend>(output_index++);
+          timestamp_output_ = &ws.OutputRef<GPUBackend>(output_index++);
           timestamp_output_->Resize(timestamp_shape_, DALI_FLOAT64);
         }
       }
@@ -181,7 +181,7 @@ class VideoReader : public DataReader<GPUBackend, SequenceWrapper> {
   }
 
   void RunImpl(DeviceWorkspace &ws) override {
-    auto &video_output = ws.Output<GPUBackend>(0);
+    auto &video_output = ws.OutputRef<GPUBackend>(0);
     auto &curent_batch = prefetched_batch_tensors_[curr_batch_consumer_];
 
     SetOutputShapeType(video_output, ws);
diff --git a/dali/operators/reader/video_reader_op_test.cc b/dali/operators/reader/video_reader_op_test.cc
index 11b7d3f5692..4a79b1273fc 100644
--- a/dali/operators/reader/video_reader_op_test.cc
+++ b/dali/operators/reader/video_reader_op_test.cc
@@ -87,7 +87,7 @@ TEST_F(VideoReaderTest, ConstantFrameRate) {
   pipe.RunGPU();
   pipe.Outputs(&ws);
 
-  const auto &frames_output = ws.Output<dali::GPUBackend>(0);
+  const auto &frames_output = ws.OutputRef<dali::GPUBackend>(0);
   const auto &frames_shape = frames_output.shape();
 
   ASSERT_EQ(frames_shape.size(), 1);
@@ -139,8 +139,8 @@ TEST_F(VideoReaderTest, MultipleVideoResolution) {
   pipe.RunGPU();
   pipe.Outputs(&ws);
 
-  const auto &frames_output = ws.Output<dali::GPUBackend>(0);
-  const auto &labels_output = ws.Output<dali::GPUBackend>(1);
+  const auto &frames_output = ws.OutputRef<dali::GPUBackend>(0);
+  const auto &labels_output = ws.OutputRef<dali::GPUBackend>(1);
 
   TensorList<CPUBackend> labels_cpu;
   labels_cpu.Copy(labels_output, 0);
@@ -196,7 +196,7 @@ TEST_F(VideoReaderTest, PackedBFrames) {
     pipe.RunCPU();
     pipe.RunGPU();
     pipe.Outputs(&ws);
-    const auto &frames_output = ws.Output<dali::GPUBackend>(0);
+    const auto &frames_output = ws.OutputRef<dali::GPUBackend>(0);
     const auto &frames_shape = frames_output.shape();
 
     ASSERT_EQ(frames_shape.size(), batch_size);
@@ -243,7 +243,7 @@ TEST_F(VideoReaderTest, Vp9Profile0) {
     }
   }
 
-  const auto &frames_output = ws.Output<dali::GPUBackend>(0);
+  const auto &frames_output = ws.OutputRef<dali::GPUBackend>(0);
   const auto &frames_shape = frames_output.shape();
 
   ASSERT_EQ(frames_shape.size(), 1);
@@ -282,7 +282,7 @@ TEST_F(VideoReaderTest, Vp9Profile2) {
     }
   }
 
-  const auto &frames_output = ws.Output<dali::GPUBackend>(0);
+  const auto &frames_output = ws.OutputRef<dali::GPUBackend>(0);
   const auto &frames_shape = frames_output.shape();
 
   ASSERT_EQ(frames_shape.size(), 1);
@@ -319,7 +319,7 @@ TEST_F(VideoReaderTest, Vp8Profile0) {
     }
   }
 
-  const auto &frames_output = ws.Output<dali::GPUBackend>(0);
+  const auto &frames_output = ws.OutputRef<dali::GPUBackend>(0);
   const auto &frames_shape = frames_output.shape();
 
   ASSERT_EQ(frames_shape.size(), 1);
@@ -358,7 +358,7 @@ TEST_F(VideoReaderTest, MJpeg) {
     }
   }
 
-  const auto &frames_output = ws.Output<dali::GPUBackend>(0);
+  const auto &frames_output = ws.OutputRef<dali::GPUBackend>(0);
   const auto &frames_shape = frames_output.shape();
 
   ASSERT_EQ(frames_shape.size(), 1);
@@ -399,7 +399,7 @@ TEST_F(VideoReaderTest, HEVC) {
     }
   }
 
-  const auto &frames_output = ws.Output<dali::GPUBackend>(0);
+  const auto &frames_output = ws.OutputRef<dali::GPUBackend>(0);
   const auto &frames_shape = frames_output.shape();
 
   ASSERT_EQ(frames_shape.size(), 16);
@@ -430,9 +430,9 @@ TEST_F(VideoReaderTest, FrameLabels) {
     pipe.RunCPU();
     pipe.RunGPU();
     pipe.Outputs(&ws);
-    const auto &frames_gpu = ws.Output<dali::GPUBackend>(0);
-    const auto &label_gpu = ws.Output<dali::GPUBackend>(1);
-    const auto &frame_num_gpu = ws.Output<dali::GPUBackend>(2);
+    const auto &frames_gpu = ws.OutputRef<dali::GPUBackend>(0);
+    const auto &label_gpu = ws.OutputRef<dali::GPUBackend>(1);
+    const auto &frame_num_gpu = ws.OutputRef<dali::GPUBackend>(2);
 
     TensorList<CPUBackend> frames_cpu;
     frames_cpu.Copy(frames_gpu, 0);
@@ -476,9 +476,9 @@ TEST_F(VideoReaderTest, FrameLabelsFilenames) {
     pipe.RunCPU();
     pipe.RunGPU();
     pipe.Outputs(&ws);
-    const auto &frames_gpu = ws.Output<dali::GPUBackend>(0);
-    const auto &label_gpu = ws.Output<dali::GPUBackend>(1);
-    const auto &frame_num_gpu = ws.Output<dali::GPUBackend>(2);
+    const auto &frames_gpu = ws.OutputRef<dali::GPUBackend>(0);
+    const auto &label_gpu = ws.OutputRef<dali::GPUBackend>(1);
+    const auto &frame_num_gpu = ws.OutputRef<dali::GPUBackend>(2);
 
     TensorList<CPUBackend> frames_cpu;
     frames_cpu.Copy(frames_gpu, 0);
@@ -524,9 +524,9 @@ TEST_F(VideoReaderTest, LabelsFilenames) {
     pipe.RunCPU();
     pipe.RunGPU();
     pipe.Outputs(&ws);
-    const auto &frames_gpu = ws.Output<dali::GPUBackend>(0);
-    const auto &label_gpu = ws.Output<dali::GPUBackend>(1);
-    const auto &frame_num_gpu = ws.Output<dali::GPUBackend>(2);
+    const auto &frames_gpu = ws.OutputRef<dali::GPUBackend>(0);
+    const auto &label_gpu = ws.OutputRef<dali::GPUBackend>(1);
+    const auto &frame_num_gpu = ws.OutputRef<dali::GPUBackend>(2);
 
     TensorList<CPUBackend> frames_cpu;
     frames_cpu.Copy(frames_gpu, 0);
@@ -572,10 +572,10 @@ TEST_F(VideoReaderTest, FrameLabelsWithFileListFrameNum) {
     pipe.RunCPU();
     pipe.RunGPU();
     pipe.Outputs(&ws);
-    const auto &frames_gpu = ws.Output<dali::GPUBackend>(0);
-    const auto &label_gpu = ws.Output<dali::GPUBackend>(1);
-    const auto &frame_num_gpu = ws.Output<dali::GPUBackend>(2);
-    const auto &timestamp_gpu = ws.Output<dali::GPUBackend>(3);
+    const auto &frames_gpu = ws.OutputRef<dali::GPUBackend>(0);
+    const auto &label_gpu = ws.OutputRef<dali::GPUBackend>(1);
+    const auto &frame_num_gpu = ws.OutputRef<dali::GPUBackend>(2);
+    const auto &timestamp_gpu = ws.OutputRef<dali::GPUBackend>(3);
 
     TensorList<CPUBackend> frames_cpu;
     frames_cpu.Copy(frames_gpu, 0);
@@ -634,10 +634,10 @@ TEST_F(VideoReaderTest, TimestampLabels) {
     pipe.RunCPU();
     pipe.RunGPU();
     pipe.Outputs(&ws);
-    const auto &frames_gpu = ws.Output<dali::GPUBackend>(0);
-    const auto &label_gpu = ws.Output<dali::GPUBackend>(1);
-    const auto &frame_num_gpu = ws.Output<dali::GPUBackend>(2);
-    const auto &timestamp_gpu = ws.Output<dali::GPUBackend>(3);
+    const auto &frames_gpu = ws.OutputRef<dali::GPUBackend>(0);
+    const auto &label_gpu = ws.OutputRef<dali::GPUBackend>(1);
+    const auto &frame_num_gpu = ws.OutputRef<dali::GPUBackend>(2);
+    const auto &timestamp_gpu = ws.OutputRef<dali::GPUBackend>(3);
 
     TensorList<CPUBackend> frames_cpu;
     frames_cpu.Copy(frames_gpu, 0);
@@ -680,9 +680,9 @@ TEST_F(VideoReaderTest, StartEndLabels) {
     pipe.RunCPU();
     pipe.RunGPU();
     pipe.Outputs(&ws);
-    const auto &frames_gpu = ws.Output<dali::GPUBackend>(0);
-    const auto &label_gpu = ws.Output<dali::GPUBackend>(1);
-    const auto &frame_num_gpu = ws.Output<dali::GPUBackend>(2);
+    const auto &frames_gpu = ws.OutputRef<dali::GPUBackend>(0);
+    const auto &label_gpu = ws.OutputRef<dali::GPUBackend>(1);
+    const auto &frame_num_gpu = ws.OutputRef<dali::GPUBackend>(2);
 
     TensorList<CPUBackend> frames_cpu;
     frames_cpu.Copy(frames_gpu, 0);
@@ -720,7 +720,7 @@ TEST_F(VideoReaderTest, MultipleFrameRates) {
     pipe.RunCPU();
     pipe.RunGPU();
     pipe.Outputs(&ws);
-    const auto &frames_output = ws.Output<dali::GPUBackend>(0);
+    const auto &frames_output = ws.OutputRef<dali::GPUBackend>(0);
     const auto &frames_shape = frames_output.shape();
 
     ASSERT_EQ(frames_shape.size(), batch_size);
diff --git a/dali/operators/sequence/optical_flow/optical_flow.cc b/dali/operators/sequence/optical_flow/optical_flow.cc
index d418e72f542..aeb88c73169 100644
--- a/dali/operators/sequence/optical_flow/optical_flow.cc
+++ b/dali/operators/sequence/optical_flow/optical_flow.cc
@@ -113,9 +113,9 @@ void OpticalFlow<GPUBackend>::RunImpl(Workspace<GPUBackend> &ws) {
   if (enable_external_hints_) {
     // Fetch data
     // Input is a TensorList, where every Tensor is a sequence
-    const auto &input = ws.Input<GPUBackend>(0);
-    const auto &hints = ws.Input<GPUBackend>(1);
-    auto &output = ws.Output<GPUBackend>(0);
+    const auto &input = ws.InputRef<GPUBackend>(0);
+    const auto &hints = ws.InputRef<GPUBackend>(1);
+    auto &output = ws.OutputRef<GPUBackend>(0);
     output.SetLayout("HWC");  // Channels represent the two flow vector components (x and y)
     // Extract calculation params
     ExtractParams(input, hints);
@@ -153,8 +153,8 @@ void OpticalFlow<GPUBackend>::RunImpl(Workspace<GPUBackend> &ws) {
   } else {
     // Fetch data
     // Input is a TensorList, where every Tensor is a sequence
-    const auto &input = ws.Input<GPUBackend>(0);
-    auto &output = ws.Output<GPUBackend>(0);
+    const auto &input = ws.InputRef<GPUBackend>(0);
+    auto &output = ws.OutputRef<GPUBackend>(0);
     output.SetLayout(input.GetLayout());
 
     // Extract calculation params
diff --git a/dali/operators/signal/decibel/to_decibels_op_gpu.cu b/dali/operators/signal/decibel/to_decibels_op_gpu.cu
index 1cfd388cc8f..5741041f1cc 100644
--- a/dali/operators/signal/decibel/to_decibels_op_gpu.cu
+++ b/dali/operators/signal/decibel/to_decibels_op_gpu.cu
@@ -103,7 +103,7 @@ template <>
 bool ToDecibels<GPUBackend>::SetupImpl(std::vector<OutputDesc> &output_desc,
                                        const workspace_t<GPUBackend> &ws) {
   output_desc.resize(kNumOutputs);
-  const auto &input = ws.Input<GPUBackend>(0);
+  const auto &input = ws.InputRef<GPUBackend>(0);
   auto type = input.type();
   TYPE_SWITCH(type, type2id, T, (float), (
       using Impl = ToDecibelsImpl<T>;
diff --git a/dali/operators/ssd/box_encoder.cc b/dali/operators/ssd/box_encoder.cc
index 836abd20be4..cf18f70b776 100644
--- a/dali/operators/ssd/box_encoder.cc
+++ b/dali/operators/ssd/box_encoder.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -156,8 +156,8 @@ void BoxEncoder<CPUBackend>::WriteMatchesToOutput(
 }
 
 void BoxEncoder<CPUBackend>::RunImpl(SampleWorkspace &ws) {
-  const auto &bboxes_input = ws.Input<CPUBackend>(kBoxesInId);
-  const auto &labels_input = ws.Input<CPUBackend>(kLabelsInId);
+  const auto &bboxes_input = ws.InputRef<CPUBackend>(kBoxesInId);
+  const auto &labels_input = ws.InputRef<CPUBackend>(kLabelsInId);
   const auto num_boxes = bboxes_input.dim(0);
   const auto labels = labels_input.data<int>();
 
@@ -166,11 +166,11 @@ void BoxEncoder<CPUBackend>::RunImpl(SampleWorkspace &ws) {
   ReadBoxes(make_span(boxes), make_cspan(bboxes_input.data<float>(), bboxes_input.size()), {}, {});
 
   // Create output
-  auto &bboxes_output = ws.Output<CPUBackend>(kBoxesOutId);
+  auto &bboxes_output = ws.OutputRef<CPUBackend>(kBoxesOutId);
   bboxes_output.Resize({static_cast<int>(anchors_.size()), BoundingBox::size}, bboxes_input.type());
   auto out_boxes = bboxes_output.mutable_data<float>();
 
-  auto &labels_output = ws.Output<CPUBackend>(kLabelsOutId);
+  auto &labels_output = ws.OutputRef<CPUBackend>(kLabelsOutId);
   labels_output.Resize({static_cast<int>(anchors_.size())}, labels_input.type());
   auto out_labels = labels_output.mutable_data<int>();
 
diff --git a/dali/operators/ssd/box_encoder.cu b/dali/operators/ssd/box_encoder.cu
index cde92d853a8..594c058915a 100644
--- a/dali/operators/ssd/box_encoder.cu
+++ b/dali/operators/ssd/box_encoder.cu
@@ -260,8 +260,8 @@ BoxEncoder<GPUBackend>::CalculateDims(
 }
 
 void BoxEncoder<GPUBackend>::RunImpl(Workspace<GPUBackend> &ws) {
-  const auto &boxes_input = ws.Input<GPUBackend>(kBoxesInId);
-  const auto &labels_input = ws.Input<GPUBackend>(kLabelsInId);
+  const auto &boxes_input = ws.InputRef<GPUBackend>(kBoxesInId);
+  const auto &labels_input = ws.InputRef<GPUBackend>(kLabelsInId);
   assert(ws.GetInputBatchSize(kBoxesInId) == ws.GetInputBatchSize(kLabelsInId));
   auto curr_batch_size = ws.GetInputBatchSize(kBoxesInId);
 
@@ -273,10 +273,10 @@ void BoxEncoder<GPUBackend>::RunImpl(Workspace<GPUBackend> &ws) {
 
   auto dims = CalculateDims(boxes_input);
 
-  auto &boxes_output = ws.Output<GPUBackend>(kBoxesOutId);
+  auto &boxes_output = ws.OutputRef<GPUBackend>(kBoxesOutId);
   boxes_output.Resize(dims.first, boxes_input.type());
 
-  auto &labels_output = ws.Output<GPUBackend>(kLabelsOutId);
+  auto &labels_output = ws.OutputRef<GPUBackend>(kLabelsOutId);
   labels_output.Resize(dims.second, labels_input.type());
 
   samples.resize(curr_batch_size_);
diff --git a/dali/operators/ssd/box_encoder_test.cc b/dali/operators/ssd/box_encoder_test.cc
index ceecc477e4f..468da270afe 100644
--- a/dali/operators/ssd/box_encoder_test.cc
+++ b/dali/operators/ssd/box_encoder_test.cc
@@ -1085,14 +1085,14 @@ class BoxEncoderTest : public GenericBBoxesTest<ImgType> {
   }
 
   void CheckAnswersForCocoOnCpu(DeviceWorkspace *ws, bool offset = false) {
-    TensorList<CPUBackend> &boxes = ws->Output<dali::CPUBackend>(0);
-    TensorList<CPUBackend> &labels = ws->Output<dali::CPUBackend>(1);
+    TensorList<CPUBackend> &boxes = ws->OutputRef<dali::CPUBackend>(0);
+    TensorList<CPUBackend> &labels = ws->OutputRef<dali::CPUBackend>(1);
     CheckAnswersForCoco(&boxes, &labels, offset);
   }
 
   void CheckAnswersForCocoOnGpu(DeviceWorkspace *ws, bool offset = false) {
-    auto boxes = this->CopyTensorListToHost(ws->Output<dali::GPUBackend>(0));
-    auto labels = this->CopyTensorListToHost(ws->Output<dali::GPUBackend>(1));
+    auto boxes = this->CopyTensorListToHost(ws->OutputRef<dali::GPUBackend>(0));
+    auto labels = this->CopyTensorListToHost(ws->OutputRef<dali::GPUBackend>(1));
     CheckAnswersForCoco(boxes.get(), labels.get(), offset);
   }
 };
diff --git a/dali/operators/ssd/random_crop.cc b/dali/operators/ssd/random_crop.cc
index 340cad42326..781313baeb8 100644
--- a/dali/operators/ssd/random_crop.cc
+++ b/dali/operators/ssd/random_crop.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2019, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -179,10 +179,10 @@ void crop(const Tensor<CPUBackend>& img, vector<int> bounds, Tensor<CPUBackend>&
 template <>
 void SSDRandomCrop<CPUBackend>::RunImpl(SampleWorkspace &ws) {
   // [H, W, C], dtype=uint8_t
-  const auto& img = ws.Input<CPUBackend>(0);
+  const auto& img = ws.InputRef<CPUBackend>(0);
   // [N] : [ltrb, ... ], dtype=float
-  const auto& bboxes = ws.Input<CPUBackend>(1);
-  const auto& labels = ws.Input<CPUBackend>(2);
+  const auto& bboxes = ws.InputRef<CPUBackend>(1);
+  const auto& labels = ws.InputRef<CPUBackend>(2);
   int sample = ws.data_idx();
 
   auto N = bboxes.dim(0);
@@ -202,9 +202,9 @@ void SSDRandomCrop<CPUBackend>::RunImpl(SampleWorkspace &ws) {
 
     if (option.no_crop()) {
       // copy directly to output without modification
-      ws.Output<CPUBackend>(0).Copy(img, 0);
-      ws.Output<CPUBackend>(1).Copy(bboxes, 0);
-      ws.Output<CPUBackend>(2).Copy(labels, 0);
+      ws.OutputRef<CPUBackend>(0).Copy(img, 0);
+      ws.OutputRef<CPUBackend>(1).Copy(bboxes, 0);
+      ws.OutputRef<CPUBackend>(2).Copy(labels, 0);
       return;
     }
 
@@ -273,10 +273,10 @@ void SSDRandomCrop<CPUBackend>::RunImpl(SampleWorkspace &ws) {
 
       // now we know how many output bboxes there will be, we can allocate
       // the output.
-      auto &img_out = ws.Output<CPUBackend>(0);
+      auto &img_out = ws.OutputRef<CPUBackend>(0);
       img_out.SetLayout(img.GetLayout());
-      auto &bbox_out = ws.Output<CPUBackend>(1);
-      auto &label_out = ws.Output<CPUBackend>(2);
+      auto &bbox_out = ws.OutputRef<CPUBackend>(1);
+      auto &label_out = ws.OutputRef<CPUBackend>(2);
 
       bbox_out.Resize({valid_bboxes, 4}, DALI_FLOAT);
       auto *bbox_out_data = bbox_out.mutable_data<float>();
diff --git a/dali/pipeline/executor/executor_test.cc b/dali/pipeline/executor/executor_test.cc
index f51dbfe9cd9..2b724ba9e1e 100644
--- a/dali/pipeline/executor/executor_test.cc
+++ b/dali/pipeline/executor/executor_test.cc
@@ -599,7 +599,7 @@ TYPED_TEST(ExecutorSyncTest, TestPrefetchedExecution) {
   ASSERT_EQ(ws.NumOutput(), 1);
   ASSERT_EQ(ws.NumInput(), 0);
   ASSERT_TRUE(ws.OutputIsType<GPUBackend>(0));
-  TensorList<GPUBackend> &res1 = ws.Output<GPUBackend>(0);
+  TensorList<GPUBackend> &res1 = ws.OutputRef<GPUBackend>(0);
   for (int i = 0; i < batch_size; ++i) {
     this->VerifyDecode(
         res1.template tensor<uint8>(i),
@@ -615,7 +615,7 @@ TYPED_TEST(ExecutorSyncTest, TestPrefetchedExecution) {
   auto status_2 = barrier_future_2.wait_for(std::chrono::seconds(5));
   ASSERT_EQ(status_2, std::future_status::ready);
   ASSERT_EQ(cb_counter, 2);
-  TensorList<GPUBackend> &res2 = ws.Output<GPUBackend>(0);
+  TensorList<GPUBackend> &res2 = ws.OutputRef<GPUBackend>(0);
   for (int i = 0; i < batch_size; ++i) {
     this->VerifyDecode(
         res2.template tensor<uint8>(i),
diff --git a/dali/pipeline/operator/builtin/external_source.cc b/dali/pipeline/operator/builtin/external_source.cc
index 1a711cbd792..0e43e08c1bb 100644
--- a/dali/pipeline/operator/builtin/external_source.cc
+++ b/dali/pipeline/operator/builtin/external_source.cc
@@ -36,7 +36,7 @@ void ExternalSource<CPUBackend>::RunImpl(HostWorkspace &ws) {
     for (int sample_id = 0; sample_id < curr_batch_size; ++sample_id) {
       thread_pool.AddWork(
           [&ws, sample_id, &tensor_vector_elm](int tid) {
-            Tensor<CPUBackend> &output_tensor = ws.Output<CPUBackend>(0, sample_id);
+            Tensor<CPUBackend> &output_tensor = ws.OutputRef<CPUBackend>(0)[sample_id];
             // HostWorkspace doesn't have any stream
             cudaStream_t stream = 0;
             output_tensor.Copy((*tensor_vector_elm.front())[sample_id], stream);
diff --git a/dali/pipeline/operator/builtin/external_source.cu b/dali/pipeline/operator/builtin/external_source.cu
index ddcdae2c6d0..53dd0046fd6 100644
--- a/dali/pipeline/operator/builtin/external_source.cu
+++ b/dali/pipeline/operator/builtin/external_source.cu
@@ -37,7 +37,7 @@ void ExternalSource<GPUBackend>::RunImpl(DeviceWorkspace &ws) {
     }
   }
 
-  auto &output = ws.Output<GPUBackend>(0);
+  auto &output = ws.OutputRef<GPUBackend>(0);
   cudaStream_t stream_used = ws.has_stream() ? ws.stream() : 0;
   if (!state_info.no_copy || state_info.copied_shared_data) {
     CUDA_CALL(cudaStreamWaitEvent(stream_used, *internal_copy_to_storage.front(), 0));
diff --git a/dali/pipeline/operator/builtin/external_source_test.cc b/dali/pipeline/operator/builtin/external_source_test.cc
index d7b9ba55cf9..eb6d5c5aff0 100644
--- a/dali/pipeline/operator/builtin/external_source_test.cc
+++ b/dali/pipeline/operator/builtin/external_source_test.cc
@@ -192,7 +192,7 @@ class ExternalSourceTest : public::testing::WithParamInterface<int>,
   bool RunOutputs() {
     DeviceWorkspace ws;
     exe_->Outputs(&ws);
-    auto &tensor_gpu_list = ws.Output<GPUBackend>(0);
+    auto &tensor_gpu_list = ws.OutputRef<GPUBackend>(0);
     TensorList<CPUBackend> tensor_cpu_list;
     tensor_cpu_list.Copy(tensor_gpu_list, (ws.has_stream() ? ws.stream() : 0));
     CUDA_CALL(cudaStreamSynchronize(ws.has_stream() ? ws.stream() : 0));
@@ -579,9 +579,9 @@ void TestRunExternalSource(Pipeline &pipe, const std::string &name,
   TensorList<CPUBackend> output_cpu;
   pipe.Outputs(&ws);
   if (dev == "cpu") {
-    output_cpu.Copy(ws.Output<CPUBackend>(0), 0);
+    output_cpu.Copy(ws.OutputRef<CPUBackend>(0), 0);
   } else {
-    output_cpu.Copy(ws.Output<GPUBackend>(0), 0);
+    output_cpu.Copy(ws.OutputRef<GPUBackend>(0), 0);
     cudaStreamSynchronize(0);
   }
   ASSERT_EQ(input_cpu.shape(), output_cpu.shape());
diff --git a/dali/pipeline/operator/builtin/make_contiguous.cu b/dali/pipeline/operator/builtin/make_contiguous.cu
index b977566fb19..27aa9b8fa92 100644
--- a/dali/pipeline/operator/builtin/make_contiguous.cu
+++ b/dali/pipeline/operator/builtin/make_contiguous.cu
@@ -24,7 +24,7 @@ void MakeContiguousMixed::Run(MixedWorkspace &ws) {
   DALIDataType type = input.type();
 
   for (size_t i = 0; i < input.num_samples(); ++i) {
-    auto &sample = ws.Input<CPUBackend>(0, i);
+    auto &sample = ws.InputRef<CPUBackend>(0)[i];
     size_t sample_bytes = sample.nbytes();
     if (coalesced && sample_bytes > COALESCE_THRESHOLD)
       coalesced = false;
@@ -34,7 +34,7 @@ void MakeContiguousMixed::Run(MixedWorkspace &ws) {
         "in input batch. Cannot copy to contiguous device buffer.");
   }
 
-  auto &output = ws.Output<GPUBackend>(0);
+  auto &output = ws.OutputRef<GPUBackend>(0);
   if (coalesced) {
     DomainTimeRange tr("[DALI][MakeContiguousMixed] coalesced", DomainTimeRange::kBlue);
     cpu_output_buff.Copy(input, 0);
diff --git a/dali/pipeline/operator/operator.cc b/dali/pipeline/operator/operator.cc
index 53f9a459c08..1f5be4259df 100644
--- a/dali/pipeline/operator/operator.cc
+++ b/dali/pipeline/operator/operator.cc
@@ -50,7 +50,7 @@ void OperatorBase::EnforceUniformOutputBatchSize<MixedBackend>(
   auto ref_batch_size = ws.NumInput() > 0 ? ws.GetInputBatchSize(0) : ws.GetRequestedBatchSize(0);
   for (int i = 0; i < ws.NumOutput(); i++) {
     auto output_batch_size = const_cast<workspace_t<MixedBackend> &>(ws)
-                                 .template Output<GPUBackend>(i)
+                                 .template OutputRef<GPUBackend>(i)
                                  .shape()
                                  .num_samples();
     DALI_ENFORCE(ref_batch_size == output_batch_size,
diff --git a/dali/pipeline/operator/operator.h b/dali/pipeline/operator/operator.h
index 6e4e0817055..f695e8f5a63 100644
--- a/dali/pipeline/operator/operator.h
+++ b/dali/pipeline/operator/operator.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -332,7 +332,7 @@ class Operator<CPUBackend> : public OperatorBase {
     for (int data_idx = 0; data_idx < curr_batch_size; ++data_idx) {
       thread_pool.AddWork([this, &ws, data_idx](int tid) {
         SampleWorkspace sample;
-        ws.GetSample(&sample, data_idx, tid);
+        MakeSampleView(sample, ws, data_idx, tid);
         this->SetupSharedSampleParams(sample);
         this->RunImpl(sample);
       }, -data_idx);  // -data_idx for FIFO order
diff --git a/dali/pipeline/pipeline_test.cc b/dali/pipeline/pipeline_test.cc
index d7d81b0359a..0f4261c4689 100644
--- a/dali/pipeline/pipeline_test.cc
+++ b/dali/pipeline/pipeline_test.cc
@@ -465,29 +465,29 @@ TEST_F(PipelineTestOnce, TestPresize) {
   pipe.Outputs(&ws);
 
   // we should not presize CPU buffers if they are not pinned
-  ASSERT_EQ(*(ws.Output<CPUBackend>(0).tensor<size_t>(0)), 0);
+  ASSERT_EQ(*(ws.OutputRef<CPUBackend>(0).tensor<size_t>(0)), 0);
 
   int ref_presize = RestrictPinnedMemUsage() ? 0 : presize_val_CPU;
-  ASSERT_EQ(*(ws.Output<CPUBackend>(1).tensor<size_t>(0)), ref_presize);
+  ASSERT_EQ(*(ws.OutputRef<CPUBackend>(1).tensor<size_t>(0)), ref_presize);
 
   size_t tmp[2];
   CUDA_CALL(cudaDeviceSynchronize());
-  CUDA_CALL(cudaMemcpy(&tmp, ws.Output<GPUBackend>(2).tensor<size_t>(0),
+  CUDA_CALL(cudaMemcpy(&tmp, ws.OutputRef<GPUBackend>(2).tensor<size_t>(0),
             sizeof(size_t) * 2, cudaMemcpyDefault));
   ASSERT_EQ(tmp[0], presize_val_Mixed);
   ASSERT_EQ(tmp[1], 2 * sizeof(size_t));
 
-  CUDA_CALL(cudaMemcpy(&tmp, ws.Output<GPUBackend>(3).tensor<size_t>(0),
+  CUDA_CALL(cudaMemcpy(&tmp, ws.OutputRef<GPUBackend>(3).tensor<size_t>(0),
             sizeof(size_t) * 2, cudaMemcpyDefault));
   ASSERT_EQ(tmp[0], presize_val_GPU);
   ASSERT_EQ(tmp[1], 2 * sizeof(size_t));
 
-  CUDA_CALL(cudaMemcpy(&tmp, ws.Output<GPUBackend>(4).tensor<size_t>(0),
+  CUDA_CALL(cudaMemcpy(&tmp, ws.OutputRef<GPUBackend>(4).tensor<size_t>(0),
             sizeof(size_t) * 2, cudaMemcpyDefault));
   ASSERT_EQ(tmp[0], presize_val_GPU);
   ASSERT_EQ(tmp[1], 2 * sizeof(size_t));
 
-  CUDA_CALL(cudaMemcpy(&tmp, ws.Output<GPUBackend>(5).tensor<size_t>(0),
+  CUDA_CALL(cudaMemcpy(&tmp, ws.OutputRef<GPUBackend>(5).tensor<size_t>(0),
             sizeof(size_t) * 2, cudaMemcpyDefault));
   ASSERT_EQ(tmp[0], presize_val_default);
   ASSERT_EQ(tmp[1], 2 * sizeof(size_t));
@@ -551,7 +551,7 @@ class PrefetchedPipelineTest : public GenericDecoderTest<RGB> {
     ASSERT_EQ(ws.NumOutput(), 1);
     ASSERT_EQ(ws.NumInput(), 0);
     ASSERT_TRUE(ws.OutputIsType<GPUBackend>(0));
-    TensorList<GPUBackend> &res1 = ws.Output<GPUBackend>(0);
+    TensorList<GPUBackend> &res1 = ws.OutputRef<GPUBackend>(0);
     for (int j = 0; j < batch_size; ++j) {
       this->VerifyDecode(
           res1.template tensor<uint8>(j),
diff --git a/dali/pipeline/workspace/device_workspace.cc b/dali/pipeline/workspace/device_workspace.cc
deleted file mode 100644
index 6dc452a898a..00000000000
--- a/dali/pipeline/workspace/device_workspace.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "dali/pipeline/workspace/device_workspace.h"
-
-#include "dali/pipeline/workspace/sample_workspace.h"
-
-namespace dali {
-
-template <>
-const TensorList<CPUBackend>& DeviceWorkspace::Input(int idx) const {
-  return *CPUInput(idx);
-}
-
-template <>
-const TensorList<GPUBackend>& DeviceWorkspace::Input(int idx) const {
-  return *GPUInput(idx);
-}
-
-template <>
-TensorList<CPUBackend>& DeviceWorkspace::MutableInput(int idx) {
-  return *CPUInput(idx);
-}
-
-template <>
-TensorList<GPUBackend>& DeviceWorkspace::MutableInput(int idx) {
-  return *GPUInput(idx);
-}
-
-template <>
-TensorList<CPUBackend>& DeviceWorkspace::Output(int idx) {
-  return *CPUOutput(idx);
-}
-
-template <>
-TensorList<GPUBackend>& DeviceWorkspace::Output(int idx) {
-  return *GPUOutput(idx);
-}
-
-}  // namespace dali
diff --git a/dali/pipeline/workspace/device_workspace.h b/dali/pipeline/workspace/device_workspace.h
index 3498290ba88..0c2d6e84b96 100644
--- a/dali/pipeline/workspace/device_workspace.h
+++ b/dali/pipeline/workspace/device_workspace.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -41,8 +41,6 @@ using DeviceOutputType = shared_ptr<TensorList<Backend>>;
  */
 class DLL_PUBLIC DeviceWorkspace : public WorkspaceBase<DeviceInputType, DeviceOutputType> {
  public:
-  using WorkspaceBase<DeviceInputType, DeviceOutputType>::input_t;
-  using WorkspaceBase<DeviceInputType, DeviceOutputType>::output_t;
   DLL_PUBLIC DeviceWorkspace() : stream_(0), event_(nullptr) {}
   DLL_PUBLIC ~DeviceWorkspace() override = default;
 
@@ -58,33 +56,6 @@ class DLL_PUBLIC DeviceWorkspace : public WorkspaceBase<DeviceInputType, DeviceO
     parent_events_.clear();
   }
 
-  /**
-   * @brief Returns the input TensorList at index `idx`.
-   *
-   * @throws runtime_error If calling type does not match the type of
-   * the output at the given index.
-   */
-  template <typename Backend>
-  DLL_PUBLIC const TensorList<Backend>& Input(int idx) const;
-
-  /**
-   * @brief Returns the input non-const TensorList at index `idx`.
-   *
-   * @throws runtime_error If calling type does not match the type of
-   * the output at the given index.
-   */
-  template <typename Backend>
-  DLL_PUBLIC TensorList<Backend>& MutableInput(int idx);
-
-  /**
-   * @brief Returns the output TensorList at index `idx`.
-   *
-   * @throws runtime_error If calling type does not match the type of
-   * the output at the given index.
-   */
-  template <typename Backend>
-  DLL_PUBLIC TensorList<Backend>& Output(int idx);
-
   /**
    * @brief Sets the stream for this workspace.
    */
diff --git a/dali/pipeline/workspace/host_workspace.cc b/dali/pipeline/workspace/host_workspace.cc
deleted file mode 100644
index 4b6cc0f35fa..00000000000
--- a/dali/pipeline/workspace/host_workspace.cc
+++ /dev/null
@@ -1,66 +0,0 @@
-// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "dali/pipeline/workspace/host_workspace.h"
-
-#include "dali/pipeline/workspace/sample_workspace.h"
-
-namespace dali {
-
-void HostWorkspace::GetSample(SampleWorkspace* ws, int data_idx, int thread_idx) {
-  DALI_ENFORCE(ws != nullptr, "Input workspace is nullptr.");
-  ws->Clear();
-  ws->set_data_idx(data_idx);
-  ws->set_thread_idx(thread_idx);
-  for (const auto& input_meta : input_index_map_) {
-    if (input_meta.storage_device == StorageDevice::CPU) {
-      ws->AddInput(cpu_inputs_[input_meta.index]->tensor_handle(data_idx));
-    } else {
-      ws->AddInput(gpu_inputs_[input_meta.index]->tensor_handle(data_idx));
-    }
-  }
-  for (const auto& output_meta : output_index_map_) {
-    if (output_meta.storage_device == StorageDevice::CPU) {
-      ws->AddOutput(cpu_outputs_[output_meta.index]->tensor_handle(data_idx));
-    } else {
-      ws->AddOutput(gpu_outputs_[output_meta.index]->tensor_handle(data_idx));
-    }
-  }
-  for (auto& arg_pair : argument_inputs_) {
-    assert(!arg_pair.second.should_update);
-    ws->AddArgumentInput(arg_pair.first, arg_pair.second.tvec);
-  }
-}
-
-template <>
-const Tensor<CPUBackend>& HostWorkspace::Input(int idx, int data_idx) const {
-  return InputRef<CPUBackend>(idx)[data_idx];
-}
-
-template <>
-const Tensor<GPUBackend>& HostWorkspace::Input(int idx, int data_idx) const {
-  return InputRef<GPUBackend>(idx)[data_idx];
-}
-
-template <>
-Tensor<CPUBackend>& HostWorkspace::Output(int idx, int data_idx) {
-  return OutputRef<CPUBackend>(idx)[data_idx];
-}
-
-template <>
-Tensor<GPUBackend>& HostWorkspace::Output(int idx, int data_idx) {
-  return OutputRef<GPUBackend>(idx)[data_idx];
-}
-
-}  // namespace dali
diff --git a/dali/pipeline/workspace/host_workspace.h b/dali/pipeline/workspace/host_workspace.h
index 085370ad576..911ec1963c2 100644
--- a/dali/pipeline/workspace/host_workspace.h
+++ b/dali/pipeline/workspace/host_workspace.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -42,39 +42,9 @@ class SampleWorkspace;
  */
 class DLL_PUBLIC HostWorkspace : public WorkspaceBase<HostInputType, HostOutputType> {
  public:
-  using WorkspaceBase<HostInputType, HostOutputType>::input_t;
-  using WorkspaceBase<HostInputType, HostOutputType>::output_t;
-
   DLL_PUBLIC inline HostWorkspace() {}
   DLL_PUBLIC inline ~HostWorkspace() override = default;
 
-  /**
-   * @brief Returns a sample workspace for the given sample
-   * index and thread index
-   */
-  DLL_PUBLIC void GetSample(SampleWorkspace *ws, int data_idx, int thread_idx);
-
-  /**
-   * @brief Returns the Tensor at index `data_idx` in the input
-   * Tensors at index `idx`.
-   *
-   * @throws runtime_error if the calling type does not match the
-   * type of the tensor at the given index
-   */
-  template <typename Backend>
-  DLL_PUBLIC const Tensor<Backend>& Input(int idx, int data_idx) const;
-
-  /**
-   * @brief Returns the Tensor at index `data_idx` in the output
-   * Tensors at index `idx`.
-   *
-   * @throws runtime_error if the calling type does not match the
-   * type of the tensor at the given index
-   */
-  template <typename Backend>
-  DLL_PUBLIC Tensor<Backend>& Output(int idx, int data_idx);
-
-
   DLL_PUBLIC inline void SetThreadPool(ThreadPool *pool) {
     thread_pool_ = pool;
   }
diff --git a/dali/pipeline/workspace/mixed_workspace.cc b/dali/pipeline/workspace/mixed_workspace.cc
deleted file mode 100644
index a54a0e81632..00000000000
--- a/dali/pipeline/workspace/mixed_workspace.cc
+++ /dev/null
@@ -1,41 +0,0 @@
-// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "dali/pipeline/workspace/mixed_workspace.h"
-
-#include "dali/pipeline/workspace/sample_workspace.h"
-
-namespace dali {
-
-template <>
-const Tensor<CPUBackend>& MixedWorkspace::Input(int idx, int data_idx) const {
-  return InputRef<CPUBackend>(idx)[data_idx];
-}
-
-template <>
-const Tensor<GPUBackend>& MixedWorkspace::Input(int idx, int data_idx) const {
-  return InputRef<GPUBackend>(idx)[data_idx];
-}
-
-template <>
-TensorList<CPUBackend>& MixedWorkspace::Output(int idx) {
-  return OutputRef<CPUBackend>(idx);
-}
-
-template <>
-TensorList<GPUBackend>& MixedWorkspace::Output(int idx) {
-  return  OutputRef<GPUBackend>(idx);
-}
-
-}  // namespace dali
diff --git a/dali/pipeline/workspace/mixed_workspace.h b/dali/pipeline/workspace/mixed_workspace.h
index 8bddaf89b90..c70e8875eba 100644
--- a/dali/pipeline/workspace/mixed_workspace.h
+++ b/dali/pipeline/workspace/mixed_workspace.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -41,31 +41,9 @@ using MixedOutputType = shared_ptr<TensorList<Backend>>;
  */
 class DLL_PUBLIC MixedWorkspace : public WorkspaceBase<MixedInputType, MixedOutputType> {
  public:
-  using WorkspaceBase<MixedInputType, MixedOutputType>::input_t;
-  using WorkspaceBase<MixedInputType, MixedOutputType>::output_t;
   DLL_PUBLIC inline MixedWorkspace() : stream_(0), event_(nullptr) {}
   DLL_PUBLIC inline ~MixedWorkspace() override = default;
 
-
-  /**
-   * @brief Returns the input Tensor at index `data_idx` in the input
-   * set of Tensors at index `idx`.
-   *
-   * @throws runtime_error If calling type does not match the type of
-   * the output at the given index.
-   */
-  template <typename Backend>
-  DLL_PUBLIC const Tensor<Backend>& Input(int idx, int data_idx) const;
-
-  /**
-   * @brief Returns the output TensorList at index `idx`.
-   *
-   * @throws runtime_error If calling type does not match the type of
-   * the output at the given index.
-   */
-  template <typename Backend>
-  DLL_PUBLIC TensorList<Backend>& Output(int idx);
-
   /**
    * @brief Sets the stream for this workspace.
    */
diff --git a/dali/pipeline/workspace/sample_workspace.cc b/dali/pipeline/workspace/sample_workspace.cc
index 445b5ce1c9b..7f9bcf42a56 100644
--- a/dali/pipeline/workspace/sample_workspace.cc
+++ b/dali/pipeline/workspace/sample_workspace.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -12,28 +12,40 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "dali/pipeline/workspace/host_workspace.h"
 #include "dali/pipeline/workspace/sample_workspace.h"
 
 namespace dali {
 
-template <>
-const Tensor<CPUBackend>& SampleWorkspace::Input(int idx) const {
-  return *CPUInput(idx);
-}
-
-template <>
-const Tensor<GPUBackend>& SampleWorkspace::Input(int idx) const {
-  return *GPUInput(idx);
-}
-
-template <>
-Tensor<CPUBackend>& SampleWorkspace::Output(int idx) {
-  return *CPUOutput(idx);
-}
+void MakeSampleView(SampleWorkspace& sample, HostWorkspace& batch, int data_idx, int thread_idx) {
+  sample.Clear();
+  sample.set_data_idx(data_idx);
+  sample.set_thread_idx(thread_idx);
+  int num_inputs = batch.NumInput();
+  for (int i = 0; i < num_inputs; i++) {
+    if (batch.InputIsType<CPUBackend>(i)) {
+      auto &input_ref = batch.InputRef<CPUBackend>(i);
+      sample.AddInput(&input_ref[data_idx]);
+    } else {
+      auto &input_ref = batch.InputRef<GPUBackend>(i);
+      sample.AddInput(&input_ref[data_idx]);
+    }
+  }
 
-template <>
-Tensor<GPUBackend>& SampleWorkspace::Output(int idx) {
-  return *GPUOutput(idx);
+  int num_outputs = batch.NumOutput();
+  for (int i = 0; i < num_outputs; i++) {
+    if (batch.OutputIsType<CPUBackend>(i)) {
+      auto &output_ref = batch.OutputRef<CPUBackend>(i);
+      sample.AddOutput(&output_ref[data_idx]);
+    } else {
+      auto &output_ref = batch.OutputRef<GPUBackend>(i);
+      sample.AddOutput(&output_ref[data_idx]);
+    }
+  }
+  for (auto& arg_pair : batch) {
+    assert(!arg_pair.second.should_update);
+    sample.AddArgumentInput(arg_pair.first, arg_pair.second.tvec);
+  }
 }
 
 }  // namespace dali
diff --git a/dali/pipeline/workspace/sample_workspace.h b/dali/pipeline/workspace/sample_workspace.h
index 88b62444575..9e281db5437 100644
--- a/dali/pipeline/workspace/sample_workspace.h
+++ b/dali/pipeline/workspace/sample_workspace.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2017-2018, NVIDIA CORPORATION. All rights reserved.
+// Copyright (c) 2017-2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -32,13 +32,14 @@
 namespace dali {
 
 template <typename Backend>
-using SampleInputType = shared_ptr<Tensor<Backend>>;
+using SampleInputType = Tensor<Backend> *;
 template <typename Backend>
-using SampleOutputType = shared_ptr<Tensor<Backend>>;
+using SampleOutputType = Tensor<Backend> *;
 
 /**
- * @brief SampleWorkspace stores all data required for an operator to
- * perform its computation on a single sample.
+ * @brief SampleWorkspace is workspace used for the legacy, deprcated CPU Operator implementation.
+ * It has views of all data required for an operator to perform its computation on a single sample,
+ * the data is actually owned by a corresponding HostWorkspace
  */
 class DLL_PUBLIC SampleWorkspace : public WorkspaceBase<SampleInputType, SampleOutputType> {
  public:
@@ -58,20 +59,6 @@ class DLL_PUBLIC SampleWorkspace : public WorkspaceBase<SampleInputType, SampleO
     stream_ = 0;
   }
 
-  /**
-   * @brief Returns Tensor with index = data_idx() from the input
-   * TensorList at index = `idx`.
-   */
-  template <typename Backend>
-  DLL_PUBLIC const Tensor<Backend>& Input(int idx) const;
-
-  /**
-   * @brief Returns Tensor with index = data_idx() from the output
-   * TensorList at index = `idx`.
-   */
-  template <typename Backend>
-  DLL_PUBLIC Tensor<Backend>& Output(int idx);
-
   int GetInputBatchSize(int) const {
     DALI_FAIL(
         "Impossible function: "
@@ -140,6 +127,13 @@ class DLL_PUBLIC SampleWorkspace : public WorkspaceBase<SampleInputType, SampleO
   bool has_stream_;
 };
 
+/**
+ * @brief Fill the `sample` with data references to the ones owned by the `batch` for given
+ * `data_idx` and set the `thread_idx`.
+ */
+DLL_PUBLIC void MakeSampleView(SampleWorkspace& sample, HostWorkspace& batch, int data_idx,
+                                 int thread_idx);
+
 }  // namespace dali
 
 #endif  // DALI_PIPELINE_WORKSPACE_SAMPLE_WORKSPACE_H_
diff --git a/dali/pipeline/workspace/workspace.h b/dali/pipeline/workspace/workspace.h
index 616383be8ba..6d53ad6ae92 100644
--- a/dali/pipeline/workspace/workspace.h
+++ b/dali/pipeline/workspace/workspace.h
@@ -142,12 +142,12 @@ class WorkspaceBase : public ArgumentWorkspace {
   }
 
   template <typename Backend>
-  typename InputType<Backend>::element_type& InputRef(int idx) const {
+  auto& InputRef(int idx) const {
     return *InputHandle(idx, Backend{});
   }
 
   template <typename Backend>
-  typename OutputType<Backend>::element_type& OutputRef(int idx) const {
+  auto& OutputRef(int idx) const {
     return *OutputHandle(idx, Backend{});
   }
 
diff --git a/dali/test/dali_operator_test.h b/dali/test/dali_operator_test.h
index 693eb21f51c..15a11cc9e36 100644
--- a/dali/test/dali_operator_test.h
+++ b/dali/test/dali_operator_test.h
@@ -142,9 +142,9 @@ GetOutputsFromPipeline(Pipeline &pipeline, const std::string &output_backend) {
   pipeline.Outputs(&workspace);
   for (int output_idx = 0; output_idx < workspace.NumOutput(); output_idx++) {
     if (workspace.OutputIsType<CPUBackend>(output_idx)) {
-      ret.emplace_back(&workspace.template Output<CPUBackend>(output_idx));
+      ret.emplace_back(&workspace.template OutputRef<CPUBackend>(output_idx));
     } else {
-      ret.emplace_back(&workspace.template Output<GPUBackend>(output_idx));
+      ret.emplace_back(&workspace.template OutputRef<GPUBackend>(output_idx));
     }
   }
   return ret;
diff --git a/dali/test/dali_test_bboxes.h b/dali/test/dali_test_bboxes.h
index 1ea10275d53..e7956471abe 100644
--- a/dali/test/dali_test_bboxes.h
+++ b/dali/test/dali_test_bboxes.h
@@ -95,11 +95,11 @@ class GenericBBoxesTest : public DALISingleOpTest<ImgType> {
     DeviceWorkspace ws;
     pipe->Outputs(&ws);
 
-    auto images_cpu = this->CopyToHost(ws.Output<GPUBackend>(0))[0];
-    images_cpu->SetLayout(ws.Output<GPUBackend>(0).GetLayout());
+    auto images_cpu = this->CopyToHost(ws.OutputRef<GPUBackend>(0))[0];
+    images_cpu->SetLayout(ws.OutputRef<GPUBackend>(0).GetLayout());
 
-    auto boxes_cpu = this->CopyToHost(ws.Output<GPUBackend>(1))[0];
-    boxes_cpu->SetLayout(ws.Output<GPUBackend>(1).GetLayout());
+    auto boxes_cpu = this->CopyToHost(ws.OutputRef<GPUBackend>(1))[0];
+    boxes_cpu->SetLayout(ws.OutputRef<GPUBackend>(1).GetLayout());
 
     return {images_cpu, boxes_cpu};
   }
@@ -145,15 +145,15 @@ class GenericBBoxesTest : public DALISingleOpTest<ImgType> {
     std::vector<std::shared_ptr<TensorList<CPUBackend>>> ret;
     ret.push_back(std::make_shared<TensorList<CPUBackend>>());
     ret.push_back(std::make_shared<TensorList<CPUBackend>>());
-    ret[0]->Copy(ws.Output<CPUBackend>(0), 0);
-    ret[1]->Copy(ws.Output<CPUBackend>(1), 0);
+    ret[0]->Copy(ws.OutputRef<CPUBackend>(0), 0);
+    ret[1]->Copy(ws.OutputRef<CPUBackend>(1), 0);
     return ret;
   }
 
   vector<std::shared_ptr<TensorList<CPUBackend>>> Reference(
       const vector<TensorList<CPUBackend> *> &inputs,
       DeviceWorkspace *ws) override {
-    auto &from = ws->Output<GPUBackend>(1);
+    auto &from = ws->OutputRef<GPUBackend>(1);
     auto reference = this->CopyToHost(from);
     reference[0]->SetLayout(from.GetLayout());
     return reference;
diff --git a/dali/test/dali_test_matching.h b/dali/test/dali_test_matching.h
index a97ac7b647d..134ee8cf274 100644
--- a/dali/test/dali_test_matching.h
+++ b/dali/test/dali_test_matching.h
@@ -59,9 +59,9 @@ class GenericMatchingTest : public DALISingleOpTest<ImgType, OutputImgType> {
   vector<std::shared_ptr<TensorList<CPUBackend>>>
   Reference(const vector<TensorList<CPUBackend>*> &inputs, DeviceWorkspace *ws) override {
     if (GetOpType() == OpType::GPU)
-      return this->CopyToHost(ws->Output<GPUBackend>(1));
+      return this->CopyToHost(ws->OutputRef<GPUBackend>(1));
     else
-      return this->CopyToHost(ws->Output<CPUBackend>(1));
+      return this->CopyToHost(ws->OutputRef<CPUBackend>(1));
   }
 
   uint32_t GetTestCheckType() const override {
diff --git a/dali/test/dali_test_resize.h b/dali/test/dali_test_resize.h
index 30fe20157b8..d31b944e398 100755
--- a/dali/test/dali_test_resize.h
+++ b/dali/test/dali_test_resize.h
@@ -76,7 +76,7 @@ class GenericResizeTest : public DALISingleOpTest<ImgType> {
 
       // determine resize parameters
       if (useExternSizes) {
-        const auto *t = ws->Output<CPUBackend>(1).tensor<int>(i);
+        const auto *t = ws->OutputRef<CPUBackend>(1).tensor<int>(i);
         rsz_h = t[0];
         rsz_w = t[1];
       } else {
diff --git a/dali/test/dali_test_single_op.h b/dali/test/dali_test_single_op.h
index cc3d484d229..9e2c2e464ae 100644
--- a/dali/test/dali_test_single_op.h
+++ b/dali/test/dali_test_single_op.h
@@ -308,9 +308,9 @@ class DALISingleOpTest : public DALITest {
       auto idx = output_indices[i];
       if (output_device == "gpu") {
         // copy to host
-        calc_output->Copy(ws->Output<GPUBackend>(idx), nullptr);
+        calc_output->Copy(ws->OutputRef<GPUBackend>(idx), nullptr);
       } else {
-        calc_output->Copy(ws->Output<CPUBackend>(idx), nullptr);
+        calc_output->Copy(ws->OutputRef<CPUBackend>(idx), nullptr);
       }
 
       auto& ref_output = res[i];
diff --git a/dali/test/plugins/dummy/dummy.cu b/dali/test/plugins/dummy/dummy.cu
index 10409ead010..a11e6a51a58 100644
--- a/dali/test/plugins/dummy/dummy.cu
+++ b/dali/test/plugins/dummy/dummy.cu
@@ -19,9 +19,9 @@ namespace other_ns {
 
 template<>
 void Dummy<::dali::GPUBackend>::RunImpl(::dali::DeviceWorkspace &ws) {
-  const auto &input = ws.Input<::dali::GPUBackend>(0);
+  const auto &input = ws.InputRef<::dali::GPUBackend>(0);
   const auto &shape = input.shape();
-  auto &output = ws.Output<::dali::GPUBackend>(0);
+  auto &output = ws.OutputRef<::dali::GPUBackend>(0);
   for (int sample_idx = 0; sample_idx < shape.num_samples(); sample_idx++) {
     CUDA_CALL(cudaMemcpyAsync(
             output.raw_mutable_tensor(sample_idx),