diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
index f23064da26319..f25002799fde1 100644
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@@ -126,6 +126,7 @@ sycl/test-e2e/KernelFusion @intel/dpcpp-kernel-fusion-reviewers
 sycl/include/sycl/ext/oneapi/matrix/ @intel/sycl-matrix-reviewers
 sycl/test-e2e/Matrix @intel/sycl-matrix-reviewers
 sycl/test/matrix @intel/sycl-matrix-reviewers
+sycl/test/check_device_code/matrix @intel/sycl-matrix-reviewers
 
 # Native CPU
 llvm/**/*SYCLNativeCPU* @intel/dpcpp-nativecpu-pi-reviewers 
@@ -164,3 +165,15 @@ sycl/test-e2e/DeviceCodeSplit/ @intel/dpcpp-tools-reviewers
 sycl/test-e2e/SeparateCompile/ @intel/dpcpp-tools-reviewers
 sycl/test-e2e/Printf/ @intel/dpcpp-tools-reviewers @intel/llvm-reviewers-runtime
 sycl/test-e2e/SpecConstants/ @intel/dpcpp-tools-reviewers
+
+# Sanitizer
+clang/lib/Driver/SanitizerArgs.cpp @intel/dpcpp-sanitizers-review
+libdevice/sanitizer_utils.cpp @intel/dpcpp-sanitizers-review
+libdevice/include/asan_libdevice.hpp @intel/dpcpp-sanitizers-review
+libdevice/include/sanitizer_utils.hpp @intel/dpcpp-sanitizers-review
+llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp @intel/dpcpp-sanitizers-review
+sycl/test-e2e/AddressSanitizer/ @intel/dpcpp-sanitizers-review
+llvm/test/Instrumentation/AddressSanitizer/ @intel/dpcpp-sanitizers-review
+llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerOptions.h @intel/dpcpp-sanitizers-review
+llvm/include/llvm/Transforms/Instrumentation/AddressSanitizerCommon.h @intel/dpcpp-sanitizers-review
+llvm/include/llvm/Transforms/Instrumentation/AddressSanitizer.h @intel/dpcpp-sanitizers-review
diff --git a/.github/workflows/sycl-linux-precommit-aws.yml b/.github/workflows/sycl-linux-precommit-aws.yml
index f7fe4cad3ea96..990fb89dcaca8 100644
--- a/.github/workflows/sycl-linux-precommit-aws.yml
+++ b/.github/workflows/sycl-linux-precommit-aws.yml
@@ -19,7 +19,7 @@ permissions:
 
 jobs:
   create-check:
-    runs-on: [Linux, build]
+    runs-on: [Linux, aux-tasks]
     permissions:
       checks: write
       statuses: write
@@ -64,7 +64,7 @@ jobs:
     with:
       name: CUDA E2E
       runner: '["aws_cuda-${{ github.event.workflow_run.id }}-${{ github.event.workflow_run.run_attempt }}"]'
-      image: ghcr.io/intel/llvm/ubuntu2204_build:latest
+      image: ghcr.io/intel/llvm/ubuntu2204_build:latest-0300ac924620a51f76c4929794637b82790f12ab
       image_options: -u 1001 --gpus all --cap-add SYS_ADMIN --env NVIDIA_DISABLE_REQUIRE=1
       target_devices: ext_oneapi_cuda:gpu
       # No idea why but that seems to work and be in sync with the main
@@ -79,7 +79,7 @@ jobs:
   update-check:
     needs: [create-check, e2e-cuda]
     if: always()
-    runs-on: [Linux, build]
+    runs-on: [Linux, aux-tasks]
     permissions:
       checks: write
       statuses: write
diff --git a/.github/workflows/sycl-linux-precommit.yml b/.github/workflows/sycl-linux-precommit.yml
index f6e31541b7188..19d106fa23675 100644
--- a/.github/workflows/sycl-linux-precommit.yml
+++ b/.github/workflows/sycl-linux-precommit.yml
@@ -46,6 +46,7 @@ jobs:
       build_artifact_suffix: "default"
       build_cache_suffix: "default"
       changes: ${{ needs.detect_changes.outputs.filters }}
+      build_image: "ghcr.io/intel/llvm/ubuntu2204_build:latest-0300ac924620a51f76c4929794637b82790f12ab"
 
   determine_arc_tests:
     name: Decide which Arc tests to run
@@ -77,7 +78,7 @@ jobs:
         include:
           - name: AMD/HIP
             runner: '["Linux", "amdgpu"]'
-            image: ghcr.io/intel/llvm/ubuntu2204_build:latest
+            image: ghcr.io/intel/llvm/ubuntu2204_build:latest-0300ac924620a51f76c4929794637b82790f12ab
             image_options: -u 1001 --device=/dev/dri --device=/dev/kfd
             target_devices: ext_oneapi_hip:gpu
           - name: Intel
diff --git a/.github/workflows/sycl-nightly.yml b/.github/workflows/sycl-nightly.yml
index f5b3453e6db98..fc0b90be7990a 100644
--- a/.github/workflows/sycl-nightly.yml
+++ b/.github/workflows/sycl-nightly.yml
@@ -74,13 +74,6 @@ jobs:
             target_devices: opencl:cpu
             tests_selector: e2e
 
-          - name: Self-hosted CUDA
-            runner: '["Linux", "cuda"]'
-            image: ghcr.io/intel/llvm/ubuntu2204_build:latest
-            image_options: -u 1001 --gpus all --cap-add SYS_ADMIN
-            target_devices: ext_oneapi_cuda:gpu
-            tests_selector: e2e
-
           - name: SYCL-CTS on OCL CPU
             runner: '["Linux", "gen12"]'
             image: ghcr.io/intel/llvm/ubuntu2204_intel_drivers:latest
diff --git a/buildbot/configure.py b/buildbot/configure.py
index 720ebb156eae9..f172be352ba7d 100644
--- a/buildbot/configure.py
+++ b/buildbot/configure.py
@@ -41,6 +41,7 @@ def do_configure(args):
     fusion_dir = os.path.join(abs_src_dir, "sycl-fusion")
     llvm_targets_to_build = args.host_target
     llvm_enable_projects = 'clang;' + llvm_external_projects
+    libclc_build_native = 'OFF'
     libclc_targets_to_build = ''
     libclc_gen_remangled_variants = 'OFF'
     sycl_build_pi_hip_platform = 'AMD'
@@ -88,8 +89,10 @@ def do_configure(args):
         sycl_enabled_plugins.append("hip")
 
     if args.native_cpu:
-        # Todo: we should set whatever targets we support for native cpu
-        libclc_targets_to_build += ";x86_64-unknown-linux-gnu"
+        if args.native_cpu_libclc_targets:
+            libclc_targets_to_build += ";" + args.native_cpu_libclc_targets
+        else:
+            libclc_build_native = "ON"
         libclc_gen_remangled_variants = "ON"
         sycl_enabled_plugins.append("native_cpu")
 
@@ -191,6 +194,7 @@ def do_configure(args):
                 "-DLIBCLC_GENERATE_REMANGLED_VARIANTS={}".format(
                     libclc_gen_remangled_variants
                 ),
+                "-DLIBCLC_NATIVECPU_HOST_TARGET={}".format(libclc_build_native),
             ]
         )
 
@@ -257,8 +261,8 @@ def main():
     parser.add_argument("--native_cpu", action='store_true', help="Enable SYCL Native CPU")
     parser.add_argument("--hip", action='store_true', help="switch from OpenCL to HIP")
     parser.add_argument("--hip-platform", type=str, choices=['AMD', 'NVIDIA'], default='AMD', help="choose hardware platform for HIP backend")
-    parser.add_argument("--host-target", default='X86',
-                        help="host LLVM target architecture, defaults to X86, multiple targets may be provided as a semi-colon separated string")
+    parser.add_argument("--host-target", default='host',
+                        help="host LLVM target architecture, defaults to \'host\', multiple targets may be provided as a semi-colon separated string")
     parser.add_argument("--enable-all-llvm-targets", action='store_true', help="build compiler with all supported targets, it doesn't change runtime build")
     parser.add_argument("--no-assertions", action='store_true', help="build without assertions")
     parser.add_argument("--docs", action='store_true', help="build Doxygen documentation")
@@ -276,6 +280,7 @@ def main():
     parser.add_argument("--disable-preview-lib", action='store_true', help="Disable building of the SYCL runtime major release preview library")
     parser.add_argument("--disable-fusion", action="store_true", help="Disable the kernel fusion JIT compiler")
     parser.add_argument("--add_security_flags", type=str, choices=['none', 'default', 'sanitize'], default=None, help="Enables security flags for compile & link. Two values are supported: 'default' and 'sanitize'. 'Sanitize' option is an extension of 'default' set.")
+    parser.add_argument('--native-cpu-libclc-targets', help='Target triples for libclc, used by the Native CPU backend')
     args = parser.parse_args()
 
     print("args:{}".format(args))
diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 1eb61d83ff2db..203394297a77a 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -395,6 +395,8 @@ def warn_drv_opt_requires_opt
   : Warning<"'%0' should be used only in conjunction with '%1'">, InGroup<UnusedCommandLineArgument>;
 def err_drv_sycl_missing_amdgpu_arch : Error<
   "missing AMDGPU architecture for SYCL offloading; specify it with '-Xsycl-target-backend%select{|=%1}0 --offload-arch=<arch-name>'">;
+def err_drv_sycl_thinlto_split_off: Error<
+  "'%0' is not supported when '%1' is set with '-fsycl'">;
 def warn_drv_sycl_offload_target_duplicate : Warning<
   "SYCL offloading target '%0' is similar to target '%1' already specified; "
   "will be ignored">, InGroup<SyclTarget>;
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 74c17c6646669..ebeb3d17ce205 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -12031,10 +12031,10 @@ bool ASTContext::DeclMustBeEmitted(const Decl *D) {
     // or `indirectly_callable' attribute must be emitted regardless of number
     // of actual uses
     if (LangOpts.SYCLIsDevice && isa<CXXMethodDecl>(D)) {
-      if (auto *A = D->getAttr<SYCLDeviceIndirectlyCallableAttr>())
-        return !A->isImplicit();
-      if (auto *A = D->getAttr<SYCLDeviceAttr>())
-        return !A->isImplicit();
+      if (D->hasAttr<SYCLDeviceIndirectlyCallableAttr>())
+        return true;
+      if (D->hasAttr<SYCLDeviceAttr>())
+        return true;
     }
 
     GVALinkage Linkage = GetGVALinkageForFunction(FD);
diff --git a/clang/lib/Basic/Targets/NVPTX.cpp b/clang/lib/Basic/Targets/NVPTX.cpp
index acdc5d9daadd5..759b751249595 100644
--- a/clang/lib/Basic/Targets/NVPTX.cpp
+++ b/clang/lib/Basic/Targets/NVPTX.cpp
@@ -286,9 +286,9 @@ void NVPTXTargetInfo::getTargetDefines(const LangOptions &Opts,
       Builder.defineMacro("__SYCL_CUDA_ARCH__", CUDAArchCode);
     } else {
       Builder.defineMacro("__CUDA_ARCH__", CUDAArchCode);
-      if (GPU == CudaArch::SM_90a)
-        Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1");
     }
+    if (GPU == CudaArch::SM_90a)
+      Builder.defineMacro("__CUDA_ARCH_FEAT_SM90_ALL", "1");
   }
 }
 
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index b2d0d5edc9232..759c4f2fe6286 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1881,9 +1881,9 @@ static llvm::fp::FPAccuracy convertFPAccuracy(StringRef FPAccuracyStr) {
 }
 
 static int32_t convertFPAccuracyToAspect(StringRef FPAccuracyStr) {
-  assert(FPAccuracyStr.equals("high") || FPAccuracyStr.equals("medium") ||
-         FPAccuracyStr.equals("low") || FPAccuracyStr.equals("sycl") ||
-         FPAccuracyStr.equals("cuda"));
+  assert(FPAccuracyStr == "high" || FPAccuracyStr == "medium" ||
+         FPAccuracyStr == "low" || FPAccuracyStr == "sycl" ||
+         FPAccuracyStr == "cuda");
   return llvm::StringSwitch<int32_t>(FPAccuracyStr)
       .Case("high", SYCLInternalAspect::fp_intrinsic_accuracy_high)
       .Case("medium", SYCLInternalAspect::fp_intrinsic_accuracy_medium)
diff --git a/clang/lib/Driver/Compilation.cpp b/clang/lib/Driver/Compilation.cpp
index 5bad7146895d7..2e85f85fbc1c1 100644
--- a/clang/lib/Driver/Compilation.cpp
+++ b/clang/lib/Driver/Compilation.cpp
@@ -198,7 +198,7 @@ bool Compilation::CleanupFile(const char *File, bool IssueErrors) const {
       // when the nvptx*-nvidia-cuda is passed to -fsycl-targets.
       if (DefaultToolChain.getTriple().isNVPTX())
         return false;
-      if (llvm::sys::path::extension(ActualFile).equals(".spv"))
+      if (llvm::sys::path::extension(ActualFile) == ".spv")
         return false;
     }
   }
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index abea40cd8f0c5..7ad00c96c5662 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -829,8 +829,8 @@ static bool isValidSYCLTriple(llvm::Triple T) {
   // SPIR/SPIRV arch, but has invalid SubArch for AOT.
   StringRef A(T.getArchName());
   if (T.getSubArch() == llvm::Triple::NoSubArch &&
-      ((T.getArch() == llvm::Triple::spir && !A.equals("spir")) ||
-       (T.getArch() == llvm::Triple::spir64 && !A.equals("spir64"))))
+      ((T.getArch() == llvm::Triple::spir && A != "spir") ||
+       (T.getArch() == llvm::Triple::spir64 && A != "spir64")))
     return false;
   return true;
 }
@@ -1149,7 +1149,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
       return;
     const char *ArgValue = A->getValue();
     for (const StringRef AllowedValue : AllowedValues)
-      if (AllowedValue.equals(ArgValue))
+      if (AllowedValue == ArgValue)
         return;
     Diag(clang::diag::err_drv_invalid_argument_to_option)
         << ArgValue << A->getOption().getName();
@@ -1182,6 +1182,7 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
     // of -fsycl*target options passed
     Arg *SYCLTargetsValues = SYCLTargets;
     if (SYCLTargetsValues) {
+      llvm::StringSet<> SYCLTriples;
       if (SYCLTargetsValues->getNumValues()) {
 
         // Multiple targets are currently not supported when using
@@ -1220,15 +1221,40 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
             const ToolChain *HostTC =
                 C.getSingleOffloadToolChain<Action::OFK_Host>();
             llvm::Triple HostTriple = HostTC->getTriple();
-            UniqueSYCLTriplesVec.push_back(HostTriple);
+            SYCLTriples.insert(HostTriple.normalize());
             continue;
           }
 
-          if (!isValidSYCLTriple(MakeSYCLDeviceTriple(UserTargetName))) {
+          llvm::Triple DeviceTriple(MakeSYCLDeviceTriple(UserTargetName));
+          if (!isValidSYCLTriple(DeviceTriple)) {
             Diag(clang::diag::err_drv_invalid_sycl_target) << Val;
             continue;
           }
 
+          // For any -fsycl-targets=spir64_gen additions, we will scan the
+          // additional -X* options for potential -device settings.  These
+          // need to be added as a known Arch to the packager.
+          if (DeviceTriple.isSPIRAOT() && Arch.empty() &&
+              DeviceTriple.getSubArch() == llvm::Triple::SPIRSubArch_gen) {
+            const ToolChain *HostTC =
+                C.getSingleOffloadToolChain<Action::OFK_Host>();
+            auto DeviceTC = std::make_unique<toolchains::SYCLToolChain>(
+                *this, DeviceTriple, *HostTC, C.getInputArgs());
+            assert(DeviceTC && "Device toolchain not defined.");
+            ArgStringList TargetArgs;
+            DeviceTC->TranslateBackendTargetArgs(DeviceTC->getTriple(),
+                                                 C.getInputArgs(), TargetArgs);
+            // Look for -device <string> and use that as the known arch to
+            // be associated with the current spir64_gen entry.  Grab the
+            // right most entry.
+            for (int i = TargetArgs.size() - 2; i >= 0; --i) {
+              if (StringRef(TargetArgs[i]) == "-device") {
+                Arch = TargetArgs[i + 1];
+                break;
+              }
+            }
+          }
+
           // Make sure we don't have a duplicate triple.
           std::string NormalizedName = MakeSYCLDeviceTriple(Val).normalize();
           auto Duplicate = FoundNormalizedTriples.find(NormalizedName);
@@ -1241,11 +1267,16 @@ void Driver::CreateOffloadingDeviceToolChains(Compilation &C,
           // Store the current triple so that we can check for duplicates in
           // the following iterations.
           FoundNormalizedTriples[NormalizedName] = Val;
-          llvm::Triple DeviceTriple(MakeSYCLDeviceTriple(UserTargetName));
-          UniqueSYCLTriplesVec.push_back(DeviceTriple);
+          SYCLTriples.insert(DeviceTriple.normalize());
           if (!Arch.empty())
             DerivedArchs[DeviceTriple.getTriple()].insert(Arch);
         }
+        if (!SYCLTriples.empty()) {
+          for (const auto &SYCLTriple : SYCLTriples) {
+            llvm::Triple Triple(SYCLTriple.getKey());
+            UniqueSYCLTriplesVec.push_back(Triple);
+          }
+        }
         addSYCLDefaultTriple(C, UniqueSYCLTriplesVec);
       } else
         Diag(clang::diag::warn_drv_empty_joined_argument)
@@ -1891,7 +1922,7 @@ Compilation *Driver::BuildCompilation(ArrayRef<const char *> ArgList) {
       // an external option setting is required to target hardware.
       setOffloadCompileMode(FPGAEmulationMode);
       for (StringRef ArgString : TargetArgs) {
-        if (ArgString.equals("-hardware") || ArgString.equals("-simulation")) {
+        if (ArgString == "-hardware" || ArgString == "-simulation") {
           setOffloadCompileMode(FPGAHWMode);
           break;
         }
@@ -5022,17 +5053,16 @@ class OffloadingActionBuilder final {
       }
 
       // By default, we produce an action for each device arch.
-      auto TC = ToolChains.begin();
-      for (Action *&A : SYCLDeviceActions) {
-        if ((*TC)->getTriple().isNVPTX() && CurPhase >= phases::Backend) {
+      for (auto TargetActionInfo :
+           llvm::zip(SYCLDeviceActions, SYCLTargetInfoList)) {
+        auto &TargetInfo = std::get<1>(TargetActionInfo);
+        if (TargetInfo.TC->getTriple().isNVPTX() && CurPhase >= phases::Backend)
           // For CUDA, stop to emit LLVM IR so it can be linked later on.
-          ++TC;
           continue;
-        }
 
+        Action *&A = std::get<0>(TargetActionInfo);
         A = C.getDriver().ConstructPhaseAction(C, Args, CurPhase, A,
                                                AssociatedOffloadKind);
-        ++TC;
       }
 
       return ABRT_Success;
@@ -6241,12 +6271,12 @@ class OffloadingActionBuilder final {
             using namespace tools::SYCL;
             StringRef Device{Value.first};
             if (Device.consume_front(gen::AmdGPU))
-              return TargetArch.equals(Device) && TargetTriple.isAMDGCN();
+              return TargetArch == Device && TargetTriple.isAMDGCN();
             if (Device.consume_front(gen::NvidiaGPU))
-              return TargetArch.equals(Device) && TargetTriple.isNVPTX();
+              return TargetArch == Device && TargetTriple.isNVPTX();
             if (Device.consume_front(gen::IntelGPU))
-              return TargetArch.equals(Device) && TargetTriple.isSPIRAOT();
-            return TargetArch.equals(Device) && isValidSYCLTriple(TargetTriple);
+              return TargetArch == Device && TargetTriple.isSPIRAOT();
+            return TargetArch == Device && isValidSYCLTriple(TargetTriple);
           });
         } else {
           TargetIt = TargetTable.find(TargetTriple.str());
@@ -9604,7 +9634,8 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
       bool IsHIPNoRDC = JA.getOffloadingDeviceKind() == Action::OFK_HIP &&
                         !C.getArgs().hasFlag(options::OPT_fgpu_rdc,
                                              options::OPT_fno_gpu_rdc, false);
-      bool UseOutExtension = IsHIPNoRDC || isa<OffloadPackagerJobAction>(JA);
+      bool UseOutExtension = IsHIPNoRDC || isa<OffloadPackagerJobAction>(JA) ||
+                             isa<BackendCompileJobAction>(JA);
       if (UseOutExtension) {
         Output = BaseName;
         llvm::sys::path::replace_extension(Output, "");
@@ -9701,7 +9732,7 @@ const char *Driver::GetNamedOutputPath(Compilation &C, const JobAction &JA,
     const auto &ResultFiles = C.getResultFiles();
     const auto CollidingFilenameIt =
         llvm::find_if(ResultFiles, [NamedOutput](const auto &It) {
-          return StringRef(NamedOutput).equals(It.second);
+          return StringRef(NamedOutput) == It.second;
         });
     if (CollidingFilenameIt != ResultFiles.end()) {
       // Upon any collision, a unique hash will be appended to the filename,
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 76594ae861313..53f56a59664f6 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -1160,6 +1160,8 @@ void SanitizerArgs::addArgs(const ToolChain &TC, const llvm::opt::ArgList &Args,
       CmdArgs.push_back("-asan-stack=0");
       CmdArgs.push_back("-mllvm");
       CmdArgs.push_back("-asan-globals=0");
+      CmdArgs.push_back("-mllvm");
+      CmdArgs.push_back("-asan-mapping-scale=4");
     }
     return;
   }
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 5677175b5f867..6e5645552d55e 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4967,7 +4967,7 @@ void Clang::ConstructHostCompilerJob(Compilation &C, const JobAction &JA,
   if (isa<PreprocessJobAction>(JA)) {
     if (IsMSVCHostCompiler) {
       // Check the output file, if it is 'stdout' we want to use -E.
-      if (StringRef(Output.getFilename()).equals("-")) {
+      if (StringRef(Output.getFilename()) == "-") {
         HostCompileArgs.push_back("-E");
         OutputAdded = true;
       } else {
@@ -5858,10 +5858,19 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
       CmdArgs.push_back("-emit-llvm-uselists");
 
     if (IsUsingLTO) {
-      if (IsDeviceOffloadAction && !JA.isDeviceOffloading(Action::OFK_OpenMP) &&
-          !Args.hasFlag(options::OPT_offload_new_driver,
-                        options::OPT_no_offload_new_driver, false) &&
-          !Triple.isAMDGPU()) {
+      bool IsUsingOffloadNewDriver =
+          Args.hasFlag(options::OPT_offload_new_driver,
+                       options::OPT_no_offload_new_driver, false);
+      Arg *SYCLSplitMode =
+          Args.getLastArg(options::OPT_fsycl_device_code_split_EQ);
+      bool IsDeviceCodeSplitDisabled =
+          SYCLSplitMode && StringRef(SYCLSplitMode->getValue()) == "off";
+      bool IsSYCLLTOSupported =
+          JA.isDeviceOffloading(Action::OFK_SYCL) && IsUsingOffloadNewDriver;
+      if ((IsDeviceOffloadAction &&
+           !JA.isDeviceOffloading(Action::OFK_OpenMP) && !Triple.isAMDGPU() &&
+           !IsUsingOffloadNewDriver) ||
+          (JA.isDeviceOffloading(Action::OFK_SYCL) && !IsSYCLLTOSupported)) {
         D.Diag(diag::err_drv_unsupported_opt_for_target)
             << Args.getLastArg(options::OPT_foffload_lto,
                                options::OPT_foffload_lto_EQ)
@@ -5874,6 +5883,13 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                                options::OPT_foffload_lto_EQ)
                    ->getAsString(Args)
             << "-fno-gpu-rdc";
+      } else if (JA.isDeviceOffloading(Action::OFK_SYCL) &&
+                 IsDeviceCodeSplitDisabled && LTOMode == LTOK_Thin) {
+        D.Diag(diag::err_drv_sycl_thinlto_split_off)
+            << SYCLSplitMode->getAsString(Args)
+            << Args.getLastArg(options::OPT_foffload_lto,
+                               options::OPT_foffload_lto_EQ)
+                   ->getAsString(Args);
       } else {
         assert(LTOMode == LTOK_Full || LTOMode == LTOK_Thin);
         CmdArgs.push_back(Args.MakeArgString(
@@ -10281,6 +10297,41 @@ void OffloadPackager::ConstructJob(Compilation &C, const JobAction &JA,
       for (StringRef Feature : FeatureArgs)
         Parts.emplace_back("feature=" + Feature.str());
 
+    // Now that the standard parts are added to the packager string, add any
+    // additional supplemental options that cover compile and link opts that
+    // are used for SYCL based offloading.
+    // Here, we add the compile and link options that are required by backend
+    // compilers and the clang-offload-wrapper in the case of SYCL offloading.
+    if (OffloadAction->getOffloadingDeviceKind() == Action::OFK_SYCL) {
+      ArgStringList BuildArgs;
+      auto createArgString = [&](const char *Opt) {
+        if (BuildArgs.empty())
+          return;
+        SmallString<128> AL;
+        for (const char *A : BuildArgs) {
+          if (AL.empty()) {
+            AL = A;
+            continue;
+          }
+          AL += " ";
+          AL += A;
+        }
+        Parts.emplace_back(C.getArgs().MakeArgString(Twine(Opt) + AL));
+      };
+      const ArgList &Args =
+          C.getArgsForToolChain(nullptr, StringRef(), Action::OFK_SYCL);
+      const ToolChain *HostTC = C.getSingleOffloadToolChain<Action::OFK_Host>();
+      const toolchains::SYCLToolChain &SYCLTC =
+          static_cast<const toolchains::SYCLToolChain &>(*TC);
+      SYCLTC.AddImpliedTargetArgs(TC->getTriple(), Args, BuildArgs, JA, *HostTC,
+                                  Arch);
+      SYCLTC.TranslateBackendTargetArgs(TC->getTriple(), Args, BuildArgs, Arch);
+      createArgString("compile-opts=");
+      BuildArgs.clear();
+      SYCLTC.TranslateLinkerTargetArgs(TC->getTriple(), Args, BuildArgs, Arch);
+      createArgString("link-opts=");
+    }
+
     CmdArgs.push_back(Args.MakeArgString("--image=" + llvm::join(Parts, ",")));
   }
 
@@ -10436,6 +10487,7 @@ static void getOtherSPIRVTransOpts(Compilation &C,
       ",+SPV_INTEL_fpga_invocation_pipelining_attributes"
       ",+SPV_INTEL_fpga_latency_control"
       ",+SPV_INTEL_task_sequence"
+      ",+SPV_KHR_shader_clock"
       ",+SPV_INTEL_bindless_images";
   ExtArg = ExtArg + DefaultExtArg + INTELExtArg;
   if (C.getDriver().IsFPGAHWMode())
@@ -10486,7 +10538,7 @@ void SPIRVTranslator::ConstructJob(Compilation &C, const JobAction &JA,
 
     // Handle -Xspirv-translator
     TC.TranslateTargetOpt(
-        TCArgs, TranslatorArgs, options::OPT_Xspirv_translator,
+        Triple, TCArgs, TranslatorArgs, options::OPT_Xspirv_translator,
         options::OPT_Xspirv_translator_EQ, JA.getOffloadingArch());
   }
   for (auto I : Inputs) {
@@ -10592,10 +10644,10 @@ static void addArgs(ArgStringList &DstArgs, const llvm::opt::ArgList &Alloc,
   }
 }
 
-static void getOtherSYCLPostLinkOpts(const ToolChain &TC, const JobAction &JA,
-                                     const llvm::opt::ArgList &TCArgs,
-                                     ArgStringList &PostLinkArgs,
-                                     bool SpecConsts, types::ID OutputType) {
+static void getNonTripleBasedSYCLPostLinkOpts(const ToolChain &TC,
+                                              const JobAction &JA,
+                                              const llvm::opt::ArgList &TCArgs,
+                                              ArgStringList &PostLinkArgs) {
   // See if device code splitting is requested
   if (Arg *A = TCArgs.getLastArg(options::OPT_fsycl_device_code_split_EQ)) {
     auto CodeSplitValue = StringRef(A->getValue());
@@ -10608,19 +10660,8 @@ static void getOtherSYCLPostLinkOpts(const ToolChain &TC, const JobAction &JA,
     else { // Device code split is off
     }
   }
-  if (OutputType == types::TY_LLVM_BC) {
-    // single file output requested - this means only perform necessary IR
-    // transformations (like specialization constant intrinsic lowering) and
-    // output LLVMIR
-    addArgs(PostLinkArgs, TCArgs, {"-ir-output-only"});
-  }
   addArgs(PostLinkArgs, TCArgs,
           {StringRef(getSYCLPostLinkOptimizationLevel(TCArgs))});
-  // specialization constants processing is mandatory
-  if (SpecConsts)
-    addArgs(PostLinkArgs, TCArgs, {"-spec-const=native"});
-  else
-    addArgs(PostLinkArgs, TCArgs, {"-spec-const=emulation"});
 
   // Process device-globals.
   addArgs(PostLinkArgs, TCArgs, {"-device-globals"});
@@ -10631,32 +10672,50 @@ static void getOtherSYCLPostLinkOpts(const ToolChain &TC, const JobAction &JA,
     addArgs(PostLinkArgs, TCArgs, {"-lower-esimd-force-stateless-mem=false"});
 }
 
-// Add any sycl-post-link options that rely on a specific Triple.
-static void
-getTripleBasedSYCLPostLinkOpts(const ToolChain &TC, const JobAction &JA,
-                               const llvm::opt::ArgList &TCArgs,
-                               llvm::Triple Triple, ArgStringList &PostLinkArgs,
-                               bool SpecConsts, types::ID OutputType) {
+// Add any sycl-post-link options that rely on a specific Triple in addition
+// to user supplied options. This function is invoked only for the old
+// offloading model. For the new offloading model, a slightly modified version
+// of this function is called inside clang-linker-wrapper.
+// NOTE: Any changes made here should be reflected in the similarly named
+// function in clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp.
+static void getTripleBasedSYCLPostLinkOpts(const ToolChain &TC,
+                                           const llvm::opt::ArgList &TCArgs,
+                                           ArgStringList &PostLinkArgs,
+                                           llvm::Triple Triple,
+                                           bool SpecConstsSupported,
+                                           types::ID OutputType) {
+  if (OutputType == types::TY_LLVM_BC) {
+    // single file output requested - this means only perform necessary IR
+    // transformations (like specialization constant intrinsic lowering) and
+    // output LLVMIR
+    addArgs(PostLinkArgs, TCArgs, {"-ir-output-only"});
+  }
+  if (SpecConstsSupported)
+    addArgs(PostLinkArgs, TCArgs, {"-spec-const=native"});
+  else
+    addArgs(PostLinkArgs, TCArgs, {"-spec-const=emulation"});
 
   // See if device code splitting is requested.  The logic here works along side
-  // the behavior in setOtherSYCLPostLinkOpts, where the option is added based
-  // on the user setting of-fsycl-device-code-split.
-  if (!(TCArgs.hasArg(options::OPT_fsycl_device_code_split_EQ) ||
-        Triple.getArchName() == "spir64_fpga"))
+  // the behavior in getNonTripleBasedSYCLPostLinkOpts, where the option is
+  // added based on the user setting of -fsycl-device-code-split.
+  if (!TCArgs.hasArg(options::OPT_fsycl_device_code_split_EQ) &&
+      (Triple.getArchName() != "spir64_fpga"))
     addArgs(PostLinkArgs, TCArgs, {"-split=auto"});
 
   // On Intel targets we don't need non-kernel functions as entry points,
   // because it only increases amount of code for device compiler to handle,
   // without any actual benefits.
   // TODO: Try to extend this feature for non-Intel GPUs.
-  if (!TCArgs.hasFlag(options::OPT_fno_sycl_remove_unused_external_funcs,
-                      options::OPT_fsycl_remove_unused_external_funcs, false) &&
-      !Triple.isNVPTX() && !Triple.isAMDGPU() && !isSYCLNativeCPU(TC))
+  if ((!TCArgs.hasFlag(options::OPT_fno_sycl_remove_unused_external_funcs,
+                       options::OPT_fsycl_remove_unused_external_funcs,
+                       false) &&
+       !isSYCLNativeCPU(TC)) &&
+      !Triple.isNVPTX() && !Triple.isAMDGPU())
     addArgs(PostLinkArgs, TCArgs, {"-emit-only-kernels-as-entry-points"});
 
-  if (!(Triple.isAMDGCN()))
+  if (!Triple.isAMDGCN())
     addArgs(PostLinkArgs, TCArgs, {"-emit-param-info"});
-  // Enable PI program metadata
+  // Enable program metadata
   if (Triple.isNVPTX() || Triple.isAMDGCN() || isSYCLNativeCPU(TC))
     addArgs(PostLinkArgs, TCArgs, {"-emit-program-metadata"});
   if (OutputType != types::TY_LLVM_BC) {
@@ -10669,18 +10728,19 @@ getTripleBasedSYCLPostLinkOpts(const ToolChain &TC, const JobAction &JA,
     // add options unconditionally
     addArgs(PostLinkArgs, TCArgs, {"-symbols"});
     addArgs(PostLinkArgs, TCArgs, {"-emit-exported-symbols"});
+    addArgs(PostLinkArgs, TCArgs, {"-emit-imported-symbols"});
     if (SplitEsimd)
       addArgs(PostLinkArgs, TCArgs, {"-split-esimd"});
     addArgs(PostLinkArgs, TCArgs, {"-lower-esimd"});
   }
-  bool isAOT = Triple.isNVPTX() || Triple.isAMDGCN() ||
+  bool IsAOT = Triple.isNVPTX() || Triple.isAMDGCN() ||
                Triple.getSubArch() == llvm::Triple::SPIRSubArch_fpga ||
                Triple.getSubArch() == llvm::Triple::SPIRSubArch_gen ||
                Triple.getSubArch() == llvm::Triple::SPIRSubArch_x86_64;
   if (TCArgs.hasFlag(options::OPT_fsycl_add_default_spec_consts_image,
                      options::OPT_fno_sycl_add_default_spec_consts_image,
                      false) &&
-      isAOT)
+      IsAOT)
     addArgs(PostLinkArgs, TCArgs,
             {"-generate-device-image-default-spec-consts"});
 }
@@ -10703,10 +10763,8 @@ void SYCLPostLink::ConstructJob(Compilation &C, const JobAction &JA,
   ArgStringList CmdArgs;
 
   llvm::Triple T = getToolChain().getTriple();
-  getOtherSYCLPostLinkOpts(getToolChain(), JA, TCArgs, CmdArgs,
-                           SYCLPostLink->getRTSetsSpecConstants(),
-                           SYCLPostLink->getTrueType());
-  getTripleBasedSYCLPostLinkOpts(getToolChain(), JA, TCArgs, T, CmdArgs,
+  getNonTripleBasedSYCLPostLinkOpts(getToolChain(), JA, TCArgs, CmdArgs);
+  getTripleBasedSYCLPostLinkOpts(getToolChain(), TCArgs, CmdArgs, T,
                                  SYCLPostLink->getRTSetsSpecConstants(),
                                  SYCLPostLink->getTrueType());
 
@@ -10717,16 +10775,16 @@ void SYCLPostLink::ConstructJob(Compilation &C, const JobAction &JA,
   if (T.getSubArch() == llvm::Triple::SPIRSubArch_gen && Device.data())
     OutputArg = ("intel_gpu_" + Device + "," + OutputArg).str();
 
-  addArgs(CmdArgs, TCArgs, {"-o", OutputArg});
-
   const toolchains::SYCLToolChain &TC =
       static_cast<const toolchains::SYCLToolChain &>(getToolChain());
 
   // Handle -Xdevice-post-link
-  TC.TranslateTargetOpt(TCArgs, CmdArgs, options::OPT_Xdevice_post_link,
+  TC.TranslateTargetOpt(T, TCArgs, CmdArgs, options::OPT_Xdevice_post_link,
                         options::OPT_Xdevice_post_link_EQ,
                         JA.getOffloadingArch());
 
+  addArgs(CmdArgs, TCArgs, {"-o", OutputArg});
+
   // Add input file
   assert(Inputs.size() == 1 && Inputs.front().isFilename() &&
          "single input file expected");
@@ -10908,7 +10966,8 @@ void SpirvToIrWrapper::ConstructJob(Compilation &C, const JobAction &JA,
       static_cast<const toolchains::SYCLToolChain &>(getToolChain());
 
   // Handle -Xspirv-to-ir-wrapper
-  TC.TranslateTargetOpt(TCArgs, CmdArgs, options::OPT_Xspirv_to_ir_wrapper,
+  TC.TranslateTargetOpt(getToolChain().getTriple(), TCArgs, CmdArgs,
+                        options::OPT_Xspirv_to_ir_wrapper,
                         options::OPT_Xspirv_to_ir_wrapper_EQ,
                         JA.getOffloadingArch());
 
@@ -10973,13 +11032,6 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
   if (Args.hasArg(options::OPT_v))
     CmdArgs.push_back("--wrapper-verbose");
 
-  // TODO(NOM2): Pass following options to clang-linker-wrapper.
-  // Please refer to sycl/doc/design/OffloadDesign.md for details.
-  // sycl-device-libraries
-  // sycl-device-library-location
-  // sycl-post-link-options
-  // llvm-spirv-options
-
   if (const Arg *A = Args.getLastArg(options::OPT_g_Group)) {
     if (!A->getOption().matches(options::OPT_g0))
       CmdArgs.push_back("--device-debug");
@@ -11012,12 +11064,14 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
   // Add any SYCL offloading specific options to the clang-linker-wrapper
   if (C.hasOffloadToolChain<Action::OFK_SYCL>()) {
     // -sycl-device-libraries=<comma separated list> contains all of the SYCL
-    // device specific libraries that are needed.  This provides the list of
-    // files file only.
-    // TODO: This generic list will be populated with only device binaries
-    // for spir/spirv. Other targets (AOT and others) can represent a different
-    // set of device libraries.  We will cross that bridge when we begin to
-    // enable the other possible targets.
+    // device specific libraries that are needed. This generic list will be
+    // populated with device binaries for all target triples in the current
+    // compilation flow.
+
+    // Create a comma separated list to pass along to the linker wrapper.
+    SmallString<256> LibList;
+    // TODO: TargetTriple should not be used here for creating linker wrapper
+    // options. It should also not be passed to the linker wrapper.
     llvm::Triple TargetTriple;
     auto ToolChainRange = C.getOffloadToolChains<Action::OFK_SYCL>();
     for (auto &I :
@@ -11026,38 +11080,24 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
       if (TC->getTriple().isSPIROrSPIRV() &&
           TC->getTriple().getSubArch() == llvm::Triple::NoSubArch) {
         TargetTriple = TC->getTriple();
-        break;
+        SmallVector<std::string, 8> SYCLDeviceLibs;
+        bool IsSPIR = TargetTriple.isSPIROrSPIRV();
+        bool IsSpirvAOT = TargetTriple.isSPIRAOT();
+        bool UseJitLink =
+            IsSPIR &&
+            Args.hasFlag(options::OPT_fsycl_device_lib_jit_link,
+                         options::OPT_fno_sycl_device_lib_jit_link, false);
+        bool UseAOTLink = IsSPIR && (IsSpirvAOT || !UseJitLink);
+        SYCLDeviceLibs = SYCL::getDeviceLibraries(C, TargetTriple, UseAOTLink);
+        for (const auto &AddLib : SYCLDeviceLibs) {
+          if (LibList.size() > 0)
+            LibList += ",";
+          LibList += AddLib;
+        }
       }
     }
-    // Pass the device triple to the linker wrapper tool for SYCL offload.
-    // Only spir64 or spirv64 is currently passed.
-    // TODO(NOM1): Support target triples in a more generic way.
-    // TODO(NOM3): Investigate why passing spirv64-unknown-unknown does not
-    // work.
-    if (TargetTriple.isSPIR())
-      CmdArgs.push_back("--triple=spir64");
-    else if (TargetTriple.isSPIRV())
-      CmdArgs.push_back("--triple=spirv64");
-
-    SmallVector<std::string, 8> SYCLDeviceLibs;
-    auto IsSPIR = TargetTriple.isSPIROrSPIRV();
-    bool IsSpirvAOT = TargetTriple.isSPIRAOT();
-    bool UseJitLink =
-        IsSPIR &&
-        Args.hasFlag(options::OPT_fsycl_device_lib_jit_link,
-                     options::OPT_fno_sycl_device_lib_jit_link, false);
-    bool UseAOTLink = IsSPIR && (IsSpirvAOT || !UseJitLink);
-    SYCLDeviceLibs = SYCL::getDeviceLibraries(C, TargetTriple, UseAOTLink);
-    // Create a comma separated list to pass along to the linker wrapper.
-    SmallString<256> LibList;
-    for (const auto &AddLib : SYCLDeviceLibs) {
-      if (LibList.size() > 0)
-        LibList += ",";
-      LibList += AddLib;
-    }
     // -sycl-device-libraries=<libs> provides a comma separate list of
     // libraries to add to the device linking step.
-    // SYCL device libraries can be found.
     if (LibList.size())
       CmdArgs.push_back(
           Args.MakeArgString(Twine("-sycl-device-libraries=") + LibList));
@@ -11097,26 +11137,14 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
     // --sycl-post-link-options="options" provides a string of options to be
     // passed along to the sycl-post-link tool during device link.
     SmallString<128> PostLinkOptString;
+    ArgStringList PostLinkArgs;
+    getNonTripleBasedSYCLPostLinkOpts(getToolChain(), JA, Args, PostLinkArgs);
+    for (const auto &A : PostLinkArgs)
+      appendOption(PostLinkOptString, A);
     if (Args.hasArg(options::OPT_Xdevice_post_link)) {
       for (const auto &A : Args.getAllArgValues(options::OPT_Xdevice_post_link))
         appendOption(PostLinkOptString, A);
     }
-    ArgStringList PostLinkArgs;
-    bool IsSYCLNativeCPU = driver::isSYCLNativeCPU(Args);
-    types::ID OutputType = TargetTriple.isSPIROrSPIRV() || IsSYCLNativeCPU
-                               ? types::TY_Tempfiletable
-                               : types::TY_LLVM_BC;
-    // TODO: Items like native_cpu and Specialization Constants behaviors are
-    // dependent on each toolchain.  Passing these along as 'general settings'
-    // for the clang-linker-wrapper causes for potential inconsistencies and
-    // would need to handled more at the device linking level.
-    bool SpecConsts = TargetTriple.isSPIROrSPIRV();
-    getOtherSYCLPostLinkOpts(getToolChain(), JA, Args, PostLinkArgs, SpecConsts,
-                             OutputType);
-    getTripleBasedSYCLPostLinkOpts(getToolChain(), JA, Args, TargetTriple,
-                                   PostLinkArgs, SpecConsts, OutputType);
-    for (const auto &A : PostLinkArgs)
-      appendOption(PostLinkOptString, A);
     if (!PostLinkOptString.empty())
       CmdArgs.push_back(
           Args.MakeArgString("--sycl-post-link-options=" + PostLinkOptString));
@@ -11140,25 +11168,50 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
     const toolchains::SYCLToolChain &SYCLTC =
         static_cast<const toolchains::SYCLToolChain &>(getToolChain());
     // Only store compile/link opts in the image descriptor for the SPIR-V
-    // target.
+    // target.  For AOT, pass along the addition options via GPU or CPU
+    // specific clang-linker-wrapper options.
     const ArgList &Args =
         C.getArgsForToolChain(nullptr, StringRef(), Action::OFK_SYCL);
-    ArgStringList BuildArgs;
-    OptString.clear();
-    SYCLTC.TranslateBackendTargetArgs(TargetTriple, Args, BuildArgs);
-    for (const auto &A : BuildArgs)
-      appendOption(OptString, A);
-    if (!OptString.empty())
-      CmdArgs.push_back(
-          Args.MakeArgString("--sycl-backend-compile-options=" + OptString));
-    BuildArgs.clear();
-    OptString.clear();
-    SYCLTC.TranslateLinkerTargetArgs(TargetTriple, Args, BuildArgs);
-    for (const auto &A : BuildArgs)
-      appendOption(OptString, A);
-    if (!OptString.empty())
-      CmdArgs.push_back(
-          Args.MakeArgString("--sycl-target-link-options=" + OptString));
+    for (auto &ToolChainMember :
+         llvm::make_range(ToolChainRange.first, ToolChainRange.second)) {
+      const ToolChain *TC = ToolChainMember.second;
+      bool IsJIT = false;
+      StringRef WrapperOption;
+      StringRef WrapperLinkOption;
+      if (TC->getTriple().isSPIROrSPIRV()) {
+        if (TC->getTriple().getSubArch() == llvm::Triple::NoSubArch) {
+          IsJIT = true;
+          WrapperOption = "--sycl-backend-compile-options=";
+        }
+        if (TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen)
+          WrapperOption = "--gen-tool-arg=";
+        if (TC->getTriple().getSubArch() == llvm::Triple::SPIRSubArch_x86_64)
+          WrapperOption = "--cpu-tool-arg=";
+      } else
+        continue;
+      ArgStringList BuildArgs;
+      SmallString<128> BackendOptString;
+      SmallString<128> LinkOptString;
+      SYCLTC.TranslateBackendTargetArgs(TC->getTriple(), Args, BuildArgs);
+      for (const auto &A : BuildArgs)
+        appendOption(BackendOptString, A);
+
+      BuildArgs.clear();
+      SYCLTC.TranslateLinkerTargetArgs(TC->getTriple(), Args, BuildArgs);
+      for (const auto &A : BuildArgs) {
+        if (IsJIT)
+          appendOption(LinkOptString, A);
+        else
+          // For AOT, combine the Backend and Linker strings into one.
+          appendOption(BackendOptString, A);
+      }
+      if (!BackendOptString.empty())
+        CmdArgs.push_back(
+            Args.MakeArgString(Twine(WrapperOption) + BackendOptString));
+      if (!LinkOptString.empty())
+        CmdArgs.push_back(
+            Args.MakeArgString("--sycl-target-link-options=" + LinkOptString));
+    }
   }
 
   // Construct the link job so we can wrap around it.
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 8f3b47937c512..ba0a8d928c8fa 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1568,9 +1568,8 @@ bool tools::isDependentLibAdded(const ArgList &Args, StringRef Lib) {
   // Check if given Lib is added via --dependent-lib
   SmallString<64> DepLib("--dependent-lib=");
   DepLib += Lib;
-  return llvm::any_of(
-      Args.getAllArgValues(options::OPT_Xclang),
-      [&DepLib](StringRef Option) { return Option.equals(DepLib); });
+  return llvm::any_of(Args.getAllArgValues(options::OPT_Xclang),
+                      [&DepLib](StringRef Option) { return Option == DepLib; });
 }
 
 const char *tools::SplitDebugName(const JobAction &JA, const ArgList &Args,
diff --git a/clang/lib/Driver/ToolChains/SYCL.cpp b/clang/lib/Driver/ToolChains/SYCL.cpp
index edd1e7fec46c9..1db4500bd3b51 100644
--- a/clang/lib/Driver/ToolChains/SYCL.cpp
+++ b/clang/lib/Driver/ToolChains/SYCL.cpp
@@ -114,7 +114,7 @@ void SYCL::constructLLVMForeachCommand(Compilation &C, const JobAction &JA,
   // If fsycl-dump-device-code is passed, put the PTX files
   // into the path provided in fsycl-dump-device-code.
   if (T->getToolChain().getTriple().isNVPTX() &&
-      C.getDriver().isDumpDeviceCodeEnabled() && Ext.equals("s")) {
+      C.getDriver().isDumpDeviceCodeEnabled() && Ext == "s") {
     SmallString<128> OutputDir;
 
     Arg *DumpDeviceCodeArg =
@@ -235,12 +235,11 @@ SYCL::getDeviceLibraries(const Compilation &C, const llvm::Triple &TargetTriple,
       for (StringRef Val : A->getValues()) {
         if (Val == "all") {
           for (const auto &K : DeviceLibLinkInfo.keys())
-            DeviceLibLinkInfo[K] =
-                true && (!NoDeviceLibs || K.equals("internal"));
+            DeviceLibLinkInfo[K] = true && (!NoDeviceLibs || K == "internal");
           break;
         }
         auto LinkInfoIter = DeviceLibLinkInfo.find(Val);
-        if (LinkInfoIter == DeviceLibLinkInfo.end() || Val.equals("internal")) {
+        if (LinkInfoIter == DeviceLibLinkInfo.end() || Val == "internal") {
           // TODO: Move the diagnostic to the SYCL section of
           // Driver::CreateOffloadingDeviceToolChains() to minimize code
           // duplication.
@@ -488,7 +487,7 @@ const char *SYCL::Linker::constructLLVMLinkCommand(
       for (const auto &L : SYCLDeviceLibList) {
         std::string DeviceLibName(L);
         DeviceLibName.append(LibPostfix);
-        if (StringRef(PureLibName).equals(DeviceLibName) ||
+        if (StringRef(PureLibName) == DeviceLibName ||
             (IsNVPTX && StringRef(PureLibName).starts_with(L)))
           return true;
       }
@@ -899,7 +898,7 @@ static bool hasPVCDevice(const ArgStringList &CmdArgs) {
         DeviceArg = SplitArg;
         break;
       }
-      if (SplitArg.equals("-device"))
+      if (SplitArg == "-device")
         DeviceSeen = true;
     }
     if (DeviceSeen)
@@ -982,7 +981,8 @@ void SYCL::gen::BackendCompiler::ConstructJob(Compilation &C,
                           *HostTC, Device);
   TC.TranslateBackendTargetArgs(getToolChain().getTriple(), Args, CmdArgs,
                                 Device);
-  TC.TranslateLinkerTargetArgs(getToolChain().getTriple(), Args, CmdArgs);
+  TC.TranslateLinkerTargetArgs(getToolChain().getTriple(), Args, CmdArgs,
+                               Device);
   SmallString<128> ExecPath(
       getToolChain().GetProgramPath(makeExeName(C, "ocloc")));
   const char *Exec = C.getArgs().MakeArgString(ExecPath);
@@ -1049,6 +1049,7 @@ StringRef SYCL::gen::resolveGenDevice(StringRef DeviceName) {
           .Case("nvidia_gpu_sm_87", "sm_87")
           .Case("nvidia_gpu_sm_89", "sm_89")
           .Case("nvidia_gpu_sm_90", "sm_90")
+          .Case("nvidia_gpu_sm_90a", "sm_90a")
           .Case("amd_gpu_gfx700", "gfx700")
           .Case("amd_gpu_gfx701", "gfx701")
           .Case("amd_gpu_gfx702", "gfx702")
@@ -1135,6 +1136,7 @@ SmallString<64> SYCL::gen::getGenDeviceMacro(StringRef DeviceName) {
                       .Case("sm_87", "NVIDIA_GPU_SM_87")
                       .Case("sm_89", "NVIDIA_GPU_SM_89")
                       .Case("sm_90", "NVIDIA_GPU_SM_90")
+                      .Case("sm_90a", "NVIDIA_GPU_SM_90A")
                       .Case("gfx700", "AMD_GPU_GFX700")
                       .Case("gfx701", "AMD_GPU_GFX701")
                       .Case("gfx702", "AMD_GPU_GFX702")
@@ -1383,7 +1385,8 @@ static void WarnForDeprecatedBackendOpts(const Driver &D,
 
 // Expects a specific type of option (e.g. -Xsycl-target-backend) and will
 // extract the arguments.
-void SYCLToolChain::TranslateTargetOpt(const llvm::opt::ArgList &Args,
+void SYCLToolChain::TranslateTargetOpt(const llvm::Triple &Triple,
+                                       const llvm::opt::ArgList &Args,
                                        llvm::opt::ArgStringList &CmdArgs,
                                        OptSpecifier Opt, OptSpecifier Opt_EQ,
                                        StringRef Device) const {
@@ -1393,15 +1396,21 @@ void SYCLToolChain::TranslateTargetOpt(const llvm::opt::ArgList &Args,
     if (A->getOption().matches(Opt_EQ)) {
       // Passing device args: -X<Opt>=<triple> -opt=val.
       StringRef GenDevice = SYCL::gen::resolveGenDevice(A->getValue());
-      bool IsGenTriple =
-          getTriple().isSPIR() &&
-          getTriple().getSubArch() == llvm::Triple::SPIRSubArch_gen;
-      if (Device != GenDevice)
-        continue;
-      if (getDriver().MakeSYCLDeviceTriple(A->getValue()) != getTriple() &&
-          (!IsGenTriple || (IsGenTriple && GenDevice.empty())))
-        // Triples do not match, but only skip when we know we are not comparing
-        // against intel_gpu_* and non-spir64_gen
+      bool IsGenTriple = Triple.isSPIR() &&
+                         Triple.getSubArch() == llvm::Triple::SPIRSubArch_gen;
+      if (IsGenTriple) {
+        if (Device != GenDevice && !Device.empty())
+          continue;
+        if (getDriver().MakeSYCLDeviceTriple(A->getValue()) != Triple &&
+            GenDevice.empty())
+          // Triples do not match, but only skip when we know we are not
+          // comparing against intel_gpu_*
+          continue;
+        if (getDriver().MakeSYCLDeviceTriple(A->getValue()) == Triple &&
+            !Device.empty())
+          // Triples match, but we are expecting a specific device to be set.
+          continue;
+      } else if (getDriver().MakeSYCLDeviceTriple(A->getValue()) != Triple)
         continue;
     } else if (!OptNoTriple)
       // Don't worry about any of the other args, we only want to pass what is
@@ -1424,8 +1433,7 @@ void SYCLToolChain::TranslateTargetOpt(const llvm::opt::ArgList &Args,
     } else
       // Triple found, add the next argument in line.
       ArgString = A->getValue(1);
-    WarnForDeprecatedBackendOpts(getDriver(), getTriple(), Device, ArgString,
-                                 A);
+    WarnForDeprecatedBackendOpts(getDriver(), Triple, Device, ArgString, A);
     parseTargetOpts(ArgString, Args, CmdArgs);
     A->claim();
   }
@@ -1468,8 +1476,8 @@ void SYCLToolChain::AddImpliedTargetArgs(const llvm::Triple &Triple,
     auto ProcessElement = [&](StringRef Ele) {
       auto [DeviceName, RegAllocMode] = Ele.split(':');
       StringRef BackendOptName = SYCL::gen::getGenGRFFlag(RegAllocMode);
-      bool IsDefault = RegAllocMode.equals("default");
-      if (RegAllocMode.empty() || !DeviceName.equals("pvc") ||
+      bool IsDefault = RegAllocMode == "default";
+      if (RegAllocMode.empty() || DeviceName != "pvc" ||
           (BackendOptName.empty() && !IsDefault)) {
         getDriver().Diag(diag::err_drv_unsupported_option_argument)
             << A->getSpelling() << Ele;
@@ -1517,7 +1525,7 @@ void SYCLToolChain::AddImpliedTargetArgs(const llvm::Triple &Triple,
   if (Args.hasArg(options::OPT_fintelfpga) && getDriver().IsFPGAHWMode() &&
       Triple.getSubArch() == llvm::Triple::SPIRSubArch_fpga) {
     if (Arg *A = Args.getLastArg(options::OPT_ffp_model_EQ)) {
-      if (StringRef(A->getValue()).equals("fast"))
+      if (StringRef(A->getValue()) == "fast")
         BeArgs.push_back("-vpfp-relaxed");
     }
   }
@@ -1628,21 +1636,22 @@ void SYCLToolChain::TranslateBackendTargetArgs(
       Triple.isSPIROrSPIRV() && getDriver().isSYCLDefaultTripleImplied())
     return;
   // Handle -Xsycl-target-backend.
-  TranslateTargetOpt(Args, CmdArgs, options::OPT_Xsycl_backend,
+  TranslateTargetOpt(Triple, Args, CmdArgs, options::OPT_Xsycl_backend,
                      options::OPT_Xsycl_backend_EQ, Device);
   TranslateGPUTargetOpt(Args, CmdArgs, options::OPT_fsycl_targets_EQ);
 }
 
-void SYCLToolChain::TranslateLinkerTargetArgs(
-    const llvm::Triple &Triple, const llvm::opt::ArgList &Args,
-    llvm::opt::ArgStringList &CmdArgs) const {
+void SYCLToolChain::TranslateLinkerTargetArgs(const llvm::Triple &Triple,
+                                              const llvm::opt::ArgList &Args,
+                                              llvm::opt::ArgStringList &CmdArgs,
+                                              StringRef Device) const {
   // Do not process -Xsycl-target-linker for implied spir64/spirv64
   if (Triple.getSubArch() == llvm::Triple::NoSubArch &&
       Triple.isSPIROrSPIRV() && getDriver().isSYCLDefaultTripleImplied())
     return;
   // Handle -Xsycl-target-linker.
-  TranslateTargetOpt(Args, CmdArgs, options::OPT_Xsycl_linker,
-                     options::OPT_Xsycl_linker_EQ, StringRef());
+  TranslateTargetOpt(Triple, Args, CmdArgs, options::OPT_Xsycl_linker,
+                     options::OPT_Xsycl_linker_EQ, Device);
 }
 
 Tool *SYCLToolChain::buildBackendCompiler() const {
diff --git a/clang/lib/Driver/ToolChains/SYCL.h b/clang/lib/Driver/ToolChains/SYCL.h
index 4fe8dee807f49..3a62de18cd07d 100644
--- a/clang/lib/Driver/ToolChains/SYCL.h
+++ b/clang/lib/Driver/ToolChains/SYCL.h
@@ -181,8 +181,10 @@ class LLVM_LIBRARY_VISIBILITY SYCLToolChain : public ToolChain {
                                   StringRef Device = "") const;
   void TranslateLinkerTargetArgs(const llvm::Triple &Triple,
                                  const llvm::opt::ArgList &Args,
-                                 llvm::opt::ArgStringList &CmdArgs) const;
-  void TranslateTargetOpt(const llvm::opt::ArgList &Args,
+                                 llvm::opt::ArgStringList &CmdArgs,
+                                 StringRef Device = "") const;
+  void TranslateTargetOpt(const llvm::Triple &Triple,
+                          const llvm::opt::ArgList &Args,
                           llvm::opt::ArgStringList &CmdArgs,
                           llvm::opt::OptSpecifier Opt,
                           llvm::opt::OptSpecifier Opt_EQ,
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 6bdd566bb63ed..cd0c41031b551 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3858,7 +3858,7 @@ void CompilerInvocation::ParseFpAccuracyArgs(LangOptions &Opts, ArgList &Args,
             checkFPAccuracyIsValid(ValElement[0], Diags);
             // No need to fill the map if the FPaccuracy is 'default'.
             // The default builtin will be generated.
-            if (!ValElement[0].equals("default")) {
+            if (ValElement[0] != "default") {
               // if FPAccuracyFuncMap of this function has been previously set
               // update its value; the last fp-accuracy option in the command
               // line wins.
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 9f6c4b9a71f89..03e60c787f3f1 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3865,10 +3865,9 @@ bool Sema::CheckIntelSYCLAllocaBuiltinFunctionCall(unsigned BuiltinID,
   }
 
   // Check size is passed as a specialization constant
-  const auto CheckSize = [this, IsAlignedAlloca, ElementTypeIndex,
-                          SpecNameIndex](const ASTContext &Ctx,
-                                         SourceLocation Loc,
-                                         const TemplateArgumentList *CST) {
+  const auto CheckSize = [this, IsAlignedAlloca, SpecNameIndex](
+                             const ASTContext &Ctx, SourceLocation Loc,
+                             const TemplateArgumentList *CST) {
     TemplateArgument TA = CST->get(SpecNameIndex);
     QualType Ty = TA.getNonTypeTemplateArgumentType();
     if (Ty.isNull() || !Ty->isReferenceType())
diff --git a/clang/test/CodeGenSYCL/force-emit-device-virtual-funcs.cpp b/clang/test/CodeGenSYCL/force-emit-device-virtual-funcs.cpp
new file mode 100644
index 0000000000000..fb34cae42d9ae
--- /dev/null
+++ b/clang/test/CodeGenSYCL/force-emit-device-virtual-funcs.cpp
@@ -0,0 +1,47 @@
+// RUN: %clang_cc1 -internal-isystem %S/Inputs -triple spir64-unknown-unknown -fsycl-is-device \
+// RUN:     -fsycl-allow-virtual-functions -emit-llvm %s -o %t.ll
+// RUN: FileCheck %s --input-file=%t.ll --implicit-check-not _ZN7Derived3baz \
+// RUN:     --implicit-check-not _ZN4Base4baz --implicit-check-not _ZN4Base3foo
+//
+// Some SYCL properties may be turned into 'sycl_device' attribute implicitly
+// and we would like to ensure that functions like this (at the moment those
+// would be virtual member functions only) are forcefully emitted into device
+// code.
+
+class Base {
+  virtual void foo() {}
+
+  virtual void baz();
+
+  [[__sycl_detail__::add_ir_attributes_function("indirectly-callable", "a")]]
+  virtual void bar();
+};
+
+void Base::bar() {}
+
+void Base::baz() {}
+
+class Derived : public Base {
+public:
+  [[__sycl_detail__::add_ir_attributes_function("indirectly-callable", "b")]]
+  void foo() override;
+
+  [[__sycl_detail__::add_ir_attributes_function("indirectly-callable", "c")]]
+  void bar() override final;
+
+  [[__sycl_detail__::add_ir_attributes_function("not-indirectly-callable", "c")]]
+  void baz() override final;
+};
+
+void Derived::foo() {}
+
+void Derived::bar() {}
+
+void Derived::baz() {}
+
+// CHECK: define {{.*}}spir_func void @_ZN4Base3bar{{.*}} #[[#AttrA:]]
+// CHECK: define {{.*}}spir_func void @_ZN7Derived3foo{{.*}} #[[#AttrB:]]
+// CHECK: define {{.*}}spir_func void @_ZN7Derived3bar{{.*}} #[[#AttrC:]]
+// CHECK: attributes #[[#AttrA]] = {{.*}} "indirectly-callable"="a"
+// CHECK: attributes #[[#AttrB]] = {{.*}} "indirectly-callable"="b"
+// CHECK: attributes #[[#AttrC]] = {{.*}} "indirectly-callable"="c"
diff --git a/clang/test/Driver/linker-wrapper-sycl-win.cpp b/clang/test/Driver/linker-wrapper-sycl-win.cpp
index 1854dee476641..2ef253a019f34 100644
--- a/clang/test/Driver/linker-wrapper-sycl-win.cpp
+++ b/clang/test/Driver/linker-wrapper-sycl-win.cpp
@@ -1,11 +1,11 @@
 // REQUIRES: system-windows
 
 /// Check for list of commands for standalone clang-linker-wrapper run for sycl
-// RUN: clang-linker-wrapper -sycl-device-library-location=%S/Inputs -sycl-device-libraries=libsycl-crt.new.obj,libsycl-complex.new.obj -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-pc-windows-msvc" "--triple=spir64" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %S/Inputs/test-sycl.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS %s
+// RUN: clang-linker-wrapper -sycl-device-library-location=%S/Inputs -sycl-device-libraries=libsycl-crt.new.obj,libsycl-complex.new.obj -sycl-post-link-options="SYCL_POST_LINK_OPTIONS" -llvm-spirv-options="LLVM_SPIRV_OPTIONS" "--host-triple=x86_64-pc-windows-msvc" "--linker-path=/usr/bin/ld" "--" HOST_LINKER_FLAGS "-dynamic-linker" HOST_DYN_LIB "-o" "a.out" HOST_LIB_PATH HOST_STAT_LIB %S/Inputs/test-sycl.o --dry-run 2>&1 | FileCheck -check-prefix=CHK-CMDS %s
 // CHK-CMDS: "{{.*}}spirv-to-ir-wrapper.exe" {{.*}} -o [[FIRSTLLVMLINKIN:.*]].bc --llvm-spirv-opts=--spirv-preserve-auxdata --llvm-spirv-opts=--spirv-target-env=SPV-IR --llvm-spirv-opts=--spirv-builtin-format=global
 // CHK-CMDS-NEXT: "{{.*}}llvm-link.exe" [[FIRSTLLVMLINKIN:.*]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings
 // CHK-CMDS-NEXT: "{{.*}}llvm-link.exe" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings
-// CHK-CMDS-NEXT: "{{.*}}sycl-post-link.exe" SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[SECONDLLVMLINKOUT]].bc
+// CHK-CMDS-NEXT: "{{.*}}sycl-post-link.exe"{{.*}} SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[SECONDLLVMLINKOUT]].bc
 // LLVM-SPIRV is not called in dry-run
 // CHK-CMDS-NEXT: offload-wrapper: input: [[LLVMSPIRVOUT:.*]].table, output: [[WRAPPEROUT:.*]].bc
 // CHK-CMDS-NEXT: "{{.*}}llc.exe" -filetype=obj -o [[LLCOUT:.*]].o [[WRAPPEROUT]].bc
diff --git a/clang/test/Driver/linker-wrapper-sycl.cpp b/clang/test/Driver/linker-wrapper-sycl.cpp
index 0ba8b7414d69c..19bde37eb8be6 100644
--- a/clang/test/Driver/linker-wrapper-sycl.cpp
+++ b/clang/test/Driver/linker-wrapper-sycl.cpp
@@ -5,7 +5,7 @@
 // CHK-CMDS: "{{.*}}spirv-to-ir-wrapper" {{.*}} -o [[FIRSTLLVMLINKIN:.*]].bc --llvm-spirv-opts=--spirv-preserve-auxdata --llvm-spirv-opts=--spirv-target-env=SPV-IR --llvm-spirv-opts=--spirv-builtin-format=global
 // CHK-CMDS-NEXT: "{{.*}}llvm-link" [[FIRSTLLVMLINKIN:.*]].bc -o [[FIRSTLLVMLINKOUT:.*]].bc --suppress-warnings
 // CHK-CMDS-NEXT: "{{.*}}llvm-link" -only-needed [[FIRSTLLVMLINKOUT]].bc {{.*}}.bc {{.*}}.bc -o [[SECONDLLVMLINKOUT:.*]].bc --suppress-warnings
-// CHK-CMDS-NEXT: "{{.*}}sycl-post-link" SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[SECONDLLVMLINKOUT]].bc
+// CHK-CMDS-NEXT: "{{.*}}sycl-post-link"{{.*}} SYCL_POST_LINK_OPTIONS -o [[SYCLPOSTLINKOUT:.*]].table [[SECONDLLVMLINKOUT]].bc
 // LLVM-SPIRV is not called in dry-run
 // CHK-CMDS-NEXT: offload-wrapper: input: [[LLVMSPIRVOUT:.*]].table, output: [[WRAPPEROUT:.*]].bc
 // CHK-CMDS-NEXT: "{{.*}}llc" -filetype=obj -o [[LLCOUT:.*]].o [[WRAPPEROUT]].bc
diff --git a/clang/test/Driver/sycl-device-lib.cpp b/clang/test/Driver/sycl-device-lib.cpp
index d478c022a7e5d..df90b29872208 100644
--- a/clang/test/Driver/sycl-device-lib.cpp
+++ b/clang/test/Driver/sycl-device-lib.cpp
@@ -185,7 +185,7 @@
 // RUN:   | FileCheck %s -check-prefix=SYCL_LLVM_LINK_NO_DEVICE_LIB
 // SYCL_LLVM_LINK_NO_DEVICE_LIB: clang{{.*}} "-cc1" {{.*}} "-fsycl-is-device"
 // SYCL_LLVM_LINK_NO_DEVICE_LIB-NOT: llvm-link{{.*}}  "-only-needed"
-// SYCL_LLVM_LINK_NO_DEVICE_LIB: sycl-post-link{{.*}}  "-symbols" "-emit-exported-symbols"{{.*}} "-o" "{{.*}}.table" "{{.*}}.bc"
+// SYCL_LLVM_LINK_NO_DEVICE_LIB: sycl-post-link{{.*}}  "-symbols" "-emit-exported-symbols" "-emit-imported-symbols"{{.*}} "-o" "{{.*}}.table" "{{.*}}.bc"
 
 /// ###########################################################################
 /// test llvm-link behavior for special user input whose filename resembles SYCL device library
diff --git a/clang/test/Driver/sycl-device-sanitizer.cpp b/clang/test/Driver/sycl-device-sanitizer.cpp
index cba93ba3a9f68..5ab3c4265b21b 100644
--- a/clang/test/Driver/sycl-device-sanitizer.cpp
+++ b/clang/test/Driver/sycl-device-sanitizer.cpp
@@ -8,6 +8,7 @@
 // SYCL-ASAN-SAME: "-mllvm" "-asan-constructor-kind=none"
 // SYCL-ASAN-SAME: "-mllvm" "-asan-stack=0"
 // SYCL-ASAN-SAME: "-mllvm" "-asan-globals=0"
+// SYCL-ASAN-SAME: "-mllvm" "-asan-mapping-scale=4"
 
 // RUN: %clangxx -fsycl -fsanitize=address -mllvm -asan-stack=1 -c %s -### 2>&1 \
 // RUN:   | FileCheck --check-prefix=SYCL-ASAN-FILTER %s
diff --git a/clang/test/Driver/sycl-device-traits-macros-nvptx.cpp b/clang/test/Driver/sycl-device-traits-macros-nvptx.cpp
index f3fb0c536ccbd..75dd5b99c7aef 100644
--- a/clang/test/Driver/sycl-device-traits-macros-nvptx.cpp
+++ b/clang/test/Driver/sycl-device-traits-macros-nvptx.cpp
@@ -37,6 +37,8 @@
 // RUN:   FileCheck %s --check-prefix=CHECK-SYCL-NVPTX-NVIDIA-CUDA-DEVICE-TRIPLE
 // RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvidia_gpu_sm_90 -### %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CHECK-SYCL-NVPTX-NVIDIA-CUDA-DEVICE-TRIPLE
+// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvidia_gpu_sm_90a -### %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CHECK-SYCL-NVPTX-NVIDIA-CUDA-DEVICE-TRIPLE
 
 // Compiling for a CUDA target passing the device arch to '--offload-arch' (using the '--cuda-gpu-arch' alias).
 //
@@ -68,6 +70,8 @@
 // RUN:   FileCheck %s --check-prefix=CHECK-SYCL-NVPTX-NVIDIA-CUDA-OFFLOAD-ARCH
 // RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_90 -### %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=CHECK-SYCL-NVPTX-NVIDIA-CUDA-OFFLOAD-ARCH
+// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend --cuda-gpu-arch=sm_90a -### %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CHECK-SYCL-NVPTX-NVIDIA-CUDA-OFFLOAD-ARCH
 
 // Check device traits macros are defined if sycl is enabled:
 
@@ -92,4 +96,4 @@
 // CHECK-SM60: "-D__SYCL_TARGET_NVIDIA_GPU_SM_60__"{{.*}} "-D__SYCL_ALL_DEVICES_HAVE_[[ASPECT]]__=1"
 // CHECK-SM70: "-D__SYCL_TARGET_NVIDIA_GPU_SM_70__"{{.*}} "-D__SYCL_ALL_DEVICES_HAVE_[[ASPECT]]__=1"
 // CHECK-SM80: "-D__SYCL_TARGET_NVIDIA_GPU_SM_80__"{{.*}} "-D__SYCL_ALL_DEVICES_HAVE_[[ASPECT]]__=1"
-// CHECK-SM90: "-D__SYCL_TARGET_NVIDIA_GPU_SM_80__"{{.*}} "-D__SYCL_ALL_DEVICES_HAVE_[[ASPECT]]__=1"
+// CHECK-SM90: "-D__SYCL_TARGET_NVIDIA_GPU_SM_90__"{{.*}} "-D__SYCL_ALL_DEVICES_HAVE_[[ASPECT]]__=1"
diff --git a/clang/test/Driver/sycl-fno-libspirv-warn.cpp b/clang/test/Driver/sycl-fno-libspirv-warn.cpp
index 842d97153e549..902576f596f3b 100644
--- a/clang/test/Driver/sycl-fno-libspirv-warn.cpp
+++ b/clang/test/Driver/sycl-fno-libspirv-warn.cpp
@@ -1,7 +1,7 @@
 /// Test that appropriate warnings are output when -fno-sycl-libspirv is used.
 
 // RUN: not %clangxx -fsycl -fsycl-targets=nvptx64-nvidia-cuda,amdgcn-amd-amdhsa -fno-sycl-libspirv %s -### 2>&1 | FileCheck %s
-// CHECK: warning: '-fno-sycl-libspirv' should not be used with target 'nvptx64-nvidia-cuda'; libspirv is required for correct behavior [-Wno-libspirv-hip-cuda]
-// CHECK: warning: '-fno-sycl-libspirv' should not be used with target 'amdgcn-amd-amdhsa'; libspirv is required for correct behavior [-Wno-libspirv-hip-cuda]
+// CHECK-DAG: warning: '-fno-sycl-libspirv' should not be used with target 'nvptx64-nvidia-cuda'; libspirv is required for correct behavior [-Wno-libspirv-hip-cuda]
+// CHECK-DAG: warning: '-fno-sycl-libspirv' should not be used with target 'amdgcn-amd-amdhsa'; libspirv is required for correct behavior [-Wno-libspirv-hip-cuda]
 // RUN: %clangxx -fsycl -fsycl-targets=spir64-unknown-unknown -fno-sycl-libspirv %s -### 2>&1 | FileCheck --check-prefix=CHECK-SPIR64 %s
 // CHECK-SPIR64: ignoring '-fno-sycl-libspirv' option as it is not currently supported for target 'spir64-unknown-unknown' [-Woption-ignored]
diff --git a/clang/test/Driver/sycl-intelfpga-aoco-win.cpp b/clang/test/Driver/sycl-intelfpga-aoco-win.cpp
index bebbda92ac0f3..5cba6ff20a2ca 100755
--- a/clang/test/Driver/sycl-intelfpga-aoco-win.cpp
+++ b/clang/test/Driver/sycl-intelfpga-aoco-win.cpp
@@ -50,7 +50,7 @@
 // CHK-FPGA-AOCO: spirv-to-ir-wrapper{{.*}} "[[LIBLIST]]" "-o" "[[LIBLIST2:.+\.txt]]"
 // CHK-FPGA-AOCO: llvm-link{{.*}} "-o" "[[LINKEDBC:.+\.bc]]"
 // CHK-FPGA-AOCO: llvm-link{{.*}} "--only-needed" "[[LINKEDBC]]" "@[[LIBLIST2]]" "-o" "[[LINKEDBC2:.+\.bc]]"
-// CHK-FPGA-AOCO: sycl-post-link{{.*}} "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[SPLTABLE:.+\.table]]" "[[LINKEDBC2]]"
+// CHK-FPGA-AOCO: sycl-post-link{{.*}} "-device-globals"{{.*}} "-spec-const=emulation"{{.*}} "-o" "[[SPLTABLE:.+\.table]]" "[[LINKEDBC2]]"
 // CHK-FPGA-AOCO: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[SPLTABLE]]"
 // CHK-FPGA-AOCO: llvm-spirv{{.*}} "-o" "[[TARGSPV:.+\.txt]]" {{.*}} "[[TABLEOUT]]"
 // CHK-FPGA-AOCO: clang-offload-bundler{{.*}} "-type=aoo" "-targets=sycl-fpga_aoco-intel-unknown" "-input=[[INPUTLIB]]" "-output=[[AOCOLIST:.+\.txt]]" "-unbundle"
diff --git a/clang/test/Driver/sycl-intelfpga-aoco.cpp b/clang/test/Driver/sycl-intelfpga-aoco.cpp
index 8220580455b87..20839e0c08370 100755
--- a/clang/test/Driver/sycl-intelfpga-aoco.cpp
+++ b/clang/test/Driver/sycl-intelfpga-aoco.cpp
@@ -54,7 +54,7 @@
 // CHK-FPGA-AOCO: spirv-to-ir-wrapper{{.*}} "[[LIBLIST]]" "-o" "[[LIBLIST2:.+\.txt]]"
 // CHK-FPGA-AOCO: llvm-link{{.*}} "-o" "[[LINKEDBC:.+\.bc]]"
 // CHK-FPGA-AOCO: llvm-link{{.*}} "--only-needed" "[[LINKEDBC]]" "@[[LIBLIST2]]" "-o" "[[LINKEDBC2:.+\.bc]]"
-// CHK-FPGA-AOCO: sycl-post-link{{.*}} "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[SPLTABLE:.+\.table]]" "[[LINKEDBC2]]"
+// CHK-FPGA-AOCO: sycl-post-link{{.*}} "-device-globals"{{.*}} "-spec-const=emulation"{{.*}} "-o" "[[SPLTABLE:.+\.table]]" "[[LINKEDBC2]]"
 // CHK-FPGA-AOCO: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[SPLTABLE]]"
 // CHK-FPGA-AOCO: llvm-spirv{{.*}} "-o" "[[TARGSPV:.+\.txt]]" {{.*}} "[[TABLEOUT]]"
 // CHK-FPGA-AOCO: clang-offload-bundler{{.*}} "-type=aoo" "-targets=sycl-fpga_aoco-intel-unknown" "-input=[[INPUTLIB]]" "-output=[[AOCOLIST:.+\.txt]]" "-unbundle"
@@ -105,7 +105,7 @@
 // CHK-FPGA-AOCO-EMU: clang-offload-bundler{{.*}} "-type=aoo" "-targets=sycl-spir64_fpga-unknown-unknown" "-input=[[INPUTLIB:.+\.a]]" "-output=[[OUTLIB:.+\.txt]]" "-unbundle"
 // CHK-FPGA-AOCO-EMU: llvm-foreach{{.*}} "--out-ext=txt" "--in-file-list=[[OUTLIB]]" "--in-replace=[[OUTLIB]]" "--out-file-list=[[DEVICELIST:.+\.txt]]" "--out-replace=[[DEVICELIST]]" "--" {{.*}}spirv-to-ir-wrapper{{.*}} "[[OUTLIB]]" "-o" "[[DEVICELIST]]"
 // CHK-FPGA-AOCO-EMU: llvm-link{{.*}} "@[[DEVICELIST]]" "-o" "[[LINKEDBC:.+\.bc]]"
-// CHK-FPGA-AOCO-EMU: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[SPLTABLE:.+\.table]]" "[[LINKEDBC]]"
+// CHK-FPGA-AOCO-EMU: sycl-post-link{{.*}} "-O2" "-device-globals"{{.*}} "-spec-const=emulation"{{.*}} "-o" "[[SPLTABLE:.+\.table]]" "[[LINKEDBC]]"
 // CHK-FPGA-AOCO-EMU: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[SPLTABLE]]"
 // CHK-FPGA-AOCO-EMU: llvm-spirv{{.*}} "-o" "[[TARGSPV:.+\.txt]]" {{.*}} "[[TABLEOUT]]"
 // CHK-FPGA-AOCO-EMU: opencl-aot{{.*}} "-device=fpga_fast_emu" "-spv=[[TARGSPV]]" "-ir=[[AOCXOUT:.+\.aocx]]"
diff --git a/clang/test/Driver/sycl-linker-wrapper-image.cpp b/clang/test/Driver/sycl-linker-wrapper-image.cpp
index 37e976a6e39c5..6c9c0329e438b 100644
--- a/clang/test/Driver/sycl-linker-wrapper-image.cpp
+++ b/clang/test/Driver/sycl-linker-wrapper-image.cpp
@@ -4,7 +4,7 @@
 // RUN: %clang -cc1 -fsycl-is-device -disable-llvm-passes -triple=spir64-unknown-unknown %s -emit-llvm-bc -o %t.device.bc
 // RUN: clang-offload-packager -o %t.fat --image=file=%t.device.bc,kind=sycl,triple=spir64-unknown-unknown
 // RUN: %clang -cc1 %s -triple=x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.fat
-// RUN: clang-linker-wrapper --print-wrapped-module --host-triple=x86_64-unknown-linux-gnu --triple=spir64 \
+// RUN: clang-linker-wrapper --print-wrapped-module --host-triple=x86_64-unknown-linux-gnu \
 // RUN:                      -sycl-device-library-location=%S/Inputs -sycl-post-link-options="-split=auto -symbols" \
 // RUN:                      %t.o -o %t.out 2>&1 --linker-path="/usr/bin/ld" | FileCheck %s
 
@@ -41,13 +41,14 @@ int main() {
 // CHECK-DAG: @prop_val = internal unnamed_addr constant [8 x i8] zeroinitializer
 // CHECK-DAG: @__sycl_offload_prop_sets_arr.2 = internal constant [1 x %_pi_device_binary_property_struct] [%_pi_device_binary_property_struct { ptr @prop.1, ptr @prop_val, i32 2, i64 8 }]
 // CHECK-DAG: @SYCL_PropSetName.3 = internal unnamed_addr constant [25 x i8] c"SYCL/device requirements\00"
-// CHECK-DAG: @__sycl_offload_prop_sets_arr.4 = internal constant [2 x %_pi_device_binary_property_set_struct] [%_pi_device_binary_property_set_struct { ptr @SYCL_PropSetName, ptr @__sycl_offload_prop_sets_arr, ptr getelementptr inbounds ([1 x %_pi_device_binary_property_struct], ptr @__sycl_offload_prop_sets_arr, i64 1, i64 0) }, %_pi_device_binary_property_set_struct { ptr @SYCL_PropSetName.3, ptr @__sycl_offload_prop_sets_arr.2, ptr getelementptr inbounds ([1 x %_pi_device_binary_property_struct], ptr @__sycl_offload_prop_sets_arr.2, i64 1, i64 0) }]
+// CHECK-DAG: @SYCL_PropSetName.4 = internal unnamed_addr constant [22 x i8] c"SYCL/kernel param opt\00"
+// CHECK-DAG: @__sycl_offload_prop_sets_arr.5 = internal constant [3 x %_pi_device_binary_property_set_struct] [%_pi_device_binary_property_set_struct { ptr @SYCL_PropSetName, ptr @__sycl_offload_prop_sets_arr, ptr getelementptr inbounds ([1 x %_pi_device_binary_property_struct], ptr @__sycl_offload_prop_sets_arr, i64 1, i64 0) }, %_pi_device_binary_property_set_struct { ptr @SYCL_PropSetName.3, ptr @__sycl_offload_prop_sets_arr.2, ptr getelementptr inbounds ([1 x %_pi_device_binary_property_struct], ptr @__sycl_offload_prop_sets_arr.2, i64 1, i64 0) }, %_pi_device_binary_property_set_struct { ptr @SYCL_PropSetName.4, ptr null, ptr null }]
 // CHECK-DAG: @.sycl_offloading.0.data = internal unnamed_addr constant [740 x i8] 
 // CHECK-DAG: @__sycl_offload_entry_name = internal unnamed_addr constant [25 x i8] c"_ZTSZ4mainE11fake_kernel\00"
 // CHECK-DAG: @__sycl_offload_entries_arr = internal constant [1 x %struct.__tgt_offload_entry] [%struct.__tgt_offload_entry { ptr null, ptr @__sycl_offload_entry_name, i64 0, i32 0, i32 0 }]
 // CHECK-DAG: @.sycl_offloading.0.info = internal local_unnamed_addr constant [2 x i64] [i64 ptrtoint (ptr @.sycl_offloading.0.data to i64), i64 740], section ".tgtimg", align 16
 // CHECK-DAG: @llvm.used = appending global [1 x ptr] [ptr @.sycl_offloading.0.info], section "llvm.metadata"
-// CHECK-DAG: @.sycl_offloading.device_images = internal unnamed_addr constant [1 x %__sycl.tgt_device_image] [%__sycl.tgt_device_image { i16 2, i8 4, i8 0, ptr @.sycl_offloading.target.0, ptr @.sycl_offloading.opts.compile.0, ptr @.sycl_offloading.opts.link.0, ptr null, ptr null, ptr @.sycl_offloading.0.data, ptr getelementptr inbounds ([740 x i8], ptr @.sycl_offloading.0.data, i64 1, i64 0), ptr @__sycl_offload_entries_arr, ptr getelementptr inbounds ([1 x %struct.__tgt_offload_entry], ptr @__sycl_offload_entries_arr, i64 1, i64 0), ptr @__sycl_offload_prop_sets_arr.4, ptr getelementptr inbounds ([2 x %_pi_device_binary_property_set_struct], ptr @__sycl_offload_prop_sets_arr.4, i64 1, i64 0) }]
+// CHECK-DAG: @.sycl_offloading.device_images = internal unnamed_addr constant [1 x %__sycl.tgt_device_image] [%__sycl.tgt_device_image { i16 2, i8 4, i8 0, ptr @.sycl_offloading.target.0, ptr @.sycl_offloading.opts.compile.0, ptr @.sycl_offloading.opts.link.0, ptr null, ptr null, ptr @.sycl_offloading.0.data, ptr getelementptr inbounds ([740 x i8], ptr @.sycl_offloading.0.data, i64 1, i64 0), ptr @__sycl_offload_entries_arr, ptr getelementptr inbounds ([1 x %struct.__tgt_offload_entry], ptr @__sycl_offload_entries_arr, i64 1, i64 0), ptr @__sycl_offload_prop_sets_arr.5, ptr getelementptr inbounds ([3 x %_pi_device_binary_property_set_struct], ptr @__sycl_offload_prop_sets_arr.5, i64 1, i64 0) }]
 // CHECK-DAG: @.sycl_offloading.descriptor = internal constant %__sycl.tgt_bin_desc { i16 1, i16 1, ptr @.sycl_offloading.device_images, ptr null, ptr null }
 // CHECK-DAG: @llvm.global_ctors = {{.*}} { i32 1, ptr @sycl.descriptor_reg, ptr null }]
 // CHECK-DAG: @llvm.global_dtors = {{.*}} { i32 1, ptr @sycl.descriptor_unreg, ptr null }]
diff --git a/clang/test/Driver/sycl-lto.cpp b/clang/test/Driver/sycl-lto.cpp
new file mode 100644
index 0000000000000..b2b68fa5a3583
--- /dev/null
+++ b/clang/test/Driver/sycl-lto.cpp
@@ -0,0 +1,13 @@
+// Verify the usage of -foffload-lto with SYCL.
+
+// Verify we error when using the old offload driver.
+// RUN: not %clangxx -fsycl -foffload-lto=thin %s -### 2>&1 | FileCheck -check-prefix=CHECK_ERROR %s
+// CHECK_ERROR: unsupported option '-foffload-lto=thin' for target 'spir64-unknown-unknown'
+
+// Verify we error when using the new offload driver but with device code split set to off.
+// RUN: not %clangxx -fsycl --offload-new-driver -foffload-lto=thin -fsycl-device-code-split=off %s -### 2>&1 | FileCheck -check-prefix=CHECK_SPLIT_ERROR %s
+// CHECK_SPLIT_ERROR: '-fsycl-device-code-split=off' is not supported when '-foffload-lto=thin' is set with '-fsycl'
+
+// Verify there's no error and we see the expected cc1 flags with the new offload driver.
+// RUN: %clangxx -fsycl --offload-new-driver -foffload-lto=thin %s -### 2>&1 | FileCheck -check-prefix=CHECK_SUPPORTED %s
+// CHECK_SUPPORTED: clang{{.*}} "-cc1" "-triple" "spir64-unknown-unknown" {{.*}} "-flto=thin" "-flto-unit"
diff --git a/clang/test/Driver/sycl-offload-aot.cpp b/clang/test/Driver/sycl-offload-aot.cpp
index 4e9c615423ecb..295c4dd7eb137 100644
--- a/clang/test/Driver/sycl-offload-aot.cpp
+++ b/clang/test/Driver/sycl-offload-aot.cpp
@@ -298,3 +298,12 @@
 // RUN: %clang -fsycl -### -fsycl-targets=spir64_fpga -Xshardware -Xsycl-target-backend "-DBLAH" %s 2>&1 \
 // RUN:  | FileCheck -check-prefix=DUP-OPT %s
 // DUP-OPT-NOT: aoc{{.*}} "-DBLAH" {{.*}} "-DBLAH"
+
+/// Output files from ocloc should have an extension.
+// RUN:  %clangxx --target=x86_64-unknown-linux-gnu -fsycl \
+// RUN:           -fsycl-targets=intel_gpu_skl %s -### 2>&1 \
+// RUN:    | FileCheck -check-prefix=OCLOC_OUTPUT %s
+// RUN:  %clangxx --target=x86_64-unknown-linux-gnu -fsycl -save-temps \
+// RUN:           -fsycl-targets=intel_gpu_skl %s -### 2>&1 \
+// RUN:    | FileCheck -check-prefix=OCLOC_OUTPUT %s
+// OCLOC_OUTPUT: ocloc{{.*}} "-output" "{{.*}}.out"
diff --git a/clang/test/Driver/sycl-offload-intelfpga-emu.cpp b/clang/test/Driver/sycl-offload-intelfpga-emu.cpp
index 4a84d6437ae53..2a01cf719801e 100644
--- a/clang/test/Driver/sycl-offload-intelfpga-emu.cpp
+++ b/clang/test/Driver/sycl-offload-intelfpga-emu.cpp
@@ -16,7 +16,7 @@
 // CHK-FPGA-LINK: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_fpga-unknown-unknown" "-input=[[INPUT:.+\.o]]" "-output=[[OUTPUT1:.+\.o]]" "-unbundle"
 // CHK-FPGA-LINK: spirv-to-ir-wrapper{{.*}} "[[OUTPUT1]]" "-o" "[[IROUTPUT1:.+\.bc]]"
 // CHK-FPGA-LINK: llvm-link{{.*}} "[[IROUTPUT1]]" "-o" "[[OUTPUT2_1:.+\.bc]]"
-// CHK-FPGA-LINK: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" {{.*}} "-o" "[[OUTPUT2:.+\.table]]" "[[OUTPUT2_1]]"
+// CHK-FPGA-LINK: sycl-post-link{{.*}} "-O2"{{.*}} "-spec-const=emulation" {{.*}} "-o" "[[OUTPUT2:.+\.table]]" "[[OUTPUT2_1]]"
 // CHK-FPGA-LINK: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[OUTPUT2]]"
 // CHK-FPGA-LINK: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.txt]]" "-spirv-max-version={{.*}}"{{.*}} "[[TABLEOUT]]"
 // CHK-FPGA-EARLY: opencl-aot{{.*}} "-device=fpga_fast_emu" "-spv=[[OUTPUT3]]" "-ir=[[OUTPUT4:.+\.aocr]]" "--bo=-g"
@@ -41,7 +41,7 @@
 // CHK-FPGA-LINK-WIN: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-spir64_fpga-unknown-unknown{{.*}}" "-input=[[INPUT:.+\.obj]]" "-output=[[OUTPUT1:.+\.obj]]" "-unbundle"
 // CHK-FPGA-LINK-WIN: spirv-to-ir-wrapper{{.*}} "[[OUTPUT1]]" "-o" "[[IROUTPUT1:.+\.bc]]"
 // CHK-FPGA-LINK-WIN: llvm-link{{.*}} "[[IROUTPUT1]]" "-o" "[[OUTPUT2_1:.+\.bc]]"
-// CHK-FPGA-LINK-WIN: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" {{.*}} "-o" "[[OUTPUT2:.+\.table]]" "[[OUTPUT2_1]]"
+// CHK-FPGA-LINK-WIN: sycl-post-link{{.*}} "-O2"{{.*}} "-spec-const=emulation" {{.*}} "-o" "[[OUTPUT2:.+\.table]]" "[[OUTPUT2_1]]"
 // CHK-FPGA-LINK-WIN: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[OUTPUT2]]"
 // CHK-FPGA-LINK-WIN: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.txt]]" "-spirv-max-version={{.*}}"{{.*}} "[[TABLEOUT]]"
 // CHK-FPGA-LINK-WIN: opencl-aot{{.*}} "-device=fpga_fast_emu" "-spv=[[OUTPUT3]]" "-ir=[[OUTPUT4:.+\.aocr]]" "--bo=-g"
@@ -115,7 +115,7 @@
 // CHK-FPGA: clang-offload-bundler{{.*}} "-type=o" "-targets=host-x86_64-unknown-linux-gnu,sycl-spir64_fpga-unknown-unknown" {{.*}} "-output=[[FINALLINKx:.+\.o]]" "-output=[[OUTPUT1:.+\.o]]" "-unbundle"
 // CHK-FPGA: spirv-to-ir-wrapper{{.*}} "[[OUTPUT1]]" "-o" "[[IROUTPUT1:.+\.bc]]"
 // CHK-FPGA: llvm-link{{.*}} "[[IROUTPUT1]]"{{.*}} "-o" "[[OUTPUT2_BC:.+\.bc]]"
-// CHK-FPGA: sycl-post-link{{.*}} "-O2" "-spec-const=emulation"{{.*}} "-o" "[[OUTPUT3_TABLE:.+\.table]]" "[[OUTPUT2_BC]]"
+// CHK-FPGA: sycl-post-link{{.*}} "-O2"{{.*}} "-spec-const=emulation"{{.*}} "-o" "[[OUTPUT3_TABLE:.+\.table]]" "[[OUTPUT2_BC]]"
 // CHK-FPGA: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[OUTPUT3_TABLE]]"
 // CHK-FPGA: llvm-spirv{{.*}} "-o" "[[OUTPUT5:.+\.txt]]" "-spirv-max-version={{.*}}"{{.*}} "[[TABLEOUT]]"
 // CHK-FPGA: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-fpga_dep" {{.*}} "-output=[[DEPFILE:.+\.d]]" "-unbundle"
@@ -178,7 +178,7 @@
 // CHK-FPGA-AOCX-SRC: clang-offload-wrapper{{.*}} "-o=[[WRAPOUT:.+\.bc]]" {{.*}} "-target=spir64_fpga" "-kind=sycl" "--sym-prop-bc-files=[[SYM_AND_PROP]]" "-batch" "[[TABLEOUT]]"
 // CHK-FPGA-AOCX-SRC: llc{{.*}} "-filetype=obj" "-o" "[[LLCOUT:.+\.(o|obj)]]" "[[WRAPOUT]]"
 // CHK-FPGA-AOCX-SRC: llvm-link{{.*}} "[[DEVICEBC]]" "-o" "[[LLVMLINKOUT:.+\.bc]]" "--suppress-warnings"
-// CHK-FPGA-AOCX-SRC: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]]
+// CHK-FPGA-AOCX-SRC: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=emulation"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]]
 // CHK-FPGA-AOCX-SRC: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[POSTLINKOUT]]"
 // CHK-FPGA-AOCX-SRC: llvm-spirv{{.*}} "-o" "[[LLVMSPVOUT:.+\.txt]]" {{.*}} "[[TABLEOUT]]"
 // CHK-FPGA-AOCX-SRC: opencl-aot{{.*}} "-device=fpga_fast_emu" "-spv=[[LLVMSPVOUT]]" "-ir=[[OUTPUT4:.+\.aocx]]" "--bo=-g"
@@ -204,7 +204,7 @@
 // CHK-FPGA-AOCX-OBJ: clang-offload-bundler{{.*}} "-type=o" {{.*}} "-output=[[HOSTOBJx:.+\.(o|obj)]]" "-output=[[DEVICEOBJ:.+\.(o|obj)]]" "-unbundle"
 // CHK-FPGA-AOCX-OBJ: spirv-to-ir-wrapper{{.*}} "[[DEVICEOBJ]]" "-o" "[[IROUTPUT:.+\.bc]]"
 // CHK-FPGA-AOCX-OBJ: llvm-link{{.*}} "[[IROUTPUT]]" "-o" "[[LLVMLINKOUT:.+\.bc]]" "--suppress-warnings"
-// CHK-FPGA-AOCX-OBJ: sycl-post-link{{.*}} "-O2" "-spec-const=emulation"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]]
+// CHK-FPGA-AOCX-OBJ: sycl-post-link{{.*}} "-O2"{{.*}} "-spec-const=emulation"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]]
 // CHK-FPGA-AOCX-OBJ: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[POSTLINKOUT]]"
 // CHK-FPGA-AOCX-OBJ: llvm-spirv{{.*}} "-o" "[[LLVMSPVOUT:.+\.txt]]" {{.*}} "[[TABLEOUT]]"
 // CHK-FPGA-AOCX-OBJ: opencl-aot{{.*}} "-device=fpga_fast_emu" "-spv=[[LLVMSPVOUT]]" "-ir=[[OUTPUT4:.+\.aocx]]" "--bo=-g"
diff --git a/clang/test/Driver/sycl-offload-intelfpga-link.cpp b/clang/test/Driver/sycl-offload-intelfpga-link.cpp
index 5d3292be10917..d816f01b4a6e7 100644
--- a/clang/test/Driver/sycl-offload-intelfpga-link.cpp
+++ b/clang/test/Driver/sycl-offload-intelfpga-link.cpp
@@ -13,7 +13,7 @@
 // CHK-FPGA-LINK-NOT: clang-offload-bundler{{.*}}
 // CHK-FPGA-LINK: spirv-to-ir-wrapper{{.*}} "[[OUTPUT1]]" "-o" "[[IROUTPUT1:.+\.bc]]"
 // CHK-FPGA-LINK: llvm-link{{.*}} "[[IROUTPUT1]]" "-o" "[[OUTPUT2_1:.+\.bc]]"
-// CHK-FPGA-LINK: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[OUTPUT2:.+\.table]]" "[[OUTPUT2_1]]"
+// CHK-FPGA-LINK: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=emulation"{{.*}} "-o" "[[OUTPUT2:.+\.table]]" "[[OUTPUT2_1]]"
 // CHK-FPGA-LINK: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[OUTPUT2]]"
 // CHK-FPGA-LINK: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.txt]]" "-spirv-max-version={{.*}}"{{.*}} "[[TABLEOUT]]"
 // CHK-FPGA-EARLY: aoc{{.*}} "-o" "[[OUTPUT4:.+\.aocr]]" "[[OUTPUT3]]" "-sycl" "-rtl"
@@ -46,7 +46,7 @@
 // CHK-FPGA-LINK-WIN-NOT: clang-offload-bundler{{.*}}
 // CHK-FPGA-LINK-WIN: spirv-to-ir-wrapper{{.*}} "[[OUTPUT1]]" "-o" "[[IROUTPUT1:.+\.bc]]"
 // CHK-FPGA-LINK-WIN: llvm-link{{.*}} "[[IROUTPUT1]]" "-o" "[[OUTPUT2_1:.+\.bc]]"
-// CHK-FPGA-LINK-WIN: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[OUTPUT2:.+\.table]]" "[[OUTPUT2_1]]"
+// CHK-FPGA-LINK-WIN: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=emulation"{{.*}} "-o" "[[OUTPUT2:.+\.table]]" "[[OUTPUT2_1]]"
 // CHK-FPGA-LINK-WIN: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[OUTPUT2]]"
 // CHK-FPGA-LINK-WIN: llvm-spirv{{.*}} "-o" "[[OUTPUT3:.+\.txt]]" "-spirv-max-version={{.*}}"{{.*}} "[[TABLEOUT]]"
 // CHK-FPGA-LINK-WIN: aoc{{.*}} "-o" "[[OUTPUT5:.+\.aocr]]" "[[OUTPUT3]]" "-sycl" "-rtl"
@@ -175,7 +175,7 @@
 // CHK-FPGA: clang-offload-bundler{{.*}} "-type=o" "-targets=host-x86_64-unknown-linux-gnu,sycl-spir64_fpga-unknown-unknown" {{.*}} "-output=[[FINALLINK2x:.+\.o]]" "-output=[[OUTPUT1:.+\.o]]" "-unbundle"
 // CHK-FPGA: spirv-to-ir-wrapper{{.*}} "[[OUTPUT1]]" "-o" "[[IROUTPUT1:.+\.bc]]"
 // CHK-FPGA: llvm-link{{.*}} "[[IROUTPUT1]]" "-o" "[[OUTPUT2_BC:.+\.bc]]"
-// CHK-FPGA: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[OUTPUT3_TABLE:.+\.table]]" "[[OUTPUT2_BC]]"
+// CHK-FPGA: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=emulation"{{.*}} "-o" "[[OUTPUT3_TABLE:.+\.table]]" "[[OUTPUT2_BC]]"
 // CHK-FPGA: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[OUTPUT3_TABLE]]"
 // CHK-FPGA: llvm-spirv{{.*}} "-o" "[[OUTPUT5:.+\.txt]]" "-spirv-max-version={{.*}}"{{.*}} "[[TABLEOUT]]"
 // CHK-FPGA: clang-offload-bundler{{.*}} "-type=o" "-targets=sycl-fpga_dep" {{.*}} "-output=[[DEPFILE:.+\.d]]" "-unbundle"
@@ -237,7 +237,7 @@
 // CHK-FPGA-AOCX-SRC: clang-offload-wrapper{{.*}} "-o=[[WRAPOUT:.+\.bc]]" {{.*}} "-target=spir64_fpga" "-kind=sycl" "--sym-prop-bc-files=[[SYM_AND_PROP]]" "-batch" "[[TABLEOUT]]"
 // CHK-FPGA-AOCX-SRC: llc{{.*}} "-filetype=obj" "-o" "[[LLCOUT:.+\.(o|obj)]]" "[[WRAPOUT]]"
 // CHK-FPGA-AOCX-SRC: llvm-link{{.*}} "[[DEVICEBC]]" "-o" "[[LLVMLINKOUT:.+\.bc]]" "--suppress-warnings"
-// CHK-FPGA-AOCX-SRC: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]]
+// CHK-FPGA-AOCX-SRC: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=emulation"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]]
 // CHK-FPGA-AOCX-SRC: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[POSTLINKOUT]]"
 // CHK-FPGA-AOCX-SRC: llvm-spirv{{.*}} "-o" "[[LLVMSPVOUT:.+\.txt]]" {{.*}} "[[TABLEOUT]]"
 // CHK-FPGA-AOCX-SRC: aoc{{.*}} "-o" "[[AOCOUT:.+\.aocx]]" "[[LLVMSPVOUT]]" "-sycl"
@@ -263,7 +263,7 @@
 // CHK-FPGA-AOCX-OBJ: clang-offload-bundler{{.*}} "-type=o" {{.*}} "-output=[[HOSTOBJx:.+\.(o|obj)]]" "-output=[[DEVICEOBJ:.+\.(o|obj)]]" "-unbundle"
 // CHK-FPGA-AOCX-OBJ: spirv-to-ir-wrapper{{.*}} "[[DEVICEOBJ]]" "-o" "[[IROUTPUT:.+\.bc]]"
 // CHK-FPGA-AOCX-OBJ: llvm-link{{.*}} "[[IROUTPUT]]" "-o" "[[LLVMLINKOUT:.+\.bc]]" "--suppress-warnings"
-// CHK-FPGA-AOCX-OBJ: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]]
+// CHK-FPGA-AOCX-OBJ: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=emulation"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]]
 // CHK-FPGA-AOCX-OBJ: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[POSTLINKOUT]]"
 // CHK-FPGA-AOCX-OBJ: llvm-spirv{{.*}} "-o" "[[LLVMSPVOUT:.+\.txt]]" {{.*}} "[[TABLEOUT]]"
 // CHK-FPGA-AOCX-OBJ: aoc{{.*}} "-o" "[[AOCOUT:.+\.aocx]]" "[[LLVMSPVOUT]]" "-sycl"
@@ -283,7 +283,7 @@
 // CHK-FPGA-AOCX-OBJ2: clang-offload-bundler{{.*}} "-type=o" {{.*}} "-output=[[HOSTOBJx:.+\.(o|obj)]]" "-output=[[DEVICEOBJ:.+\.(o|obj)]]" "-output=[[DEVICEOBJ2:.+\.(o|obj)]]" "-unbundle"
 // CHK-FPGA-AOCX-OBJ2: spirv-to-ir-wrapper{{.*}} "[[DEVICEOBJ]]" "-o" "[[IROUTPUT:.+\.bc]]"
 // CHK-FPGA-AOCX-OBJ2: llvm-link{{.*}} "[[IROUTPUT]]" "-o" "[[LLVMLINKOUT:.+\.bc]]" "--suppress-warnings"
-// CHK-FPGA-AOCX-OBJ2: sycl-post-link{{.*}} "-O2" "-spec-const=native" "-device-globals"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]]"
+// CHK-FPGA-AOCX-OBJ2: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=native"{{.*}} "-o" "[[POSTLINKOUT:.+\.table]]" "[[LLVMLINKOUT]]"
 // CHK-FPGA-AOCX-OBJ2: file-table-tform{{.*}} "-o" "[[TABLEOUT:.+\.txt]]" "[[POSTLINKOUT]]"
 // CHK-FPGA-AOCX-OBJ2: llvm-spirv{{.*}} "-o" "[[LLVMSPVOUT:.+\.txt]]" {{.*}} "[[TABLEOUT]]"
 // CHK-FPGA-AOCX-OBJ2: file-table-tform{{.*}} "-replace=Code,Code" "-o" "[[TFORM_OUT:.+\.table]]" "[[POSTLINKOUT]]" "[[LLVMSPVOUT]]"
@@ -291,7 +291,7 @@
 // CHK-FPGA-AOCX-OBJ2: llc{{.*}} "-filetype=obj" "-o" "[[LLCOUT:.+\.(o|obj)]]" "[[WRAPOUT]]"
 // CHK-FPGA-AOCX-OBJ2: spirv-to-ir-wrapper{{.*}} "[[DEVICEOBJ2]]" "-o" "[[IROUTPUT2:.+\.bc]]"
 // CHK-FPGA-AOCX-OBJ2: llvm-link{{.*}} "[[IROUTPUT2]]" "-o" "[[LLVMLINKOUT2:.+\.bc]]" "--suppress-warnings"
-// CHK-FPGA-AOCX-OBJ2: sycl-post-link{{.*}} "-O2" "-spec-const=emulation" "-device-globals"{{.*}} "-o" "[[POSTLINKOUT2:.+\.table]]" "[[LLVMLINKOUT2]]"
+// CHK-FPGA-AOCX-OBJ2: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=emulation"{{.*}} "-o" "[[POSTLINKOUT2:.+\.table]]" "[[LLVMLINKOUT2]]"
 // CHK-FPGA-AOCX-OBJ2: file-table-tform{{.*}} "-o" "[[TABLEOUT2:.+\.txt]]" "[[POSTLINKOUT2]]"
 // CHK-FPGA-AOCX-OBJ2: llvm-spirv{{.*}} "-o" "[[LLVMSPVOUT2:.+\.txt]]" {{.*}} "[[TABLEOUT2]]"
 // CHK-FPGA-AOCX-OBJ2: aoc{{.*}} "-o" "[[AOCOUT:.+\.aocx]]" "[[LLVMSPVOUT2]]" "-sycl"
diff --git a/clang/test/Driver/sycl-offload-new-driver.c b/clang/test/Driver/sycl-offload-new-driver.c
index a4b91621bfb94..0a4a5067457d6 100644
--- a/clang/test/Driver/sycl-offload-new-driver.c
+++ b/clang/test/Driver/sycl-offload-new-driver.c
@@ -25,17 +25,16 @@
 /// Check the toolflow for SYCL compilation using new offload model
 // RUN: %clangxx -### --target=x86_64-unknown-linux-gnu -fsycl -fsycl-targets=spir64 --offload-new-driver %s 2>&1 | FileCheck -check-prefix=CHK-FLOW %s
 // CHK-FLOW: clang{{.*}} "-cc1" "-triple" "spir64-unknown-unknown" "-aux-triple" "x86_64-unknown-linux-gnu" "-fsycl-is-device" {{.*}} "-fsycl-int-header=[[HEADER:.*]].h" "-fsycl-int-footer=[[FOOTER:.*]].h" {{.*}} "--offload-new-driver" {{.*}} "-o" "[[CC1DEVOUT:.*]]" "-x" "c++" "[[INPUT:.*]]"
-// CHK-FLOW-NEXT: clang-offload-packager{{.*}} "-o" "[[PACKOUT:.*]]" "--image=file=[[CC1DEVOUT]],triple=spir64-unknown-unknown,arch=,kind=sycl"
+// CHK-FLOW-NEXT: clang-offload-packager{{.*}} "-o" "[[PACKOUT:.*]]" "--image=file=[[CC1DEVOUT]],triple=spir64-unknown-unknown,arch=,kind=sycl{{.*}}"
 // CHK-FLOW-NEXT: append-file{{.*}} "[[INPUT]]" "--append=[[FOOTER]].h" "--orig-filename=[[INPUT]]" "--output=[[APPENDOUT:.*]]" "--use-include"
 // CHK-FLOW-NEXT: clang{{.*}} "-cc1" "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-include" "[[HEADER]].h" "-dependency-filter" "[[HEADER]].h" {{.*}} "-fsycl-is-host"{{.*}} "-full-main-file-name" "[[INPUT]]" {{.*}} "--offload-new-driver" {{.*}} "-fembed-offload-object=[[PACKOUT]]" {{.*}} "-o" "[[CC1FINALOUT:.*]]" "-x" "c++" "[[APPENDOUT]]"
-// CHK-FLOW-NEXT: clang-linker-wrapper{{.*}} "--host-triple=x86_64-unknown-linux-gnu" "--triple=spir64"{{.*}} "--linker-path={{.*}}/ld" {{.*}} "[[CC1FINALOUT]]"
+// CHK-FLOW-NEXT: clang-linker-wrapper{{.*}} "--host-triple=x86_64-unknown-linux-gnu"{{.*}} "--linker-path={{.*}}/ld" {{.*}} "[[CC1FINALOUT]]"
 
 /// Verify options passed to clang-linker-wrapper
 // RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl --offload-new-driver \
 // RUN:          --sysroot=%S/Inputs/SYCL -### %s 2>&1 \
 // RUN:   | FileCheck -check-prefix WRAPPER_OPTIONS %s
-// WRAPPER_OPTIONS: clang-linker-wrapper{{.*}} "--triple=spir64"
-// WRAPPER_OPTIONS-SAME: "-sycl-device-libraries=libsycl-crt.new.o,libsycl-complex.new.o,libsycl-complex-fp64.new.o,libsycl-cmath.new.o,libsycl-cmath-fp64.new.o,libsycl-imf.new.o,libsycl-imf-fp64.new.o,libsycl-imf-bf16.new.o,libsycl-fallback-cassert.new.o,libsycl-fallback-cstring.new.o,libsycl-fallback-complex.new.o,libsycl-fallback-complex-fp64.new.o,libsycl-fallback-cmath.new.o,libsycl-fallback-cmath-fp64.new.o,libsycl-fallback-imf.new.o,libsycl-fallback-imf-fp64.new.o,libsycl-fallback-imf-bf16.new.o,libsycl-itt-user-wrappers.new.o,libsycl-itt-compiler-wrappers.new.o,libsycl-itt-stubs.new.o"
+// WRAPPER_OPTIONS: clang-linker-wrapper{{.*}} "-sycl-device-libraries=libsycl-crt.new.o,libsycl-complex.new.o,libsycl-complex-fp64.new.o,libsycl-cmath.new.o,libsycl-cmath-fp64.new.o,libsycl-imf.new.o,libsycl-imf-fp64.new.o,libsycl-imf-bf16.new.o,libsycl-fallback-cassert.new.o,libsycl-fallback-cstring.new.o,libsycl-fallback-complex.new.o,libsycl-fallback-complex-fp64.new.o,libsycl-fallback-cmath.new.o,libsycl-fallback-cmath-fp64.new.o,libsycl-fallback-imf.new.o,libsycl-fallback-imf-fp64.new.o,libsycl-fallback-imf-bf16.new.o,libsycl-itt-user-wrappers.new.o,libsycl-itt-compiler-wrappers.new.o,libsycl-itt-stubs.new.o"
 // WRAPPER_OPTIONS-SAME: "-sycl-device-library-location={{.*}}/lib"
 
 /// Verify phases used to generate SPIR-V instead of LLVM-IR
@@ -56,14 +55,12 @@
 // RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl --offload-new-driver \
 // RUN:          -Xspirv-translator -translator-opt -### %s 2>&1 \
 // RUN:   | FileCheck -check-prefix WRAPPER_OPTIONS_TRANSLATOR %s
-// WRAPPER_OPTIONS_TRANSLATOR: clang-linker-wrapper{{.*}} "--triple=spir64"
-// WRAPPER_OPTIONS_TRANSLATOR-SAME: "--llvm-spirv-options={{.*}}-translator-opt{{.*}}"
+// WRAPPER_OPTIONS_TRANSLATOR: clang-linker-wrapper{{.*}} "--llvm-spirv-options={{.*}}-translator-opt{{.*}}"
 
 // RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl --offload-new-driver \
 // RUN:          -Xdevice-post-link -post-link-opt -### %s 2>&1 \
 // RUN:   | FileCheck -check-prefix WRAPPER_OPTIONS_POSTLINK %s
-// WRAPPER_OPTIONS_POSTLINK: clang-linker-wrapper{{.*}} "--triple=spir64"
-// WRAPPER_OPTIONS_POSTLINK-SAME: "--sycl-post-link-options=-post-link-opt -O2 -spec-const=native -device-globals -split=auto -emit-only-kernels-as-entry-points -emit-param-info -symbols -emit-exported-symbols -split-esimd -lower-esimd"
+// WRAPPER_OPTIONS_POSTLINK: clang-linker-wrapper{{.*}} "--sycl-post-link-options=-O2 -device-globals -post-link-opt"
 
 // -fsycl-device-only behavior
 // RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl --offload-new-driver \
@@ -94,6 +91,17 @@
 // RUN:  | FileCheck -check-prefix=CHK_ARCH \
 // RUN:              -DTRIPLE=spir64_gen-unknown-unknown -DARCH=pvc %s
 // RUN: %clangxx -### --target=x86_64-unknown-linux-gnu -fsycl \
+// RUN:          -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen \
+// RUN:          "-device pvc" --offload-new-driver %s 2>&1 \
+// RUN:  | FileCheck -check-prefix=CHK_ARCH \
+// RUN:              -DTRIPLE=spir64_gen-unknown-unknown -DARCH=pvc %s
+// RUN: %clangxx -### --target=x86_64-unknown-linux-gnu -fsycl \
+// RUN:          -fsycl-targets=spir64_gen -Xsycl-target-backend=spir64_gen \
+// RUN:          "-device pvc" -Xsycl-target-backend=spir64_gen "-device dg1" \
+// RUN:          --offload-new-driver %s 2>&1 \
+// RUN:  | FileCheck -check-prefix=CHK_ARCH \
+// RUN:              -DTRIPLE=spir64_gen-unknown-unknown -DARCH=dg1 %s
+// RUN: %clangxx -### --target=x86_64-unknown-linux-gnu -fsycl \
 // RUN:          -fno-sycl-libspirv -fsycl-targets=amd_gpu_gfx900 \
 // RUN:          -nogpulib --offload-new-driver %s 2>&1 \
 // RUN:  | FileCheck -check-prefix=CHK_ARCH \
@@ -105,17 +113,63 @@
 // RUN:              -DTRIPLE=nvptx64-nvidia-cuda -DARCH=sm_50 %s
 // CHK_ARCH: clang{{.*}} "-triple" "[[TRIPLE]]"
 // CHK_ARCH-SAME: "-fsycl-is-device" {{.*}} "--offload-new-driver"{{.*}} "-o" "[[CC1DEVOUT:.+\.bc]]"
-// CHK_ARCH-NEXT: clang-offload-packager{{.*}} "--image=file=[[CC1DEVOUT]],triple=[[TRIPLE]],arch=[[ARCH]],kind=sycl"
+// CHK_ARCH-NEXT: clang-offload-packager{{.*}} "--image=file=[[CC1DEVOUT]],triple=[[TRIPLE]],arch=[[ARCH]],kind=sycl{{.*}}"
+
+// Verify offload-packager option values
+// RUN: %clangxx -### --target=x86_64-unknown-linux-gnu -fsycl \
+// RUN:          -fsycl-targets=spir64,intel_gpu_pvc \
+// RUN:          -Xsycl-target-backend=spir64 -spir64-opt \
+// RUN:          -Xsycl-target-backend=intel_gpu_pvc -spir64_gen-opt \
+// RUN:          -Xsycl-target-linker=spir64 -spir64-link-opt \
+// RUN:          -Xsycl-target-linker=intel_gpu_pvc -spir64_gen-link-opt \
+// RUN:          --offload-new-driver %s 2>&1 \
+// RUN:  | FileCheck -check-prefix=CHK_PACKAGER_OPTS %s
+// CHK_PACKAGER_OPTS: clang-offload-packager{{.*}} "-o"
+// CHK_PACKAGER_OPTS-SAME: {{.*}}triple=spir64_gen-unknown-unknown,arch=pvc,kind=sycl,compile-opts={{.*}}-spir64_gen-opt,link-opts=-spir64_gen-link-opt
+// CHK_PACKAGER_OPTS-SAME: {{.*}}triple=spir64-unknown-unknown,arch=,kind=sycl,compile-opts={{.*}}-spir64-opt,link-opts=-spir64-link-opt
+
+/// Check phases with multiple intel_gpu settings
+// RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl \
+// RUN:          -fsycl-targets=intel_gpu_dg1,intel_gpu_pvc \
+// RUN:          --offload-new-driver -ccc-print-phases %s 2>&1 \
+// RUN:  | FileCheck -check-prefix=MULT_TARG_PHASES %s
+// MULT_TARG_PHASES: 0: input, "[[INPUT:.+\.c]]", c++, (host-sycl)
+// MULT_TARG_PHASES: 1: append-footer, {0}, c++, (host-sycl)
+// MULT_TARG_PHASES: 2: preprocessor, {1}, c++-cpp-output, (host-sycl)
+// MULT_TARG_PHASES: 3: compiler, {2}, ir, (host-sycl)
+// MULT_TARG_PHASES: 4: input, "[[INPUT]]", c++, (device-sycl, dg1)
+// MULT_TARG_PHASES: 5: preprocessor, {4}, c++-cpp-output, (device-sycl, dg1)
+// MULT_TARG_PHASES: 6: compiler, {5}, ir, (device-sycl, dg1)
+// MULT_TARG_PHASES: 7: backend, {6}, ir, (device-sycl, dg1)
+// MULT_TARG_PHASES: 8: offload, "device-sycl (spir64_gen-unknown-unknown:dg1)" {7}, ir
+// MULT_TARG_PHASES: 9: input, "[[INPUT]]", c++, (device-sycl, pvc)
+// MULT_TARG_PHASES: 10: preprocessor, {9}, c++-cpp-output, (device-sycl, pvc)
+// MULT_TARG_PHASES: 11: compiler, {10}, ir, (device-sycl, pvc)
+// MULT_TARG_PHASES: 12: backend, {11}, ir, (device-sycl, pvc)
+// MULT_TARG_PHASES: 13: offload, "device-sycl (spir64_gen-unknown-unknown:pvc)" {12}, ir
+// MULT_TARG_PHASES: 14: clang-offload-packager, {8, 13}, image, (device-sycl)
+// MULT_TARG_PHASES: 15: offload, "host-sycl (x86_64-unknown-linux-gnu)" {3}, "device-sycl (x86_64-unknown-linux-gnu)" {14}, ir
+// MULT_TARG_PHASES: 16: backend, {15}, assembler, (host-sycl)
+// MULT_TARG_PHASES: 17: assembler, {16}, object, (host-sycl)
 
 /// Test option passing behavior for clang-offload-wrapper options.
 // RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl --offload-new-driver \
 // RUN:          -Xsycl-target-backend -backend-opt -### %s 2>&1 \
 // RUN:   | FileCheck -check-prefix WRAPPER_OPTIONS_BACKEND %s
-// WRAPPER_OPTIONS_BACKEND: clang-linker-wrapper{{.*}} "--triple=spir64"
-// WRAPPER_OPTIONS_BACKEND-SAME: "--sycl-backend-compile-options={{.*}}-backend-opt{{.*}}"
+// WRAPPER_OPTIONS_BACKEND: clang-linker-wrapper{{.*}} "--sycl-backend-compile-options={{.*}}-backend-opt{{.*}}"
 
 // RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl --offload-new-driver \
 // RUN:          -Xsycl-target-linker -link-opt -### %s 2>&1 \
 // RUN:   | FileCheck -check-prefix WRAPPER_OPTIONS_LINK %s
-// WRAPPER_OPTIONS_LINK: clang-linker-wrapper{{.*}} "--triple=spir64"
-// WRAPPER_OPTIONS_LINK-SAME: "--sycl-target-link-options={{.*}}-link-opt{{.*}}"
+// WRAPPER_OPTIONS_LINK: clang-linker-wrapper{{.*}} "--sycl-target-link-options={{.*}}-link-opt{{.*}}"
+
+/// Test option passing behavior for clang-offload-wrapper options for AOT.
+// RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl --offload-new-driver \
+// RUN:          -fsycl-targets=spir64_gen,spir64_x86_64 \
+// RUN:          -Xsycl-target-backend=spir64_gen -backend-gen-opt \
+// RUN:          -Xsycl-target-backend=spir64_x86_64 -backend-cpu-opt \
+// RUN:          -### %s 2>&1 \
+// RUN:   | FileCheck -check-prefix WRAPPER_OPTIONS_BACKEND_AOT %s
+// WRAPPER_OPTIONS_BACKEND_AOT: clang-linker-wrapper{{.*}}  "--host-triple=x86_64-unknown-linux-gnu"
+// WRAPPER_OPTIONS_BACKEND_AOT-SAME: "--gen-tool-arg={{.*}}-backend-gen-opt"
+// WRAPPER_OPTIONS_BACKEND_AOT-SAME: "--cpu-tool-arg={{.*}}-backend-cpu-opt"
diff --git a/clang/test/Driver/sycl-offload-with-split.c b/clang/test/Driver/sycl-offload-with-split.c
index d081d083c3a91..3e304b78e2e52 100644
--- a/clang/test/Driver/sycl-offload-with-split.c
+++ b/clang/test/Driver/sycl-offload-with-split.c
@@ -205,7 +205,7 @@
 // CHK-TOOLS-AOT: clang{{.*}} "-fsycl-is-device"{{.*}} "-fsycl-int-header=[[INPUT1:.+\-header.+\.h]]" "-fsycl-int-footer={{.*}}"{{.*}} "-o" "[[OUTPUT1:.+\.bc]]"
 // CHK-TOOLS-AOT: clang{{.*}} "-triple" "x86_64-unknown-linux-gnu" {{.*}} "-o" "[[OUTPUT10:.+\.o]]"
 // CHK-TOOLS-AOT: llvm-link{{.*}} "[[OUTPUT1]]" "-o" "[[OUTPUT2:.+\.bc]]"
-// CHK-TOOLS-AOT: sycl-post-link{{.*}} "-split=auto" {{.*}} "-spec-const=emulation" {{.*}} "-o" "[[OUTPUT3:.+\.table]]" "[[OUTPUT2]]"
+// CHK-TOOLS-AOT: sycl-post-link{{.*}} "-split=auto"{{.*}} "-spec-const=emulation"{{.*}} "-o" "[[OUTPUT3:.+\.table]]" "[[OUTPUT2]]"
 // CHK-TOOLS-AOT: file-table-tform{{.*}} "-o" "[[OUTPUT4:.+\.txt]]" "[[OUTPUT3]]"
 // CHK-TOOLS-AOT: llvm-foreach{{.*}} "--in-file-list=[[OUTPUT4]]" "--in-replace=[[OUTPUT4]]" "--out-ext=spv" "--out-file-list=[[OUTPUT5:.+\.txt]]" "--out-replace=[[OUTPUT5]]" "--" "{{.*}}llvm-spirv{{.*}}" "-o" "[[OUTPUT5]]" {{.*}} "[[OUTPUT4]]"
 // CHK-TOOLS-FPGA: llvm-foreach{{.*}} "--out-file-list=[[OUTPUT6:.+\.txt]]{{.*}} "--" "{{.*}}aoc{{.*}} "-o" "[[OUTPUT6]]" "[[OUTPUT5]]"
diff --git a/clang/test/Driver/sycl-oneapi-gpu-nvidia.cpp b/clang/test/Driver/sycl-oneapi-gpu-nvidia.cpp
index cc5795bad3aca..097571e21edf5 100644
--- a/clang/test/Driver/sycl-oneapi-gpu-nvidia.cpp
+++ b/clang/test/Driver/sycl-oneapi-gpu-nvidia.cpp
@@ -28,11 +28,14 @@
 // RUN:   FileCheck %s --check-prefixes=DEVICE_NVIDIA,MACRO_NVIDIA -DDEV_STR=sm_89 -DMAC_STR=SM_89
 // RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvidia_gpu_sm_90 -### %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefixes=DEVICE_NVIDIA,MACRO_NVIDIA -DDEV_STR=sm_90 -DMAC_STR=SM_90
+// RUN: %clangxx -fsycl -nocudalib -fsycl-targets=nvidia_gpu_sm_90a -### %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=DEVICE_NVIDIA,MACRO_NVIDIA -DDEV_STR=sm_90a -DMAC_STR=SM_90A
 // MACRO_NVIDIA: clang{{.*}}  "-fsycl-is-host"
 // MACRO_NVIDIA: "-D__SYCL_TARGET_NVIDIA_GPU_[[MAC_STR]]__"
 // MACRO_NVIDIA: clang{{.*}} "-triple" "nvptx64-nvidia-cuda"
 // DEVICE_NVIDIA: llvm-foreach{{.*}} "--gpu-name" "[[DEV_STR]]"
 
+
 /// test for invalid nvidia arch
 // RUN: not %clangxx -c -fsycl -fsycl-targets=nvidia_gpu_bad -### %s 2>&1 | \
 // RUN:   FileCheck %s --check-prefix=BAD_NVIDIA_INPUT
diff --git a/clang/test/Driver/sycl-post-link-options-win.cpp b/clang/test/Driver/sycl-post-link-options-win.cpp
new file mode 100644
index 0000000000000..65a802d1f0210
--- /dev/null
+++ b/clang/test/Driver/sycl-post-link-options-win.cpp
@@ -0,0 +1,16 @@
+// REQUIRES: system-windows
+/// Verify same set of sycl-post-link options generated for old and new offloading model
+// RUN: %clangxx -### --target=x86_64-pc-windows-msvc -fsycl \
+// RUN:          -Xdevice-post-link -O0 %s 2>&1 \
+// RUN:   | FileCheck -check-prefix OPTIONS_POSTLINK_JIT_OLD %s
+// OPTIONS_POSTLINK_JIT_OLD: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=native" "-split=auto" "-emit-only-kernels-as-entry-points" "-emit-param-info" "-symbols" "-emit-exported-symbols" "-emit-imported-symbols" "-split-esimd" "-lower-esimd" "-O0"
+
+// RUN: %clang -cc1 %s -triple x86_64-pc-windows-msvc -emit-obj -o %t.elf.o
+// RUN: clang-offload-packager -o %t.out --image=file=%t.elf.o,kind=sycl,triple=spir64
+// RUN: %clang -cc1 %s -triple x86_64-pc-windows-msvc -emit-obj -o %t.o \
+// RUN:   -fembed-offload-object=%t.out
+// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-pc-windows-msvc \
+// RUN:   -sycl-device-library-location=%S/Inputs -sycl-device-libraries=libsycl-crt.new.obj \
+// RUN:   --sycl-post-link-options="-O2 -device-globals -O0" \
+// RUN:   --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck --check-prefix OPTIONS_POSTLINK_JIT_NEW %s
+// OPTIONS_POSTLINK_JIT_NEW: sycl-post-link{{.*}} -spec-const=native -split=auto -emit-only-kernels-as-entry-points -emit-param-info -symbols -emit-exported-symbols -emit-imported-symbols -split-esimd -lower-esimd -O2 -device-globals -O0
diff --git a/clang/test/Driver/sycl-post-link-options.cpp b/clang/test/Driver/sycl-post-link-options.cpp
new file mode 100644
index 0000000000000..4f81fb424ec7c
--- /dev/null
+++ b/clang/test/Driver/sycl-post-link-options.cpp
@@ -0,0 +1,16 @@
+// REQUIRES: system-linux
+/// Verify same set of sycl-post-link options generated for old and new offloading model
+// RUN: %clangxx --target=x86_64-unknown-linux-gnu -fsycl -### \
+// RUN:          -Xdevice-post-link -O0 %s 2>&1 \
+// RUN:   | FileCheck -check-prefix OPTIONS_POSTLINK_JIT_OLD %s
+// OPTIONS_POSTLINK_JIT_OLD: sycl-post-link{{.*}} "-O2" "-device-globals" "-spec-const=native" "-split=auto" "-emit-only-kernels-as-entry-points" "-emit-param-info" "-symbols" "-emit-exported-symbols" "-emit-imported-symbols" "-split-esimd" "-lower-esimd" "-O0"
+
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.elf.o
+// RUN: clang-offload-packager -o %t.out --image=file=%t.elf.o,kind=sycl,triple=spir64
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \
+// RUN:   -fembed-offload-object=%t.out
+// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu \
+// RUN:   -sycl-device-library-location=%S/Inputs -sycl-device-libraries=libsycl-crt.new.o \
+// RUN:   --sycl-post-link-options="-O2 -device-globals -O0" \
+// RUN:   --linker-path=/usr/bin/ld %t.o -o a.out 2>&1 | FileCheck --check-prefix OPTIONS_POSTLINK_JIT_NEW %s
+// OPTIONS_POSTLINK_JIT_NEW: sycl-post-link{{.*}} -spec-const=native -split=auto -emit-only-kernels-as-entry-points -emit-param-info -symbols -emit-exported-symbols -emit-imported-symbols -split-esimd -lower-esimd -O2 -device-globals -O0
diff --git a/clang/test/Driver/sycl-spirv-ext.c b/clang/test/Driver/sycl-spirv-ext.c
index a306b9eb1ea4d..eb4d24197b1af 100644
--- a/clang/test/Driver/sycl-spirv-ext.c
+++ b/clang/test/Driver/sycl-spirv-ext.c
@@ -48,6 +48,7 @@
 // CHECK-DEFAULT-SAME:,+SPV_INTEL_fpga_invocation_pipelining_attributes
 // CHECK-DEFAULT-SAME:,+SPV_INTEL_fpga_latency_control
 // CHECK-DEFAULT-SAME:,+SPV_INTEL_task_sequence
+// CHECK-DEFAULT-SAME:,+SPV_KHR_shader_clock
 // CHECK-DEFAULT-SAME:,+SPV_INTEL_bindless_images
 // CHECK-DEFAULT-SAME:,+SPV_INTEL_token_type
 // CHECK-DEFAULT-SAME:,+SPV_INTEL_bfloat16_conversion
@@ -125,4 +126,3 @@
 // CHECK-CPU-SAME:,+SPV_KHR_non_semantic_info
 // CHECK-CPU-SAME:,+SPV_KHR_cooperative_matrix
 // CHECK-CPU-SAME:,+SPV_INTEL_fp_max_error"
-
diff --git a/clang/test/Preprocessor/sycl-macro.cpp b/clang/test/Preprocessor/sycl-macro.cpp
index cab506771e74c..81645d3ac3736 100644
--- a/clang/test/Preprocessor/sycl-macro.cpp
+++ b/clang/test/Preprocessor/sycl-macro.cpp
@@ -12,6 +12,8 @@
 // RUN: %clang_cc1 %s  -triple nvptx64-nvidia-cuda -target-cpu sm_80 -fsycl-is-device -E -dM | FileCheck --check-prefix=CHECK-CUDA %s
 // RUN: %clang_cc1 %s  -triple amdgcn-amd-amdhsa -target-cpu gfx906 -fsycl-is-device -E -dM | FileCheck --check-prefix=CHECK-HIP %s
 
+// RUN: %clang_cc1 %s  -triple nvptx64-nvidia-cuda -target-cpu sm_90a -fsycl-is-device -E -dM | FileCheck --check-prefix=CHECK-CUDA-FEATURE %s
+
 // CHECK-NOT:#define __SYCL_DEVICE_ONLY__ 1
 // CHECK-NOT:#define SYCL_EXTERNAL
 // CHECK-NOT:#define CL_SYCL_LANGUAGE_VERSION 121
@@ -37,3 +39,5 @@
 // CHECK-CUDA-NOT:#define __CUDA_ARCH__ 800
 
 // CHECK-HIP:#define __CUDA_ARCH__ 0
+
+// CHECK-CUDA-FEATURE:#define __CUDA_ARCH_FEAT_SM90_ALL 1
diff --git a/clang/test/SemaSYCL/accessor-type-diagnostics.cpp b/clang/test/SemaSYCL/accessor-type-diagnostics.cpp
index e18a1880fcd9b..d7c8e15cde20c 100644
--- a/clang/test/SemaSYCL/accessor-type-diagnostics.cpp
+++ b/clang/test/SemaSYCL/accessor-type-diagnostics.cpp
@@ -1,9 +1,7 @@
 // RUN: %clang_cc1 -triple spir64 -fsycl-is-device -verify \
-// RUN:  -aux-triple x86_64-unknown-linux-gnu -fsyntax-only       \
-// RUN:  -Wno-sycl-2017-compat  %s
+// RUN:  -aux-triple x86_64-unknown-linux-gnu -fsyntax-only %s
 // RUN: %clang_cc1 -triple spir64 -fsycl-is-device -verify \
-// RUN:  -aux-triple x86_64-pc-windows-msvc -fsyntax-only         \
-// RUN:  -Wno-sycl-2017-compat  %s
+// RUN:  -aux-triple x86_64-pc-windows-msvc -fsyntax-only %s
 //
 // Ensure SYCL type restrictions are applied to accessors as well.
 
diff --git a/clang/test/SemaSYCL/buffer_location.cpp b/clang/test/SemaSYCL/buffer_location.cpp
index c733d8e7c8315..95fb14eb207b1 100644
--- a/clang/test/SemaSYCL/buffer_location.cpp
+++ b/clang/test/SemaSYCL/buffer_location.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 -fsycl-is-device -Wno-sycl-2017-compat -ast-dump %s | FileCheck %s
-// RUN: %clang_cc1 -fsycl-is-device -Wno-sycl-2017-compat -verify -pedantic -DTRIGGER_ERROR %s
+// RUN: %clang_cc1 -fsycl-is-device -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -fsycl-is-device -verify -pedantic -DTRIGGER_ERROR %s
 
 #include "Inputs/sycl.hpp"
 
diff --git a/clang/test/SemaSYCL/deferred-diagnostics-aux-builtin.cpp b/clang/test/SemaSYCL/deferred-diagnostics-aux-builtin.cpp
index cde5eb40559a1..65141587bc93c 100644
--- a/clang/test/SemaSYCL/deferred-diagnostics-aux-builtin.cpp
+++ b/clang/test/SemaSYCL/deferred-diagnostics-aux-builtin.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple spir64-unknown-unknown -aux-triple x86_64-unknown-linux-gnu -Wno-sycl-2017-compat -verify -fsyntax-only  %s
+// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -triple spir64-unknown-unknown -aux-triple x86_64-unknown-linux-gnu -verify -fsyntax-only  %s
 
 #include "sycl.hpp"
 
diff --git a/clang/test/SemaSYCL/implicit-sycl-device-attr.cpp b/clang/test/SemaSYCL/implicit-sycl-device-attr.cpp
index 8b2c0618795c3..4b30e9ef31f5a 100644
--- a/clang/test/SemaSYCL/implicit-sycl-device-attr.cpp
+++ b/clang/test/SemaSYCL/implicit-sycl-device-attr.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -triple spir64 \
 // RUN:  -aux-triple x86_64-unknown-linux-gnu -Wno-return-type -verify     \
-// RUN:  -Wno-sycl-2017-compat -fsyntax-only -std=c++17 %s
+// RUN:  -fsyntax-only -std=c++17 %s
 
 // add_ir_attributes_function attribute used to represent compile-time SYCL
 // properties and some of those properties are intended to be turned into
diff --git a/clang/test/SemaSYCL/inline-asm.cpp b/clang/test/SemaSYCL/inline-asm.cpp
index a4a308fe0875a..85d5fbeed2b1d 100644
--- a/clang/test/SemaSYCL/inline-asm.cpp
+++ b/clang/test/SemaSYCL/inline-asm.cpp
@@ -1,6 +1,6 @@
-// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -Wno-sycl-2017-compat -verify %s -DLINUX_ASM
-// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -Wno-sycl-2017-compat -verify %s -DLINUX_ASM -DSPIR_CHECK -triple spir64-unknown-unknown
-// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -Wno-sycl-2017-compat -verify -triple x86_64-windows -fasm-blocks %s
+// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -verify %s -DLINUX_ASM
+// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -verify %s -DLINUX_ASM -DSPIR_CHECK -triple spir64-unknown-unknown
+// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -verify -triple x86_64-windows -fasm-blocks %s
 
 #ifndef SPIR_CHECK
 //expected-no-diagnostics
diff --git a/clang/test/SemaSYCL/intel-fpga-loop-ast.cpp b/clang/test/SemaSYCL/intel-fpga-loop-ast.cpp
index be824787f07d6..8e133b853456f 100644
--- a/clang/test/SemaSYCL/intel-fpga-loop-ast.cpp
+++ b/clang/test/SemaSYCL/intel-fpga-loop-ast.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -Wno-sycl-2017-compat -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -ast-dump %s | FileCheck %s
 
 // Add AST tests for Loop attributes: [[intel::enable_loop_pipelining]],
 // [[intel::max_interleaving()]], [[intel::loop_coalesce]],
diff --git a/clang/test/SemaSYCL/intel-fpga-no-global-work-offset-ast.cpp b/clang/test/SemaSYCL/intel-fpga-no-global-work-offset-ast.cpp
index 67343e725d8d8..5256194c2becc 100644
--- a/clang/test/SemaSYCL/intel-fpga-no-global-work-offset-ast.cpp
+++ b/clang/test/SemaSYCL/intel-fpga-no-global-work-offset-ast.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -Wno-sycl-2017-compat -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -ast-dump %s | FileCheck %s
 
 // Tests for AST of Intel FPGA no_global_work_offset function attribute.
 
diff --git a/clang/test/SemaSYCL/intel-fpga-nofusion.cpp b/clang/test/SemaSYCL/intel-fpga-nofusion.cpp
index fd64b2ac7fcac..e8a931b124f5d 100644
--- a/clang/test/SemaSYCL/intel-fpga-nofusion.cpp
+++ b/clang/test/SemaSYCL/intel-fpga-nofusion.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -fsyntax-only -ast-dump -Wno-sycl-2017-compat -verify %s | FileCheck %s
+// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -fsyntax-only -ast-dump -verify %s | FileCheck %s
 // expected-no-diagnostics
 
 #include "sycl.hpp"
diff --git a/clang/test/SemaSYCL/intel-max-global-work-dim-device-ast.cpp b/clang/test/SemaSYCL/intel-max-global-work-dim-device-ast.cpp
index 381433e7c6087..7ec419972dea2 100644
--- a/clang/test/SemaSYCL/intel-max-global-work-dim-device-ast.cpp
+++ b/clang/test/SemaSYCL/intel-max-global-work-dim-device-ast.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -fsyntax-only -ast-dump -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -Wno-sycl-2017-compat -triple spir64 | FileCheck %s
+// RUN: %clang_cc1 %s -fsyntax-only -ast-dump -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -triple spir64 | FileCheck %s
 
 // The test checks AST of [[intel::max_global_work_dim()]] attribute.
 
diff --git a/clang/test/SemaSYCL/intel-max-global-work-dim-device.cpp b/clang/test/SemaSYCL/intel-max-global-work-dim-device.cpp
index b453985c2ae79..a100c9728615f 100644
--- a/clang/test/SemaSYCL/intel-max-global-work-dim-device.cpp
+++ b/clang/test/SemaSYCL/intel-max-global-work-dim-device.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -fsyntax-only -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2020 -Wno-sycl-2017-compat -triple spir64 -verify
+// RUN: %clang_cc1 %s -fsyntax-only -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2020 -triple spir64 -verify
 
 // The test checks support and functionality of [[intel::max_global_work_dim()]] attribute.
 
diff --git a/clang/test/SemaSYCL/intel-max-global-work-dim-host.cpp b/clang/test/SemaSYCL/intel-max-global-work-dim-host.cpp
index 2519c18672778..0359de526a33e 100644
--- a/clang/test/SemaSYCL/intel-max-global-work-dim-host.cpp
+++ b/clang/test/SemaSYCL/intel-max-global-work-dim-host.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-host -Wno-sycl-2017-compat -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsycl-is-host -fsyntax-only -verify %s
 // expected-no-diagnostics
 
 [[intel::max_global_work_dim(2)]] void func_do_not_ignore() {}
diff --git a/clang/test/SemaSYCL/intel-max-work-group-size-host.cpp b/clang/test/SemaSYCL/intel-max-work-group-size-host.cpp
index cce684221dfcb..5faa2ff420269 100644
--- a/clang/test/SemaSYCL/intel-max-work-group-size-host.cpp
+++ b/clang/test/SemaSYCL/intel-max-work-group-size-host.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-host -fsyntax-only -Wno-sycl-2017-compat -verify %s
+// RUN: %clang_cc1 -fsycl-is-host -fsyntax-only -verify %s
 // expected-no-diagnostics
 
 [[intel::max_work_group_size(2, 2, 2)]] void func_do_not_ignore() {}
diff --git a/clang/test/SemaSYCL/intel-reqd-work-group-size-ast-device.cpp b/clang/test/SemaSYCL/intel-reqd-work-group-size-ast-device.cpp
index 412a8a88b8b1e..5ce303af1cb58 100644
--- a/clang/test/SemaSYCL/intel-reqd-work-group-size-ast-device.cpp
+++ b/clang/test/SemaSYCL/intel-reqd-work-group-size-ast-device.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -Wno-sycl-2017-compat -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -ast-dump %s | FileCheck %s
 
 // Test for AST of reqd_work_group_size kernel attribute in SYCL 1.2.1.
 
diff --git a/clang/test/SemaSYCL/intel-reqd-work-group-size-host.cpp b/clang/test/SemaSYCL/intel-reqd-work-group-size-host.cpp
index f9d98ef6be877..445f0815670e3 100644
--- a/clang/test/SemaSYCL/intel-reqd-work-group-size-host.cpp
+++ b/clang/test/SemaSYCL/intel-reqd-work-group-size-host.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-host -Wno-sycl-2017-compat -fsyntax-only -verify %s
+// RUN: %clang_cc1 -fsycl-is-host -fsyntax-only -verify %s
 // expected-no-diagnostics
 
 [[sycl::reqd_work_group_size(4)]] void f4x1x1() {}
diff --git a/clang/test/SemaSYCL/intel-work-group-size-hint-ast-device.cpp b/clang/test/SemaSYCL/intel-work-group-size-hint-ast-device.cpp
index aefe2202ce43f..caf429e5a7bc4 100644
--- a/clang/test/SemaSYCL/intel-work-group-size-hint-ast-device.cpp
+++ b/clang/test/SemaSYCL/intel-work-group-size-hint-ast-device.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -Wno-sycl-2017-compat -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -ast-dump %s | FileCheck %s
 
 // Test for AST of work_group_size_hint kernel attribute in SYCL 1.2.1.
 
diff --git a/clang/test/SemaSYCL/invalid-kernel-arguments.cpp b/clang/test/SemaSYCL/invalid-kernel-arguments.cpp
index cf62e6117e47b..10b60eacc9e4c 100644
--- a/clang/test/SemaSYCL/invalid-kernel-arguments.cpp
+++ b/clang/test/SemaSYCL/invalid-kernel-arguments.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -Wno-sycl-2017-compat -verify %s
+// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -verify %s
 
 // This test checks that compiler doesn't crash if type of kernel argument is
 // invalid.
diff --git a/clang/test/SemaSYCL/kernel-arg-opt-report.cpp b/clang/test/SemaSYCL/kernel-arg-opt-report.cpp
index a9cca45099b62..95f2106d72655 100644
--- a/clang/test/SemaSYCL/kernel-arg-opt-report.cpp
+++ b/clang/test/SemaSYCL/kernel-arg-opt-report.cpp
@@ -1,9 +1,9 @@
 // RUN: %clang_cc1 -triple spir64-unknown-unknown -fsycl-is-device \
-// RUN: -Wno-sycl-2017-compat -emit-llvm-bc %s -o %t-host.bc -opt-record-file %t-host.yaml
+// RUN: -emit-llvm-bc %s -o %t-host.bc -opt-record-file %t-host.yaml
 // RUN: FileCheck -check-prefix=SPIR --input-file %t-host.yaml %s
 
 // RUN: %clang_cc1 -triple nvptx64-unknown-unknown -fsycl-is-device \
-// RUN: -Wno-sycl-2017-compat -emit-llvm-bc %s -o %t-host.bc -opt-record-file %t-host.yaml
+// RUN: -emit-llvm-bc %s -o %t-host.bc -opt-record-file %t-host.yaml
 // RUN: FileCheck -check-prefix=NVPTX --input-file %t-host.yaml %s
 // The test generates remarks about the kernel argument, their location and type
 // in the resulting yaml file.
diff --git a/clang/test/SemaSYCL/kernel-function-type.cpp b/clang/test/SemaSYCL/kernel-function-type.cpp
index 9a2036d7d5eee..bef5a5d7ab0b9 100644
--- a/clang/test/SemaSYCL/kernel-function-type.cpp
+++ b/clang/test/SemaSYCL/kernel-function-type.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -Wno-sycl-2017-compat -verify %s
+// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -verify %s
 // expected-no-diagnostics
 
 // The kernel_single_task call is emitted as an OpenCL kernel function. The call
diff --git a/clang/test/SemaSYCL/lambda_implicit_capture_this.cpp b/clang/test/SemaSYCL/lambda_implicit_capture_this.cpp
index 4e7d41b142815..9bd49d777977c 100644
--- a/clang/test/SemaSYCL/lambda_implicit_capture_this.cpp
+++ b/clang/test/SemaSYCL/lambda_implicit_capture_this.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -Wno-sycl-2017-compat -verify %s
+// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -verify %s
 //
 // This test checks that the compiler issues an error on attempt to capture
 // "this" pointer by lambdas passed to the device code (directly and indirectly)
diff --git a/clang/test/SemaSYCL/loop_fusion_ast.cpp b/clang/test/SemaSYCL/loop_fusion_ast.cpp
index 52829b05d669c..d3f05382e103b 100644
--- a/clang/test/SemaSYCL/loop_fusion_ast.cpp
+++ b/clang/test/SemaSYCL/loop_fusion_ast.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -Wno-sycl-2017-compat -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -ast-dump %s | FileCheck %s
 
 // Tests for AST of Intel FPGA loop fusion function attributes
 #include "sycl.hpp"
diff --git a/clang/test/SemaSYCL/loop_unroll.cpp b/clang/test/SemaSYCL/loop_unroll.cpp
index 2fcf31480b183..61796397ece05 100644
--- a/clang/test/SemaSYCL/loop_unroll.cpp
+++ b/clang/test/SemaSYCL/loop_unroll.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -Wno-sycl-2017-compat -verify -pedantic %s
+// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -verify -pedantic %s
 
 template <int A>
 void bar() {
diff --git a/clang/test/SemaSYCL/markfunction-astconsumer.cpp b/clang/test/SemaSYCL/markfunction-astconsumer.cpp
index 207d2b9e5d47b..0b690c004b68f 100644
--- a/clang/test/SemaSYCL/markfunction-astconsumer.cpp
+++ b/clang/test/SemaSYCL/markfunction-astconsumer.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -Wno-return-type -verify -Wno-sycl-2017-compat -fsyntax-only -std=c++17 %s
+// RUN: %clang_cc1 -fsycl-is-device -Wno-return-type -verify -fsyntax-only -std=c++17 %s
 void bar();
 
 template <typename T>
diff --git a/clang/test/SemaSYCL/max-concurrency-ast.cpp b/clang/test/SemaSYCL/max-concurrency-ast.cpp
index 654ce182fed67..f61332135a0a5 100644
--- a/clang/test/SemaSYCL/max-concurrency-ast.cpp
+++ b/clang/test/SemaSYCL/max-concurrency-ast.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -Wno-sycl-2017-compat -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -ast-dump %s | FileCheck %s
 
 // Tests for AST of Intel FPGA max concurrency function attribute.
 #include "sycl.hpp"
diff --git a/clang/test/SemaSYCL/no-vtables.cpp b/clang/test/SemaSYCL/no-vtables.cpp
index 28400fe2834c3..905a40da9dc5b 100644
--- a/clang/test/SemaSYCL/no-vtables.cpp
+++ b/clang/test/SemaSYCL/no-vtables.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -verify -Wno-sycl-2017-compat -emit-llvm-only %s
+// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -verify -emit-llvm-only %s
 // expected-no-diagnostics
 // Should never fail, since the type is never used in kernel code.
 
diff --git a/clang/test/SemaSYCL/no-vtables2.cpp b/clang/test/SemaSYCL/no-vtables2.cpp
index 721786c96c1e9..fd0313574f491 100644
--- a/clang/test/SemaSYCL/no-vtables2.cpp
+++ b/clang/test/SemaSYCL/no-vtables2.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -Wno-return-type -verify -Wno-sycl-2017-compat -fsyntax-only %s
+// RUN: %clang_cc1 -fsycl-is-device -Wno-return-type -verify -fsyntax-only %s
 
 struct Base {
   virtual void f() const {}
diff --git a/clang/test/SemaSYCL/num_simd_work_items.cpp b/clang/test/SemaSYCL/num_simd_work_items.cpp
index 71bd2f38bc21a..8f47fdb2d2434 100644
--- a/clang/test/SemaSYCL/num_simd_work_items.cpp
+++ b/clang/test/SemaSYCL/num_simd_work_items.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -fsycl-is-device -internal-isystem %S/Inputs -triple spir64 -fsyntax-only -sycl-std=2020 -Wno-sycl-2017-compat -verify
+// RUN: %clang_cc1 %s -fsycl-is-device -internal-isystem %S/Inputs -triple spir64 -fsyntax-only -sycl-std=2020 -verify
 
 // The test checks support and functionality of [[intel::num_simd_work_items()]] attribute.
 
diff --git a/clang/test/SemaSYCL/num_simd_work_items_ast.cpp b/clang/test/SemaSYCL/num_simd_work_items_ast.cpp
index 5da7471260989..c2dbc246c6511 100644
--- a/clang/test/SemaSYCL/num_simd_work_items_ast.cpp
+++ b/clang/test/SemaSYCL/num_simd_work_items_ast.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 %s -fsycl-is-device -internal-isystem %S/Inputs -triple spir64 -fsyntax-only -sycl-std=2017 -Wno-sycl-2017-compat -ast-dump | FileCheck %s
+// RUN: %clang_cc1 %s -fsycl-is-device -internal-isystem %S/Inputs -triple spir64 -fsyntax-only -sycl-std=2017 -ast-dump | FileCheck %s
 
 // The test checks AST of [[intel::num_simd_work_items()]] attribute.
 
diff --git a/clang/test/SemaSYCL/num_simd_work_items_host.cpp b/clang/test/SemaSYCL/num_simd_work_items_host.cpp
index 8558a574572d9..d82cdd0e1ec38 100644
--- a/clang/test/SemaSYCL/num_simd_work_items_host.cpp
+++ b/clang/test/SemaSYCL/num_simd_work_items_host.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-host -fsyntax-only -Wno-sycl-2017-compat -verify %s
+// RUN: %clang_cc1 -fsycl-is-host -fsyntax-only -verify %s
 // expected-no-diagnostics
 
 [[intel::num_simd_work_items(2)]] void func_do_not_ignore() {}
diff --git a/clang/test/SemaSYCL/pointer-to-vla.cpp b/clang/test/SemaSYCL/pointer-to-vla.cpp
index cd2d925f92dad..16f89caa07b5c 100644
--- a/clang/test/SemaSYCL/pointer-to-vla.cpp
+++ b/clang/test/SemaSYCL/pointer-to-vla.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -triple spir64 -fsyntax-only -Wno-sycl-2017-compat -verify %s
+// RUN: %clang_cc1 -fsycl-is-device -triple spir64 -fsyntax-only -verify %s
 //
 // This test checks if compiler reports compilation error on an attempt to pass
 // a pointer to VLA as kernel argument
diff --git a/clang/test/SemaSYCL/prohibit-thread-local.cpp b/clang/test/SemaSYCL/prohibit-thread-local.cpp
index c87c2439ad02b..c7b00d04befdf 100644
--- a/clang/test/SemaSYCL/prohibit-thread-local.cpp
+++ b/clang/test/SemaSYCL/prohibit-thread-local.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -triple spir64 -verify -Wno-sycl-2017-compat -fsyntax-only %s
+// RUN: %clang_cc1 -fsycl-is-device -triple spir64 -verify -fsyntax-only %s
 
 thread_local const int prohobit_ns_scope = 0;
 thread_local int prohobit_ns_scope2 = 0;
diff --git a/clang/test/SemaSYCL/reference-kernel-param.cpp b/clang/test/SemaSYCL/reference-kernel-param.cpp
index 81350f48d552f..080ecaef42684 100644
--- a/clang/test/SemaSYCL/reference-kernel-param.cpp
+++ b/clang/test/SemaSYCL/reference-kernel-param.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -verify -Wno-sycl-2017-compat -fsyntax-only %s
+// RUN: %clang_cc1 -fsycl-is-device -verify -fsyntax-only %s
 
 // This test checks if compiler reports compilation error on an attempt to pass
 // a reference as SYCL kernel parameter.
diff --git a/clang/test/SemaSYCL/reqd-sub-group-size-ast.cpp b/clang/test/SemaSYCL/reqd-sub-group-size-ast.cpp
index dde7140595dd4..512e8d5f326ea 100644
--- a/clang/test/SemaSYCL/reqd-sub-group-size-ast.cpp
+++ b/clang/test/SemaSYCL/reqd-sub-group-size-ast.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -Wno-sycl-2017-compat -ast-dump %s | FileCheck %s
+// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -sycl-std=2017 -ast-dump %s | FileCheck %s
 
 // The test checks AST of [[intel::reqd_sub_group_size()]] attribute.
 
diff --git a/clang/test/SemaSYCL/restrict-recursion.cpp b/clang/test/SemaSYCL/restrict-recursion.cpp
index fdfd4ba8fb4e8..a2ede51efe027 100644
--- a/clang/test/SemaSYCL/restrict-recursion.cpp
+++ b/clang/test/SemaSYCL/restrict-recursion.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -Wno-return-type -verify -Wno-sycl-2017-compat -fsyntax-only -std=c++17 %s
+// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -Wno-return-type -verify -fsyntax-only -std=c++17 %s
 
 // This recursive function is not called from sycl kernel,
 // so it should not be diagnosed.
diff --git a/clang/test/SemaSYCL/restrict-recursion2.cpp b/clang/test/SemaSYCL/restrict-recursion2.cpp
index a92705aa81bbd..8e9e1b32f3c53 100644
--- a/clang/test/SemaSYCL/restrict-recursion2.cpp
+++ b/clang/test/SemaSYCL/restrict-recursion2.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -Wno-return-type -verify -Wno-sycl-2017-compat -fsyntax-only -std=c++17 %s
+// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -Wno-return-type -verify -fsyntax-only -std=c++17 %s
 
 // This recursive function is not called from sycl kernel,
 // so it should not be diagnosed.
diff --git a/clang/test/SemaSYCL/restrict-recursion3.cpp b/clang/test/SemaSYCL/restrict-recursion3.cpp
index b66e3cd580cc3..5e8b3fae83c0b 100644
--- a/clang/test/SemaSYCL/restrict-recursion3.cpp
+++ b/clang/test/SemaSYCL/restrict-recursion3.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -Wno-return-type -Wno-sycl-2017-compat -Wno-error=sycl-strict -verify -fsyntax-only -std=c++17 %s
+// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -Wno-return-type -Wno-error=sycl-strict -verify -fsyntax-only -std=c++17 %s
 
 // This recursive function is not called from sycl kernel,
 // so it should not be diagnosed.
diff --git a/clang/test/SemaSYCL/restrict-recursion4.cpp b/clang/test/SemaSYCL/restrict-recursion4.cpp
index ee0fdb20ce4c0..30a5c5e2b6296 100644
--- a/clang/test/SemaSYCL/restrict-recursion4.cpp
+++ b/clang/test/SemaSYCL/restrict-recursion4.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -Wno-return-type -Wno-sycl-2017-compat -Wno-error=sycl-strict -verify -fsyntax-only -std=c++17 %s
+// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -Wno-return-type -Wno-error=sycl-strict -verify -fsyntax-only -std=c++17 %s
 
 // This recursive function is not called from sycl kernel,
 // so it should not be diagnosed.
diff --git a/clang/test/SemaSYCL/stall_enable_device.cpp b/clang/test/SemaSYCL/stall_enable_device.cpp
index b8682461fc9ba..ceba235e5e820 100644
--- a/clang/test/SemaSYCL/stall_enable_device.cpp
+++ b/clang/test/SemaSYCL/stall_enable_device.cpp
@@ -1,5 +1,5 @@
-// RUN: %clang_cc1 %s -fsyntax-only -internal-isystem %S/Inputs -fsycl-is-device -Wno-sycl-2017-compat -DTRIGGER_ERROR -verify
-// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -fsyntax-only -ast-dump -Wno-sycl-2017-compat %s | FileCheck %s
+// RUN: %clang_cc1 %s -fsyntax-only -internal-isystem %S/Inputs -fsycl-is-device -DTRIGGER_ERROR -verify
+// RUN: %clang_cc1 -fsycl-is-device -internal-isystem %S/Inputs -fsyntax-only -ast-dump %s | FileCheck %s
 
 // Test that checks [[intel::use_stall_enable_clusters]] attribute support on function.
 
diff --git a/clang/test/SemaSYCL/sycl-callstack.cpp b/clang/test/SemaSYCL/sycl-callstack.cpp
index e6eb7a34a3c3c..28f36d741be7b 100644
--- a/clang/test/SemaSYCL/sycl-callstack.cpp
+++ b/clang/test/SemaSYCL/sycl-callstack.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -verify -Wno-sycl-2017-compat -fsyntax-only -std=c++17 %s
+// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -verify -fsyntax-only -std=c++17 %s
 
 template <typename name, typename Func>
 __attribute__((sycl_kernel)) void kernel_single_task(const Func &kernelFunc) {
diff --git a/clang/test/SemaSYCL/sycl-cconv.cpp b/clang/test/SemaSYCL/sycl-cconv.cpp
index 2f30d8474423c..143e5bb9a750a 100644
--- a/clang/test/SemaSYCL/sycl-cconv.cpp
+++ b/clang/test/SemaSYCL/sycl-cconv.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-windows -aux-triple x86_64-pc-windows-msvc -fsyntax-only -Wno-sycl-2017-compat -verify %s
+// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-windows -aux-triple x86_64-pc-windows-msvc -fsyntax-only -verify %s
 
 // expected-no-warning@+1
 __inline __cdecl int printf(char const* const _Format, ...) { return 0; }
diff --git a/clang/test/SemaSYCL/sycl-device-const-static.cpp b/clang/test/SemaSYCL/sycl-device-const-static.cpp
index 6a785f17725cb..69b75e936abc0 100644
--- a/clang/test/SemaSYCL/sycl-device-const-static.cpp
+++ b/clang/test/SemaSYCL/sycl-device-const-static.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -verify -Wno-sycl-2017-compat -fsyntax-only %s
+// RUN: %clang_cc1 -fsycl-is-device -verify -fsyntax-only %s
 
 struct Base {};
 struct S {
diff --git a/clang/test/SemaSYCL/sycl-device-static-restrict.cpp b/clang/test/SemaSYCL/sycl-device-static-restrict.cpp
index 6a9d5092689e2..733285a1325a4 100644
--- a/clang/test/SemaSYCL/sycl-device-static-restrict.cpp
+++ b/clang/test/SemaSYCL/sycl-device-static-restrict.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -verify -Wno-sycl-2017-compat -fsyntax-only %s
+// RUN: %clang_cc1 -fsycl-is-device -verify -fsyntax-only %s
 const int glob1 = 1;
 int glob2 = 2;
 template <typename name, typename Func>
diff --git a/clang/test/SemaSYCL/sycl-device-template-diag.cpp b/clang/test/SemaSYCL/sycl-device-template-diag.cpp
index f1a8942aed6d6..7ae5f7b57fca5 100644
--- a/clang/test/SemaSYCL/sycl-device-template-diag.cpp
+++ b/clang/test/SemaSYCL/sycl-device-template-diag.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -verify -Wno-sycl-2017-compat -fsyntax-only %s -internal-isystem %S/Inputs
+// RUN: %clang_cc1 -fsycl-is-device -verify -fsyntax-only %s -internal-isystem %S/Inputs
 
 // This test verifies that we generate deferred diagnostics when
 // such diagnostics are in a function template.
diff --git a/clang/test/SemaSYCL/sycl-dllimport-dllexport.cpp b/clang/test/SemaSYCL/sycl-dllimport-dllexport.cpp
index e703b6fcb117b..ea67ee0d2ebed 100644
--- a/clang/test/SemaSYCL/sycl-dllimport-dllexport.cpp
+++ b/clang/test/SemaSYCL/sycl-dllimport-dllexport.cpp
@@ -1,19 +1,19 @@
 // RUN: %clang_cc1 -triple spir64-unknown-unknown -fms-extensions \
 // RUN: -aux-triple x86_64-unknown-linux-gnu -fsycl-is-device \
-// RUN: -fsyntax-only -Wno-sycl-2017-compat -DWARNCHECK %s -o /dev/null 2>&1 | FileCheck %s
+// RUN: -fsyntax-only -DWARNCHECK %s -o /dev/null 2>&1 | FileCheck %s
 // check random triple aux-triple with sycl-device
 
-// RUN: %clang_cc1 -triple spir64-unknown-windows -Wno-sycl-2017-compat -fsyntax-only \
+// RUN: %clang_cc1 -triple spir64-unknown-windows -fsyntax-only \
 // RUN: -fms-extensions -DWARNCHECK %s -o /dev/null 2>&1 | FileCheck --check-prefixes CHECKALL %s
 // check without -aux-triple but sycl-device
 
 // RUN: %clang_cc1 -triple spir64-unknown-windows \
 // RUN: -fsycl-is-device -aux-triple x86_64-pc-windows-msvc -fms-extensions \
-// RUN: -fsyntax-only -Wno-sycl-2017-compat -DWARNCHECK %s -o /dev/null 2>&1 | \
+// RUN: -fsyntax-only -DWARNCHECK %s -o /dev/null 2>&1 | \
 // RUN: FileCheck %s --check-prefixes CHECKALL
 // check -aux-tripe without sycl-device
 
-// RUN: %clang_cc1 -triple spir64-unknown-windows -Wno-sycl-2017-compat -fsyntax-only \
+// RUN: %clang_cc1 -triple spir64-unknown-windows -fsyntax-only \
 // RUN: -aux-triple x86_64-pc-windows-msvc -fsycl-is-device \
 // RUN: -fms-extensions -verify  %s
 // check error message when dllimport function gets called in sycl-kernel code
diff --git a/clang/test/SemaSYCL/sycl-fptr-lambda.cpp b/clang/test/SemaSYCL/sycl-fptr-lambda.cpp
index eac66a9106ab8..480c4fbb6e6c7 100644
--- a/clang/test/SemaSYCL/sycl-fptr-lambda.cpp
+++ b/clang/test/SemaSYCL/sycl-fptr-lambda.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -std=c++14 -verify -Wno-sycl-2017-compat -fsyntax-only %s
+// RUN: %clang_cc1 -fsycl-is-device -std=c++14 -verify -fsyntax-only %s
 // expected-no-diagnostics
 
 template <typename name, typename Func>
diff --git a/clang/test/SemaSYCL/sycl-pseudo-dtor.cpp b/clang/test/SemaSYCL/sycl-pseudo-dtor.cpp
index 3635e22ffac43..3e90e58a0ec94 100644
--- a/clang/test/SemaSYCL/sycl-pseudo-dtor.cpp
+++ b/clang/test/SemaSYCL/sycl-pseudo-dtor.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -verify -Wno-sycl-2017-compat -fsyntax-only %s
+// RUN: %clang_cc1 -fsycl-is-device -verify -fsyntax-only %s
 
 template <typename functor_t>
 struct functor_wrapper{
diff --git a/clang/test/SemaSYCL/sycl-restrict.cpp b/clang/test/SemaSYCL/sycl-restrict.cpp
index f52f6964712ff..c7187fd737dea 100644
--- a/clang/test/SemaSYCL/sycl-restrict.cpp
+++ b/clang/test/SemaSYCL/sycl-restrict.cpp
@@ -1,14 +1,14 @@
 // RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -triple spir64 \
 // RUN:  -aux-triple x86_64-unknown-linux-gnu -Wno-return-type -verify     \
-// RUN:  -Wno-sycl-2017-compat -fsyntax-only -std=c++17 %s
+// RUN:  -fsyntax-only -std=c++17 %s
 // RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -triple spir64 \
 // RUN:  -aux-triple x86_64-unknown-linux-gnu -fno-sycl-allow-func-ptr     \
-// RUN:  -Wno-return-type -verify -Wno-sycl-2017-compat -fsyntax-only      \
+// RUN:  -Wno-return-type -verify -fsyntax-only      \
 // RUN:  -std=c++17 %s
 // RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -triple spir64 \
 // RUN:  -aux-triple x86_64-unknown-linux-gnu -DALLOW_FP=1                 \
 // RUN:  -fsycl-allow-func-ptr -Wno-return-type -verify                    \
-// RUN:  -Wno-sycl-2017-compat -fsyntax-only -std=c++17 %s
+// RUN:  -fsyntax-only -std=c++17 %s
 
 namespace std {
 class type_info;
diff --git a/clang/test/SemaSYCL/tls_error.cpp b/clang/test/SemaSYCL/tls_error.cpp
index a43a5ee9b6bbc..80b081107b3e5 100644
--- a/clang/test/SemaSYCL/tls_error.cpp
+++ b/clang/test/SemaSYCL/tls_error.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -triple spir64 -verify -Wno-sycl-2017-compat -fsyntax-only %s
+// RUN: %clang_cc1 -fsycl-is-device -triple spir64 -verify -fsyntax-only %s
 
 extern __thread void* __once_callable;  // expected-no-error
 extern __thread void (*__once_call)();  // expected-no-error
diff --git a/clang/test/SemaSYCL/unevaluated-function.cpp b/clang/test/SemaSYCL/unevaluated-function.cpp
index 2d0059eaef06d..5b07ed1764f65 100644
--- a/clang/test/SemaSYCL/unevaluated-function.cpp
+++ b/clang/test/SemaSYCL/unevaluated-function.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -verify -Wno-sycl-2017-compat -fsyntax-only %s
+// RUN: %clang_cc1 -fsycl-is-device -fcxx-exceptions -verify -fsyntax-only %s
 
 // Check that a function used in an unevaluated context is not subject
 // to delayed device diagnostics.
diff --git a/clang/test/SemaSYCL/unsupported_math.cpp b/clang/test/SemaSYCL/unsupported_math.cpp
index c1ed10ccf496f..ca65f234d3f03 100644
--- a/clang/test/SemaSYCL/unsupported_math.cpp
+++ b/clang/test/SemaSYCL/unsupported_math.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -Wno-sycl-2017-compat -verify %s
+// RUN: %clang_cc1 -fsycl-is-device -fsyntax-only -verify %s
 template <typename name, typename Func>
 __attribute__((sycl_kernel)) void kernel(const Func &kernelFunc) {
   kernelFunc();
diff --git a/clang/test/SemaSYCL/variadic-func-call.cpp b/clang/test/SemaSYCL/variadic-func-call.cpp
index 96b19f1d7d905..4da94418fafe6 100644
--- a/clang/test/SemaSYCL/variadic-func-call.cpp
+++ b/clang/test/SemaSYCL/variadic-func-call.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -fsyntax-only -Wno-sycl-2017-compat -verify %s
+// RUN: %clang_cc1 -fsycl-is-device -triple spir64-unknown-unknown -fsyntax-only -verify %s
 
 void variadic(int, ...) {}
 namespace NS {
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 48cd2c9a4d77b..2f958f31a208f 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -522,6 +522,71 @@ static Expected<StringRef> convertSPIRVToIR(StringRef Filename,
   return *TempFileOrErr;
 }
 
+// Add any sycl-post-link options that rely on a specific Triple in addition
+// to user supplied options.
+// NOTE: Any changes made here should be reflected in the similarly named
+// function in clang/lib/Driver/ToolChains/Clang.cpp.
+static void
+getTripleBasedSYCLPostLinkOpts(const ArgList &Args,
+                               SmallVector<StringRef, 8> &PostLinkArgs,
+                               const llvm::Triple Triple) {
+  const llvm::Triple HostTriple(Args.getLastArgValue(OPT_host_triple_EQ));
+  bool SYCLNativeCPU = (HostTriple == Triple);
+  bool SpecConstsSupported = (!Triple.isNVPTX() && !Triple.isAMDGCN() ||
+                              !Triple.isSPIRAOT() && !SYCLNativeCPU);
+  if (SpecConstsSupported)
+    PostLinkArgs.push_back("-spec-const=native");
+  else
+    PostLinkArgs.push_back("-spec-const=emulation");
+
+  // See if device code splitting is already requested. If not requested, then
+  // set -split=auto for non-FPGA targets.
+  bool NoSplit = true;
+  for (auto Arg : PostLinkArgs)
+    if (Arg.contains("-split=")) {
+      NoSplit = false;
+      break;
+    }
+  if (NoSplit && (Triple.getSubArch() != llvm::Triple::SPIRSubArch_fpga))
+    PostLinkArgs.push_back("-split=auto");
+
+  // On Intel targets we don't need non-kernel functions as entry points,
+  // because it only increases amount of code for device compiler to handle,
+  // without any actual benefits.
+  // TODO: Try to extend this feature for non-Intel GPUs.
+  if ((!Args.hasFlag(OPT_no_sycl_remove_unused_external_funcs,
+                     OPT_sycl_remove_unused_external_funcs, false) &&
+       !SYCLNativeCPU) &&
+      !Triple.isNVPTX() && !Triple.isAMDGPU())
+    PostLinkArgs.push_back("-emit-only-kernels-as-entry-points");
+
+  if (!Triple.isAMDGCN())
+    PostLinkArgs.push_back("-emit-param-info");
+  // Enable program metadata
+  if (Triple.isNVPTX() || Triple.isAMDGCN() || SYCLNativeCPU)
+    PostLinkArgs.push_back("-emit-program-metadata");
+
+  bool SplitEsimdByDefault = Triple.isSPIROrSPIRV();
+  bool SplitEsimd =
+      Args.hasFlag(OPT_sycl_device_code_split_esimd,
+                   OPT_no_sycl_device_code_split_esimd, SplitEsimdByDefault);
+
+  // Symbol file and specialization constant info generation is mandatory -
+  // add options unconditionally
+  PostLinkArgs.push_back("-symbols");
+  PostLinkArgs.push_back("-emit-exported-symbols");
+  PostLinkArgs.push_back("-emit-imported-symbols");
+  if (SplitEsimd)
+    PostLinkArgs.push_back("-split-esimd");
+  PostLinkArgs.push_back("-lower-esimd");
+
+  bool IsAOT = Triple.isNVPTX() || Triple.isAMDGCN() || Triple.isSPIRAOT();
+  if (Args.hasFlag(OPT_sycl_add_default_spec_consts_image,
+                   OPT_no_sycl_add_default_spec_consts_image, false) &&
+      IsAOT)
+    PostLinkArgs.push_back("-generate-device-image-default-spec-consts");
+}
+
 // Run sycl-post-link tool
 static Expected<StringRef> runSYCLPostLink(ArrayRef<StringRef> InputFiles,
                                            const ArgList &Args) {
@@ -536,12 +601,13 @@ static Expected<StringRef> runSYCLPostLink(ArrayRef<StringRef> InputFiles,
   if (!TempFileOrErr)
     return TempFileOrErr.takeError();
 
+  SmallVector<StringRef, 8> CmdArgs;
+  CmdArgs.push_back(*SYCLPostLinkPath);
+  const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
+  getTripleBasedSYCLPostLinkOpts(Args, CmdArgs, Triple);
   StringRef SYCLPostLinkOptions;
   if (Arg *A = Args.getLastArg(OPT_sycl_post_link_options_EQ))
     SYCLPostLinkOptions = A->getValue();
-
-  SmallVector<StringRef, 8> CmdArgs;
-  CmdArgs.push_back(*SYCLPostLinkPath);
   SYCLPostLinkOptions.split(CmdArgs, " ", /* MaxSplit = */ -1,
                             /* KeepEmpty = */ false);
   CmdArgs.push_back("-o");
diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
index fb9189d99ff87..1c93fa2e0cf1e 100644
--- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
+++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td
@@ -170,3 +170,18 @@ def sycl_post_link_options_EQ : Joined<["--", "-"], "sycl-post-link-options=">,
 def llvm_spirv_options_EQ : Joined<["--", "-"], "llvm-spirv-options=">,
   Flags<[WrapperOnlyOption]>,
   HelpText<"Options that will control llvm-spirv step">;
+
+// Extra SYCL options to help generate sycl-post-link options that also depend
+// on the target triple.
+def sycl_remove_unused_external_funcs : Flag<["--", "-"], "sycl-remove-unused-external-funcs">,
+  Flags<[WrapperOnlyOption, HelpHidden]>;
+def no_sycl_remove_unused_external_funcs : Flag<["--", "-"], "no-sycl-remove-unused-external-funcs">,
+  Flags<[WrapperOnlyOption, HelpHidden]>;
+def sycl_device_code_split_esimd : Flag<["--", "-"], "sycl-device-code-split-esimd">,
+  Flags<[WrapperOnlyOption, HelpHidden]>;
+def no_sycl_device_code_split_esimd : Flag<["--", "-"], "no-sycl-device-code-split-esimd">,
+  Flags<[WrapperOnlyOption, HelpHidden]>;
+def sycl_add_default_spec_consts_image : Flag<["--", "-"], "sycl-add-default-spec-consts-image">,
+  Flags<[WrapperOnlyOption, HelpHidden]>;
+def no_sycl_add_default_spec_consts_image : Flag<["--", "-"], "no-sycl-add-default-spec-consts-image">,
+  Flags<[WrapperOnlyOption, HelpHidden]>;
diff --git a/devops/cts_exclude_filter b/devops/cts_exclude_filter
index 318c23eea2168..cebc93ef30fd0 100644
--- a/devops/cts_exclude_filter
+++ b/devops/cts_exclude_filter
@@ -5,4 +5,3 @@ marray
 math_builtin_api
 # https://github.com/intel/llvm/issues/13574
 hierarchical
-accessor
diff --git a/devops/dependencies-igc-dev.json b/devops/dependencies-igc-dev.json
index 39729a23fec3a..e36ef6f0c970d 100644
--- a/devops/dependencies-igc-dev.json
+++ b/devops/dependencies-igc-dev.json
@@ -1,10 +1,10 @@
 {
   "linux": {
     "igc_dev": {
-      "github_tag": "igc-dev-4627f1f",
-      "version": "4627f1f",
-      "updated_at": "2024-05-26T23:48:05Z",
-      "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/1539236241/zip",
+      "github_tag": "igc-dev-6fe460a",
+      "version": "6fe460a",
+      "updated_at": "2024-06-24T01:03:13Z",
+      "url": "https://api.github.com/repos/intel/intel-graphics-compiler/actions/artifacts/1629761341/zip",
       "root": "{DEPS_ROOT}/opencl/runtime/linux/oclgpu"
     }
   }
diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 0f515afdc9875..e8655b0c2e839 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -32,7 +32,7 @@ set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS
   r600/libspirv/SOURCES;
   spirv/lib/SOURCES;
   spirv64/lib/SOURCES
-  x86_64-unknown-linux/libspirv/SOURCES
+  native_cpu-unknown-linux/libspirv/SOURCES
 )
 
 set( LIBCLC_MIN_LLVM 3.9.0 )
@@ -42,6 +42,9 @@ set( LIBCLC_TARGETS_TO_BUILD "all"
 
 option( ENABLE_RUNTIME_SUBNORMAL "Enable runtime linking of subnormal support." OFF )
 
+set( LIBCLC_NATIVECPU_FLAGS_X86_64 ""
+  CACHE STRING "Semicolon-separated list of compiler flags for x86_64 libclc target.")
+
 if( LIBCLC_STANDALONE_BUILD OR CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR )
   # Out-of-tree configuration
   set( LIBCLC_STANDALONE_BUILD TRUE )
@@ -156,6 +159,12 @@ if( LIBCLC_TARGETS_TO_BUILD STREQUAL "all" )
   set( LIBCLC_TARGETS_TO_BUILD ${LIBCLC_TARGETS_ALL} )
 endif()
 
+option( LIBCLC_NATIVECPU_HOST_TARGET "Build libclc for Native CPU using the host triple." Off)
+
+if( LIBCLC_NATIVECPU_HOST_TARGET )
+  list(APPEND LIBCLC_TARGETS_TO_BUILD ${LLVM_TARGET_TRIPLE})
+endif()
+
 list( SORT LIBCLC_TARGETS_TO_BUILD )
 
 # Verify that the user hasn't requested mesa3d targets without an available
@@ -195,6 +204,7 @@ set( spirv-mesa3d-_devices none )
 set( spirv64-mesa3d-_devices none )
 # TODO: Does this need to be set for each possible triple?
 set( x86_64-unknown-linux-gnu_devices none )
+set( aarch64-unknown-linux-gnu_devices none )
 
 # Setup aliases
 set( cedar_aliases palm sumo sumo2 redwood juniper )
@@ -272,6 +282,8 @@ else(LIBCLC_STANDALONE_BUILD)
 endif(LIBCLC_STANDALONE_BUILD)
 file( TO_CMAKE_PATH ${LIBCLC_LIBRARY_OUTPUT_INTDIR}/clc LIBCLC_LIBRARY_OUTPUT_INTDIR )
 
+set(NATIVECPU_SUPPORTED_ARCH "x86_64;aarch64")
+
 foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
   message( STATUS "libclc target '${t}' is enabled" )
   string( REPLACE "-" ";" TRIPLE  ${t} )
@@ -297,6 +309,18 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
     set( DARCH ${ARCH} )
   endif()
 
+  set(IS_NATIVE_CPU_ARCH FALSE)
+  if( ARCH IN_LIST NATIVECPU_SUPPORTED_ARCH )
+    set(IS_NATIVE_CPU_ARCH TRUE)
+  endif()
+
+  if( IS_NATIVE_CPU_ARCH AND OS STREQUAL linux)
+    LIST( APPEND dirs native_cpu-unknown-linux )
+  elseif( IS_NATIVE_CPU_ARCH AND NOT OS STREQUAL linux )
+    message(WARNING "libclc is being built for an unsupported ARCH/OS"
+      " configuration, some SYCL programs may fail to build.")
+  endif()
+
   set( lib_files )
   set( lib_gen_files )
   libclc_configure_lib_source(lib_files lib_gen_files
@@ -332,11 +356,11 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
         # AMDGCN needs libclc to be compiled to high bc version since all atomic
         # clang builtins need to be accessible
         list( APPEND flags -mcpu=gfx940 -mllvm --amdgpu-oclc-reflect-enable=false )
-      elseif( ARCH  STREQUAL x86_64)
-        # TODO: This is used by SYCL Native Cpu, we should define an option to set this flags
-        list( APPEND flags -Xclang -target-feature -Xclang +avx
-            -Xclang -fsycl-is-native-cpu
-            -Xclang -target-feature -Xclang +avx512f)
+      elseif( IS_NATIVE_CPU_ARCH )
+        list( APPEND flags -Xclang -fsycl-is-native-cpu )
+        if( ARCH  STREQUAL x86_64 )
+          list( APPEND flags ${LIBCLC_NATIVECPU_FLAGS_X86_64})
+        endif()
       endif()
     endif()
 
@@ -369,8 +393,8 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
       set( has_distinct_generic_addrspace FALSE )
     elseif( ARCH STREQUAL amdgcn )
       set( opt_flags -O3 --amdgpu-oclc-reflect-enable=false )
-    elseif( ARCH STREQUAL x86_64)
-      set( opt_flags )
+    elseif( IS_NATIVE_CPU_ARCH )
+      set( opt_flags -O3 )
       set( has_distinct_generic_addrspace FALSE )
     else()
       set( opt_flags -O3 )
@@ -391,6 +415,9 @@ foreach( t ${LIBCLC_TARGETS_TO_BUILD} )
       "+__opencl_c_3d_image_writes,"
       "+__opencl_c_images,"
       "+cl_khr_3d_image_writes")
+    if( ARCH STREQUAL "aarch64")
+      string( APPEND CL_3_0_EXTENSIONS ",+cl_clang_storage_class_specifiers,+__opencl_c_fp64,+cl_khr_int64_base_atomics" )
+    endif()
     if( supports_generic_addrspace )
       string( APPEND CL_3_0_EXTENSIONS ",+__opencl_c_generic_address_space" )
       if( has_distinct_generic_addrspace )
diff --git a/libclc/amdgcn-amdhsa/libspirv/synchronization/barrier.cl b/libclc/amdgcn-amdhsa/libspirv/synchronization/barrier.cl
index 16d5a06acc530..0d09c0f49e313 100644
--- a/libclc/amdgcn-amdhsa/libspirv/synchronization/barrier.cl
+++ b/libclc/amdgcn-amdhsa/libspirv/synchronization/barrier.cl
@@ -10,42 +10,56 @@
 #include <spirv/spirv.h>
 #include <spirv/spirv_types.h>
 
-#define BUILTIN_FENCE(semantics, scope_memory)                                 \
-  if (semantics & Acquire)                                                     \
-    return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, scope_memory);             \
-  else if (semantics & Release)                                                \
-    return __builtin_amdgcn_fence(__ATOMIC_RELEASE, scope_memory);             \
-  else if (semantics & AcquireRelease)                                         \
-    return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, scope_memory);             \
-  else if (semantics & SequentiallyConsistent)                                 \
-    return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, scope_memory);             \
-  else                                                                         \
-    return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, scope_memory);
 
-_CLC_DEF _CLC_OVERLOAD void __mem_fence(unsigned int scope_memory,
-                                        unsigned int semantics) {
+#define BUILTIN_FENCE(order, scope_memory)                                     \
+  /* None implies Monotonic (for llvm/AMDGPU), or relaxed in C++.              \
+   * This does not make sense as ordering argument for a fence instruction     \
+   * and is not part of the supported orderings for a fence in AMDGPU. */      \
+  if (order != None) {                                                         \
+    switch (order) {                                                           \
+    case Acquire:                                                              \
+      return __builtin_amdgcn_fence(__ATOMIC_ACQUIRE, scope_memory);           \
+    case Release:                                                              \
+      return __builtin_amdgcn_fence(__ATOMIC_RELEASE, scope_memory);           \
+    case AcquireRelease:                                                       \
+      return __builtin_amdgcn_fence(__ATOMIC_ACQ_REL, scope_memory);           \
+    case SequentiallyConsistent:                                               \
+      return __builtin_amdgcn_fence(__ATOMIC_SEQ_CST, scope_memory);           \
+    default:                                                                   \
+      __builtin_trap();                                                        \
+      __builtin_unreachable();                                                 \
+    }                                                                          \
+  }
+
+_CLC_INLINE void builtin_fence_order(unsigned int scope_memory,
+                                     unsigned int order) {
   switch ((enum Scope)scope_memory) {
   case CrossDevice:
-    BUILTIN_FENCE(semantics, "")
+    BUILTIN_FENCE(order, "")
   case Device:
-    BUILTIN_FENCE(semantics, "agent")
+    BUILTIN_FENCE(order, "agent")
   case Workgroup:
-    BUILTIN_FENCE(semantics, "workgroup")
+    BUILTIN_FENCE(order, "workgroup")
   case Subgroup:
-    BUILTIN_FENCE(semantics, "wavefront")
+    BUILTIN_FENCE(order, "wavefront")
   case Invocation:
-    BUILTIN_FENCE(semantics, "singlethread")
+    BUILTIN_FENCE(order, "singlethread")
   }
 }
 #undef BUILTIN_FENCE
 
+_CLC_DEF _CLC_OVERLOAD void __mem_fence(unsigned int scope_memory,
+                                        unsigned int semantics) {
+  builtin_fence_order(scope_memory, semantics & 0x1F);
+}
+
 _CLC_OVERLOAD _CLC_DEF void __spirv_MemoryBarrier(unsigned int scope_memory,
                                                   unsigned int semantics) {
   __mem_fence(scope_memory, semantics);
 }
 
 _CLC_OVERLOAD _CLC_DEF _CLC_CONVERGENT void
-__spirv_ControlBarrier(unsigned int scope_execution, unsigned scope_memory,
+__spirv_ControlBarrier(unsigned int scope_execution, unsigned int scope_memory,
                        unsigned int semantics) {
   if (semantics) {
     __mem_fence(scope_memory, semantics);
diff --git a/libclc/generic/libspirv/math/acos.inc b/libclc/generic/libspirv/math/acos.inc
index 4b437283f60ec..947730bead1f8 100644
--- a/libclc/generic/libspirv/math/acos.inc
+++ b/libclc/generic/libspirv/math/acos.inc
@@ -19,9 +19,6 @@
  * precision of #4 may be better.
  */
 
-// TODO: Enable half precision when atan2 is implemented
-#if __CLC_FPSIZE > 16
-
 #if __CLC_FPSIZE == 64
 #define __CLC_CONST(x) x
 #elif __CLC_FPSIZE == 32
@@ -38,5 +35,3 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __spirv_ocl_acos(__CLC_GENTYPE x) {
 }
 
 #undef __CLC_CONST
-
-#endif
diff --git a/libclc/generic/libspirv/math/acosh.cl b/libclc/generic/libspirv/math/acosh.cl
index 6945d3f6e2c8d..cb7931795466f 100644
--- a/libclc/generic/libspirv/math/acosh.cl
+++ b/libclc/generic/libspirv/math/acosh.cl
@@ -113,3 +113,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_acosh(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_acosh, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_acosh, __builtin_acosh, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/asin.inc b/libclc/generic/libspirv/math/asin.inc
index ebacd008f0352..f32aca0fb7c5c 100644
--- a/libclc/generic/libspirv/math/asin.inc
+++ b/libclc/generic/libspirv/math/asin.inc
@@ -5,8 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// TODO: Enable half precision when atan2 is implemented
-#if __CLC_FPSIZE > 16
 
 #if __CLC_FPSIZE == 64
 #define __CLC_CONST(x) x
@@ -22,5 +20,3 @@ _CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __spirv_ocl_asin(__CLC_GENTYPE x) {
 }
 
 #undef __CLC_CONST
-
-#endif
diff --git a/libclc/generic/libspirv/math/asinh.cl b/libclc/generic/libspirv/math/asinh.cl
index 10d206846fd45..76a32eb4ed1b5 100644
--- a/libclc/generic/libspirv/math/asinh.cl
+++ b/libclc/generic/libspirv/math/asinh.cl
@@ -361,3 +361,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_asinh(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_asinh, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_asinh, __builtin_asinh, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/atan.cl b/libclc/generic/libspirv/math/atan.cl
index 4dadde766f286..f8f2fb90d40c9 100644
--- a/libclc/generic/libspirv/math/atan.cl
+++ b/libclc/generic/libspirv/math/atan.cl
@@ -173,3 +173,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_atan(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_atan, double);
 
 #endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_atan, __builtin_atan, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/atan2.cl b/libclc/generic/libspirv/math/atan2.cl
index e6cce7868ff3b..f71c0188314a6 100644
--- a/libclc/generic/libspirv/math/atan2.cl
+++ b/libclc/generic/libspirv/math/atan2.cl
@@ -245,3 +245,11 @@ _CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_atan2, double,
                       double);
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_BINARY_BUILTIN(half, __spirv_ocl_atan2, __builtin_atan2, half, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/cbrt.cl b/libclc/generic/libspirv/math/cbrt.cl
index 98bff27b9c979..c34b91901a11f 100644
--- a/libclc/generic/libspirv/math/cbrt.cl
+++ b/libclc/generic/libspirv/math/cbrt.cl
@@ -144,3 +144,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_cbrt(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_cbrt, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_cbrt, __builtin_cbrt, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/clc_exp10.cl b/libclc/generic/libspirv/math/clc_exp10.cl
index d5b9621aa9888..154d4f457b27b 100644
--- a/libclc/generic/libspirv/math/clc_exp10.cl
+++ b/libclc/generic/libspirv/math/clc_exp10.cl
@@ -150,3 +150,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_exp10(double x)
 }
 _CLC_UNARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_exp10, double)
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(half, __clc_exp10, __builtin_exp10, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/clc_fmod.cl b/libclc/generic/libspirv/math/clc_fmod.cl
index f84c1155b49c3..6a773d8ab082c 100644
--- a/libclc/generic/libspirv/math/clc_fmod.cl
+++ b/libclc/generic/libspirv/math/clc_fmod.cl
@@ -166,3 +166,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y) {
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_fmod, double,
                       double);
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_BINARY_BUILTIN(half, __clc_fmod, __builtin_fmod, half, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/clc_hypot.cl b/libclc/generic/libspirv/math/clc_hypot.cl
index d99fceccf77c7..b34a5e5107b4a 100644
--- a/libclc/generic/libspirv/math/clc_hypot.cl
+++ b/libclc/generic/libspirv/math/clc_hypot.cl
@@ -85,3 +85,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_hypot(double x, double y)
 
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_hypot, double, double)
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_BINARY_BUILTIN(half, __clc_hypot, __builtin_hypot, half, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/clc_ldexp.cl b/libclc/generic/libspirv/math/clc_ldexp.cl
index be582b88445cb..6183638f388b9 100644
--- a/libclc/generic/libspirv/math/clc_ldexp.cl
+++ b/libclc/generic/libspirv/math/clc_ldexp.cl
@@ -130,3 +130,11 @@ _CLC_DEF _CLC_OVERLOAD double __clc_ldexp(double x, int n) {
 }
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_BINARY_BUILTIN(half, __clc_ldexp, __builtin_ldexp, half, int)
+
+#endif
diff --git a/libclc/generic/libspirv/math/clc_remainder.cl b/libclc/generic/libspirv/math/clc_remainder.cl
index ccef76690571a..16e75a0a1319d 100644
--- a/libclc/generic/libspirv/math/clc_remainder.cl
+++ b/libclc/generic/libspirv/math/clc_remainder.cl
@@ -202,3 +202,12 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y) {
 _CLC_BINARY_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, double, __clc_remainder, double,
                       double);
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_BINARY_BUILTIN(half, __clc_remainder, __builtin_remainder, half,
+                           half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/copysign.cl b/libclc/generic/libspirv/math/copysign.cl
index d839f9f7e88b3..6c241dd8170b0 100644
--- a/libclc/generic/libspirv/math/copysign.cl
+++ b/libclc/generic/libspirv/math/copysign.cl
@@ -25,13 +25,7 @@ _CLC_DEFINE_BINARY_BUILTIN(double, __spirv_ocl_copysign, __builtin_copysign,
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-_CLC_DEF _CLC_OVERLOAD half __spirv_ocl_copysign(half x, half y) {
-  ushort sign_x = as_ushort(x) & 0x8000u;
-  ushort unsigned_y = as_ushort(y) & 0x7ffffu;
-
-  return as_half((ushort)(sign_x | unsigned_y));
-}
-_CLC_BINARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_copysign, half,
-                      half)
+_CLC_DEFINE_BINARY_BUILTIN(half, __spirv_ocl_copysign, __builtin_copysign, half,
+                           half)
 
 #endif
diff --git a/libclc/generic/libspirv/math/cos.cl b/libclc/generic/libspirv/math/cos.cl
index 0a47bf9956af3..b53551d0d2c90 100644
--- a/libclc/generic/libspirv/math/cos.cl
+++ b/libclc/generic/libspirv/math/cos.cl
@@ -62,3 +62,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_cos(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_cos, double);
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_cos, __builtin_cos, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/cosh.cl b/libclc/generic/libspirv/math/cosh.cl
index 0c737d091a0cc..ff1da9632b2e5 100644
--- a/libclc/generic/libspirv/math/cosh.cl
+++ b/libclc/generic/libspirv/math/cosh.cl
@@ -209,3 +209,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_cosh(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_cosh, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_cosh, __builtin_cosh, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/cospi.cl b/libclc/generic/libspirv/math/cospi.cl
index ec02fee7daae7..fcf7b8d9e4b16 100644
--- a/libclc/generic/libspirv/math/cospi.cl
+++ b/libclc/generic/libspirv/math/cospi.cl
@@ -120,3 +120,16 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_cospi(double x) {
 }
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_cospi, double);
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEF _CLC_OVERLOAD half __spirv_ocl_cospi(half x) {
+  float f = x;
+  return __spirv_ocl_cospi(f);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_cospi, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/erf.cl b/libclc/generic/libspirv/math/erf.cl
index f50358917caa3..510ee76820f30 100644
--- a/libclc/generic/libspirv/math/erf.cl
+++ b/libclc/generic/libspirv/math/erf.cl
@@ -540,3 +540,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_erf(double y) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_erf, double);
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_erf, __builtin_erf, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/erfc.cl b/libclc/generic/libspirv/math/erfc.cl
index 9f5db45f5aa50..fb1d88d4f13ae 100644
--- a/libclc/generic/libspirv/math/erfc.cl
+++ b/libclc/generic/libspirv/math/erfc.cl
@@ -549,3 +549,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_erfc(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_erfc, double);
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_erfc, __builtin_erfc, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/exp.cl b/libclc/generic/libspirv/math/exp.cl
index 3fdc69a44fa13..2d6421f2d6f25 100644
--- a/libclc/generic/libspirv/math/exp.cl
+++ b/libclc/generic/libspirv/math/exp.cl
@@ -75,3 +75,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_exp(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_exp, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_exp, __builtin_exp, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/exp2.cl b/libclc/generic/libspirv/math/exp2.cl
index 6e6a736722379..7720e78be7754 100644
--- a/libclc/generic/libspirv/math/exp2.cl
+++ b/libclc/generic/libspirv/math/exp2.cl
@@ -70,3 +70,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_exp2(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_exp2, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_exp2, __builtin_exp2, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/expm1.cl b/libclc/generic/libspirv/math/expm1.cl
index 31407f8a689bd..710a67e2fe25c 100644
--- a/libclc/generic/libspirv/math/expm1.cl
+++ b/libclc/generic/libspirv/math/expm1.cl
@@ -149,3 +149,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_expm1(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_expm1, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_expm1, __builtin_expm1, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/fdim.cl b/libclc/generic/libspirv/math/fdim.cl
index a4818b9ecf812..6385d8036c5cd 100644
--- a/libclc/generic/libspirv/math/fdim.cl
+++ b/libclc/generic/libspirv/math/fdim.cl
@@ -12,3 +12,13 @@
 
 #define __CLC_BODY <fdim.inc>
 #include <clc/math/gentype.inc>
+
+#include <clcmacro.h>
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_BINARY_BUILTIN(half, __spirv_ocl_fdim, __builtin_fdim, half, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/frexp.cl b/libclc/generic/libspirv/math/frexp.cl
index 6b05fe88832b4..b893bb63ea2be 100644
--- a/libclc/generic/libspirv/math/frexp.cl
+++ b/libclc/generic/libspirv/math/frexp.cl
@@ -8,6 +8,7 @@
 
 #include <spirv/spirv.h>
 #include <utils.h>
+#include <clcmacro.h>
 
 #define __CLC_BODY <frexp.inc>
 #define __CLC_ADDRESS_SPACE private
@@ -30,3 +31,22 @@
 #include <clc/math/gentype.inc>
 #undef __CLC_ADDRESS_SPACE
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+#define _CLC_DEFINE_NO_VEC(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE,     \
+                                   ARG2_TYPE)                                  \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) {         \
+    return BUILTIN(x, y);                                                      \
+  }
+
+_CLC_DEFINE_NO_VEC(half, __spirv_ocl_frexp, __builtin_frexp, half, global int *)
+_CLC_V_V_VP_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, half, __spirv_ocl_frexp, half, global, int)
+_CLC_DEFINE_NO_VEC(half, __spirv_ocl_frexp, __builtin_frexp, half, local int *)
+_CLC_V_V_VP_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, half, __spirv_ocl_frexp, half, local, int)
+_CLC_DEFINE_NO_VEC(half, __spirv_ocl_frexp, __builtin_frexp, half, int *)
+_CLC_V_V_VP_VECTORIZE(_CLC_DEF _CLC_OVERLOAD, half, __spirv_ocl_frexp, half, , int)
+
+#endif
diff --git a/libclc/generic/libspirv/math/ilogb.cl b/libclc/generic/libspirv/math/ilogb.cl
index 2e991afa2f50a..0387cbdc109ab 100644
--- a/libclc/generic/libspirv/math/ilogb.cl
+++ b/libclc/generic/libspirv/math/ilogb.cl
@@ -41,3 +41,17 @@ _CLC_OVERLOAD _CLC_DEF int __spirv_ocl_ilogb(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, __spirv_ocl_ilogb, double);
 
 #endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEF _CLC_OVERLOAD int __spirv_ocl_ilogb(half x) {
+  float f = x;
+  return __spirv_ocl_ilogb(f);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, __spirv_ocl_ilogb, half)
+
+
+#endif
diff --git a/libclc/generic/libspirv/math/lgamma.cl b/libclc/generic/libspirv/math/lgamma.cl
index 701d898bc0538..63003e9b58ef5 100644
--- a/libclc/generic/libspirv/math/lgamma.cl
+++ b/libclc/generic/libspirv/math/lgamma.cl
@@ -27,3 +27,16 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_lgamma(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_lgamma, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_OVERLOAD _CLC_DEF half __spirv_ocl_lgamma(half x) {
+  int s;
+  return __spirv_ocl_lgamma_r(x, &s);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_lgamma, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/log.cl b/libclc/generic/libspirv/math/log.cl
index dab1368109a1c..b9e986260c2b7 100644
--- a/libclc/generic/libspirv/math/log.cl
+++ b/libclc/generic/libspirv/math/log.cl
@@ -32,3 +32,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_log(double x)
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_log, double);
 
 #endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_log, __builtin_log, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/log10.cl b/libclc/generic/libspirv/math/log10.cl
index 9a6bcc996759d..74fbd1ec112ea 100644
--- a/libclc/generic/libspirv/math/log10.cl
+++ b/libclc/generic/libspirv/math/log10.cl
@@ -24,3 +24,11 @@ _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __spirv_ocl_log10, float);
 #ifdef cl_khr_fp64
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_log10, double);
 #endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_log10, __builtin_log10, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/log1p.cl b/libclc/generic/libspirv/math/log1p.cl
index b05873155f73e..ad6f94d2ecf25 100644
--- a/libclc/generic/libspirv/math/log1p.cl
+++ b/libclc/generic/libspirv/math/log1p.cl
@@ -166,3 +166,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_log1p(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_log1p, double);
 
 #endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_log1p, __builtin_log1p, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/log2.cl b/libclc/generic/libspirv/math/log2.cl
index 46cc5e545aa27..d8be06b9c5b4a 100644
--- a/libclc/generic/libspirv/math/log2.cl
+++ b/libclc/generic/libspirv/math/log2.cl
@@ -24,3 +24,11 @@ _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __spirv_ocl_log2, float);
 #ifdef cl_khr_fp64
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_log2, double);
 #endif // cl_khr_fp64
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_log2, __builtin_log2, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/logb.cl b/libclc/generic/libspirv/math/logb.cl
index 224b7e3042618..bd62b84fe7965 100644
--- a/libclc/generic/libspirv/math/logb.cl
+++ b/libclc/generic/libspirv/math/logb.cl
@@ -38,3 +38,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_logb(double x) {
 
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_logb, double)
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_logb, __builtin_logb, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/sin.cl b/libclc/generic/libspirv/math/sin.cl
index 8e4f7c06577be..679aa304dcd38 100644
--- a/libclc/generic/libspirv/math/sin.cl
+++ b/libclc/generic/libspirv/math/sin.cl
@@ -64,3 +64,11 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_sin(double x) {
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_sin, double);
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEFINE_UNARY_BUILTIN(half, __spirv_ocl_sin, __builtin_sin, half)
+
+#endif
diff --git a/libclc/generic/libspirv/math/sinpi.cl b/libclc/generic/libspirv/math/sinpi.cl
index c2b273e1fcdd8..d0e7c1a830030 100644
--- a/libclc/generic/libspirv/math/sinpi.cl
+++ b/libclc/generic/libspirv/math/sinpi.cl
@@ -115,3 +115,16 @@ _CLC_OVERLOAD _CLC_DEF double __spirv_ocl_sinpi(double x)
 _CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, double, __spirv_ocl_sinpi, double)
 
 #endif
+
+#ifdef cl_khr_fp16
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+_CLC_DEF _CLC_OVERLOAD half __spirv_ocl_sinpi(half x) {
+  float f = x;
+  return __spirv_ocl_sinpi(f);
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, half, __spirv_ocl_sinpi, half)
+
+#endif
diff --git a/libclc/x86_64-unknown-linux/libspirv/SOURCES b/libclc/native_cpu-unknown-linux/libspirv/SOURCES
similarity index 95%
rename from libclc/x86_64-unknown-linux/libspirv/SOURCES
rename to libclc/native_cpu-unknown-linux/libspirv/SOURCES
index b5ebcbf787bf6..3bd1204f6a449 100644
--- a/libclc/x86_64-unknown-linux/libspirv/SOURCES
+++ b/libclc/native_cpu-unknown-linux/libspirv/SOURCES
@@ -16,5 +16,4 @@ math/native_sqrt.cl
 math/rint.cl
 math/round.cl
 math/trunc.cl
-shared/helpers.ll
 cl_khr_int64_extended_atomics/minmax_helpers.ll
diff --git a/libclc/x86_64-unknown-linux/libspirv/cl_khr_int64_extended_atomics/minmax_helpers.ll b/libclc/native_cpu-unknown-linux/libspirv/cl_khr_int64_extended_atomics/minmax_helpers.ll
similarity index 100%
rename from libclc/x86_64-unknown-linux/libspirv/cl_khr_int64_extended_atomics/minmax_helpers.ll
rename to libclc/native_cpu-unknown-linux/libspirv/cl_khr_int64_extended_atomics/minmax_helpers.ll
diff --git a/libclc/native_cpu-unknown-linux/libspirv/integer/popcount.cl b/libclc/native_cpu-unknown-linux/libspirv/integer/popcount.cl
new file mode 100644
index 0000000000000..448b7ed50a98d
--- /dev/null
+++ b/libclc/native_cpu-unknown-linux/libspirv/integer/popcount.cl
@@ -0,0 +1,13 @@
+#include <func.h>
+#include <clcmacro.h>
+#include <spirv/spirv.h>
+
+_CLC_DEFINE_UNARY_BUILTIN(int, __spirv_ocl_popcount, __builtin_popcount, int)
+_CLC_DEFINE_UNARY_BUILTIN(uint, __spirv_ocl_popcount, __builtin_popcount, uint)
+_CLC_DEFINE_UNARY_BUILTIN(short, __spirv_ocl_popcount, __builtin_popcount, short)
+_CLC_DEFINE_UNARY_BUILTIN(ushort, __spirv_ocl_popcount, __builtin_popcount, ushort)
+_CLC_DEFINE_UNARY_BUILTIN(long, __spirv_ocl_popcount, __builtin_popcount, long)
+_CLC_DEFINE_UNARY_BUILTIN(ulong, __spirv_ocl_popcount, __builtin_popcount, ulong)
+_CLC_DEFINE_UNARY_BUILTIN(char, __spirv_ocl_popcount, __builtin_popcount, char)
+_CLC_DEFINE_UNARY_BUILTIN(uchar, __spirv_ocl_popcount, __builtin_popcount, uchar)
+_CLC_DEFINE_UNARY_BUILTIN(schar, __spirv_ocl_popcount, __builtin_popcount, schar)
diff --git a/libclc/x86_64-unknown-linux/libspirv/math/ceil.cl b/libclc/native_cpu-unknown-linux/libspirv/math/ceil.cl
similarity index 100%
rename from libclc/x86_64-unknown-linux/libspirv/math/ceil.cl
rename to libclc/native_cpu-unknown-linux/libspirv/math/ceil.cl
diff --git a/libclc/x86_64-unknown-linux/libspirv/math/clc_sqrt.cl b/libclc/native_cpu-unknown-linux/libspirv/math/clc_sqrt.cl
similarity index 100%
rename from libclc/x86_64-unknown-linux/libspirv/math/clc_sqrt.cl
rename to libclc/native_cpu-unknown-linux/libspirv/math/clc_sqrt.cl
diff --git a/libclc/x86_64-unknown-linux/libspirv/math/fabs.cl b/libclc/native_cpu-unknown-linux/libspirv/math/fabs.cl
similarity index 100%
rename from libclc/x86_64-unknown-linux/libspirv/math/fabs.cl
rename to libclc/native_cpu-unknown-linux/libspirv/math/fabs.cl
diff --git a/libclc/x86_64-unknown-linux/libspirv/math/floor.cl b/libclc/native_cpu-unknown-linux/libspirv/math/floor.cl
similarity index 100%
rename from libclc/x86_64-unknown-linux/libspirv/math/floor.cl
rename to libclc/native_cpu-unknown-linux/libspirv/math/floor.cl
diff --git a/libclc/x86_64-unknown-linux/libspirv/math/fma.cl b/libclc/native_cpu-unknown-linux/libspirv/math/fma.cl
similarity index 100%
rename from libclc/x86_64-unknown-linux/libspirv/math/fma.cl
rename to libclc/native_cpu-unknown-linux/libspirv/math/fma.cl
diff --git a/libclc/x86_64-unknown-linux/libspirv/math/helpers.h b/libclc/native_cpu-unknown-linux/libspirv/math/helpers.h
similarity index 76%
rename from libclc/x86_64-unknown-linux/libspirv/math/helpers.h
rename to libclc/native_cpu-unknown-linux/libspirv/math/helpers.h
index 0178a74ad6c96..1dec19d63414c 100644
--- a/libclc/x86_64-unknown-linux/libspirv/math/helpers.h
+++ b/libclc/native_cpu-unknown-linux/libspirv/math/helpers.h
@@ -1,26 +1,7 @@
 #include "func.h"
 #include "types.h"
 
-#ifdef NO_CLANG_BUILTINS
-
-#define GEN_UNARY_BUILTIN_T(NAME, TYPE)                                        \
-  _CLC_OVERLOAD TYPE __##NAME##_helper(TYPE);                                  \
-  _CLC_OVERLOAD TYPE __spirv_ocl_##NAME(TYPE n) { return __##NAME##_helper(n); }
-
-#define GEN_TERNARY_BUILTIN_T(NAME, TYPE)                                      \
-  _CLC_OVERLOAD TYPE __##NAME##_helper(TYPE, TYPE, TYPE);                      \
-  _CLC_OVERLOAD TYPE __spirv_ocl_##NAME(TYPE a, TYPE b, TYPE c) {              \
-    return __##NAME##_helper(a, b, c);                                         \
-  }
-#define GEN_UNARY_BUILTIN(NAME)                                                \
-  GEN_UNARY_BUILTIN_T(NAME, float)                                             \
-  GEN_UNARY_BUILTIN_T(NAME, double)
-
-#define GEN_TERNARY_BUILTIN(NAME)                                              \
-  GEN_TERNARY_BUILTIN_T(NAME, float)                                           \
-  GEN_TERNARY_BUILTIN_T(NAME, double)
-
-#else
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 #ifndef IS_NATIVE
 #define GETNAME(ID) __spirv_ocl_##ID
@@ -54,8 +35,10 @@
     return __builtin_##NAME##f(n);                                             \
   }                                                                            \
   _CLC_OVERLOAD double GETNAME(NAME)(double n) { return __builtin_##NAME(n); } \
+  _CLC_OVERLOAD half GETNAME(NAME)(half n) { return __builtin_##NAME(n); }     \
   GEN_UNARY_VECTOR_BUILTIN_T(NAME, float)                                      \
-  GEN_UNARY_VECTOR_BUILTIN_T(NAME, double)
+  GEN_UNARY_VECTOR_BUILTIN_T(NAME, double)                                     \
+  GEN_UNARY_VECTOR_BUILTIN_T(NAME, half)
 
 #define GEN_TERNARY_VECTOR_BUILTIN(NAME, TYPE, NUM)                            \
   _CLC_OVERLOAD TYPE##NUM GETNAME(NAME)(TYPE##NUM n1, TYPE##NUM n2,            \
@@ -77,6 +60,9 @@
   _CLC_OVERLOAD double GETNAME(NAME)(double n1, double n2, double n3) {        \
     return __builtin_##NAME(n1, n2, n3);                                       \
   }                                                                            \
+  _CLC_OVERLOAD half GETNAME(NAME)(half n1, half n2, half n3) {                \
+    return __builtin_##NAME(n1, n2, n3);                                       \
+  }                                                                            \
   GEN_TERNARY_VECTOR_BUILTIN_T(NAME, float)                                    \
-  GEN_TERNARY_VECTOR_BUILTIN_T(NAME, double)
-#endif
+  GEN_TERNARY_VECTOR_BUILTIN_T(NAME, double)                                   \
+  GEN_TERNARY_VECTOR_BUILTIN_T(NAME, half)
diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_cos.cl b/libclc/native_cpu-unknown-linux/libspirv/math/native_cos.cl
similarity index 100%
rename from libclc/x86_64-unknown-linux/libspirv/math/native_cos.cl
rename to libclc/native_cpu-unknown-linux/libspirv/math/native_cos.cl
diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_exp.cl b/libclc/native_cpu-unknown-linux/libspirv/math/native_exp.cl
similarity index 100%
rename from libclc/x86_64-unknown-linux/libspirv/math/native_exp.cl
rename to libclc/native_cpu-unknown-linux/libspirv/math/native_exp.cl
diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_exp2.cl b/libclc/native_cpu-unknown-linux/libspirv/math/native_exp2.cl
similarity index 100%
rename from libclc/x86_64-unknown-linux/libspirv/math/native_exp2.cl
rename to libclc/native_cpu-unknown-linux/libspirv/math/native_exp2.cl
diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_log.cl b/libclc/native_cpu-unknown-linux/libspirv/math/native_log.cl
similarity index 100%
rename from libclc/x86_64-unknown-linux/libspirv/math/native_log.cl
rename to libclc/native_cpu-unknown-linux/libspirv/math/native_log.cl
diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_log10.cl b/libclc/native_cpu-unknown-linux/libspirv/math/native_log10.cl
similarity index 100%
rename from libclc/x86_64-unknown-linux/libspirv/math/native_log10.cl
rename to libclc/native_cpu-unknown-linux/libspirv/math/native_log10.cl
diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_log2.cl b/libclc/native_cpu-unknown-linux/libspirv/math/native_log2.cl
similarity index 100%
rename from libclc/x86_64-unknown-linux/libspirv/math/native_log2.cl
rename to libclc/native_cpu-unknown-linux/libspirv/math/native_log2.cl
diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_sin.cl b/libclc/native_cpu-unknown-linux/libspirv/math/native_sin.cl
similarity index 100%
rename from libclc/x86_64-unknown-linux/libspirv/math/native_sin.cl
rename to libclc/native_cpu-unknown-linux/libspirv/math/native_sin.cl
diff --git a/libclc/x86_64-unknown-linux/libspirv/math/native_sqrt.cl b/libclc/native_cpu-unknown-linux/libspirv/math/native_sqrt.cl
similarity index 100%
rename from libclc/x86_64-unknown-linux/libspirv/math/native_sqrt.cl
rename to libclc/native_cpu-unknown-linux/libspirv/math/native_sqrt.cl
diff --git a/libclc/x86_64-unknown-linux/libspirv/math/rint.cl b/libclc/native_cpu-unknown-linux/libspirv/math/rint.cl
similarity index 100%
rename from libclc/x86_64-unknown-linux/libspirv/math/rint.cl
rename to libclc/native_cpu-unknown-linux/libspirv/math/rint.cl
diff --git a/libclc/x86_64-unknown-linux/libspirv/math/round.cl b/libclc/native_cpu-unknown-linux/libspirv/math/round.cl
similarity index 100%
rename from libclc/x86_64-unknown-linux/libspirv/math/round.cl
rename to libclc/native_cpu-unknown-linux/libspirv/math/round.cl
diff --git a/libclc/x86_64-unknown-linux/libspirv/math/sqrt.cl b/libclc/native_cpu-unknown-linux/libspirv/math/sqrt.cl
similarity index 100%
rename from libclc/x86_64-unknown-linux/libspirv/math/sqrt.cl
rename to libclc/native_cpu-unknown-linux/libspirv/math/sqrt.cl
diff --git a/libclc/x86_64-unknown-linux/libspirv/math/trunc.cl b/libclc/native_cpu-unknown-linux/libspirv/math/trunc.cl
similarity index 100%
rename from libclc/x86_64-unknown-linux/libspirv/math/trunc.cl
rename to libclc/native_cpu-unknown-linux/libspirv/math/trunc.cl
diff --git a/libclc/x86_64-unknown-linux/libspirv/integer/helpers.h b/libclc/x86_64-unknown-linux/libspirv/integer/helpers.h
deleted file mode 100644
index 50e7c39cb3d23..0000000000000
--- a/libclc/x86_64-unknown-linux/libspirv/integer/helpers.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#include "func.h"
-
-#define GEN_UNARY_BUILTIN_T(NAME, TYPE)                                        \
-  _CLC_OVERLOAD TYPE __##NAME##_helper(TYPE);                                  \
-  _CLC_OVERLOAD TYPE __spirv_ocl_##NAME(TYPE n) { return __##NAME##_helper(n); }
-
-#define GEN_UNARY_BUILTIN(NAME)                                                \
-  GEN_UNARY_BUILTIN_T(NAME, int)                                               \
-  GEN_UNARY_BUILTIN_T(NAME, signed char)
diff --git a/libclc/x86_64-unknown-linux/libspirv/integer/popcount.cl b/libclc/x86_64-unknown-linux/libspirv/integer/popcount.cl
deleted file mode 100644
index fae953de0c340..0000000000000
--- a/libclc/x86_64-unknown-linux/libspirv/integer/popcount.cl
+++ /dev/null
@@ -1,3 +0,0 @@
-#include "helpers.h"
-
-GEN_UNARY_BUILTIN(popcount)
diff --git a/libclc/x86_64-unknown-linux/libspirv/shared/helpers.ll b/libclc/x86_64-unknown-linux/libspirv/shared/helpers.ll
deleted file mode 100644
index b3d7d5e2daa9f..0000000000000
--- a/libclc/x86_64-unknown-linux/libspirv/shared/helpers.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-declare i32 @llvm.ctpop.i32(i32 %n)
-declare i8 @llvm.ctpop.i8(i8 %n)
-
-
-define dso_local i32 @_Z17__popcount_helperi(i32 %x) {
-entry:
-  %call = call i32 @llvm.ctpop.i32(i32 %x) 
-  ret i32 %call
-}
-
-
-define dso_local i8 @_Z17__popcount_helpera(i8 %x) {
-entry:
-  %call = call i8 @llvm.ctpop.i8(i8 %x) 
-  ret i8 %call
-}
-
diff --git a/libdevice/bfloat16_wrapper.cpp b/libdevice/bfloat16_wrapper.cpp
index a0b6b96d4a293..4d2902420f1b0 100644
--- a/libdevice/bfloat16_wrapper.cpp
+++ b/libdevice/bfloat16_wrapper.cpp
@@ -11,6 +11,8 @@
 #if defined(__SPIR__) || defined(__SPIRV__)
 
 #include <CL/__spirv/spirv_ops.hpp>
+#include <CL/__spirv/spirv_types.hpp>
+#include <cassert>
 #include <cstdint>
 
 DEVICE_EXTERN_C_INLINE
@@ -23,4 +25,42 @@ float __devicelib_ConvertBF16ToFINTEL(const uint16_t &x) {
   return __spirv_ConvertBF16ToFINTEL(x);
 }
 
+// For vector of size 1.
+DEVICE_EXTERN_C_INLINE
+void __devicelib_ConvertFToBF16INTELVec1(const float *src, uint16_t *dst) {
+  dst[0] = __spirv_ConvertFToBF16INTEL(src[0]);
+}
+DEVICE_EXTERN_C_INLINE
+void __devicelib_ConvertBF16ToFINTELVec1(const uint16_t *src, float *dst) {
+  dst[0] = __spirv_ConvertBF16ToFINTEL(src[0]);
+}
+
+// Generate the conversion functions for vector of size 2, 3, 4, 8, 16.
+#define GenerateConvertFunctionForVec(size)                                    \
+  DEVICE_EXTERN_C_INLINE                                                       \
+  void __devicelib_ConvertFToBF16INTELVec##size(const float *src,              \
+                                                uint16_t *dst) {               \
+    __ocl_vec_t<float, size> x =                                               \
+        *__builtin_bit_cast(const __ocl_vec_t<float, size> *, src);            \
+    __ocl_vec_t<uint16_t, size> y = __spirv_ConvertFToBF16INTEL(x);            \
+    *__builtin_bit_cast(__ocl_vec_t<uint16_t, size> *, dst) = y;               \
+  }                                                                            \
+  DEVICE_EXTERN_C_INLINE                                                       \
+  void __devicelib_ConvertBF16ToFINTELVec##size(const uint16_t *src,           \
+                                                float *dst) {                  \
+    __ocl_vec_t<uint16_t, size> x =                                            \
+        *__builtin_bit_cast(const __ocl_vec_t<uint16_t, size> *, src);         \
+    __ocl_vec_t<float, size> y = __spirv_ConvertBF16ToFINTEL(x);               \
+    *__builtin_bit_cast(__ocl_vec_t<float, size> *, dst) = y;                  \
+  }
+
+// clang-format off
+GenerateConvertFunctionForVec(2)
+GenerateConvertFunctionForVec(3)
+GenerateConvertFunctionForVec(4)
+GenerateConvertFunctionForVec(8)
+GenerateConvertFunctionForVec(16)
+// clang-format on
+#undef GenerateConvertFunctionForVec
+
 #endif // __SPIR__ || __SPIRV__
diff --git a/libdevice/cmake/modules/SYCLLibdevice.cmake b/libdevice/cmake/modules/SYCLLibdevice.cmake
index e8c96d0099823..095b7a0cd1583 100644
--- a/libdevice/cmake/modules/SYCLLibdevice.cmake
+++ b/libdevice/cmake/modules/SYCLLibdevice.cmake
@@ -91,7 +91,7 @@ function(add_devicelib_obj obj_filename)
 
   set(devicelib-obj-file-new-offload ${obj_new_offload_binary_dir}/${obj_filename}.${new-offload-lib-suffix})
   add_custom_command(OUTPUT ${devicelib-obj-file-new-offload}
-                     COMMAND ${clang} -fsycl -c --offload-new-driver
+                     COMMAND ${clang} -fsycl -c --offload-new-driver -foffload-lto=thin
                              ${compile_opts} ${sycl_targets_opt} ${OBJ_EXTRA_ARGS}
                              ${CMAKE_CURRENT_SOURCE_DIR}/${OBJ_SRC}
                              -o ${devicelib-obj-file-new-offload}
@@ -159,7 +159,12 @@ set(imf_obj_deps device_imf.hpp imf_half.hpp imf_bf16.hpp imf_rounding_op.hpp im
 set(itt_obj_deps device_itt.h spirv_vars.h device.h sycl-compiler)
 set(bfloat16_obj_deps sycl-headers sycl-compiler)
 if (NOT MSVC)
-  set(sanitizer_obj_deps device.h atomic.hpp spirv_vars.h include/sanitizer_utils.hpp include/spir_global_var.hpp sycl-compiler)
+  set(sanitizer_obj_deps
+    device.h atomic.hpp spirv_vars.h
+    include/asan_libdevice.hpp
+    include/sanitizer_utils.hpp
+    include/spir_global_var.hpp
+    sycl-compiler)
 endif()
 
 add_devicelib(libsycl-itt-stubs SRC itt_stubs.cpp DEP ${itt_obj_deps})
@@ -219,7 +224,8 @@ set(imf_host_cxx_flags -c
 )
 
 if (NOT WIN32)
-  list(APPEND imf_host_cxx_flags -fPIC)
+  list(APPEND imf_host_cxx_flags -fPIC -fcf-protection=full)
+  list(APPEND imf_host_cxx_flags -fcf-protection=full)
 endif()
 
 add_custom_command(OUTPUT ${imf_fp32_fallback_src}
@@ -270,7 +276,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf.${lib-suffix}
                    VERBATIM)
 
 add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf.${new-offload-lib-suffix}
-                   COMMAND ${clang} -fsycl -c --offload-new-driver
+                   COMMAND ${clang} -fsycl -c --offload-new-driver -foffload-lto=thin
                            ${compile_opts} ${sycl_targets_opt}
                            ${imf_fp32_fallback_src} -I ${CMAKE_CURRENT_SOURCE_DIR}/imf
                            -o ${obj_binary_dir}/libsycl-fallback-imf.${new-offload-lib-suffix}
@@ -286,7 +292,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-fp32-host.${lib-suffix}
                    VERBATIM)
 
 add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-fp32-host.${new-offload-lib-suffix}
-                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver
+                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver -foffload-lto=thin
                            -I ${CMAKE_CURRENT_SOURCE_DIR}/imf
                            ${imf_fp32_fallback_src}
                            -o ${obj_binary_dir}/fallback-imf-fp32-host.${new-offload-lib-suffix}
@@ -321,7 +327,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf-fp64.${lib-suff
 
 add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf-fp64.${new-offload-lib-suffix}
                    COMMAND ${clang} -fsycl -c -I ${CMAKE_CURRENT_SOURCE_DIR}/imf
-                           --offload-new-driver
+                           --offload-new-driver -foffload-lto=thin
                            ${compile_opts} ${sycl_targets_opt}
                            ${imf_fp64_fallback_src}
                            -o ${obj_binary_dir}/libsycl-fallback-imf-fp64.${new-offload-lib-suffix}
@@ -337,7 +343,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-fp64-host.${lib-suffix}
                    VERBATIM)
 
 add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-fp64-host.${new-offload-lib-suffix}
-                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver
+                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver -foffload-lto=thin
                            -I ${CMAKE_CURRENT_SOURCE_DIR}/imf
                            ${imf_fp64_fallback_src}
                            -o ${obj_binary_dir}/fallback-imf-fp64-host.${new-offload-lib-suffix}
@@ -372,7 +378,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf-bf16.${lib-suff
 
 add_custom_command(OUTPUT ${obj_binary_dir}/libsycl-fallback-imf-bf16.${new-offload-lib-suffix}
                    COMMAND ${clang} -fsycl -c -I ${CMAKE_CURRENT_SOURCE_DIR}/imf
-                           --offload-new-driver
+                           --offload-new-driver -foffload-lto=thin
                            ${compile_opts} ${sycl_targets_opt}
                            ${imf_bf16_fallback_src}
                            -o ${obj_binary_dir}/libsycl-fallback-imf-bf16.${new-offload-lib-suffix}
@@ -388,7 +394,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-bf16-host.${lib-suffix}
                    VERBATIM)
 
 add_custom_command(OUTPUT ${obj_binary_dir}/fallback-imf-bf16-host.${new-offload-lib-suffix}
-                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver
+                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver -foffload-lto=thin
                            -I ${CMAKE_CURRENT_SOURCE_DIR}/imf
                            ${imf_bf16_fallback_src}
                            -o ${obj_binary_dir}/fallback-imf-bf16-host.${new-offload-lib-suffix}
@@ -437,7 +443,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/imf-fp32-host.${lib-suffix}
                    VERBATIM)
 
 add_custom_command(OUTPUT ${obj_binary_dir}/imf-fp32-host.${new-offload-lib-suffix}
-                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver
+                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver -foffload-lto=thin
                            ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper.cpp
                            -o ${obj_binary_dir}/imf-fp32-host.${new-offload-lib-suffix}
                    MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper.cpp
@@ -453,7 +459,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/imf-fp64-host.${lib-suffix}
                    VERBATIM)
 
 add_custom_command(OUTPUT ${obj_binary_dir}/imf-fp64-host.${new-offload-lib-suffix}
-                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver
+                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver -foffload-lto=thin
                            ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper_fp64.cpp
                            -o ${obj_binary_dir}/imf-fp64-host.${new-offload-lib-suffix}
                    MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper_fp64.cpp
@@ -469,7 +475,7 @@ add_custom_command(OUTPUT ${obj_binary_dir}/imf-bf16-host.${lib-suffix}
                    VERBATIM)
 
 add_custom_command(OUTPUT ${obj_binary_dir}/imf-bf16-host.${new-offload-lib-suffix}
-                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver
+                   COMMAND ${clang} ${imf_host_cxx_flags} --offload-new-driver -foffload-lto=thin
                            ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper_bf16.cpp
                            -o ${obj_binary_dir}/imf-bf16-host.${new-offload-lib-suffix}
                    MAIN_DEPENDENCY ${CMAKE_CURRENT_SOURCE_DIR}/imf_wrapper_bf16.cpp
diff --git a/libdevice/cmath_wrapper.cpp b/libdevice/cmath_wrapper.cpp
index e5e36045a7b8b..cf40373a90efb 100644
--- a/libdevice/cmath_wrapper.cpp
+++ b/libdevice/cmath_wrapper.cpp
@@ -39,6 +39,18 @@ extern "C" SYCL_EXTERNAL float __devicelib_fminf(float, float);
 DEVICE_EXTERN_C_INLINE
 float fminf(float x, float y) { return __devicelib_fminf(x, y); }
 
+DEVICE_EXTERN_C_INLINE
+float truncf(float x) { return __devicelib_truncf(x); }
+
+DEVICE_EXTERN_C_INLINE
+float sinpif(float x) { return __devicelib_sinpif(x); }
+
+DEVICE_EXTERN_C_INLINE
+float rsqrtf(float x) { return __devicelib_rsqrtf(x); }
+
+DEVICE_EXTERN_C_INLINE
+float exp10f(float x) { return __devicelib_exp10f(x); }
+
 DEVICE_EXTERN_C_INLINE
 div_t div(int x, int y) { return __devicelib_div(x, y); }
 
diff --git a/libdevice/cmath_wrapper_fp64.cpp b/libdevice/cmath_wrapper_fp64.cpp
index 5624ef2ad9b51..bfc1a122f0f18 100644
--- a/libdevice/cmath_wrapper_fp64.cpp
+++ b/libdevice/cmath_wrapper_fp64.cpp
@@ -36,6 +36,18 @@ extern "C" SYCL_EXTERNAL double __devicelib_fmin(double, double);
 DEVICE_EXTERN_C_INLINE
 double fmin(double x, double y) { return __devicelib_fmin(x, y); }
 
+DEVICE_EXTERN_C_INLINE
+double trunc(double x) { return __devicelib_trunc(x); }
+
+DEVICE_EXTERN_C_INLINE
+double sinpi(double x) { return __devicelib_sinpi(x); }
+
+DEVICE_EXTERN_C_INLINE
+double rsqrt(double x) { return __devicelib_rsqrt(x); }
+
+DEVICE_EXTERN_C_INLINE
+double exp10(double x) { return __devicelib_exp10(x); }
+
 DEVICE_EXTERN_C_INLINE
 double log(double x) { return __devicelib_log(x); }
 
diff --git a/libdevice/device_math.h b/libdevice/device_math.h
index f62c4c632f4d0..01085013dae57 100644
--- a/libdevice/device_math.h
+++ b/libdevice/device_math.h
@@ -76,6 +76,30 @@ float __devicelib_fminf(float x, float y);
 DEVICE_EXTERN_C
 double __devicelib_fmin(double x, double y);
 
+DEVICE_EXTERN_C
+float __devicelib_truncf(float x);
+
+DEVICE_EXTERN_C
+double __devicelib_trunc(double x);
+
+DEVICE_EXTERN_C
+double __devicelib_sinpi(double x);
+
+DEVICE_EXTERN_C
+float __devicelib_sinpif(float x);
+
+DEVICE_EXTERN_C
+double __devicelib_rsqrt(double x);
+
+DEVICE_EXTERN_C
+float __devicelib_rsqrtf(float x);
+
+DEVICE_EXTERN_C
+double __devicelib_exp10(double x);
+
+DEVICE_EXTERN_C
+float __devicelib_exp10f(float x);
+
 DEVICE_EXTERN_C
 div_t __devicelib_div(int x, int y);
 
diff --git a/libdevice/fallback-bfloat16.cpp b/libdevice/fallback-bfloat16.cpp
index 84015d03b35b0..4f7e35b6f2718 100644
--- a/libdevice/fallback-bfloat16.cpp
+++ b/libdevice/fallback-bfloat16.cpp
@@ -43,4 +43,31 @@ __devicelib_ConvertBF16ToFINTEL(const uint16_t &a) {
   return floatValue;
 }
 
+// Generate the conversion functions for vector of size 1, 2, 3, 4, 8, 16.
+#define GenerateConvertFunctionForVec(size)                                    \
+  DEVICE_EXTERN_C_INLINE                                                       \
+  void __devicelib_ConvertFToBF16INTELVec##size(const float *src,              \
+                                                uint16_t *dst) {               \
+    for (int i = 0; i < size; ++i) {                                           \
+      dst[i] = __devicelib_ConvertFToBF16INTEL(src[i]);                        \
+    }                                                                          \
+  }                                                                            \
+  DEVICE_EXTERN_C_INLINE                                                       \
+  void __devicelib_ConvertBF16ToFINTELVec##size(const uint16_t *src,           \
+                                                float *dst) {                  \
+    for (int i = 0; i < size; ++i) {                                           \
+      dst[i] = __devicelib_ConvertBF16ToFINTEL(src[i]);                        \
+    }                                                                          \
+  }
+
+// clang-format off
+GenerateConvertFunctionForVec(1)
+GenerateConvertFunctionForVec(2)
+GenerateConvertFunctionForVec(3)
+GenerateConvertFunctionForVec(4)
+GenerateConvertFunctionForVec(8)
+GenerateConvertFunctionForVec(16)
+// clang-format on
+#undef GenerateConvertFunctionForVec
+
 #endif // __SPIR__ || __SPIRV__
diff --git a/libdevice/fallback-cmath-fp64.cpp b/libdevice/fallback-cmath-fp64.cpp
index e3db88d7db7b6..49832ef966b5f 100644
--- a/libdevice/fallback-cmath-fp64.cpp
+++ b/libdevice/fallback-cmath-fp64.cpp
@@ -35,6 +35,18 @@ double __devicelib_fmax(double x, double y) { return __spirv_ocl_fmax(x, y); }
 DEVICE_EXTERN_C_INLINE
 double __devicelib_fmin(double x, double y) { return __spirv_ocl_fmin(x, y); }
 
+DEVICE_EXTERN_C_INLINE
+double __devicelib_trunc(double x) { return __spirv_ocl_trunc(x); }
+
+DEVICE_EXTERN_C_INLINE
+double __devicelib_sinpi(double x) { return __spirv_ocl_sinpi(x); }
+
+DEVICE_EXTERN_C_INLINE
+double __devicelib_rsqrt(double x) { return __spirv_ocl_rsqrt(x); }
+
+DEVICE_EXTERN_C_INLINE
+double __devicelib_exp10(double x) { return __spirv_ocl_exp10(x); }
+
 DEVICE_EXTERN_C_INLINE
 double __devicelib_log(double x) { return __spirv_ocl_log(x); }
 
diff --git a/libdevice/fallback-cmath.cpp b/libdevice/fallback-cmath.cpp
index 1e1d0f59a9ba6..6289126272da4 100644
--- a/libdevice/fallback-cmath.cpp
+++ b/libdevice/fallback-cmath.cpp
@@ -45,6 +45,18 @@ float __devicelib_fmaxf(float x, float y) { return __spirv_ocl_fmax(x, y); }
 DEVICE_EXTERN_C_INLINE
 float __devicelib_fminf(float x, float y) { return __spirv_ocl_fmin(x, y); }
 
+DEVICE_EXTERN_C_INLINE
+float __devicelib_truncf(float x) { return __spirv_ocl_trunc(x); }
+
+DEVICE_EXTERN_C_INLINE
+float __devicelib_sinpif(float x) { return __spirv_ocl_sinpi(x); }
+
+DEVICE_EXTERN_C_INLINE
+float __devicelib_rsqrtf(float x) { return __spirv_ocl_rsqrt(x); }
+
+DEVICE_EXTERN_C_INLINE
+float __devicelib_exp10f(float x) { return __spirv_ocl_exp10(x); }
+
 DEVICE_EXTERN_C_INLINE
 div_t __devicelib_div(int x, int y) { return {x / y, x % y}; }
 
diff --git a/libdevice/include/asan_libdevice.hpp b/libdevice/include/asan_libdevice.hpp
index 9a1e20368fd77..3107c428df426 100644
--- a/libdevice/include/asan_libdevice.hpp
+++ b/libdevice/include/asan_libdevice.hpp
@@ -75,7 +75,7 @@ struct LaunchInfo {
   LocalArgsInfo *LocalArgs = nullptr; // ordered by ArgIndex
 };
 
-constexpr unsigned ASAN_SHADOW_SCALE = 3;
+constexpr unsigned ASAN_SHADOW_SCALE = 4;
 constexpr unsigned ASAN_SHADOW_GRANULARITY = 1ULL << ASAN_SHADOW_SCALE;
 
 // Based on the observation, only the last 24 bits of the address of the private
diff --git a/libdevice/sanitizer_utils.cpp b/libdevice/sanitizer_utils.cpp
index 0ea65215a3012..959776009c9aa 100644
--- a/libdevice/sanitizer_utils.cpp
+++ b/libdevice/sanitizer_utils.cpp
@@ -109,17 +109,17 @@ __SYCL_PRIVATE__ void *ToPrivate(void *ptr) {
 }
 
 inline uptr MemToShadow_CPU(uptr addr) {
-  return __AsanShadowMemoryGlobalStart + (addr >> 3);
+  return __AsanShadowMemoryGlobalStart + (addr >> ASAN_SHADOW_SCALE);
 }
 
 inline uptr MemToShadow_DG2(uptr addr, uint32_t as) {
   uptr shadow_ptr = 0;
   if (addr & (~0xffffffffffff)) {
-    shadow_ptr =
-        (((addr & 0xffffffffffff) >> 3) + __AsanShadowMemoryGlobalStart) |
-        (~0xffffffffffff);
+    shadow_ptr = (((addr & 0xffffffffffff) >> ASAN_SHADOW_SCALE) +
+                  __AsanShadowMemoryGlobalStart) |
+                 (~0xffffffffffff);
   } else {
-    shadow_ptr = (addr >> 3) + __AsanShadowMemoryGlobalStart;
+    shadow_ptr = (addr >> ASAN_SHADOW_SCALE) + __AsanShadowMemoryGlobalStart;
   }
 
   if (shadow_ptr > __AsanShadowMemoryGlobalEnd) {
@@ -163,8 +163,8 @@ inline uptr MemToShadow_PVC(uptr addr, uint32_t as) {
   if (as == ADDRESS_SPACE_GLOBAL) { // global
     uptr shadow_ptr;
     if (addr & 0xFF00000000000000) { // Device USM
-      shadow_ptr = __AsanShadowMemoryGlobalStart + 0x200000000000 +
-                   ((addr & 0xFFFFFFFFFFFF) >> 3);
+      shadow_ptr = __AsanShadowMemoryGlobalStart + 0x80000000000 +
+                   ((addr & 0xFFFFFFFFFFFF) >> ASAN_SHADOW_SCALE);
     } else { // Only consider 47bit VA
       shadow_ptr = __AsanShadowMemoryGlobalStart +
                    ((addr & 0x7FFFFFFFFFFF) >> ASAN_SHADOW_SCALE);
@@ -204,7 +204,7 @@ inline uptr MemToShadow_PVC(uptr addr, uint32_t as) {
 
     uptr shadow_ptr = shadow_offset +
                       ((wg_lid * SLM_SIZE) >> ASAN_SHADOW_SCALE) +
-                      ((addr & (SLM_SIZE - 1)) >> 3);
+                      ((addr & (SLM_SIZE - 1)) >> ASAN_SHADOW_SCALE);
 
     if (shadow_ptr > shadow_offset_end) {
       if (__asan_report_out_of_shadow_bounds() && __AsanDebug) {
@@ -483,12 +483,14 @@ bool __asan_region_is_value(uptr addr, uint32_t as, std::size_t size,
   return true;
 }
 
-// NOTE: size < 8
-inline int __asan_address_is_poisoned(uptr a, uint32_t as, size_t size) {
+// NOTE: size <= 16
+inline int __asan_address_is_poisoned(uptr a, uint32_t as, size_t size = 1) {
   auto *shadow_address = (__SYCL_GLOBAL__ s8 *)MemToShadow(a, as);
   if (shadow_address) {
     auto shadow_value = *shadow_address;
     if (shadow_value) {
+      if (size == ASAN_SHADOW_GRANULARITY)
+        return true;
       s8 last_accessed_byte = (a & (ASAN_SHADOW_GRANULARITY - 1)) + size - 1;
       return (last_accessed_byte >= shadow_value);
     }
@@ -496,11 +498,6 @@ inline int __asan_address_is_poisoned(uptr a, uint32_t as, size_t size) {
   return false;
 }
 
-// NOTE: size = 1
-inline int __asan_address_is_poisoned(uptr a, uint32_t as) {
-  return __asan_address_is_poisoned(a, as, 1);
-}
-
 inline uptr __asan_region_is_poisoned(uptr beg, uint32_t as, size_t size) {
   if (!size)
     return 0;
@@ -564,34 +561,13 @@ inline uptr __asan_region_is_poisoned(uptr beg, uint32_t as, size_t size) {
 ASAN_REPORT_ERROR(load, false, 1)
 ASAN_REPORT_ERROR(load, false, 2)
 ASAN_REPORT_ERROR(load, false, 4)
+ASAN_REPORT_ERROR(load, false, 8)
+ASAN_REPORT_ERROR(load, false, 16)
 ASAN_REPORT_ERROR(store, true, 1)
 ASAN_REPORT_ERROR(store, true, 2)
 ASAN_REPORT_ERROR(store, true, 4)
-
-#define ASAN_REPORT_ERROR_BYTE(type, is_write, size)                           \
-  DEVICE_EXTERN_C_NOINLINE void __asan_##type##size(                           \
-      uptr addr, uint32_t as, const char __SYCL_CONSTANT__ *file,              \
-      uint32_t line, const char __SYCL_CONSTANT__ *func) {                     \
-    auto *shadow_address = (__SYCL_GLOBAL__ u##size *)MemToShadow(addr, as);   \
-    if (shadow_address && *shadow_address) {                                   \
-      __asan_report_access_error(addr, as, size, is_write, addr, file, line,   \
-                                 func);                                        \
-    }                                                                          \
-  }                                                                            \
-  DEVICE_EXTERN_C_NOINLINE void __asan_##type##size##_noabort(                 \
-      uptr addr, uint32_t as, const char __SYCL_CONSTANT__ *file,              \
-      uint32_t line, const char __SYCL_CONSTANT__ *func) {                     \
-    auto *shadow_address = (__SYCL_GLOBAL__ u##size *)MemToShadow(addr, as);   \
-    if (shadow_address && *shadow_address) {                                   \
-      __asan_report_access_error(addr, as, size, is_write, addr, file, line,   \
-                                 func, true);                                  \
-    }                                                                          \
-  }
-
-ASAN_REPORT_ERROR_BYTE(load, false, 8)
-ASAN_REPORT_ERROR_BYTE(load, false, 16)
-ASAN_REPORT_ERROR_BYTE(store, true, 8)
-ASAN_REPORT_ERROR_BYTE(store, true, 16)
+ASAN_REPORT_ERROR(store, true, 8)
+ASAN_REPORT_ERROR(store, true, 16)
 
 #define ASAN_REPORT_ERROR_N(type, is_write)                                    \
   DEVICE_EXTERN_C_NOINLINE void __asan_##type##N(                              \
diff --git a/llvm/docs/requirements-hashed.txt b/llvm/docs/requirements-hashed.txt
index fdf7682926b2b..07e3ed9d19030 100644
--- a/llvm/docs/requirements-hashed.txt
+++ b/llvm/docs/requirements-hashed.txt
@@ -360,7 +360,7 @@ sphinxcontrib-serializinghtml==1.1.10 \
     --hash=sha256:326369b8df80a7d2d8d7f99aa5ac577f51ea51556ed974e7716cfd4fca3f6cb7 \
     --hash=sha256:93f3f5dc458b91b192fe10c397e324f262cf163d79f3282c158e8436a2c4511f
     # via sphinx
-urllib3==2.2.1 \
-    --hash=sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d \
-    --hash=sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19
+urllib3==2.2.2 \
+    --hash=sha256:a448b2f64d686155468037e1ace9f2d2199776e17f0a46610480d311f73e3472 \
+    --hash=sha256:dd505485549a7a552833da5e6063639d0d177c04f23bc3864e41e5dc5f612168
     # via requests
diff --git a/llvm/include/llvm/SYCLLowerIR/CompileTimePropertiesPass.h b/llvm/include/llvm/SYCLLowerIR/CompileTimePropertiesPass.h
index f2c1f96b65d35..6cc29369d0c02 100644
--- a/llvm/include/llvm/SYCLLowerIR/CompileTimePropertiesPass.h
+++ b/llvm/include/llvm/SYCLLowerIR/CompileTimePropertiesPass.h
@@ -57,7 +57,7 @@ namespace detail {
 ///
 /// @returns \c false if the value of \c Value equals to "false", \c true
 /// otherwise.
-inline bool toBool(StringRef Value) { return !Value.equals("false"); }
+inline bool toBool(StringRef Value) { return Value != "false"; }
 
 } // namespace detail
 
diff --git a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td
index f74db3c8726ba..38d5f2512a1c4 100644
--- a/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td
+++ b/llvm/include/llvm/SYCLLowerIR/DeviceConfigFile.td
@@ -206,6 +206,8 @@ def : CudaTargetInfo<"nvidia_gpu_sm_89", !listconcat(CudaMinAspects, CudaBindles
     [AspectFp16, AspectAtomic64, AspectExt_oneapi_bfloat16_math_functions, AspectExt_oneapi_cuda_async_barrier])>;
 def : CudaTargetInfo<"nvidia_gpu_sm_90", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
     [AspectFp16, AspectAtomic64, AspectExt_oneapi_bfloat16_math_functions, AspectExt_oneapi_cuda_async_barrier])>;
+def : CudaTargetInfo<"nvidia_gpu_sm_90a", !listconcat(CudaMinAspects, CudaBindlessImagesAspects,
+    [AspectFp16, AspectAtomic64, AspectExt_oneapi_bfloat16_math_functions, AspectExt_oneapi_cuda_async_barrier])>;
 
 //
 // HIP / AMDGPU device aspects
diff --git a/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h
index 9ae433cedc668..085e424249d5c 100644
--- a/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h
+++ b/llvm/include/llvm/SYCLLowerIR/ModuleSplitter.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/PropertySetIO.h"
 
 #include <memory>
 #include <string>
@@ -196,6 +197,8 @@ class ModuleDesc {
 
   ModuleDesc clone() const;
 
+  std::string makeSymbolTable() const;
+
   const SYCLDeviceRequirements &getOrComputeDeviceRequirements() const {
     if (!Reqs.has_value())
       Reqs = computeDeviceRequirements(*this);
@@ -270,6 +273,33 @@ void dumpEntryPoints(const Module &M, bool OnlyKernelsAreEntryPoints = false,
                      const char *msg = "", int Tab = 0);
 #endif // NDEBUG
 
+struct SplitModule {
+  std::string ModuleFilePath;
+  util::PropertySetRegistry Properties;
+  std::string Symbols;
+
+  SplitModule() = default;
+  SplitModule(const SplitModule &) = default;
+  SplitModule &operator=(const SplitModule &) = default;
+  SplitModule(SplitModule &&) = default;
+  SplitModule &operator=(SplitModule &&) = default;
+
+  SplitModule(std::string_view File, util::PropertySetRegistry Properties,
+              std::string Symbols)
+      : ModuleFilePath(File), Properties(std::move(Properties)),
+        Symbols(std::move(Symbols)) {}
+};
+
+struct ModuleSplitterSettings {
+  IRSplitMode Mode;
+  bool OutputAssembly = false; // Bitcode or LLVM IR.
+  StringRef OutputPrefix;
+};
+
+/// Splits the given module \p M according to the given \p Settings.
+Expected<std::vector<SplitModule>>
+splitSYCLModule(std::unique_ptr<Module> M, ModuleSplitterSettings Settings);
+
 } // namespace module_split
 
 } // namespace llvm
diff --git a/llvm/include/llvm/SYCLLowerIR/SYCLDeviceRequirements.h b/llvm/include/llvm/SYCLLowerIR/SYCLDeviceRequirements.h
index abb78b51af154..8891f7f550c5f 100644
--- a/llvm/include/llvm/SYCLLowerIR/SYCLDeviceRequirements.h
+++ b/llvm/include/llvm/SYCLLowerIR/SYCLDeviceRequirements.h
@@ -30,7 +30,19 @@ class PropertyValue;
 }
 
 struct SYCLDeviceRequirements {
-  std::set<uint32_t> Aspects;
+  struct AspectNameValuePair {
+    llvm::SmallString<64> Name;
+    uint32_t Value;
+    AspectNameValuePair(StringRef Name, uint32_t Value)
+        : Name(Name), Value(Value) {}
+    bool operator<(const AspectNameValuePair &rhs) const {
+      return Value < rhs.Value;
+    }
+    bool operator==(const AspectNameValuePair &rhs) const {
+      return Value == rhs.Value;
+    }
+  };
+  std::set<AspectNameValuePair> Aspects;
   std::set<uint32_t> FixedTarget;
   std::optional<llvm::SmallVector<uint64_t, 3>> ReqdWorkGroupSize;
   std::optional<uint32_t> WorkGroupNumDim;
diff --git a/llvm/include/llvm/Support/PropertySetIO.h b/llvm/include/llvm/Support/PropertySetIO.h
index 93e045256ed93..bbda6c548825f 100644
--- a/llvm/include/llvm/Support/PropertySetIO.h
+++ b/llvm/include/llvm/Support/PropertySetIO.h
@@ -205,6 +205,7 @@ class PropertySetRegistry {
   static constexpr char SYCL_MISC_PROP[] = "SYCL/misc properties";
   static constexpr char SYCL_ASSERT_USED[] = "SYCL/assert used";
   static constexpr char SYCL_EXPORTED_SYMBOLS[] = "SYCL/exported symbols";
+  static constexpr char SYCL_IMPORTED_SYMBOLS[] = "SYCL/imported symbols";
   static constexpr char SYCL_DEVICE_GLOBALS[] = "SYCL/device globals";
   static constexpr char SYCL_DEVICE_REQUIREMENTS[] = "SYCL/device requirements";
   static constexpr char SYCL_HOST_PIPES[] = "SYCL/host pipes";
diff --git a/llvm/lib/SYCLLowerIR/CMakeLists.txt b/llvm/lib/SYCLLowerIR/CMakeLists.txt
index 7f2edaae323a9..22ad42e836135 100644
--- a/llvm/lib/SYCLLowerIR/CMakeLists.txt
+++ b/llvm/lib/SYCLLowerIR/CMakeLists.txt
@@ -88,10 +88,12 @@ add_llvm_component_library(LLVMSYCLLowerIR
   LLVMDemangle
   LLVMTargetParser
   LLVMTransformUtils
-  
+
   LINK_COMPONENTS
   Analysis
+  BitWriter
   Core
+  IRPrinter
   Support
   ipo
   )
diff --git a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMDSlmReservation.cpp b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMDSlmReservation.cpp
index c43b5895b94fc..7dbab9e127778 100644
--- a/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMDSlmReservation.cpp
+++ b/llvm/lib/SYCLLowerIR/ESIMD/LowerESIMDSlmReservation.cpp
@@ -360,6 +360,7 @@ class ScopedCallGraph {
             continue;
           }
           if (CallInst *ScopeStartCI = IsScopeEnd(&I)) {
+            (void)ScopeStartCI;
             ScopeMet = true;
             // Scope end marker encountered - verify all enclosed scopes have
             // ended and truncate current scope path to the enclosing node.
diff --git a/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
index cf41aee46df28..900e1578c7adf 100644
--- a/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
+++ b/llvm/lib/SYCLLowerIR/ModuleSplitter.cpp
@@ -12,16 +12,19 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IRPrinter/IRPrintingPasses.h"
 #include "llvm/SYCLLowerIR/DeviceGlobals.h"
 #include "llvm/SYCLLowerIR/LowerInvokeSimd.h"
 #include "llvm/SYCLLowerIR/SYCLUtils.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/Transforms/IPO/StripDeadPrototypes.h"
@@ -733,6 +736,14 @@ void EntryPointGroup::rebuild(const Module &M) {
       Functions.insert(const_cast<Function *>(&F));
 }
 
+std::string ModuleDesc::makeSymbolTable() const {
+  std::string ST;
+  for (const Function *F : EntryPoints.Functions)
+    ST += (Twine(F->getName()) + "\n").str();
+
+  return ST;
+}
+
 namespace {
 // This is a helper class, which allows to group/categorize function based on
 // provided rules. It is intended to be used in device code split
@@ -1143,5 +1154,62 @@ SmallVector<ModuleDesc, 2> splitByESIMD(ModuleDesc &&MD,
   return Result;
 }
 
+static Error saveModuleIRInFile(Module &M, StringRef FilePath,
+                                bool OutputAssembly) {
+  int FD = -1;
+  if (std::error_code EC = sys::fs::openFileForWrite(FilePath, FD))
+    return errorCodeToError(EC);
+
+  raw_fd_ostream OS(FD, true);
+  ModulePassManager MPM;
+  ModuleAnalysisManager MAM;
+  MAM.registerPass([&] { return PassInstrumentationAnalysis(); });
+  if (OutputAssembly)
+    MPM.addPass(PrintModulePass(OS));
+  else
+    MPM.addPass(BitcodeWriterPass(OS));
+
+  MPM.run(M, MAM);
+  return Error::success();
+}
+
+static Expected<SplitModule> saveModuleDesc(ModuleDesc &MD, std::string Prefix,
+                                            bool OutputAssembly) {
+  SplitModule SM;
+  Prefix += OutputAssembly ? ".ll" : ".bc";
+  Error E = saveModuleIRInFile(MD.getModule(), Prefix, OutputAssembly);
+  if (E)
+    return E;
+
+  SM.ModuleFilePath = Prefix;
+  SM.Symbols = MD.makeSymbolTable();
+  return SM;
+}
+
+Expected<std::vector<SplitModule>>
+splitSYCLModule(std::unique_ptr<Module> M, ModuleSplitterSettings Settings) {
+  ModuleDesc MD = std::move(M); // makeModuleDesc() ?
+  // FIXME: false arguments are temporary for now.
+  auto Splitter =
+      getDeviceCodeSplitter(std::move(MD), Settings.Mode, false, false);
+  size_t ID = 0;
+  std::vector<SplitModule> OutputImages;
+  while (Splitter->hasMoreSplits()) {
+    ModuleDesc MD2 = Splitter->nextSplit();
+    MD2.fixupLinkageOfDirectInvokeSimdTargets();
+
+    std::string OutIRFileName = (Settings.OutputPrefix + "_" + Twine(ID)).str();
+    auto SplittedImageOrErr =
+        saveModuleDesc(MD2, OutIRFileName, Settings.OutputAssembly);
+    if (!SplittedImageOrErr)
+      return SplittedImageOrErr.takeError();
+
+    OutputImages.emplace_back(std::move(*SplittedImageOrErr));
+    ++ID;
+  }
+
+  return OutputImages;
+}
+
 } // namespace module_split
 } // namespace llvm
diff --git a/llvm/lib/SYCLLowerIR/SYCLDeviceRequirements.cpp b/llvm/lib/SYCLLowerIR/SYCLDeviceRequirements.cpp
index 8ebec7f54013d..60424c04027fa 100644
--- a/llvm/lib/SYCLLowerIR/SYCLDeviceRequirements.cpp
+++ b/llvm/lib/SYCLLowerIR/SYCLDeviceRequirements.cpp
@@ -43,19 +43,20 @@ llvm::computeDeviceRequirements(const module_split::ModuleDesc &MD) {
   // Process all functions in the module
   for (const Function &F : MD.getModule()) {
     if (auto *MDN = F.getMetadata("sycl_used_aspects")) {
-      for (auto &MDOp : MDN->operands()) {
-        int64_t Val;
-        if (auto Pair = dyn_cast<MDNode>(MDOp)) {
+      for (size_t I = 0, E = MDN->getNumOperands(); I < E; ++I) {
+        StringRef AspectName = "";
+        int64_t AspectValue;
+        if (auto Pair = dyn_cast<MDNode>(MDN->getOperand(I))) {
           assert(Pair->getNumOperands() == 2);
-          Val = mdconst::extract<ConstantInt>(Pair->getOperand(1))
-                    ->getZExtValue();
+          AspectName = ExtractStringFromMDNodeOperand(Pair, 0);
+          AspectValue = ExtractSignedIntegerFromMDNodeOperand(Pair, 1);
         } else {
-          Val = mdconst::extract<ConstantInt>(MDOp)->getZExtValue();
+          AspectValue = ExtractSignedIntegerFromMDNodeOperand(MDN, I);
         }
         // Don't put internal aspects (with negative integer value) into the
         // requirements, they are used only for device image splitting.
-        if (Val >= 0)
-          Reqs.Aspects.insert(Val);
+        if (AspectValue >= 0)
+          Reqs.Aspects.insert({AspectName, uint32_t(AspectValue)});
       }
     }
 
@@ -133,8 +134,11 @@ std::map<StringRef, util::PropertyValue> SYCLDeviceRequirements::asMap() const {
   // For all properties except for "aspects", we'll only add the
   // value to the map if the corresponding value from
   // SYCLDeviceRequirements has a value/is non-empty.
-  Requirements["aspects"] =
-      std::vector<uint32_t>(Aspects.begin(), Aspects.end());
+  std::vector<uint32_t> AspectValues;
+  AspectValues.reserve(Aspects.size());
+  for (auto Aspect : Aspects)
+    AspectValues.push_back(Aspect.Value);
+  Requirements["aspects"] = std::move(AspectValues);
 
   if (!FixedTarget.empty())
     Requirements["fixed_target"] =
diff --git a/llvm/lib/Support/PropertySetIO.cpp b/llvm/lib/Support/PropertySetIO.cpp
index 96593d4aa26be..f14f8cd5b16cb 100644
--- a/llvm/lib/Support/PropertySetIO.cpp
+++ b/llvm/lib/Support/PropertySetIO.cpp
@@ -202,6 +202,7 @@ constexpr char PropertySetRegistry::SYCL_PROGRAM_METADATA[];
 constexpr char PropertySetRegistry::SYCL_MISC_PROP[];
 constexpr char PropertySetRegistry::SYCL_ASSERT_USED[];
 constexpr char PropertySetRegistry::SYCL_EXPORTED_SYMBOLS[];
+constexpr char PropertySetRegistry::SYCL_IMPORTED_SYMBOLS[];
 constexpr char PropertySetRegistry::SYCL_DEVICE_GLOBALS[];
 constexpr char PropertySetRegistry::SYCL_DEVICE_REQUIREMENTS[];
 constexpr char PropertySetRegistry::SYCL_HOST_PIPES[];
diff --git a/llvm/test/CMakeLists.txt b/llvm/test/CMakeLists.txt
index 7a965dbed6c96..6e2c59109e10c 100644
--- a/llvm/test/CMakeLists.txt
+++ b/llvm/test/CMakeLists.txt
@@ -147,6 +147,7 @@ set(LLVM_TEST_DEPENDS
           sanstats
           spirv-to-ir-wrapper
           sycl-post-link
+          sycl-module-split
           split-file
           verify-uselistorder
           yaml-bench
diff --git a/llvm/test/lit.cfg.py b/llvm/test/lit.cfg.py
index c26ee26caa8e2..1d35fdaa55bbe 100644
--- a/llvm/test/lit.cfg.py
+++ b/llvm/test/lit.cfg.py
@@ -244,6 +244,7 @@ def get_asan_rtlib():
         "sanstats",
         "llvm-remarkutil",
         "spirv-to-ir-wrapper",
+        "sycl-module-split",
     ]
 )
 
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-1.ll
index 09261f7f61088..0583cfde3af23 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-1.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-1.ll
@@ -5,6 +5,13 @@
 ; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT
 
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; By default auto mode is equal to source mode
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-TU0,CHECK
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-TU1,CHECK
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-TU0-TXT
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-TU1-TXT
+
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
 target triple = "spir64-unknown-linux"
 
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-2.ll
index e911800bf429a..4ff2095f42bbb 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-2.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-2.ll
@@ -10,6 +10,12 @@
 ; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT
 
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-TU0,CHECK
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-TU1,CHECK
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-TU0-TXT
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-TU1-TXT
+
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
 target triple = "spir64-unknown-linux"
 
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-3.ll b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-3.ll
index f5915c7ac57b6..a5c62a5912338 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-3.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-3.ll
@@ -14,6 +14,18 @@
 ; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-SYM
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-SYM
 ;
+;
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+;
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-TU0-IR \
+; RUN:     --implicit-check-not TU0_kernel --implicit-check-not _Z3foov \
+; RUN:     --implicit-check-not _Z4foo3v
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-TU1-IR \
+; RUN:     --implicit-check-not TU1_kernel --implicit-check-not _Z4foo2v \
+; RUN:     --implicit-check-not _Z4foo1v
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-TU0-SYM
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-TU1-SYM
+
 ; CHECK-TU0-SYM: _ZTSZ4mainE11TU1_kernel0
 ; CHECK-TU0-SYM: _ZTSZ4mainE11TU1_kernel1
 ;
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-func-ptr.ll b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-func-ptr.ll
index 458485bf53aa6..730d9a5cd8efc 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-func-ptr.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/auto-module-split-func-ptr.ll
@@ -4,6 +4,12 @@
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix=CHECK-IR0
 ; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix=CHECK-IR1
 
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix=CHECK-SYM0
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix=CHECK-SYM1
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefix=CHECK-IR0
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefix=CHECK-IR1
+
 ; This test checkes that we can properly perform device code split by tracking
 ; all uses of functions (not only direct calls)
 
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/basic-module-split.ll b/llvm/test/tools/sycl-post-link/device-code-split/basic-module-split.ll
index 2a86625eeb27e..48d58248d0095 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/basic-module-split.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/basic-module-split.ll
@@ -3,6 +3,12 @@
 ; RUN: FileCheck %s -input-file=%t_1.ll --check-prefixes CHECK-TU1,CHECK
 ; RUN: FileCheck %s -input-file=%t_0.sym --check-prefixes CHECK-TU0-TXT
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-TU1-TXT
+
+; RUN: sycl-module-split -split=source -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-TU0,CHECK
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-TU1,CHECK
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-TU0-TXT
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-TU1-TXT
 ; ModuleID = 'basic-module-split.ll'
 source_filename = "basic-module-split.ll"
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/complex-indirect-call-chain.ll b/llvm/test/tools/sycl-post-link/device-code-split/complex-indirect-call-chain.ll
index d26f97f9d70a0..064471405a58d 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/complex-indirect-call-chain.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/complex-indirect-call-chain.ll
@@ -12,6 +12,17 @@
 ; RUN:     --implicit-check-not @BAZ --implicit-check-not @kernel_B \
 ; RUN:     --implicit-check-not @kernel_C
 ;
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefix CHECK0 \
+; RUN:     --implicit-check-not @foo --implicit-check-not @kernel_A \
+; RUN:     --implicit-check-not @kernel_B --implicit-check-not @baz
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefix CHECK1 \
+; RUN:     --implicit-check-not @kernel_A --implicit-check-not @kernel_C
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefix CHECK2 \
+; RUN:     --implicit-check-not @foo --implicit-check-not @bar \
+; RUN:     --implicit-check-not @BAZ --implicit-check-not @kernel_B \
+; RUN:     --implicit-check-not @kernel_C
+;
 ; RUN: sycl-post-link -split=source -S < %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK0 \
 ; RUN:     --implicit-check-not @foo --implicit-check-not @kernel_A \
@@ -23,6 +34,17 @@
 ; RUN:     --implicit-check-not @BAZ --implicit-check-not @kernel_B \
 ; RUN:     --implicit-check-not @kernel_C
 ;
+; RUN: sycl-module-split -split=source -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefix CHECK0 \
+; RUN:     --implicit-check-not @foo --implicit-check-not @kernel_A \
+; RUN:     --implicit-check-not @kernel_B --implicit-check-not @baz
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefix CHECK1 \
+; RUN:     --implicit-check-not @kernel_A --implicit-check-not @kernel_C
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefix CHECK2 \
+; RUN:     --implicit-check-not @foo --implicit-check-not @bar \
+; RUN:     --implicit-check-not @BAZ --implicit-check-not @kernel_B \
+; RUN:     --implicit-check-not @kernel_C
+;
 ; RUN: sycl-post-link -split=kernel -S < %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefix CHECK0 \
 ; RUN:     --implicit-check-not @foo --implicit-check-not @kernel_A \
@@ -33,6 +55,17 @@
 ; RUN:     --implicit-check-not @foo --implicit-check-not @bar \
 ; RUN:     --implicit-check-not @BAZ --implicit-check-not @kernel_B \
 ; RUN:     --implicit-check-not @kernel_C
+;
+; RUN: sycl-module-split -split=kernel -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefix CHECK0 \
+; RUN:     --implicit-check-not @foo --implicit-check-not @kernel_A \
+; RUN:     --implicit-check-not @kernel_B --implicit-check-not @baz
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefix CHECK1 \
+; RUN:     --implicit-check-not @kernel_A --implicit-check-not @kernel_C
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefix CHECK2 \
+; RUN:     --implicit-check-not @foo --implicit-check-not @bar \
+; RUN:     --implicit-check-not @BAZ --implicit-check-not @kernel_B \
+; RUN:     --implicit-check-not @kernel_C
 
 ; CHECK0-DAG: define spir_kernel void @kernel_C
 ; CHECK0-DAG: define spir_func i32 @bar
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/one-kernel-per-module.ll b/llvm/test/tools/sycl-post-link/device-code-split/one-kernel-per-module.ll
index 715929861b356..0197a2edd4a1b 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/one-kernel-per-module.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/one-kernel-per-module.ll
@@ -5,6 +5,15 @@
 ; RUN: FileCheck %s -input-file=%t.files_1.sym --check-prefixes CHECK-MODULE1-TXT
 ; RUN: FileCheck %s -input-file=%t.files_2.ll --check-prefixes CHECK-MODULE2,CHECK
 ; RUN: FileCheck %s -input-file=%t.files_2.sym --check-prefixes CHECK-MODULE2-TXT
+;
+; RUN: sycl-module-split -split=kernel -S < %s -o %t2.files
+; RUN: FileCheck %s -input-file=%t2.files_0.ll --check-prefixes CHECK-MODULE0,CHECK
+; RUN: FileCheck %s -input-file=%t2.files_0.sym --check-prefixes CHECK-MODULE0-TXT
+; RUN: FileCheck %s -input-file=%t2.files_1.ll --check-prefixes CHECK-MODULE1,CHECK
+; RUN: FileCheck %s -input-file=%t2.files_1.sym --check-prefixes CHECK-MODULE1-TXT
+; RUN: FileCheck %s -input-file=%t2.files_2.ll --check-prefixes CHECK-MODULE2,CHECK
+; RUN: FileCheck %s -input-file=%t2.files_2.sym --check-prefixes CHECK-MODULE2-TXT
+
 ; ModuleID = 'one-kernel-per-module.ll'
 source_filename = "one-kernel-per-module.ll"
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024"
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll
index faec71a602ffd..51a2895f4d326 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-1.ll
@@ -21,6 +21,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
 ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
 
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+
 ; RUN: sycl-post-link -split=source -symbols -S < %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \
 ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
@@ -35,6 +49,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
 ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
 
+; RUN: sycl-module-split -split=source -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+
 ; RUN: sycl-post-link -split=kernel -symbols -S < %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \
 ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
@@ -49,6 +77,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
 ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
 
+; RUN: sycl-module-split -split=kernel -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+
 ; Regardless of device code split mode, each kernel should go into a separate
 ; device image
 
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-2.ll
index 773424fa91fcb..f4d66822b261c 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-2.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-2.ll
@@ -15,6 +15,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \
 ; RUN:     --implicit-check-not kernel0 --implicit-check-not kernel3
 
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2.table --check-prefix CHECK-TABLE
+;
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix CHECK-M0-SYMS \
+; RUN:     --implicit-check-not kernel3 --implicit-check-not kernel1 \
+; RUN:     --implicit-check-not kernel2
+;
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix CHECK-M1-SYMS \
+; RUN:     --implicit-check-not kernel0 --implicit-check-not kernel1 \
+; RUN:     --implicit-check-not kernel2
+;
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefix CHECK-M2-SYMS \
+; RUN:     --implicit-check-not kernel0 --implicit-check-not kernel3
+
 ; CHECK-TABLE: Code
 ; CHECK-TABLE-NEXT: _0.sym
 ; CHECK-TABLE-NEXT: _1.sym
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-3.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-3.ll
index 5c1f743997816..523477a07573b 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-3.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-aspect-split-3.ll
@@ -17,6 +17,22 @@
 ; RUN: FileCheck %s -input-file=%t_1.ll --check-prefix CHECK-M1-IR \
 ; RUN:     --implicit-check-not kernel0 --implicit-check-not bar
 
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2.table --check-prefix CHECK-TABLE
+;
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix CHECK-M0-SYMS \
+; RUN:     --implicit-check-not foo --implicit-check-not kernel1
+;
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix CHECK-M1-SYMS \
+; RUN:     --implicit-check-not foo --implicit-check-not kernel0
+;
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefix CHECK-M2-SYMS \
+; RUN:     --implicit-check-not kernel0 --implicit-check-not foo \
+; RUN:     --implicit-check-not bar
+;
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefix CHECK-M1-IR \
+; RUN:     --implicit-check-not kernel0 --implicit-check-not bar
+
 ; We expect to see 3 modules generated:
 ;
 ; CHECK-TABLE: Code
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-1.ll
index 282a0dd0dc79e..543a892415fa4 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-1.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-1.ll
@@ -15,6 +15,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
 ; RUN: --implicit-check-not Kernel3
 
+; RUN: sycl-module-split -split=auto -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3
+
 ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
@@ -25,6 +35,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
 ; RUN: --implicit-check-not Kernel3
 
+; RUN: sycl-module-split -split=source -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3
+
 ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
@@ -39,6 +59,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-SYMS-K1 \
 ; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2
 
+; RUN: sycl-module-split -split=kernel -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel1
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel1
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2
+
 ; CHECK-IR-K1: define {{.*}} @Kernel1
 ; CHECK-IR-K2: define {{.*}} @Kernel2
 ; CHECK-IR-K3: define {{.*}} @Kernel3
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-2.ll
index 5472093bda677..6c054fc579659 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-2.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-2.ll
@@ -16,6 +16,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \
 ; RUN: --implicit-check-not Kernel2
 
+; RUN: sycl-module-split -split=auto -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel2
+
 ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K2 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
@@ -26,6 +36,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \
 ; RUN: --implicit-check-not Kernel2
 
+; RUN: sycl-module-split -split=source -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel2
+
 ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
@@ -40,6 +60,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-SYMS-K1 \
 ; RUN: --implicit-check-not Kernel2 --implicit-check-not Kernel3
 
+; RUN: sycl-module-split -split=kernel -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1 \
+; RUN: --implicit-check-not Kernel2 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1 \
+; RUN: --implicit-check-not Kernel2 --implicit-check-not Kernel3
+
 ; CHECK-IR-K1: define {{.*}} @Kernel1
 ; CHECK-IR-K2: define {{.*}} @Kernel2
 ; CHECK-IR-K3: define {{.*}} @Kernel3
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-1.ll
index c85f636459fa2..fd64b234b2c6f 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-1.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-1.ll
@@ -15,6 +15,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
 ; RUN: --implicit-check-not Kernel3
 
+; RUN: sycl-module-split -split=auto -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3
+
 ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
@@ -25,6 +35,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
 ; RUN: --implicit-check-not Kernel3
 
+; RUN: sycl-module-split -split=source -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3
+
 ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
@@ -39,6 +59,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-SYMS-K1 \
 ; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2
 
+; RUN: sycl-module-split -split=kernel -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel1
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel1
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2
+
 ; CHECK-IR-K1: define {{.*}} @Kernel1
 ; CHECK-IR-K2: define {{.*}} @Kernel2
 ; CHECK-IR-K3: define {{.*}} @Kernel3
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-2.ll
index f13f9caf01ed7..4c4a4bc8a1a6e 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-2.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-2.ll
@@ -16,6 +16,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \
 ; RUN: --implicit-check-not Kernel2
 
+; RUN: sycl-module-split -split=auto -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel2
+
 ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K2 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
@@ -26,6 +36,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \
 ; RUN: --implicit-check-not Kernel2
 
+; RUN: sycl-module-split -split=source -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel2
+
 ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
@@ -40,6 +60,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-SYMS-K1 \
 ; RUN: --implicit-check-not Kernel2 --implicit-check-not Kernel3
 
+; RUN: sycl-module-split -split=kernel -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1 \
+; RUN: --implicit-check-not Kernel2 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1 \
+; RUN: --implicit-check-not Kernel2 --implicit-check-not Kernel3
+
 ; CHECK-IR-K1: define {{.*}} @Kernel1
 ; CHECK-IR-K2: define {{.*}} @Kernel2
 ; CHECK-IR-K3: define {{.*}} @Kernel3
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-4.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-4.ll
index f4a312ded5c7e..fe995542deba1 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-4.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-4.ll
@@ -16,6 +16,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
 ; RUN: --implicit-check-not Kernel3
 
+; RUN: sycl-module-split -split=auto -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3
+
 ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
@@ -26,6 +36,16 @@
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
 ; RUN: --implicit-check-not Kernel3
 
+; RUN: sycl-module-split -split=source -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3
+
 ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
@@ -40,6 +60,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-SYMS-K1 \
 ; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2
 
+; RUN: sycl-module-split -split=kernel -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel1
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel1
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel2
+
 ; CHECK-IR-K1: define {{.*}} @Kernel1
 ; CHECK-IR-K2: define {{.*}} @Kernel2
 ; CHECK-IR-K3: define {{.*}} @Kernel3
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-5.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-5.ll
index b33aba9a2ad06..25fd2e26f3ca4 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-5.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-joint-matrix-mad-5.ll
@@ -33,6 +33,32 @@
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
 ; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6
 
+; RUN: sycl-module-split -split=auto -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3,CHECK-IR-K5 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K6 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel5
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 \
+; RUN:  --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_3.ll --check-prefixes CHECK-IR-K4 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3,CHECK-SYMS-K5 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K6 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel5
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 \
+; RUN:  --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_3.sym --check-prefixes CHECK-SYMS-K4 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+
 ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K3,CHECK-IR-K5 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
@@ -59,6 +85,32 @@
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
 ; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6
 
+; RUN: sycl-module-split -split=source -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K3,CHECK-IR-K5 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K6 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel5
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K1,CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 \
+; RUN:  --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_3.ll --check-prefixes CHECK-IR-K4 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K3,CHECK-SYMS-K5 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN: --implicit-check-not Kernel4 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K6 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel5
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K1,CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel3 --implicit-check-not Kernel4 \
+; RUN:  --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_3.sym --check-prefixes CHECK-SYMS-K4 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+
 ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-IR-K6 \
 ; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
@@ -97,6 +149,44 @@
 ; RUN: --implicit-check-not Kernel12 --implicit-check-not Kernel3 \
 ; RUN:  --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6
 
+; RUN: sycl-module-split -split=kernel -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-IR-K6 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel5
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-IR-K5 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-IR-K4 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_3.ll --check-prefixes CHECK-IR-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_4.ll --check-prefixes CHECK-IR-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 \
+; RUN:  --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_5.ll --check-prefixes CHECK-IR-K1 \
+; RUN: --implicit-check-not Kernel12 --implicit-check-not Kernel3 \
+; RUN:  --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-SYMS-K6 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel5
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-SYMS-K5 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel4 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-SYMS-K4 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel3 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_3.sym --check-prefixes CHECK-SYMS-K3 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel2 \
+; RUN:  --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_4.sym --check-prefixes CHECK-SYMS-K2 \
+; RUN: --implicit-check-not Kernel1 --implicit-check-not Kernel3 \
+; RUN:  --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+; RUN: FileCheck %s -input-file=%t2_5.sym --check-prefixes CHECK-SYMS-K1 \
+; RUN: --implicit-check-not Kernel12 --implicit-check-not Kernel3 \
+; RUN:  --implicit-check-not Kernel4 --implicit-check-not Kernel5 --implicit-check-not Kernel6
+
 ; CHECK-IR-K1: define {{.*}} @Kernel1
 ; CHECK-IR-K2: define {{.*}} @Kernel2
 ; CHECK-IR-K3: define {{.*}} @Kernel3
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-1.ll
index cd890e158c734..393943b63db43 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-1.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-1.ll
@@ -21,6 +21,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
 ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
 
+; RUN: sycl-module-split -split=auto -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+
 ; RUN: sycl-post-link -split=kernel -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \
 ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
@@ -35,6 +49,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
 ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
 
+; RUN: sycl-module-split -split=kernel -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+
 ; RUN: sycl-post-link -split=source -symbols -S %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \
 ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
@@ -49,6 +77,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
 ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
 
+; RUN: sycl-module-split -split=source -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+
 ; Regardless of device code split mode, each kernel should go into a separate
 ; device image
 
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-2.ll
index 155b843c390a5..1efeb364cb2e3 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-2.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-sub-group-size-split-2.ll
@@ -16,6 +16,21 @@
 ; RUN:     --implicit-check-not kernel0 --implicit-check-not kernel1 \
 ; RUN:     --implicit-check-not kernel2
 
+; RUN: sycl-module-split -split=auto -S %s -o %t2
+; RUN: FileCheck %s -input-file=%t2.table --check-prefix CHECK-TABLE
+;
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix CHECK-M0-SYMS \
+; RUN:     --implicit-check-not kernel0 --implicit-check-not kernel3
+;
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix CHECK-M1-SYMS \
+; RUN:     --implicit-check-not kernel1 --implicit-check-not kernel2 \
+; RUN:     --implicit-check-not kernel3
+
+;
+; RUN: FileCheck %s -input-file=%t_2.sym --check-prefix CHECK-M2-SYMS \
+; RUN:     --implicit-check-not kernel0 --implicit-check-not kernel1 \
+; RUN:     --implicit-check-not kernel2
+
 ; CHECK-TABLE: Code
 ; CHECK-TABLE-NEXT: _0.sym
 ; CHECK-TABLE-NEXT: _1.sym
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-1.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-1.ll
index fa5ffe782a7db..b156d71b1e3f6 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-1.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-1.ll
@@ -21,6 +21,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
 ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
 
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+
 ; RUN: sycl-post-link -split=source -symbols -S < %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \
 ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
@@ -35,6 +49,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
 ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
 
+; RUN: sycl-module-split -split=source -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+
 ; RUN: sycl-post-link -split=kernel -symbols -S < %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll --check-prefixes CHECK-M0-IR \
 ; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
@@ -49,6 +77,20 @@
 ; RUN: FileCheck %s -input-file=%t_2.sym --check-prefixes CHECK-M2-SYMS \
 ; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
 
+; RUN: sycl-module-split -split=kernel -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll --check-prefixes CHECK-M0-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.ll --check-prefixes CHECK-M1-IR \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.ll --check-prefixes CHECK-M2-IR \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefixes CHECK-M0-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefixes CHECK-M1-SYMS \
+; RUN: --implicit-check-not kernel0 --implicit-check-not kernel2
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefixes CHECK-M2-SYMS \
+; RUN: --implicit-check-not kernel1 --implicit-check-not kernel2
+
 ; Regardless of device code split mode, each kernel should go into a separate
 ; device image
 
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-2.ll b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-2.ll
index cb38a596a7ba9..c92ae8dbc9c03 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-2.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/per-reqd-wg-size-split-2.ll
@@ -15,6 +15,20 @@
 ; RUN:     --implicit-check-not kernel1 --implicit-check-not kernel2 \
 ; RUN:     --implicit-check-not kernel0
 
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2.table --check-prefix CHECK-TABLE
+;
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix CHECK-M0-SYMS \
+; RUN:     --implicit-check-not kernel0 --implicit-check-not kernel2
+;
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix CHECK-M1-SYMS \
+; RUN:     --implicit-check-not kernel1 --implicit-check-not kernel3 \
+; RUN:     --implicit-check-not kernel2
+;
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefix CHECK-M2-SYMS \
+; RUN:     --implicit-check-not kernel1 --implicit-check-not kernel2 \
+; RUN:     --implicit-check-not kernel0
+
 ; CHECK-TABLE: Code
 ; CHECK-TABLE-NEXT: _0.sym
 ; CHECK-TABLE-NEXT: _1.sym
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/split-with-kernel-declarations.ll b/llvm/test/tools/sycl-post-link/device-code-split/split-with-kernel-declarations.ll
index 595427a786e7b..82213e4b3beeb 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/split-with-kernel-declarations.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/split-with-kernel-declarations.ll
@@ -6,11 +6,22 @@
 ; RUN: FileCheck %s -input-file=%t_0.sym --check-prefix CHECK-PER-SOURCE-SYM0
 ; RUN: FileCheck %s -input-file=%t_1.sym --check-prefix CHECK-PER-SOURCE-SYM1
 ;
-; RUN: sycl-post-link -split=kernel -symbols -S < %s -o %t1.table
-; RUN: FileCheck %s -input-file=%t1.table --check-prefix CHECK-PER-KERNEL-TABLE
-; RUN: FileCheck %s -input-file=%t1_0.sym --check-prefix CHECK-PER-KERNEL-SYM1
-; RUN: FileCheck %s -input-file=%t1_1.sym --check-prefix CHECK-PER-KERNEL-SYM2
-; RUN: FileCheck %s -input-file=%t1_2.sym --check-prefix CHECK-PER-KERNEL-SYM0
+; RUN: sycl-module-split -split=source -S < %s -o %t1
+; RUN: FileCheck %s -input-file=%t1.table --check-prefix CHECK-PER-SOURCE-TABLE
+; RUN: FileCheck %s -input-file=%t1_0.sym --check-prefix CHECK-PER-SOURCE-SYM0
+; RUN: FileCheck %s -input-file=%t1_1.sym --check-prefix CHECK-PER-SOURCE-SYM1
+;
+; RUN: sycl-post-link -split=kernel -symbols -S < %s -o %t2.table
+; RUN: FileCheck %s -input-file=%t2.table --check-prefix CHECK-PER-KERNEL-TABLE
+; RUN: FileCheck %s -input-file=%t2_0.sym --check-prefix CHECK-PER-KERNEL-SYM1
+; RUN: FileCheck %s -input-file=%t2_1.sym --check-prefix CHECK-PER-KERNEL-SYM2
+; RUN: FileCheck %s -input-file=%t2_2.sym --check-prefix CHECK-PER-KERNEL-SYM0
+;
+; RUN: sycl-module-split -split=kernel -S < %s -o %t3
+; RUN: FileCheck %s -input-file=%t3.table --check-prefix CHECK-PER-KERNEL-TABLE
+; RUN: FileCheck %s -input-file=%t3_0.sym --check-prefix CHECK-PER-KERNEL-SYM1
+; RUN: FileCheck %s -input-file=%t3_1.sym --check-prefix CHECK-PER-KERNEL-SYM2
+; RUN: FileCheck %s -input-file=%t3_2.sym --check-prefix CHECK-PER-KERNEL-SYM0
 
 ; With per-source split, there should be two device images
 ; CHECK-PER-SOURCE-TABLE: [Code|Properties|Symbols]
diff --git a/llvm/test/tools/sycl-post-link/device-code-split/vtable.ll b/llvm/test/tools/sycl-post-link/device-code-split/vtable.ll
index 02d289fa772e0..cb9fd1f77cf78 100644
--- a/llvm/test/tools/sycl-post-link/device-code-split/vtable.ll
+++ b/llvm/test/tools/sycl-post-link/device-code-split/vtable.ll
@@ -42,6 +42,9 @@
 ; RUN: sycl-post-link -split=auto -S < %s -o %t.table
 ; RUN: FileCheck %s -input-file=%t_0.ll
 ;
+; RUN: sycl-module-split -split=auto -S < %s -o %t2
+; RUN: FileCheck %s -input-file=%t2_0.ll
+;
 ; CHECK-DAG: @_ZTV8Derived1 = {{.*}} @_ZN8Derived17displayEv
 ; CHECK-DAG: @_ZTV8Derived2 = {{.*}} @_ZN8Derived27displayEv
 ;
diff --git a/llvm/test/tools/sycl-post-link/emit_imported_symbols.ll b/llvm/test/tools/sycl-post-link/emit_imported_symbols.ll
new file mode 100644
index 0000000000000..ae824d293b9ea
--- /dev/null
+++ b/llvm/test/tools/sycl-post-link/emit_imported_symbols.ll
@@ -0,0 +1,113 @@
+; This test checks that the -emit-imported-symbols option generates a list of imported symbols
+; Function names were chosen so that no function with a 'inside' in their function name is imported
+;
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Test with -split=kernel
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; RUN: sycl-post-link -symbols -emit-imported-symbols -split=kernel -S < %s -o %t_kernel.table
+
+; RUN: FileCheck %s -input-file=%t_kernel_0.sym --check-prefixes CHECK-KERNEL-SYM-0
+; RUN: FileCheck %s -input-file=%t_kernel_1.sym --check-prefixes CHECK-KERNEL-SYM-1
+; RUN: FileCheck %s -input-file=%t_kernel_2.sym --check-prefixes CHECK-KERNEL-SYM-2
+
+; RUN: FileCheck %s -input-file=%t_kernel_0.prop --check-prefixes CHECK-KERNEL-IMPORTED-SYM-0
+; RUN: FileCheck %s -input-file=%t_kernel_1.prop --check-prefixes CHECK-KERNEL-IMPORTED-SYM-1
+; RUN: FileCheck %s -input-file=%t_kernel_2.prop --check-prefixes CHECK-KERNEL-IMPORTED-SYM-2
+
+; CHECK-KERNEL-SYM-0: middle
+; CHECK-KERNEL-IMPORTED-SYM-0: [SYCL/imported symbols]
+; CHECK-KERNEL-IMPORTED-SYM-0-NEXT: childD
+; CHECK-KERNEL-IMPORTED-SYM-0-EMPTY:
+
+; CHECK-KERNEL-SYM-1: foo
+; CHECK-KERNEL-IMPORTED-SYM-1: [SYCL/imported symbols]
+; CHECK-KERNEL-IMPORTED-SYM-1-NEXT: childA
+; CHECK-KERNEL-IMPORTED-SYM-1-NEXT: childC
+; CHECK-KERNEL-IMPORTED-SYM-1-NEXT: childD
+; CHECK-KERNEL-IMPORTED-SYM-1-EMPTY:
+
+
+; CHECK-KERNEL-SYM-2: bar
+; CHECK-KERNEL-IMPORTED-SYM-2: [SYCL/imported symbols]
+; CHECK-KERNEL-IMPORTED-SYM-2-NEXT: childB
+; CHECK-KERNEL-IMPORTED-SYM-2-NEXT: childC
+; CHECK-KERNEL-IMPORTED-SYM-2-NEXT: childD
+; CHECK-KERNEL-IMPORTED-SYM-2-NEXT: _Z7outsidev
+; CHECK-KERNEL-IMPORTED-SYM-2-EMPTY:
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+; Test with -split=source
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+
+; RUN: sycl-post-link -symbols -emit-imported-symbols -split=source -S < %s -o %t_source.table
+; RUN: FileCheck %s -input-file=%t_source_0.sym --check-prefixes CHECK-SOURCE-SYM-0
+; RUN: FileCheck %s -input-file=%t_source_0.prop --check-prefixes CHECK-SOURCE-IMPORTED-SYM-0
+
+; RUN: sycl-post-link -symbols -emit-imported-symbols -split=source -S < %s -o %t_source.table -O0
+; RUN: FileCheck %s -input-file=%t_source_0.sym --check-prefixes CHECK-SOURCE-SYM-0
+; RUN: FileCheck %s -input-file=%t_source_0.prop --check-prefixes CHECK-SOURCE-IMPORTED-SYM-0
+
+; CHECK-SOURCE-SYM-0-DAG: foo
+; CHECK-SOURCE-SYM-0-DAG: bar
+; CHECK-SOURCE-SYM-0-DAG: middle
+
+; CHECK-SOURCE-IMPORTED-SYM-0: [SYCL/imported symbols]
+; CHECK-SOURCE-IMPORTED-SYM-0-NEXT: childA
+; CHECK-SOURCE-IMPORTED-SYM-0-NEXT: childB
+; CHECK-SOURCE-IMPORTED-SYM-0-NEXT: childC
+; CHECK-SOURCE-IMPORTED-SYM-0-NEXT: childD
+; CHECK-SOURCE-IMPORTED-SYM-0-NEXT: _Z7outsidev
+; CHECK-SOURCE-IMPORTED-SYM-0-EMPTY:
+
+target triple = "spir64-unknown-unknown"
+
+@llvm.used = appending global [2 x ptr] [ptr @foo, ptr @bar], section "llvm.metadata"
+
+define weak_odr spir_kernel void @foo() #0 {
+  call void @childA()
+  call void @childC()
+  call void @middle() 
+  ret void
+}
+
+define weak_odr spir_kernel void @bar() #0 {
+  ;; Functions that are not SYCL External (i.e. they have no sycl-module-id) cannot be imported
+  call spir_func void @__itt_offload_wi_start_wrapper()
+
+  call void @childB()
+  call void @childC()
+  call void @middle()
+  ;; LLVM intrinsics cannot be imported
+  %dummy = call i8 @llvm.bitreverse.i8(i8 0)
+  ;; Functions with a demangled name prefixed with a '__' are not imported
+  call void @_Z8__insidev()
+  call void @_Z7outsidev()
+
+  ;; Functions that are not SYCL External (i.e. they have no sycl-module-id) cannot be imported
+  call spir_func void @__itt_offload_wi_finish_wrapper()
+  ret void
+}
+
+define void @middle() #0 {
+  call void @childD()
+  ret void
+}
+
+declare void @childA() #1
+declare void @childB() #1
+declare void @childC() #1
+declare void @childD() #1
+
+declare void @_Z7outsidev() #1
+;; Verify unused functions are not imported
+declare void @insideUnusedFunction() #1
+declare void @_Z8__insidev() #1
+declare i8 @llvm.bitreverse.i8(i8)
+
+declare spir_func void @__itt_offload_wi_start_wrapper()
+declare spir_func void @__itt_offload_wi_finish_wrapper()
+
+attributes #0 = { "sycl-module-id"="a.cpp" }
+attributes #1 = { "sycl-module-id"="external.cpp" }
diff --git a/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll b/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll
index 1f014410d0a1c..7c2ab6e91b925 100644
--- a/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll
+++ b/llvm/test/tools/sycl-post-link/multiple-filtered-outputs.ll
@@ -65,136 +65,56 @@
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
 target triple = "spir64-unknown-unknown"
 
-; Function Attrs: mustprogress norecurse nounwind
-define weak_odr dso_local spir_kernel void @double_kernel(ptr addrspace(1) noundef align 8 %_arg_out) local_unnamed_addr #0 !srcloc !65 !kernel_arg_buffer_location !66 !sycl_used_aspects !67 !sycl_fixed_targets !68 !sycl_kernel_omit_args !69 {
+define spir_kernel void @double_kernel(ptr addrspace(1) noundef align 8 %_arg_out) #0 !sycl_used_aspects !67 {
 entry:
-  %0 = load double, ptr addrspace(1) %_arg_out, align 8, !tbaa !70
+  %0 = load double, ptr addrspace(1) %_arg_out, align 8
   %mul.i = fmul double %0, 2.000000e-01
-  store double %mul.i, ptr addrspace(1) %_arg_out, align 8, !tbaa !70
+  store double %mul.i, ptr addrspace(1) %_arg_out, align 8
   ret void
 }
 
-; Function Attrs: mustprogress norecurse nounwind
-define weak_odr dso_local spir_kernel void @float_kernel(ptr addrspace(1) noundef align 4 %_arg_out) local_unnamed_addr #0 !srcloc !74 !kernel_arg_buffer_location !66 !sycl_fixed_targets !68 !sycl_kernel_omit_args !69 {
+define spir_kernel void @float_kernel(ptr addrspace(1) noundef align 4 %_arg_out) #0 {
 entry:
-  %0 = load float, ptr addrspace(1) %_arg_out, align 4, !tbaa !75
+  %0 = load float, ptr addrspace(1) %_arg_out, align 4
   %mul.i = fmul float %0, 0x3FC99999A0000000
-  store float %mul.i, ptr addrspace(1) %_arg_out, align 4, !tbaa !75
+  store float %mul.i, ptr addrspace(1) %_arg_out, align 4
   ret void
 }
 
-; Function Attrs: mustprogress norecurse nounwind
-define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_8() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !78 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 {
+define spir_kernel void @reqd_sub_group_size_kernel_8() #0 !intel_reqd_sub_group_size !78 {
 entry:
   ret void
 }
 
-; Function Attrs: mustprogress norecurse nounwind
-define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_16() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !79 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 {
+define spir_kernel void @reqd_sub_group_size_kernel_16() #0 !intel_reqd_sub_group_size !79 {
 entry:
   ret void
 }
 
-; Function Attrs: mustprogress norecurse nounwind
-define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_32() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !80 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 {
+define spir_kernel void @reqd_sub_group_size_kernel_32() #0 !intel_reqd_sub_group_size !80 {
 entry:
   ret void
 }
 
-; Function Attrs: mustprogress norecurse nounwind
-define weak_odr dso_local spir_kernel void @reqd_sub_group_size_kernel_64() local_unnamed_addr #0 !srcloc !77 !kernel_arg_buffer_location !68 !intel_reqd_sub_group_size !81 !sycl_fixed_targets !68 !sycl_kernel_omit_args !68 {
+define spir_kernel void @reqd_sub_group_size_kernel_64() #0 !intel_reqd_sub_group_size !81 {
 entry:
   ret void
 }
 
-declare dso_local spir_func i32 @_Z18__spirv_ocl_printfPU3AS2Kcz(ptr addrspace(2), ...)
-
 attributes #0 = { mustprogress norecurse nounwind "frame-pointer"="all" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "sycl-module-id"="double.cpp" "sycl-optlevel"="3" "uniform-work-group-size"="true" }
 
 !llvm.module.flags = !{!0, !1}
 !opencl.spir.version = !{!2}
 !spirv.Source = !{!3}
-!sycl_aspects = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37, !38, !39, !40, !41, !42, !43, !44, !45, !46, !47, !48, !49, !50, !51, !52, !53, !54, !55, !56, !57, !58, !59, !60, !61, !62, !63}
 !llvm.ident = !{!64}
 
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{i32 7, !"frame-pointer", i32 2}
 !2 = !{i32 1, i32 2}
 !3 = !{i32 4, i32 100000}
-!4 = !{!"cpu", i32 1}
-!5 = !{!"gpu", i32 2}
-!6 = !{!"accelerator", i32 3}
-!7 = !{!"custom", i32 4}
-!8 = !{!"fp16", i32 5}
 !9 = !{!"fp64", i32 6}
-!10 = !{!"image", i32 9}
-!11 = !{!"online_compiler", i32 10}
-!12 = !{!"online_linker", i32 11}
-!13 = !{!"queue_profiling", i32 12}
-!14 = !{!"usm_device_allocations", i32 13}
-!15 = !{!"usm_host_allocations", i32 14}
-!16 = !{!"usm_shared_allocations", i32 15}
-!17 = !{!"usm_system_allocations", i32 17}
-!18 = !{!"ext_intel_pci_address", i32 18}
-!19 = !{!"ext_intel_gpu_eu_count", i32 19}
-!20 = !{!"ext_intel_gpu_eu_simd_width", i32 20}
-!21 = !{!"ext_intel_gpu_slices", i32 21}
-!22 = !{!"ext_intel_gpu_subslices_per_slice", i32 22}
-!23 = !{!"ext_intel_gpu_eu_count_per_subslice", i32 23}
-!24 = !{!"ext_intel_max_mem_bandwidth", i32 24}
-!25 = !{!"ext_intel_mem_channel", i32 25}
-!26 = !{!"usm_atomic_host_allocations", i32 26}
-!27 = !{!"usm_atomic_shared_allocations", i32 27}
-!28 = !{!"atomic64", i32 28}
-!29 = !{!"ext_intel_device_info_uuid", i32 29}
-!30 = !{!"ext_oneapi_srgb", i32 30}
-!31 = !{!"ext_oneapi_native_assert", i32 31}
-!32 = !{!"host_debuggable", i32 32}
-!33 = !{!"ext_intel_gpu_hw_threads_per_eu", i32 33}
-!34 = !{!"ext_oneapi_cuda_async_barrier", i32 34}
-!35 = !{!"ext_oneapi_bfloat16_math_functions", i32 35}
-!36 = !{!"ext_intel_free_memory", i32 36}
-!37 = !{!"ext_intel_device_id", i32 37}
-!38 = !{!"ext_intel_memory_clock_rate", i32 38}
-!39 = !{!"ext_intel_memory_bus_width", i32 39}
-!40 = !{!"emulated", i32 40}
-!41 = !{!"ext_intel_legacy_image", i32 41}
-!42 = !{!"ext_oneapi_bindless_images", i32 42}
-!43 = !{!"ext_oneapi_bindless_images_shared_usm", i32 43}
-!44 = !{!"ext_oneapi_bindless_images_1d_usm", i32 44}
-!45 = !{!"ext_oneapi_bindless_images_2d_usm", i32 45}
-!46 = !{!"ext_oneapi_interop_memory_import", i32 46}
-!47 = !{!"ext_oneapi_interop_memory_export", i32 47}
-!48 = !{!"ext_oneapi_interop_semaphore_import", i32 48}
-!49 = !{!"ext_oneapi_interop_semaphore_export", i32 49}
-!50 = !{!"ext_oneapi_mipmap", i32 50}
-!51 = !{!"ext_oneapi_mipmap_anisotropy", i32 51}
-!52 = !{!"ext_oneapi_mipmap_level_reference", i32 52}
-!53 = !{!"ext_intel_esimd", i32 53}
-!54 = !{!"ext_oneapi_ballot_group", i32 54}
-!55 = !{!"ext_oneapi_fixed_size_group", i32 55}
-!56 = !{!"ext_oneapi_opportunistic_group", i32 56}
-!57 = !{!"ext_oneapi_tangle_group", i32 57}
-!58 = !{!"ext_intel_matrix", i32 58}
-!59 = !{!"int64_base_atomics", i32 7}
-!60 = !{!"int64_extended_atomics", i32 8}
-!61 = !{!"usm_system_allocator", i32 17}
-!62 = !{!"usm_restricted_shared_allocations", i32 16}
-!63 = !{!"host", i32 0}
 !64 = !{!"clang version 19.0.0git (/ws/llvm/clang a7f3a637bdd6299831f903bbed9e8d069fea5c86)"}
-!65 = !{i32 233}
-!66 = !{i32 -1}
-!67 = !{i32 6}
-!68 = !{}
-!69 = !{i1 false}
-!70 = !{!71, !71, i64 0}
-!71 = !{!"double", !72, i64 0}
-!72 = !{!"omnipotent char", !73, i64 0}
-!73 = !{!"Simple C++ TBAA"}
-!74 = !{i32 364}
-!75 = !{!76, !76, i64 0}
-!76 = !{!"float", !72, i64 0}
-!77 = !{i32 529}
+!67 = !{!9}
 !78 = !{i32 8}
 !79 = !{i32 16}
 !80 = !{i32 32}
diff --git a/llvm/tools/sycl-module-split/CMakeLists.txt b/llvm/tools/sycl-module-split/CMakeLists.txt
new file mode 100644
index 0000000000000..0c29be481e538
--- /dev/null
+++ b/llvm/tools/sycl-module-split/CMakeLists.txt
@@ -0,0 +1,10 @@
+set(LLVM_LINK_COMPONENTS
+  Core
+  IRReader
+  Support
+  SYCLLowerIR
+  )
+
+add_llvm_tool(sycl-module-split
+  sycl-module-split.cpp
+  )
diff --git a/llvm/tools/sycl-module-split/sycl-module-split.cpp b/llvm/tools/sycl-module-split/sycl-module-split.cpp
new file mode 100644
index 0000000000000..89d8b9e10b2b7
--- /dev/null
+++ b/llvm/tools/sycl-module-split/sycl-module-split.cpp
@@ -0,0 +1,130 @@
+//==-- sycl-module-split: command line tool for testing SYCL Module Splitting //
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This program can be used only to test the SYCL Module Splitting.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/SYCLLowerIR/ModuleSplitter.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/PropertySetIO.h"
+#include "llvm/Support/SimpleTable.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace llvm::util;
+using namespace module_split;
+
+static cl::OptionCategory SplitCategory("Split options");
+
+static cl::opt<std::string> InputFilename(cl::Positional, cl::desc(""),
+                                          cl::init("-"),
+                                          cl::value_desc("filename"));
+
+static cl::opt<std::string>
+    OutputFilenamePrefix("o", cl::desc("output filename prefix"),
+                         cl::value_desc("filename prefix"), cl::init("output"),
+                         cl::cat(SplitCategory));
+
+cl::opt<bool> OutputAssembly{"S", cl::desc("Write output as LLVM assembly"),
+                             cl::cat(SplitCategory)};
+
+cl::opt<IRSplitMode> SplitMode(
+    "split", cl::desc("split input module"), cl::Optional, cl::init(SPLIT_NONE),
+    cl::values(clEnumValN(module_split::SPLIT_PER_TU, "source",
+                          "1 output module per source (translation unit)"),
+               clEnumValN(module_split::SPLIT_PER_KERNEL, "kernel",
+                          "1 output module per kernel"),
+               clEnumValN(module_split::SPLIT_AUTO, "auto",
+                          "Choose split mode automatically")),
+    cl::cat(SplitCategory));
+
+void writeStringToFile(const std::string &Content, StringRef Path) {
+  std::error_code EC;
+  raw_fd_ostream OS(Path, EC);
+  if (EC) {
+    errs() << formatv("error opening file: {0}\n", Path);
+    exit(1);
+  }
+
+  OS << Content << "\n";
+}
+
+void writePropertiesToFile(const PropertySetRegistry &Properties,
+                           StringRef Path) {
+  std::error_code EC;
+  raw_fd_ostream OS(Path, EC);
+  if (EC) {
+    errs() << formatv("error opening file: {0}\n", Path);
+    exit(1);
+  }
+
+  Properties.write(OS);
+}
+
+void dumpModulesAsTable(const std::vector<SplitModule> &SplitModules,
+                        StringRef Path) {
+  std::vector<StringRef> Columns = {"Code", "Properties", "Symbols"};
+  auto TableOrErr = SimpleTable::create(Columns);
+  if (!TableOrErr) {
+    errs() << "can't create a table\n";
+    exit(1);
+  }
+
+  std::unique_ptr<SimpleTable> Table = std::move(*TableOrErr);
+  for (const auto &[I, SM] : enumerate(SplitModules)) {
+    std::string SymbolsFile = (Twine(Path) + "_" + Twine(I) + ".sym").str();
+    std::string PropertiesFile = (Twine(Path) + "_" + Twine(I) + ".prop").str();
+    writePropertiesToFile(SM.Properties, PropertiesFile);
+    writeStringToFile(SM.Symbols, SymbolsFile);
+    SmallVector<StringRef, 3> Row = {SM.ModuleFilePath, PropertiesFile,
+                                     SymbolsFile};
+    Table->addRow(Row);
+  }
+
+  std::error_code EC;
+  raw_fd_ostream OS((Path + ".table").str(), EC);
+  if (EC) {
+    errs() << formatv("error opening file: {0}\n", Path);
+    exit(1);
+  }
+
+  Table->write(OS);
+}
+
+int main(int argc, char *argv[]) {
+  LLVMContext C;
+  SMDiagnostic Err;
+  cl::ParseCommandLineOptions(argc, argv, "SYCL Module Splitter\n");
+
+  std::unique_ptr<Module> M = parseIRFile(InputFilename, Err, C);
+  if (!M) {
+    Err.print(argv[0], errs());
+    return 1;
+  }
+
+  ModuleSplitterSettings Settings;
+  Settings.Mode = SplitMode;
+  Settings.OutputAssembly = OutputAssembly;
+  Settings.OutputPrefix = OutputFilenamePrefix;
+  auto SplitModulesOrErr = splitSYCLModule(std::move(M), Settings);
+  if (!SplitModulesOrErr) {
+    Err.print(argv[0], errs());
+    return 1;
+  }
+
+  dumpModulesAsTable(*SplitModulesOrErr, OutputFilenamePrefix);
+}
diff --git a/llvm/tools/sycl-post-link/CMakeLists.txt b/llvm/tools/sycl-post-link/CMakeLists.txt
index cfb9b1a27560f..aa98f4942edbc 100644
--- a/llvm/tools/sycl-post-link/CMakeLists.txt
+++ b/llvm/tools/sycl-post-link/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(LLVM_LINK_COMPONENTS
   BitWriter
   Core
+  Demangle
   IPO
   IRPrinter
   IRReader
diff --git a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
index 8f8dd8267c771..1554e81751668 100644
--- a/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
+++ b/llvm/tools/sycl-post-link/SYCLDeviceLibReqMask.cpp
@@ -668,6 +668,30 @@ SYCLDeviceLibFuncMap SDLMap = {
      DeviceLibExt::cl_intel_devicelib_bfloat16},
     {"__devicelib_ConvertBF16ToFINTEL",
      DeviceLibExt::cl_intel_devicelib_bfloat16},
+    {"__devicelib_ConvertFToBF16INTELVec1",
+     DeviceLibExt::cl_intel_devicelib_bfloat16},
+    {"__devicelib_ConvertBF16ToFINTELVec1",
+     DeviceLibExt::cl_intel_devicelib_bfloat16},
+    {"__devicelib_ConvertFToBF16INTELVec2",
+     DeviceLibExt::cl_intel_devicelib_bfloat16},
+    {"__devicelib_ConvertBF16ToFINTELVec2",
+     DeviceLibExt::cl_intel_devicelib_bfloat16},
+    {"__devicelib_ConvertFToBF16INTELVec3",
+     DeviceLibExt::cl_intel_devicelib_bfloat16},
+    {"__devicelib_ConvertBF16ToFINTELVec3",
+     DeviceLibExt::cl_intel_devicelib_bfloat16},
+    {"__devicelib_ConvertFToBF16INTELVec4",
+     DeviceLibExt::cl_intel_devicelib_bfloat16},
+    {"__devicelib_ConvertBF16ToFINTELVec4",
+     DeviceLibExt::cl_intel_devicelib_bfloat16},
+    {"__devicelib_ConvertFToBF16INTELVec8",
+     DeviceLibExt::cl_intel_devicelib_bfloat16},
+    {"__devicelib_ConvertBF16ToFINTELVec8",
+     DeviceLibExt::cl_intel_devicelib_bfloat16},
+    {"__devicelib_ConvertFToBF16INTELVec16",
+     DeviceLibExt::cl_intel_devicelib_bfloat16},
+    {"__devicelib_ConvertBF16ToFINTELVec16",
+     DeviceLibExt::cl_intel_devicelib_bfloat16},
 };
 
 // Each fallback device library corresponds to one bit in "require mask" which
diff --git a/llvm/tools/sycl-post-link/sycl-post-link.cpp b/llvm/tools/sycl-post-link/sycl-post-link.cpp
index 6c6db956c383a..9afa25c3a6552 100644
--- a/llvm/tools/sycl-post-link/sycl-post-link.cpp
+++ b/llvm/tools/sycl-post-link/sycl-post-link.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeWriterPass.h"
+#include "llvm/Demangle/Demangle.h"
 #include "llvm/GenXIntrinsics/GenXSPIRVWriterAdaptor.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/LLVMContext.h"
@@ -228,6 +229,10 @@ cl::opt<bool> EmitExportedSymbols{"emit-exported-symbols",
                                   cl::desc("emit exported symbols"),
                                   cl::cat(PostLinkCat)};
 
+cl::opt<bool> EmitImportedSymbols{"emit-imported-symbols",
+                                  cl::desc("emit imported symbols"),
+                                  cl::cat(PostLinkCat)};
+
 cl::opt<bool> EmitOnlyKernelsAsEntryPoints{
     "emit-only-kernels-as-entry-points",
     cl::desc("Consider only sycl_kernel functions as entry points for "
@@ -250,6 +255,7 @@ struct GlobalBinImageProps {
   bool EmitKernelParamInfo;
   bool EmitProgramMetadata;
   bool EmitExportedSymbols;
+  bool EmitImportedSymbols;
   bool EmitDeviceGlobalPropSet;
 };
 
@@ -411,6 +417,25 @@ std::string saveModuleIR(Module &M, int I, StringRef Suff) {
   return OutFilename;
 }
 
+bool isImportedFunction(const Function &F) {
+  if (!F.isDeclaration() || F.isIntrinsic() ||
+      !llvm::sycl::utils::isSYCLExternalFunction(&F))
+    return false;
+
+  // StripDeadPrototypes is called during module splitting
+  // cleanup.  At this point all function decls should have uses.
+  assert(!F.use_empty() && "Function F has no uses");
+
+  bool ReturnValue = true;
+  if (char *NameStr = itaniumDemangle(F.getName())) {
+    StringRef DemangledName(NameStr);
+    if (DemangledName.starts_with("__"))
+      ReturnValue = false;
+    free(NameStr);
+  }
+  return ReturnValue;
+}
+
 std::string saveModuleProperties(module_split::ModuleDesc &MD,
                                  const GlobalBinImageProps &GlobProps, int I,
                                  StringRef Suff) {
@@ -474,10 +499,21 @@ std::string saveModuleProperties(module_split::ModuleDesc &MD,
       // so they won't make it into the export list. Should the check be
       // F->getCallingConv() != CallingConv::SPIR_KERNEL?
       if (F->getCallingConv() == CallingConv::SPIR_FUNC) {
-        PropSet.add(PropSetRegTy::SYCL_EXPORTED_SYMBOLS, F->getName(), true);
+        PropSet.add(PropSetRegTy::SYCL_EXPORTED_SYMBOLS, F->getName(),
+                    /*PropVal=*/true);
       }
     }
   }
+
+  if (GlobProps.EmitImportedSymbols) {
+    // record imported functions in the property set
+    for (const auto &F : M) {
+      if (isImportedFunction(F))
+        PropSet.add(PropSetRegTy::SYCL_IMPORTED_SYMBOLS, F.getName(),
+                    /*PropVal=*/true);
+    }
+  }
+
   // Metadata names may be composite so we keep them alive until the
   // properties have been written.
   SmallVector<std::string, 4> MetadataNames;
@@ -730,7 +766,8 @@ IrPropSymFilenameTriple saveModule(module_split::ModuleDesc &MD, int I,
     Res.Ir = saveModuleIR(MD.getModule(), I, Suffix);
   }
   GlobalBinImageProps Props = {EmitKernelParamInfo, EmitProgramMetadata,
-                               EmitExportedSymbols, DeviceGlobals};
+                               EmitExportedSymbols, EmitImportedSymbols,
+                               DeviceGlobals};
   Res.Prop = saveModuleProperties(MD, Props, I, Suffix);
 
   if (DoSymGen) {
@@ -1014,41 +1051,12 @@ bool isTargetCompatibleWithModule(const std::optional<std::string> &Target,
       DeviceConfigFile::TargetTable[*Target];
   const SYCLDeviceRequirements &ModuleReqs =
       IrMD.getOrComputeDeviceRequirements();
-  // The device config file data stores the target's supported
-  // aspects as a vector of the strings, so we need to translate
-  // the values to a common format.
-  const NamedMDNode *Node = IrMD.getModule().getNamedMetadata("sycl_aspects");
-  if (Node) {
-    SmallMapVector<StringRef, int, 32> AspectNameToValue;
-    for (const MDNode *N : Node->operands()) {
-      assert(N->getNumOperands() == 2 &&
-             "Each operand of sycl_aspects must be a pair.");
-
-      // The aspect's name is the first operand.
-      const auto *AspectName = cast<MDString>(N->getOperand(0));
-
-      // The aspect's integral value is the second operand.
-      const auto *AspectCAM = cast<ConstantAsMetadata>(N->getOperand(1));
-      const Constant *AspectC = AspectCAM->getValue();
-
-      AspectNameToValue[AspectName->getString()] =
-          cast<ConstantInt>(AspectC)->getSExtValue();
-    }
-
-    // Make the set of aspects values the target supports.
-    SmallSet<int64_t, 32> TargetAspectValueSet;
-    for (const auto &Aspect : TargetInfo.aspects) {
-      auto It = AspectNameToValue.find(Aspect);
-      assert(It != AspectNameToValue.end() && "Aspect value mapping unknown!");
-      TargetAspectValueSet.insert(It->second);
-    }
 
-    // Now check to see if all the requirements of the input module
-    // are compatbile with the target.
-    for (const auto &Aspect : ModuleReqs.Aspects) {
-      if (!TargetAspectValueSet.contains(Aspect))
-        return false;
-    }
+  // Check to see if all the requirements of the input module
+  // are compatbile with the target.
+  for (const auto &Aspect : ModuleReqs.Aspects) {
+    if (!is_contained(TargetInfo.aspects, Aspect.Name))
+      return false;
   }
 
   // Check if module sub group size is compatible with the target.
@@ -1278,13 +1286,14 @@ int main(int argc, char **argv) {
   bool DoParamInfo = EmitKernelParamInfo.getNumOccurrences() > 0;
   bool DoProgMetadata = EmitProgramMetadata.getNumOccurrences() > 0;
   bool DoExportedSyms = EmitExportedSymbols.getNumOccurrences() > 0;
+  bool DoImportedSyms = EmitImportedSymbols.getNumOccurrences() > 0;
   bool DoDeviceGlobals = DeviceGlobals.getNumOccurrences() > 0;
   bool DoGenerateDeviceImageWithDefaulValues =
       GenerateDeviceImageWithDefaultSpecConsts.getNumOccurrences() > 0;
 
   if (!DoSplit && !DoSpecConst && !DoSymGen && !DoParamInfo &&
-      !DoProgMetadata && !DoSplitEsimd && !DoExportedSyms && !DoDeviceGlobals &&
-      !DoLowerEsimd) {
+      !DoProgMetadata && !DoSplitEsimd && !DoExportedSyms && !DoImportedSyms &&
+      !DoDeviceGlobals && !DoLowerEsimd) {
     errs() << "no actions specified; try --help for usage info\n";
     return 1;
   }
@@ -1318,6 +1327,11 @@ int main(int argc, char **argv) {
            << " -" << IROutputOnly.ArgStr << "\n";
     return 1;
   }
+  if (IROutputOnly && DoImportedSyms) {
+    errs() << "error: -" << EmitImportedSymbols.ArgStr << " can't be used with"
+           << " -" << IROutputOnly.ArgStr << "\n";
+    return 1;
+  }
   if (IROutputOnly && DoGenerateDeviceImageWithDefaulValues) {
     errs() << "error: -" << GenerateDeviceImageWithDefaultSpecConsts.ArgStr
            << " can't be used with -" << IROutputOnly.ArgStr << "\n";
diff --git a/llvm/utils/git/requirements.txt b/llvm/utils/git/requirements.txt
index de84e17104954..656f64bb7315e 100644
--- a/llvm/utils/git/requirements.txt
+++ b/llvm/utils/git/requirements.txt
@@ -240,7 +240,7 @@ smmap==5.0.1 \
     --hash=sha256:dceeb6c0028fdb6734471eb07c0cd2aae706ccaecab45965ee83f11c8d3b1f62 \
     --hash=sha256:e6d8668fa5f93e706934a62d7b4db19c8d9eb8cf2adbb75ef1b675aa332b69da
     # via gitdb
-urllib3==2.2.1 \
+urllib3==2.2.2 \
     --hash=sha256:450b20ec296a467077128bff42b73080516e71b56ff59a60a02bef2232c4fa9d \
     --hash=sha256:d0570876c61ab9e520d776c38acbbb5b05a776d3f9ff98a5c8fd5162a444cf19
     # via requests
diff --git a/llvm/utils/git/requirements_formatting.txt b/llvm/utils/git/requirements_formatting.txt
index 6fadbaffcb7c1..2741c03fa26b7 100644
--- a/llvm/utils/git/requirements_formatting.txt
+++ b/llvm/utils/git/requirements_formatting.txt
@@ -46,7 +46,7 @@ requests==2.32.0
     # via pygithub
 toml==0.10.2
     # via darker
-urllib3==2.2.1
+urllib3==2.2.2
     # via requests
 wrapt==1.16.0
     # via deprecated
diff --git a/mlir/utils/vscode/package-lock.json b/mlir/utils/vscode/package-lock.json
index 9f4d8f51f31c6..11edbbbf968f4 100644
--- a/mlir/utils/vscode/package-lock.json
+++ b/mlir/utils/vscode/package-lock.json
@@ -285,11 +285,11 @@
       }
     },
     "node_modules/braces": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz",
-      "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==",
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz",
+      "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==",
       "dependencies": {
-        "fill-range": "^7.0.1"
+        "fill-range": "^7.1.1"
       },
       "engines": {
         "node": ">=8"
@@ -724,9 +724,9 @@
       }
     },
     "node_modules/fill-range": {
-      "version": "7.0.1",
-      "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz",
-      "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==",
+      "version": "7.1.1",
+      "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz",
+      "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==",
       "dependencies": {
         "to-regex-range": "^5.0.1"
       },
@@ -2208,11 +2208,11 @@
       }
     },
     "braces": {
-      "version": "3.0.2",
-      "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.2.tgz",
-      "integrity": "sha512-b8um+L1RzM3WDSzvhm6gIz1yfTbBt6YTlcEKAvsmqCZZFw46z626lVj9j1yEPW33H5H+lBQpZMP1k8l+78Ha0A==",
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz",
+      "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==",
       "requires": {
-        "fill-range": "^7.0.1"
+        "fill-range": "^7.1.1"
       }
     },
     "buffer": {
@@ -2533,9 +2533,9 @@
       }
     },
     "fill-range": {
-      "version": "7.0.1",
-      "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.0.1.tgz",
-      "integrity": "sha512-qOo9F+dMUmC2Lcb4BbVvnKJxTPjCm+RRpe4gDuGrzkL7mEVl/djYSu2OdQ2Pa302N4oqkSg9ir6jaLWJ2USVpQ==",
+      "version": "7.1.1",
+      "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz",
+      "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==",
       "requires": {
         "to-regex-range": "^5.0.1"
       }
diff --git a/sycl/ReleaseNotes.md b/sycl/ReleaseNotes.md
index b80e6640b9d18..bb592c570db92 100644
--- a/sycl/ReleaseNotes.md
+++ b/sycl/ReleaseNotes.md
@@ -1,3 +1,153 @@
+# Mar'24 release notes
+Release notes for commit range [f4e0d3177338](https://github.com/intel/llvm/commit/f4ed132f243ab43816ebe826669d978139964df2).. [d2817d6d317db1](https://github.com/intel/llvm/commit/d2817d6d317db1143bb227168e85c409d5ab7c82)
+
+## New Features
+### SYCL Compiler
+
+- Added more available CPU for `-march` option in OpenCL AOT compiler. [7911773c]
+- Added support for additional AMD GPU targets. [c1ce15944]
+- Supported detecting out-of-bound errors on CPU device, static local memory, and device globals via AddressSanitizer. [f331ba2063] [a14cfdd7999]
+- Provide a preprocessor macro to locate the CUPTI library when XPTI tracing is enabled during compiler build. [e15ebd08] [acf89a6c90]
+- Made `-fsycl-dump-device-code` save PTX files generated for the CUDA backend. [16e06ff]
+- When multiple floating point accuracy-related options are specified on the CLI, made the last option take precedence over others. [69e2b91]
+- Added a new `-fsycl-dump-device-code` option to dump device code generated during SYCL compilation into a user-specified directory. [96ce6ea]
+- Added support for `-fsycl-link` with ahead-of-time (AOT) compilation. [22fab5a]
+- Added support for `-O3` on Windows when using `clang-cl`. [0af4ac7]
+
+### SYCL Library
+
+- Implemented [ext_oneapi_kernel_compiler](https://github.com/intel/llvm/blob/096676e8d4d87475860723ed8a4d8c256bcd98c2/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_compiler.asciidoc) SYCL extension.  [096676e8] [e5826540] [67086100]
+- Implemented [ext_intel_fp_control](https://github.com/intel/llvm/blob/bf8ea96f/sycl/doc/extensions/experimental/sycl_ext_intel_fp_control.asciidoc) SYCL extension. [bf8ea96f]
+- Implemented [ext_oneapi_kernel_compiler_opencl](https://github.com/intel/llvm/blob/6344ead19/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_compiler_opencl.asciidoc) SYCL extension. [6344ead19]
+- Enabled kernel fusion with heterogeneous ND ranges for HIP targets. [e44888873]
+- Enabled [ext_oneapi_graph](https://github.com/intel/llvm/blob/5d7524543/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc) SYCL extension for OpenCL and HIP backend. [5d7524543] [897b27076]
+- Supported graph partitioning for host task dependencies in [ext_oneapi_graph](https://github.com/intel/llvm/blob/d53f123a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc) SYCL extension. [d53f123a]
+- Added ESIMD APIs for stochastic rounding, property-based gather, masked-gather, and ReaD timestamp counting. [aa4e87801] [3eca2d473] [1261e0518]
+- Added out-of-bounds `load`,`store`,`fill` and overloads accepting annotated pointers in [ext_oneapi_matrix](https://github.com/intel/llvm/blob/4c17a7f39/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc) SYCL extension [4c17a7f39] [f3137e99]
+- Added support for `queue::mem_advise` on HIP backends. [a669374b7] [ab86d0db]
+- Supported `fill` and `memset` nodes in [ext_oneapi_graph](https://github.com/intel/llvm/blob/8ea022954/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc) SYCL extension. [8ea022954]
+- Implemented [ext_oneapi_in_order_queue_events](https://github.com/intel/llvm/blob/19072756e/sycl/doc/extensions/experimental/sycl_ext_oneapi_in_order_queue_events.asciidoc) SYCL extension. [19072756e]
+- Implemented [ext_oneapi_address_cast](https://github.com/intel/llvm/blob/123705190/sycl/doc/extensions/proposed/sycl_ext_oneapi_address_cast.asciidoc) SYCL extension. [123705190]
+- Implemented [ext_oneapi_kernel_compiler_spirv](https://github.com/intel/llvm/blob/36e123d3e1/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_compiler_spirv.asciidoc) SYCL extension. [36e123d3e1]
+- Implemented [ext_oneapi_composite_device](https://github.com/intel/llvm/blob/2db1a4f6a5/sycl/doc/extensions/experimental/sycl_ext_oneapi_composite_device.asciidoc) SYCL extension. [2db1a4f6a5]
+- Implemented joint matrix query from [ext_oneapi_matrix](https://github.com/intel/llvm/blob/00eebe1e4/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc) SYCL extension on CUDA and HIP backends. [00eebe1e4]
+- Added support for unsampled image arrays in [ext_oneapi_bindless_images](https://github.com/intel/llvm/blob/76ec3f0f7/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc) SYCL extension. [76ec3f0f7]
+- Added `__imf_rcp64h` - equivalent to CUDA's `__nv_rcp64h` - and `sqrt` function with selectable rounding modes to Intel math libdevice. [ce70cb521] [6c1dde4243b5]
+- Integrated OneAPI construction kit's vectorizer to Native CPU backend. [330ac57d6]
+- Added ability to compare device architecture and support for PVC-VG to [ext_oneapi_device_architecture](https://github.com/intel/llvm/blob/68445467/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc) SYCL extension. [68445467] [ac0e142e12]
+- Added `sycl::length` wrapper and a helper functions in SYCLCompat library for occupancy calculation in Intel GPUs. [b209b321] [2525570]
+- Added support for SYCL barriers on Native CPU. [3c39d132a]
+- Added support for `bfloat16` to `sycl::vec`. [bbbe8839]
+- Added vectorized binary and unary operations through callable structs in the SYCLCompat library. [5505e03]
+- Supported profiling information for default-constructed events when `ext_oneapi_barrier` is submitted to an empty in-order queue. [200694b]
+- Implemented `ext_oneapi_private_alloca` by adding code generation capabilities for `private_alloca`. [f4e0d31]
+- Added support for memory attributes on `non-const` device global variables on FPGA. [3bb5f40] [3fc6708]
+- Added `set_default_queue` functionality to SYCLCompat library to enable changing the default queue of the current device. [e72b85c]
+- Propagate annotations from `annotated_ptr` to the underlying raw pointers to enable additional optimization opportunities. [8f182cd]
+
+### Documentation
+- Proposed [ext_intel_fp_control](https://github.com/intel/llvm/blob/bf8ea96f4/sycl/doc/extensions/experimental/sycl_ext_intel_fp_control.asciidoc) extension to allow specifying the rounding and denorm mode for floating-point operations in SYCL kernels. [bf8ea96f4]
+- Proposed [ext_oneapi_raw_kernel_arg](https://github.com/intel/llvm/blob/4168793978/sycl/doc/extensions/proposed/sycl_ext_oneapi_raw_kernel_arg.asciidoc) SYCL extension to allow opaque types to be passed to SYCL kernels. [4168793978]
+- Proposed [ext_oneapi_composite_device](https://github.com/intel/llvm/blob/9a1b9084/sycl/doc/extensions/experimental/sycl_ext_oneapi_composite_device.asciidoc) SYCL extension to allow card-level device access on PVC GPUs. [9a1b9084]
+- Proposed [ext_oneapi_in_order_queue_events](https://github.com/intel/llvm/blob/19072756e/sycl/doc/extensions/experimental/sycl_ext_oneapi_in_order_queue_events.asciidoc) SYCL extension to allow getting event from the last submitted command and setting an external event as an implicit dependence on the next command submitted to the queue [19072756e]
+- Proposed [ext_oneapi_profiling_tag](https://github.com/intel/llvm/blob/b4ade420/sycl/doc/extensions/proposed/sycl_ext_oneapi_profiling_tag.asciidoc) SYCL extension to time commands submitted to the queue. [b4ade420]
+- Proposed [ext_oneapi_private_alloca](https://github.com/intel/llvm/blob/aaf7a58863/sycl/doc/extensions/experimental/sycl_ext_oneapi_private_alloca.asciidoc) SYCL extension to have specialization constant-length private memory allocations. [aaf7a58863]
+- Added `joint_matrix_prefetch` and overloads of load and store with `annotated_ptr` in [ext_intel_matrix](https://github.com/intel/llvm/blob/04a222f7bb3022f3623ad40c9de70fd97579061a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_intel_matrix.asciidoc) and [ext_oneapi_matrix](https://github.com/intel/llvm/blob/04a222f7bb3022f3623ad40c9de70fd97579061a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc) SYCL extensions. [04a222f]
+
+### Other changes
+- Created an additional version-agnostic copy of the SYCL import library during compiler build. [2d2e418c]
+
+## Improvements
+### SYCL Compiler
+- Enabled default selection of general register file (GRF) size on Linux for PVC GPUs. [8083f8a8]
+- Disabled passing `-sycl-opt` for NativeCPU to enable the original full LLVM optimization pipeline. [3fe77b9]
+- Enabled `-fsycl-esimd-force-stateless-mem` flag by default. [f316273]
+- Enable `-emit-only-kernels-as-entry-point` by default on Intel backends for `sycl-post-link` to prevent device code bloating. [70fddbb]
+
+
+### SYCL Library
+- Improved error messages for invalid properties specified on non pointer types. [728b132a5]
+- Adopted a unified and scalable way to pass alignment and cache flags to all ESIMD functions. [a2208484ab] [960d898c] [5ef8df837d] [a57a96c77] [19cd6144a] [646ab086e5] [0bf2e666c]
+- Added default constructor to bindless sampler and image handler in [ext_oneapi_bindless_images](https://github.com/intel/llvm/blob/d65f3aa560/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc) SYCL extension. [d65f3aa560] [7bfdcfd4cabf]
+- Added `SYCL_CACHE_IN_MEM` environment variable to disable in-memory caching of programs and facilitated automatic program cache cleaning when running out of memory. [9322d14ce] [6cf1ae081ac]
+- Improved templated and convertible builtins after clarification in SYCL 2020 revision 8. [92861835]
+- Allowed generic_space `multi_ptr` in math builtins. [eda8a587f1]
+- Improved error message when writing beyond the bounds of `simd_view` object. [197c33a2b]
+- Optimized `ext_oneapi_submit_barrier` from [ext_oneapi_enqueue_barrier](https://github.com/intel/llvm/blob/7e08c15dd/sycl/doc/extensions/supported/sycl_ext_oneapi_enqueue_barrier.asciidoc) into `NOP` for in-order queues with empty waitlist. [7e08c15dd]
+- Supported prefetch, memory advise, and automatic management of dependencies for multiple command-buffer submissions in [ext_oneapi_graph](https://github.com/intel/llvm/blob/c6fbac59/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc) SYCL extension. [c6fbac59] [56f8d38c]
+- Added support for profiling command buffers. [b04f894dbd06b]
+- Implemented ESIMD APIs that accepts compile-time properties. [655ab100] [5582ce4db] [d286f4ab1c] [961793913] [0cfe7e35] [656b8be7]
+- Removed deprecated esimd_emulators from device filters and depreciated `SYCL_DEVICE_FILTER` in favor of `ONEAPI_DEVICE_SELECTOR`. [9d0888ca3] [8d0fa9875]
+- Improved error message when trying to fuse kernels with incompatible ND-Ranges in [ext_codeplay_kernel_fusion](https://github.com/intel/llvm/blob/7d492f87ec97/sycl/doc/extensions/experimental/sycl_ext_codeplay_kernel_fusion.asciidoc). [7d492f87ec97]
+- Made user functions to always inline in the SYCL kernels to reduce overhead in SYCLCompat library. [e121c8811]
+- Made runtime choose device image with inlined specialization constant when `-fsycl-add-default-spec-consts-image` option is used. [73d34739b]
+- Made `nd_item` stateless to reduce initialization overhead. [7999e27b]
+- Improved warning messages and added `-ignore-device-selector` flag to `sycl-ls` to ignore device selection environment variables. [6e3aa218]
+- Improved error handling when calling `matrix_combinations` query on platforms unsupported by [ext_oneapi_device_architecture](https://github.com/intel/llvm/blob/c00305b73/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc) SYCL extension. [c00305b73]
+- Made default `sycl::queue` context reusable on Windows. [491e6e4ea]
+- Changed default cache hints for `prefetch` ESIMD API. [984c88c]
+- Limited `bfloat16` ESIMD operations to data types convertible to `float`, as required by the SPEC. [f81b5a2]
+- Removed the implicitly passed `-ze-take-global-address` IGC option as it is by default enabled on newer IGC versions. [7e414a9]
+- Improved product security by ensuring that `pi_win_proxy_loader.dll` is loaded only from trusted directories. [85b7145] [218d9fe] [9c504a5]
+- Aligned `sycl-ls` output with `ONEAPI_DEVICE_SELECTOR` environment variable syntax. [38ce764] [f720291]
+- Improved error message when kernel compilation fails. [eba7b7e]
+
+
+### Documentation
+- Updated [ext_oneapi_kernel_compiler_opencl](https://github.com/intel/llvm/blob/6344ead19e/sycl/doc/extensions/experimental/sycl_ext_oneapi_kernel_compiler_opencl.asciidoc) SYCL extension to allow querying OpenCL version. [6344ead19e]
+- Updated [ext_intel_data_flow_pipes_properties](https://github.com/intel/llvm/blob/2a0911892/sycl/doc/extensions/experimental/sycl_ext_intel_data_flow_pipes_properties.asciidoc) to include AXI streaming as a protocol choice on FPGAs. [2a0911892]
+- Updated [KernelFusionJIT](https://github.com/intel/llvm/blob/b9854a12/sycl/doc/design/KernelFusionJIT.md) to include details on local/private memory allocation size, different promotion hints, etc. [b9854a12]
+- Updated [ext_oneapi_in_order_queue_events](https://github.com/intel/llvm/blob/b0f584c675f9/sycl/doc/extensions/experimental/sycl_ext_oneapi_in_order_queue_events.asciidoc) to make external events wait when queue is waited on. [b0f584c675f9]
+- Improved [ext_oneapi_address_cast](https://github.com/intel/llvm/blob/84a92e03/sycl/doc/extensions/proposed/sycl_ext_oneapi_address_cast.asciidoc) SYCL extension to allow casting raw pointers to multi_ptr. [84a92e03]
+
+## Bug Fixes
+### SYCL Compiler
+- Made the device binary generated by `-fsycl-link=image` linkable by adding more information into the binary. [219d4ef54]
+- Fixed linking error when separately compiling and linking a SYCL program with SYCL libraries. [d6eecfa]
+- Fixed `clangd` parsing crash with `-fsycl` flag when using `!nullptr` asserts. [f42bbcc]
+
+### SYCL Library
+- Fixed computation of submit time based on host timestamps. [254756369c]
+- Fixed SYCL CTS failures for Unified Runtime's OpenCL adapter. [4c0780e76]
+- Fixed strict aliasing violations in `sycl::vec` routines. [a9d0e1b8]
+- Fixed logical operations and integer conversions among sycl::vec types. [3d5e41fddf] [ff48612f] [7868596d]
+- Fixed compound operators on `annoted_ptr` when the user-defined type only defines a compound operator. [c43a90f2]
+- Fixed exponential slowdown in multiple calls to `queue::ext_oneapi_submit_barrier`. [079fc97b]
+- Fixed input handling for `ONEAPI_DEVICE_SELECTOR` environment variable. [90b6aee46]
+- Fixed in-order dependency filtering for isolated kernels. [8e7995df]
+- Fixed double-free bug in kernel-program cache. [04ff5b81]
+- Fixed resource leak in `SYCL_FALLBACK_ASSERT`. [b478d2fa]
+- Fixed deadlock in in-order queue when submitting a host task and simultaneously accessing stream service events. [3031733]
+- Made `sycl::vec` interface consistent with `sycl::marray` and `sycl::buffer` by defining `value_type` alias. [33e5b10]
+- Fix handling of enumeration specialization constants. [1f0dc36]
+- Fixes `-O0 -fno-inline-functions` ESIMD failures by inlining some non-inline functions due to VC limitations. [89327e0]
+
+### Documentation
+- Clarified [ext_oneapi_graph](https://github.com/intel/llvm/blob/2581123a1/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc) SYCL extension to make it illegal for graph nodes to depend on events from outside the graph. [2581123a1]
+- Updated [ext_oneapi_non_uniform_groups](https://github.com/intel/llvm/blob/90a55a5/sycl/doc/extensions/experimental/sycl_ext_oneapi_non_uniform_groups.asciidoc) to invert group numbering for ballot groups. [90a55a5]
+- Updated [ext_oneapi_free_function_kernels](https://github.com/intel/llvm/blob/a452e06a0ebcbabbfecbeb2ca05675265bddbf8d/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc) to remove `range kernels` from the extension. [a452e06]
+
+## Known Issues
+- On Windows, the Unified Runtime's Level Zero leak check does not work correctly with
+the default contexts on Windows. This is because on Windows the release
+of the plugin DLLs races against the release of static global variables
+(like the default context).
+- Intel Graphic Compiler's Vector Compute backend does not support O0 code and often gets miscompiled, produces wrong answers and crashes. This issue directly affects ESIMD code at O0. As a temporary workaround, we have optimize ESIMD code even in O0 mode. [00749b1e8](https://github.com/intel/llvm/commit/00749b1e8e3085acfdc63108f073a255842533e2)
+- `multi_ptr` relational operators assume the lowest possible value of `std::null_ptr` which might cause issues with the CUDA and AMDGPU backends. This will be fixed in the next release. ([13201](https://github.com/intel/llvm/pull/13201))
+- When `-fsycl-device-code-split=off` is set, having kernels with different `reqd_work_group_size` attributes could lead to runtime errors about local size mismatching the attribute value. The issue is also reproducible when there is a kernel with `reqd_work_group_size` attribute, but other kernels don't have that attribute set. This will be fixed in the next release. ([#13523](https://github.com/intel/llvm/pull/13523))
+- Having default-constructed `local_accessor` as unused kernel argument could lead to runtime errors during kernel arguments setting. The issue is reproducible when optimizations are explicitly disabled through `-O0`, or when optimizations failed to remove that unused kernel argument. This will be fixed in the next release. ([#13382](https://github.com/intel/llvm/pull/13382))
+- ONEAPI_DEVICE_SELECTOR incorrectly parses `!` from discard filters. This will be fixed in the next release. ([SYCL] Fix ONEAPI_DEVICE_SELECTOR handling of discard filters. #13927)
+
+## API/ABI breaking changes
+- Renamed and removed some APIs from [ext_oneapi_free_function_queries](https://github.com/intel/llvm/commit/287fd3733#diff-4ab48d4a7f26c356939d42c6aed9c67d4d59aafac11565f3bfe71d7e053a4db4) SYCL extension. [287fd3733]
+
+## Upcoming API/ABI breakages
+The following changes ared only in effect if the `-fpreview-breaking-changes` flag is set.
+- Changed return type of `abs_diff` to be same as that of the input. [2a3e1ab82]
+- Added a preview of pre-C++11 ABI support for GCC on Linux.  This feature  allows users to set a GCC compiler flag -D_GLIBCXX_USE_CXX11_ABI=0 to use pre-C++11 ABI. Details about GCC C++11 ABI is available at https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html. In this release, this feature is enabled under the flag -fpreview-breaking-changes, and the support is incomplete and may not work for some cases.  [459e122a]
+- Removed some sub-group class APIs that do not appear in SYCL 2020 Spec. [2985395]
+
+
 # Nov'23 release notes
 Release notes for commit range f4e0d3177338..f4ed132f243a
 
diff --git a/sycl/doc/GetStartedGuide.md b/sycl/doc/GetStartedGuide.md
index 0f36f0d38eeb4..de14612ba53e5 100644
--- a/sycl/doc/GetStartedGuide.md
+++ b/sycl/doc/GetStartedGuide.md
@@ -439,17 +439,17 @@ run the following commands
     # Extract OpenCL CPU RT
     mkdir -p /opt/intel/oclcpuexp_<cpu_version>
     cd /opt/intel/oclcpuexp_<cpu_version>
-    tar -zxvf oclcpu_rt_<cpu_version>.tar.gz
+    tar -zxvf oclcpuexp_<cpu_version>.tar.gz
     ```
 
-2) Create ICD file pointing to the new runtime (requires root access)
+2) Create ICD file pointing to the new runtime (requires sudo access)
 
     ```bash
     # OpenCL FPGA emulation RT
-    echo  /opt/intel/oclfpgaemu_<fpga_version>/x64/libintelocl_emu.so >
+    echo  /opt/intel/oclfpgaemu_<fpga_version>/x64/libintelocl_emu.so | sudo tee
       /etc/OpenCL/vendors/intel_fpgaemu.icd
     # OpenCL CPU RT
-    echo /opt/intel/oclcpuexp_<cpu_version>/x64/libintelocl.so >
+    echo /opt/intel/oclcpuexp_<cpu_version>/x64/libintelocl.so | sudo tee
       /etc/OpenCL/vendors/intel_expcpu.icd
     ```
 
@@ -469,32 +469,32 @@ folder:
     ```bash
     # OpenCL FPGA emulation RT
     ln -s /opt/intel/oneapi-tbb-<tbb_version>/lib/intel64/gcc4.8/libtbb.so
-      /opt/intel/oclfpgaemu_<fpga_version>/x64
+      /opt/intel/oclfpgaemu_<fpga_version>/x64/libtbb.so
     ln -s /opt/intel/oneapi-tbb-<tbb_version>/lib/intel64/gcc4.8/libtbbmalloc.so
-      /opt/intel/oclfpgaemu_<fpga_version>/x64
+      /opt/intel/oclfpgaemu_<fpga_version>/x64/libtbbmalloc.so
     ln -s /opt/intel/oneapi-tbb-<tbb_version>/lib/intel64/gcc4.8/libtbb.so.12
-      /opt/intel/oclfpgaemu_<fpga_version>/x64
+      /opt/intel/oclfpgaemu_<fpga_version>/x64/libtbb.so.12
     ln -s /opt/intel/oneapi-tbb-<tbb_version>/lib/intel64/gcc4.8/libtbbmalloc.so.2
-      /opt/intel/oclfpgaemu_<fpga_version>/x64
+      /opt/intel/oclfpgaemu_<fpga_version>/x64/libtbbmalloc.so.2
     # OpenCL CPU RT
     ln -s /opt/intel/oneapi-tbb-<tbb_version>/lib/intel64/gcc4.8/libtbb.so
-      /opt/intel/oclcpuexp_<cpu_version>/x64
+      /opt/intel/oclcpuexp_<cpu_version>/x64/libtbb.so
     ln -s /opt/intel/oneapi-tbb-<tbb_version>/lib/intel64/gcc4.8/libtbbmalloc.so
-      /opt/intel/oclcpuexp_<cpu_version>/x64
+      /opt/intel/oclcpuexp_<cpu_version>/x64/libtbbmalloc.so
     ln -s /opt/intel/oneapi-tbb-<tbb_version>/lib/intel64/gcc4.8/libtbb.so.12
-      /opt/intel/oclcpuexp_<cpu_version>/x64
+      /opt/intel/oclcpuexp_<cpu_version>/x64/libtbb.so.12
     ln -s /opt/intel/oneapi-tbb-<tbb_version>/lib/intel64/gcc4.8/libtbbmalloc.so.2
-      /opt/intel/oclcpuexp_<cpu_version>/x64
+      /opt/intel/oclcpuexp_<cpu_version>/x64/libtbbmalloc.so.2
     ```
 
-5) Configure library paths (requires root access)
+5) Configure library paths (requires sudo access)
 
     ```bash
-    echo /opt/intel/oclfpgaemu_<fpga_version>/x64 >
+    echo /opt/intel/oclfpgaemu_<fpga_version>/x64 | sudo tee
       /etc/ld.so.conf.d/libintelopenclexp.conf
-    echo /opt/intel/oclcpuexp_<cpu_version>/x64 >>
+    echo /opt/intel/oclcpuexp_<cpu_version>/x64 | sudo tee -a
       /etc/ld.so.conf.d/libintelopenclexp.conf
-    ldconfig -f /etc/ld.so.conf.d/libintelopenclexp.conf
+    sudo ldconfig -f /etc/ld.so.conf.d/libintelopenclexp.conf
     ```
 
 **Windows (64-bit)**:
diff --git a/sycl/doc/UsersManual.md b/sycl/doc/UsersManual.md
index e354f2e605a6f..638d4e2c2f080 100644
--- a/sycl/doc/UsersManual.md
+++ b/sycl/doc/UsersManual.md
@@ -91,6 +91,7 @@ and not recommended to use in production environment.
     * nvidia_gpu_sm_87 - NVIDIA Jetson/Drive AGX Orin architecture
     * nvidia_gpu_sm_89 - NVIDIA Ada Lovelace architecture
     * nvidia_gpu_sm_90 - NVIDIA Hopper architecture
+    * nvidia_gpu_sm_90a - NVIDIA Hopper architecture (with wgmma and setmaxnreg instructions)
     * amd_gpu_gfx700 - AMD GCN GFX7 (Sea Islands (CI)) architecture
     * amd_gpu_gfx701 - AMD GCN GFX7 (Sea Islands (CI)) architecture
     * amd_gpu_gfx702 - AMD GCN GFX7 (Sea Islands (CI)) architecture
diff --git a/sycl/doc/design/CommandGraph.md b/sycl/doc/design/CommandGraph.md
index 9519067a00484..f36c40af07403 100644
--- a/sycl/doc/design/CommandGraph.md
+++ b/sycl/doc/design/CommandGraph.md
@@ -438,6 +438,24 @@ Level Zero:
 Future work will include exploring L0 API extensions to improve the mapping of
 UR command-buffer to L0 command-list.
 
+#### Copy Engine
+
+For performance considerations, the Unified Runtime Level Zero adapter uses
+different Level Zero command-queues to submit compute kernels and memory
+operations when the device has a dedicated copy engine. To take advantage of the
+copy engine when available, the graph workload can also be split between memory
+operations and compute kernels. To achieve this, two graph workload
+command-lists live simultaneously in a command-buffer.
+
+When the command-buffer is finalized, memory operations (e.g. buffer copy,
+buffer fill, ...) are enqueued in the *copy* command-list while the other
+commands are enqueued in the compute command-list. On submission, if not empty,
+the *copy* command-list is sent to the main copy command-queue while the compute
+command-list is sent to the compute command-queue.
+
+Both are executed concurrently. Synchronization between the command-lists is
+handled by Level Zero events.
+
 ### CUDA
 
 The SYCL Graph CUDA backend relies on the
diff --git a/sycl/doc/design/DeviceIf.md b/sycl/doc/design/DeviceIf.md
index b9cbb1cf3de73..93a92934842b7 100644
--- a/sycl/doc/design/DeviceIf.md
+++ b/sycl/doc/design/DeviceIf.md
@@ -183,6 +183,7 @@ one of the following corresponding C++ macro names:
 * `__SYCL_TARGET_NVIDIA_GPU_SM87__`
 * `__SYCL_TARGET_NVIDIA_GPU_SM89__`
 * `__SYCL_TARGET_NVIDIA_GPU_SM90__`
+* `__SYCL_TARGET_NVIDIA_GPU_SM90A__`
 * `__SYCL_TARGET_AMD_GPU_GFX700__`
 * `__SYCL_TARGET_AMD_GPU_GFX701__`
 * `__SYCL_TARGET_AMD_GPU_GFX702__`
diff --git a/sycl/doc/design/SYCLNativeCPU.md b/sycl/doc/design/SYCLNativeCPU.md
index d2fc7d3b484e8..28d19de097e76 100644
--- a/sycl/doc/design/SYCLNativeCPU.md
+++ b/sycl/doc/design/SYCLNativeCPU.md
@@ -38,6 +38,12 @@ python buildbot/configure.py \
 # other options here
 ```
 
+### libclc target triples
+
+SYCL Native CPU uses [libclc](https://github.com/intel/llvm/tree/sycl/libclc) to implement many SPIRV builtins. When Native CPU is enabled, the default target triple for libclc will be `LLVM_TARGET_TRIPLE` (same as the default target triple used by `clang`). This can be overridden by setting the `--native-cpu-libclc-targets` option in `configure.py`.
+
+### oneAPI Construction Kit
+
 SYCL Native CPU uses the [oneAPI Construction Kit](https://github.com/codeplaysoftware/oneapi-construction-kit) (OCK) in order to support some core SYCL functionalities and improve performances, the OCK is fetched by default when SYCL Native CPU is enabled, and can optionally be disabled using the `NATIVECPU_USE_OCK` CMake variable (please note that disabling the OCK will result in limited functionalities and performances on the SYCL Native CPU backend):
 
 ```
diff --git a/sycl/doc/developer/ContributeToDPCPP.md b/sycl/doc/developer/ContributeToDPCPP.md
index ee60eb5a59d70..a096d99b33397 100644
--- a/sycl/doc/developer/ContributeToDPCPP.md
+++ b/sycl/doc/developer/ContributeToDPCPP.md
@@ -164,3 +164,39 @@ These tests verify SYCL specification conformance. All implementation details
 are out of scope for the tests.
 See DPC++ compiler invocation definitions at
 [FindIntel_SYCL](https://github.com/KhronosGroup/SYCL-CTS/blob/SYCL-1.2.1/master/cmake/FindIntel_SYCL.cmake))
+
+## Unified Runtime Updates
+
+To integrate changes from the [Unified Runtime][ur] project into DPC++ there
+two main options which depend on the scope of those changes and the current
+state of DPC++.
+
+1. Synchronized update:
+  * When: If the Unified Runtime change touches the API/ABI, more than one
+    adapter, or common code such as the loader.
+  * How: Update the `UNIFIED_RUNTIME_TAG` to point at the desired commit or tag
+    name in the Unified Runtime repository and ensure that any tag for specific
+    adapters are set to use `${UNIFIED_RUNTIME_TAG}`.
+
+2. Decoupled update:
+  * When: If only a single Unified Runtime adatper has changed.
+  * How: Update the tag used in the `fetch_adapter_source()` call for a
+    specific Unified Runtime adapter, e.g. Level Zero, OpenCL, CUDA, HIP, or
+    Native CPU.
+
+In general, a synchronized update should be the default. However, when there
+are a lot of changes in flight in parallel always synchronizing the tag can be
+troublesome. This is when a decoupled update can help sustain the merge
+velocity of Unified Runtime changes.
+
+The [intel/unified-runtime-reviewers][ur-reviewers-team] team is responsible
+for ensuring that the Unified Runtime tag is updated correctly and will only
+provide code owner approval to pull requests once the following criteria are
+met:
+
+* Tags are pointing to a valid commit or tag on Unified Runtime main branch.
+* Changes to additional code owned files are in a good state.
+* GitHub Actions checks are passing.
+
+[ur]: https://github.com/oneapi-src/unified-runtime
+[ur-reviewers-team]: https://github.com/orgs/intel/teams/unified-runtime-reviewers
diff --git a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc
index e755c6f9ee414..9fb1dd1503237 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc
@@ -1085,6 +1085,13 @@ multiple of 4 when `T` is `float`; where `T` is the type of the
 `joint_matrix` elements. When `T` is not `half` or `float` there are
 no restrictions to `stride`.
 
+IMPORTANT: For some devices it is important to use the sm version
+(Compute Capability) corresponding to the device that will run the
+program when specifying e.g. `-fsycl-targets=nvidia_gpu_sm_xx` during
+compilation. This particularly affects matrix operations using `half`.
+For more information on this issue consult
+https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#wmma-restrictions
+
 ==== AMD Matrix Cores Supported Combinations
 The complete set of matrix data types and dimensions that are supported by
 the `ext_oneapi_hip` backend are represented in the following
@@ -1139,4 +1146,5 @@ supported combinations
 load/store overloads
 |11  |2024-04-29 |Yury Plyakhin | Add 1x64x16 supported combination for
 Intel XMX (intel_gpu_pvc)
+|12  |2024-06-14 |Jack Kirk | Add note on sm version device matching issue.
 |======================
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc
index 3261a94b17cdf..6359515a67b9d 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bfloat16_math_functions.asciidoc
@@ -103,286 +103,556 @@ then it supports the `bfloat16` math functions described in the next section.
 
 === Math Functions
 
-The following functions are only available when `T` is `bfloat16` or
-`sycl::marray<bfloat16, {N}>`, where `{N}` means any positive value of
-`size_t` type.
-
 ==== isnan
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
 
-bool isnan(bfloat16 x);
+bool isnan(bfloat16 x);                                                 (1)
 
-template <size_t N>
-sycl::marray<bool, N> isnan(sycl::marray<bfloat16, N> x);
+template<typename NonScalar> 
+/*return type*/ isnan(NonScalar x);                                     (2)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
 ===== Description
+====== Overload (1)
+
+Returns `true` if `x` is a NaN value, otherwise returns `false`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+ - `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+ - The element type is `bfloat16`.
 
-Returns true if x is NAN value, otherwise returns false.
+*Returns:* If `NonScalar` is `marray`, returns `true` for each element of `x` only if `x[i]` has a NaN value. If `NonScalar` is `vec` or the `[code]#+__swizzled_vec__+#` type, returns -1 for each element of `x` if `x[i]` is a NaN value and returns 0 otherwise.
+
+The return type depends on `NonScalar`. For `marray`, the return type is `marray<bool, N>` and for `vec`, `[code]#+__swizzled_vec__+#` type, the return type is `vec<int16_t, N>`.
 
 ==== fma
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
 
-template <typename T>
-T fma(T a, T b, T c);
+bfloat16 fma(bfloat16 a, bfloat16 b, bfloat16 c);                          (1)
+
+template<typename NonScalar1, typename NonScalar2, typename NonScalar3>    (2)
+/*return-type*/ fma(NonScalar1 a, NonScalar2 b, NonScalar3 c)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
 ===== Description
 
-Returns the correctly rounded floating-point representation of the
+====== Overload (1)
+
+*Returns:* Returns the correctly rounded floating-point representation of the
 sum of `c` with the infinitely precise product of `a` and `b`.
 Rounding of intermediate products shall not occur. The mantissa
 LSB rounds to the nearest even. Subnormal numbers are supported.
 
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* One of the following conditions must hold for `NonScalar1`, `NonScalar2`, and `NonScalar3`:
+** `NonScalar1`, `NonScalar2`, and `NonScalar3` are each `marray`; or
+** `NonScalar1`, `NonScalar2`, and `NonScalar3` are any combination of `vec` and the `[code]#+__swizzled_vec__+#` type;
+* `NonScalar1`, `NonScalar2`, and `NonScalar3` have the same number of elements;
+* `NonScalar1`, `NonScalar2`, and `NonScalar3` have the same element type; and
+* The element type of `NonScalar1`, `NonScalar2`, and `NonScalar3` is `bfloat16`.
+
+*Returns:* For each element of `a`, `b`, and `c`; the correctly rounded floating-point representation of the sum of `c[i]` with the infinitely precise product of `a[i]` and `b[i]`. Rounding of intermediate products shall not occur. Edge case behavior is per the IEEE 754-2008 standard.
+
+The return type is `NonScalar1` unless `NonScalar1` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
+
 ==== fmax
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T fmax(T x, T y);
+bfloat16 fmax(bfloat16 x, bfloat16 y);                                           (1)
+
+template<typename NonScalar1, typename NonScalar2>                               (2)
+/*return-type*/ fmax(NonScalar1 x, NonScalar2 y)
+
+template<typename NonScalar>                                                     (3)
+/*return-type*/ fmax(NonScalar x, bfloat16 y)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
 ===== Description
 
-Returns `y` if
-`x < y`, otherwise it
-returns `x`. If one argument is a
-NaN, `fmax()` returns the other
-argument. If both arguments are
-NaNs, `fmax()` returns a NaN.
+====== Overload (1)
+
+Returns `y` if `x < y`, otherwise it returns `x`. If one argument is a NaN, `fmax()` returns the other
+argument. If both arguments are NaNs, `fmax()` returns a NaN.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* One of the following conditions must hold for `NonScalar1` and `NonScalar2`:
+** Both `NonScalar1` and `NonScalar2` are `marray`; or
+** `NonScalar1` and `NonScalar2` are any combination of `vec` and the `[code]#+__swizzled_vec__+#` type;
+* `NonScalar1` and `NonScalar2` have the same number of elements;
+* `NonScalar1` and `NonScalar2` have the same element type; and
+* The element type of `NonScalar1` and `NonScalar2` is bfloat16.
+
+*Returns:* For each element of `x` and `y`, the value `y[i]` if `x[i] < y[i]`, otherwise `x[i]`. If one element is a NaN, the result is the other element. If both elements are NaNs, the result is NaN.
+
+The return type is `NonScalar1` unless `NonScalar1` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
+
+====== Overload (3)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* NonScalar is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is bfloat16.
+
+*Returns:* For each element of `x`, the value `y` if `x[i] < y`, otherwise `x[i]`. If one value is a NaN, the result is the other value. If both value are NaNs, the result is a NaN.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== fmin
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T fmin(T x, T y);
+bfloat16 fmin(bfloat16 x, bfloat16 y);                                           (1)
+
+template<typename NonScalar1, typename NonScalar2>                               (2)
+/*return-type*/ fmin(NonScalar1 x, NonScalar2 y)
+
+template<typename NonScalar>                                                     (3)
+/*return-type*/ fmin(NonScalar x, bfloat16 y)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
 ===== Description
 
-Returns `y` if
-`y < x`, otherwise it
-returns `x`. If one argument is a
-NaN, `fmax()` returns the other
-argument. If both arguments are
-NaNs, `fmax()` returns a NaN.
+====== Overload (1)
+
+Returns `x` if `x < y`, otherwise it returns `y`. If one argument is a
+NaN, `fmin()` returns the other argument. If both arguments are NaNs, `fmin()` returns a NaN.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* One of the following conditions must hold for `NonScalar1` and `NonScalar2`:
+** Both `NonScalar1` and `NonScalar2` are `marray`; or
+** `NonScalar1` and `NonScalar2` are any combination of `vec` and the `[code]#+__swizzled_vec__+#` type;
+* `NonScalar1` and `NonScalar2` have the same number of elements;
+* `NonScalar1` and `NonScalar2` have the same element type; and
+* The element type of `NonScalar1` and `NonScalar2` is bfloat16.
+
+*Returns:* For each element of `x` and `y`, the value `x[i]` if `x[i] < y[i]`, otherwise `y[i]`. If one element is a NaN, the result is the other element. If both elements are NaNs, the result is NaN.
+
+The return type is `NonScalar1` unless `NonScalar1` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
+
+====== Overload (3)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* NonScalar is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is bfloat16.
+
+*Returns:* For each element of `x`, the value `x[i]` if `x[i] < y`, otherwise `y`. If one value is a NaN, the result is the other value. If both value are NaNs, the result is a NaN.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== fabs
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T fabs(T x);
+bfloat16 fabs(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                          (2)
+/*return-type*/ fabs(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Compute absolute value(s) of a scalar `bfloat16` value.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
 
-Compute absolute value of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+*Returns:* For each element of `x`, the absolute value of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== ceil
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T ceil(T x);
+bfloat16 ceil(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                          (2)
+/*return-type*/ ceil(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+The value `x` rounded to an integral value using the round to positive infinity rounding mode.
+
+====== Overload (2)
 
-Returns `x` rounded to an integral value using the round to positive infinity rounding mode
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value using the round to positive infinity rounding mode.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== floor
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T floor(T x);
+bfloat16 floor(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                           (2)
+/*return-type*/ floor(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+The value `x` rounded to an integral value using the round to negative infinity rounding mode.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
 
-Returns `x` rounded to an integral value using the round to negative infinity rounding mode
-for a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value using the round to negative infinity rounding mode.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== cos
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T cos(T x);
+bfloat16 cos(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                         (2)
+/*return-type*/ cos(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the cosine of `x`.
 
-Compute cosine of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the cosine of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== sin
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T sin(T x);
+bfloat16 sin(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                         (2)
+/*return-type*/ sin(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the sine of `x`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
 
-Compute sine of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
 
+*Returns:* For each element of `x`, the sine of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== exp
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T exp(T x);
+bfloat16 exp(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                         (2)
+/*return-type*/ exp(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the base-e exponential of `x`.
+
+====== Overload (2)
 
-Compute the base-e exponential of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the base-e exponential of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== exp2
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T exp2(T x);
+bfloat16 exp2(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                         (2)
+/*return-type*/ exp2(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the base-2 exponential of `x`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
 
-Compute the base-2 exponential of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+*Returns:* For each element of `x`, the base-2 exponential of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== exp10
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T exp10(T x);
+bfloat16 exp10(bfloat16 x);                                            (1)
+
+template<typename NonScalar>                                           (2)
+/*return-type*/ exp10(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the base-10 exponential of `x`.
 
-Compute the base-10 exponential of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the base-10 exponential of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== log
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T log(T x);
+bfloat16 log(bfloat16 x);                                              (1)
+
+template<typename NonScalar>                                           (2)
+/*return-type*/ log(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the natural logarithm of `x`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
 
-Compute natural logarithm of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the natural logarithm of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== log2
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T log2(T x);
+bfloat16 log2(bfloat16 x);                                              (1)
+
+template<typename NonScalar>                                            (2)
+/*return-type*/ log2(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the base-2 logarithm of `x`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the base-2 logarithm of `x[i]`.
 
-Compute base-2 logarithm of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== log10
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T log10(T x);
+bfloat16 log10(bfloat16 x);                                              (1)
+
+template<typename NonScalar>                                             (2)
+/*return-type*/ log10(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the base-10 logarithm of `x`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
 
-Compute base-10 logarithm of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the base-10 logarithm of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 
 ==== rint
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T rint(T x);
+bfloat16 rint(bfloat16 x);                                              (1)
+
+template<typename NonScalar>                                            (2)
+/*return-type*/ rint(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
 
-Returns `x` rounded to an integral value using the round to nearest even rounding mode
-for a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+Returns the value `x` rounded to an integral value (using round to nearest even rounding mode) in floating-point format. Refer to section 7.1 of the OpenCL 1.2 specification document: https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#opencl12 for a description of the rounding modes.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value (using round to nearest even rounding mode) in floating-point format.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== sqrt
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T sqrt(T x);
+bfloat16 sqrt(bfloat16 x);                                              (1)
+
+template<typename NonScalar>                                            (2)
+/*return-type*/ sqrt(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the square root of `x`.
+
+====== Overload (2)
 
-Compute square root of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the square root of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== rsqrt
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T rsqrt(T x);
+bfloat16 rsqrt(bfloat16 x);                                             (1)
+
+template<typename NonScalar>                                            (2)
+/*return-type*/ rsqrt(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
+
+Returns the inverse square root of `x`.
+
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
+
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
 
-Compute inverse square root of a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+*Returns:* For each element of `x`, the inverse square root of `x[i]`.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
 
 ==== trunc
 
 ```c++
 namespace sycl::ext::oneapi::experimental {
-template <typename T>
-T trunc(T x);
+bfloat16 trunc(bfloat16 x);                                             (1)
+
+template<typename NonScalar>                                            (2)
+/*return-type*/ trunc(NonScalar x)
 } // namespace sycl::ext::oneapi::experimental
 ```
 
-===== Description
+===== Overload (1)
 
-Returns `x` rounded to an integral value using the round to zero rounding mode
-for a `bfloat16` value or `sycl::marray<bfloat16, N>`.
+Returns the value `x` rounded to an integral value using the round to zero rounding mode.
 
-== Issues
+====== Overload (2)
+
+*Constraints:* Available only if all of the following conditions are met:
 
-1. The CUDA backend does not have a use case that would necessitate support
-of the `vec` class in bfloat16 math functions, and `marray` would always be
-preferred over `vec` if `vec` support were to be added in the CUDA backend.
-For portability reasons, support for the `vec` class can be easily added if
-other backends require it.
+* `NonScalar` is `marray`, `vec`, or the `[code]#+__swizzled_vec__+#` type; and
+* The element type is `bfloat16`.
+
+*Returns:* For each element of `x`, the value `x[i]` rounded to an integral value using the round to zero rounding mode.
+
+The return type is `NonScalar` unless `NonScalar` is the `[code]#+__swizzled_vec__+#` type, in which case the return type is the corresponding `vec`.
+
+== Issues
 
-2. We should decide on a roadmap to extend support of `bfloat16` to other
+1. We should decide on a roadmap to extend support of `bfloat16` to other
 SYCL 2020 math functions.
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc
index 8c47f17f3adfc..71bc0dc031d64 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_bindless_images.asciidoc
@@ -1364,7 +1364,7 @@ and the array index.
 // Fetch an unsampled image array
 template <typename DataT, typename HintT = DataT, typename CoordT>
 DataT fetch_image_array(const unsampled_image_handle &ImageHandle,
-                       const CoordT &Coords, const unsigned int ArrayLayer);
+                       const CoordT &Coords, unsigned int ArrayLayer);
 ```
 
 Fetching an image array follows the same restrictions on what coordinate types 
@@ -1390,7 +1390,7 @@ provided that type is trivially copyable.
 // Write to an unsampled image array
 template <typename DataT, typename CoordT>
 DataT write_image_array(unsampled_image_handle ImageHandle,
-                        const CoordT &Coords, const unsigned int ArrayLayer
+                        const CoordT &Coords, unsigned int ArrayLayer
                         const DataT &Color);
 ```
 
@@ -1495,7 +1495,7 @@ sampling depends on the sampler attributes passed upon creation of the cubemap.
 template <typename DataT, typename HintT = DataT>
 DataT fetch_cubemap(const unsampled_image_handle &ImageHandle,
                     const int2 &Coords,
-                    const int Face);
+                    int Face);
 
 // Sampled cubemap read
 template <typename DataT, typename HintT = DataT>
@@ -1506,7 +1506,7 @@ DataT sample_cubemap(const sampled_image_handle &ImageHandle,
 template <typename DataT>
 void write_cubemap(unsampled_image_handle ImageHandle,
                    const int2 &Coords,
-                   const int Face, 
+                   int Face, 
                    const DataT &Color);
 ```
 
@@ -1594,10 +1594,18 @@ struct.
 ```cpp
 namespace sycl::ext::oneapi::experimental {
 
+// Types of external memory handles
+enum class external_mem_handle_type {
+  opaque_fd = 0,
+  win32_nt_handle = 1,
+  win32_nt_dx12_resource = 2,
+};
+
 // Descriptor templated on specific resource type
 template <typename ResourceType>
 struct external_mem_descriptor {
   ResourceType external_resource;
+  external_mem_handle_type handle_type;
   size_t size_in_bytes;
 };
 
@@ -1609,9 +1617,13 @@ handle type, `ResourceType`, for their purposes, e.g. `resource_fd` to describe
 a POSIX file descriptor resource on Linux systems, or a `resource_win32_handle` 
 for Windows NT resource handles.
 
-Once the user populates the `external_mem_descriptor` with the appropriate 
-`ResourceType` values, and the size of the external memory in bytes,
-they can then import that memory into SYCL through `import_external_memory`.
+The user must populate the `external_mem_descriptor` with the appropriate 
+`ResourceType` values, a `handle_type`, and the size of the external memory in 
+bytes, before they can then import that memory into SYCL through 
+`import_external_memory`. Note that some handle types can only be used in 
+combination with certain resource types, for example the `opaque_fd` handle type
+is only used on Linux systems and is only compatible with the `resource_fd` 
+resource type.
 
 ```cpp
 namespace sycl::ext::oneapi::experimental {
@@ -1690,16 +1702,32 @@ memory resources handles can take different forms of structure and type
 depending on the API and operating system, so do external semaphore resource 
 handles.
 
+It is important to note, that the use of imported external semaphore objects
+within SYCL has the restriction in that imported external semaphores can only
+be used in conjuction with SYCL queues that have been constructed with the
+`property::queue::in_order` property. The semaphore synchronization mechanism
+is not supported for the default SYCL out-of-order queues. Use of the semaphore 
+synchronization mechanism with SYCL queues which were not constructed with the 
+`queue::in_order` property will result in undefined behaviour.
+
 External semaphore import is facilitated through the following proposed 
 descriptor struct.
 
 ```cpp
 namespace sycl::ext::oneapi::experimental {
 
+// Types of external semaphore handles
+enum class external_semaphore_handle_type {
+  opaque_fd = 0,
+  win32_nt_handle = 1,
+  win32_nt_dx12_fence = 2,
+};
+
 // Descriptor templated on specific resource type
 template <typename ResourceType>
 struct external_semaphore_descriptor {
   ResourceType external_resource;
+  external_semaphore_handle_type handle_type;
 };
 
 }
@@ -1710,9 +1738,12 @@ appropriate handle type, `ResourceType`, for their purposes, e.g. `resource_fd`
 to describe a POSIX file descriptor resource on Linux systems, or a 
 `resource_win32_handle` for Windows NT resource handles.
 
-Once the user populates the `external_semaphore_descriptor` with the appropriate 
-`ResourceType` values, they can then import that semaphore into SYCL through 
-`import_external_semaphore`.
+The user must populate the `external_semaphore_descriptor` with the appropriate 
+`ResourceType` values, and `handle_type`, before they can then import that 
+semaphore into SYCL through `import_external_semaphore`. Note that some handle 
+types can only be used in combination with certain resource types, for example 
+the `opaque_fd` handle type is only used on Linux systems and is only 
+compatible with the `resource_fd` resource type.
 
 ```cpp
 namespace sycl::ext::oneapi::experimental {
@@ -1728,7 +1759,6 @@ interop_semaphore_handle import_external_semaphore(
         externalSemaphoreDescriptor,
     const sycl::device &syclDevice,
     const sycl::context &syclContext);
-}
 
 template <typename ResourceType>
 interop_semaphore_handle import_external_semaphore(
@@ -1739,8 +1769,11 @@ interop_semaphore_handle import_external_semaphore(
 ```
 
 The resulting `interop_semaphore_handle` can then be used in a SYCL command 
-group, to either wait until the semaphore is in the signaled state, or set the 
-semaphore to a signaled state.
+group, to either wait until the semaphore signalled, or signal the semaphore.
+
+If the type of semaphore imported supports setting the state of discrete 
+semaphore value (the semaphore type is `win32_nt_dx12_fence`), then the user 
+can specify which value the semaphore operation should wait on, or signal.
 
 We propose to extend the SYCL queue and handler classes with semaphore waiting 
 and signalling operations.
@@ -1754,9 +1787,19 @@ public:
       ext::oneapi::experimental::interop_semaphore_handle
           interop_semaphore_handle);
 
+  void ext_oneapi_wait_external_semaphore(
+      ext::oneapi::experimental::interop_semaphore_handle
+          interop_semaphore_handle,
+      uint64_t wait_value);
+
   void ext_oneapi_signal_external_semaphore(
       ext::oneapi::experimental::interop_semaphore_handle
           interop_semaphore_handle);
+
+  void ext_oneapi_signal_external_semaphore(
+      ext::oneapi::experimental::interop_semaphore_handle
+          interop_semaphore_handle,
+      uint64_t signal_value);
 };
 
 class queue {
@@ -1773,6 +1816,21 @@ public:
           interop_semaphore_handle,
       const std::vector<event> &DepEvents);
 
+  event ext_oneapi_wait_external_semaphore(
+      ext::oneapi::experimental::interop_semaphore_handle
+          interop_semaphore_handle,
+      uint64_t wait_value);
+  event ext_oneapi_wait_external_semaphore(
+      ext::oneapi::experimental::interop_semaphore_handle
+          interop_semaphore_handle,
+      uint64_t wait_value, 
+      event DepEvent);
+  event ext_oneapi_wait_external_semaphore(
+      ext::oneapi::experimental::interop_semaphore_handle
+          interop_semaphore_handle,
+      uint64_t wait_value, 
+      const std::vector<event> &DepEvents);
+
   event ext_oneapi_signal_external_semaphore(
       ext::oneapi::experimental::interop_semaphore_handle
           interop_semaphore_handle);
@@ -1784,17 +1842,46 @@ public:
       ext::oneapi::experimental::interop_semaphore_handle
           interop_semaphore_handle,
       const std::vector<event> &DepEvents);
+
+  event ext_oneapi_signal_external_semaphore(
+      ext::oneapi::experimental::interop_semaphore_handle
+          interop_semaphore_handle,
+      uint64_t signal_value);
+  event ext_oneapi_signal_external_semaphore(
+      ext::oneapi::experimental::interop_semaphore_handle
+          interop_semaphore_handle,
+      uint64_t signal_value,
+      event DepEvent);
+  event ext_oneapi_signal_external_semaphore(
+      ext::oneapi::experimental::interop_semaphore_handle
+          interop_semaphore_handle,
+      uint64_t signal_value,
+      const std::vector<event> &DepEvents);
 };
 }
 ```
 
-Any operations submitted to the queue after a 
-`ext_oneapi_wait_external_semaphore` call will not begin until the imported 
-semaphore is in a signaled state.
+The behaviour of waiting on a semaphore will depend on the type of the 
+semaphore which was imported.
+
+If the semaphore does not support setting of a discrete state value (the 
+semaphore type is not `win32_nt_dx12_fence`), then any operations submitted to 
+the queue after a `ext_oneapi_wait_external_semaphore` call will not begin 
+until the imported semaphore is in a signalled state. After this, the semaphore 
+will be reset to a non-signalled state.
+
+If the semaphore does support setting of a discrete state value (the semaphore 
+type is `win32_nt_dx12_fence`), then any operations submitted to the queue 
+after a `ext_oneapi_wait_external_semaphore` call will not begin until the 
+imported semaphore is in a state greater than or equal to the `wait_value`. The 
+state of this type of semaphore will not be altered by the call to 
+`ext_oneapi_wait_external_semaphore`.
 
 When `ext_oneapi_signal_external_semaphore` is called, the external semaphore 
-will be set to the signaled state after all commands submitted to the queue 
-prior to the `ext_oneapi_signal_external_semaphore` call complete.
+will either be set to a signalled state, or the state of the semaphore will be 
+set to `signal_value`, depending on the type of semaphore which was imported.
+This singalling will be done after all commands submitted to the queue prior to 
+the `ext_oneapi_signal_external_semaphore` call complete.
 
 `ext_oneapi_wait_external_semaphore` and `ext_oneapi_signal_external_semaphore` 
 are non-blocking, asynchronous operations.
@@ -2366,13 +2453,17 @@ int external_output_image_file_descriptor = /* passed from external API */
 // Extension: populate external memory descriptors
 sycl::ext::oneapi::experimental::external_mem_descriptor<
     sycl::ext::oneapi::experimental::resource_fd>
-    input_ext_mem_desc{external_input_image_file_descriptor, 
-                       img_size_in_bytes};
+    input_ext_mem_desc{
+      external_input_image_file_descriptor,
+      sycl::ext::oneapi::experimental::external_mem_handle_type::opaque_fd,
+      img_size_in_bytes};
 
 sycl::ext::oneapi::experimental::external_mem_descriptor<
     sycl::ext::oneapi::experimental::resource_fd>
-    output_ext_mem_desc{external_output_image_file_descriptor,
-                        img_size_in_bytes};
+    output_ext_mem_desc{
+      external_output_image_file_descriptor,
+      sycl::ext::oneapi::experimental::external_mem_handle_type::opaque_fd,
+      img_size_in_bytes};
 
 // An external API semaphore will signal this semaphore before our SYCL commands
 // can begin execution
@@ -2386,11 +2477,13 @@ int done_semaphore_file_descriptor = /* passed from external API */;
 //            We assume POSIX file descriptor resource types
 sycl::ext::oneapi::experimental::external_semaphore_descriptor<
     sycl::ext::oneapi::experimental::resource_fd>
-    wait_external_semaphore_desc{wait_semaphore_file_descriptor};
+    wait_external_semaphore_desc{wait_semaphore_file_descriptor,
+    sycl::ext::oneapi::experimental::external_semaphore_handle_type::opaque_fd};
 
 sycl::ext::oneapi::experimental::external_semaphore_descriptor<
     sycl::ext::oneapi::experimental::resource_fd>
-    done_external_semaphore_desc{done_semaphore_file_descriptor};
+    done_external_semaphore_desc{done_semaphore_file_descriptor,
+    sycl::ext::oneapi::experimental::external_semaphore_handle_type::opaque_fd};
 
 try {
   // Extension: import external semaphores
@@ -2682,4 +2775,15 @@ These features still need to be handled:
                     This function is redundant since images don't have a notion
                     of channel order, only the channel size. Use
                     `get_num_channels()` instead.
+|5.11|2024-05-27| - Added `external_mem_handle_type` and 
+                    `external_semaphore_handle_type` enums. These will allow 
+                    multiple handle types to be consumed by the same interop API.
+                  - Added `handle_type` field to the `external_mem_descriptor`
+                    and `external_semaphore_descriptor` structs. This allows
+                    multiple handle types to be consumed by the API, such as 
+                    file descriptors, Windows NT handles, and other handles in 
+                    the future.
+                  - Added semaphore operations which can accept values. These
+                    are only supported for certain semaphore types 
+                    (e.g. `win32_nt_dx12_fence`).
 |======================
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc
index 8006bf651b1ad..f2ebcc5944462 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc
@@ -121,6 +121,16 @@ which version of this extension first included each of these enumerators.
 |Added in version
 |Description
 
+3+^|*Unknown architecture*
+
+a|
+[source]
+----
+unknown
+----
+|-
+|Some architecture which is not one of those listed below.
+
 3+^|*Intel CPU family*
 
 a|
@@ -528,6 +538,7 @@ a|
 [source]
 ----
 nvidia_gpu_sm_90
+nvidia_gpu_sm_90a
 ----
 |-
 |NVIDIA Hopper architecture.
@@ -1018,7 +1029,8 @@ struct architecture;
 
 _Return type:_ `sycl::ext::oneapi::experimental::architecture`
 
-_Returns:_ The architecture of the device.
+_Returns:_ The architecture of the device if architecture is supported, otherwise 
+`ext::oneapi::experimental::architecture::unknown`.
 |====
 
 
@@ -1106,6 +1118,9 @@ They currently exist only for use with the
 link:sycl_ext_matrix/sycl_ext_oneapi_matrix.asciidoc[sycl_ext_oneapi_matrix]
 extension.
 
+The architecture enumeration `unknown` is not currently supported with the 
+`if_architecture_is` function.
+
 
 == Implementation notes
 
diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_enqueue_functions.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc
similarity index 97%
rename from sycl/doc/extensions/proposed/sycl_ext_oneapi_enqueue_functions.asciidoc
rename to sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc
index ed85566c99fbc..70898ecf61a10 100644
--- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_enqueue_functions.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_enqueue_functions.asciidoc
@@ -44,11 +44,12 @@ SYCL specification refer to that revision.
 
 == Status
 
-This is a proposed extension specification, intended to gather community
-feedback.  Interfaces defined in this specification may not be implemented yet
-or may be in a preliminary state.  The specification itself may also change in
-incompatible ways before it is finalized.  *Shipping software products should
-not rely on APIs defined in this specification.*
+This is an experimental extension specification, intended to provide early
+access to features and gather community feedback.  Interfaces defined in this
+specification are implemented in {dpcpp}, but they are not finalized and may
+change incompatibly in future versions of {dpcpp} without prior notice.
+*Shipping software products should not rely on APIs defined in this
+specification.*
 
 
 == Overview
@@ -79,7 +80,7 @@ This extension makes SYCL simpler and easier to document. It is also expected
 to improve the performance of many SYCL applications, where `event` objects are
 not required to describe application behavior.
 
-All functions proposed in this extension accept as their first argument an
+All functions in this extension accept as their first argument an
 object that represents where a command should be submitted, allowing the new
 functions to be used either at command-group scope or as a replacement for
 existing queue shortcuts. A future version of this extension may adjust this
@@ -89,7 +90,7 @@ by accepting a scheduler and returning a sender).
 
 === Usage example
 
-The example below demonstrates that the syntax proposed here requires only
+The example below demonstrates that the syntax here requires only
 minor changes to existing applications, while retaining their structure.
 
 
@@ -117,7 +118,7 @@ sycl::free(output, q);
 ----
 
 
-==== Proposed syntax
+==== Syntax
 
 [source,c++]
 ----
diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
index 961f87462af6c..77fab2ebe5fb1 100644
--- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
+++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc
@@ -1047,9 +1047,11 @@ Constraints:
 
 Parameters:
 
-* `propList` - Optional parameter for passing properties. The only property
-  that is valid to pass here is `property::graph::updatable`, to enable the
+* `propList` - Optional parameter for passing properties. Two properties
+  are valid to pass here. One is `property::graph::updatable` to enable the
   returned executable graph to be <<executable-graph-update, updated>>.
+  The other is <<enable-profiling, `property::graph::enable_profiling`>>
+  to enable profiling events returned from submissions of the executable graph.
 
 Returns: A new executable graph object which can be submitted to a queue.
 
diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc
index f2832edc31156..7a471b7fa36c6 100644
--- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc
+++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_free_function_kernels.asciidoc
@@ -773,6 +773,62 @@ int main() {
 ```
 
 
+== {dpcpp} guaranteed compatibility with Level Zero and OpenCL backends
+
+The contents of this section are non-normative and apply only to the {dpcpp}
+implementation.
+Kernels written using the free function kernel syntax can be submitted to a
+device by using the Level Zero or OpenCL backends, without going through the
+SYCL host runtime APIs.
+This works only when the kernel is AOT compiled to native device code using the
+`-fsycl-targets` compiler option.
+
+The interface to the kernel in the native device code module is only guaranteed
+when the kernel adheres to the following restrictions:
+
+* The kernel is written in the free function kernel syntax;
+* The kernel function is declared as `extern "C"`;
+* Each formal argument to the kernel is either a {cpp} trivially copyable type
+  or the `work_group_memory` type (see
+  link:../proposed/sycl_ext_oneapi_work_group_memory.asciidoc[
+  sycl_ext_oneapi_work_group_memory]); and
+* The translation unit containing the kernel is compiled with the
+  `-fno-sycl-dead-args-optimization` option.
+
+Both Level Zero and OpenCL identify a kernel via a _name_ string.
+(See `zeKernelCreate` and `clCreateKernel` in their respective specifications.)
+When a kernel is defined according to the restrictions above, the _name_ is
+guaranteed to be the same as the name of the kernel's function in the {cpp}
+source code but with "++__sycl_kernel_++" prefixed.
+For example, if the function name is "foo", the kernel's name in the native
+device code module is "++__sycl_kernel_foo++".
+
+Both Level Zero and OpenCL set kernel argument values using three pieces of
+information:
+
+* The index of the argument;
+* The size (in bytes) of the value; and
+* A pointer to the start of the value.
+
+(See `zeKernelSetArgumentValue` and `clSetKernelArg` in their respective
+specifications.)
+
+When a kernel is defined according to the restrictions above, the argument
+indices are the same as the positions of the formal kernel arguments in the
+{cpp} source code.
+The first argument has index 0, the next has index 1, etc.
+
+If an argument has a trivially copyable type, the size must be the size of that
+type, and the pointer must point to a memory region that has the same size and
+representation as that trivially copyable type.
+
+If an argument has the type `work_group_memory`, the size must be the size (in
+bytes) of the device local memory that is represented by the
+`work_group_memory` argument.
+The pointer passed to  `zeKernelSetArgumentValue` or `clSetKernelArg` must be
+NULL in this case.
+
+
 == Implementation notes
 
 === Compiler diagnostics
diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_profiling_tag.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_profiling_tag.asciidoc
index 9d971079b0e56..30ead43b272e3 100644
--- a/sycl/doc/extensions/proposed/sycl_ext_oneapi_profiling_tag.asciidoc
+++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_profiling_tag.asciidoc
@@ -162,10 +162,10 @@ Implementations are encouraged to transition the event directly from the
 "submitted" status to the "complete" status and are encouraged to set the
 "command_start" timestamp to the same value as the "command_end" timestamp.
 
-_Throws:_ A synchronous `exception` with the `errc::invalid` error code if the
-queue was not constructed with the `property::queue::enable_profiling` property
-and if the queue's device does not have the aspect
-`ext_oneapi_queue_profiling_tag`.
+_Throws:_ A synchronous `exception` with the `errc::invalid` error code if
+the queue's device does not have the aspect `ext_oneapi_queue_profiling_tag`
+and the queue was not constructed with the `property::queue::enable_profiling`
+property.
 
 [_Note:_ In order to understand why the "command_start" and "command_end"
 timestamps are encouraged to be the same, think of the barrier as an empty
diff --git a/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_memory.asciidoc b/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_memory.asciidoc
new file mode 100644
index 0000000000000..9a7875c6987ab
--- /dev/null
+++ b/sycl/doc/extensions/proposed/sycl_ext_oneapi_work_group_memory.asciidoc
@@ -0,0 +1,553 @@
+= sycl_ext_oneapi_work_group_memory
+
+:source-highlighter: coderay
+:coderay-linenums-mode: table
+
+// This section needs to be after the document title.
+:doctype: book
+:toc2:
+:toc: left
+:encoding: utf-8
+:lang: en
+:dpcpp: pass:[DPC++]
+:endnote: &#8212;{nbsp}end{nbsp}note
+
+// Set the default source code type in this document to C++,
+// for syntax highlighting purposes.  This is needed because
+// docbook uses c++ and html5 uses cpp.
+:language: {basebackend@docbook:c++:cpp}
+
+
+== Notice
+
+[%hardbreaks]
+Copyright (C) 2024 Intel Corporation.  All rights reserved.
+
+Khronos(R) is a registered trademark and SYCL(TM) and SPIR(TM) are trademarks
+of The Khronos Group Inc.  OpenCL(TM) is a trademark of Apple Inc. used by
+permission by Khronos.
+
+
+== Contact
+
+To report problems with this extension, please open a new issue at:
+
+https://github.com/intel/llvm/issues
+
+
+== Dependencies
+
+This extension is written against the SYCL 2020 revision 8 specification.
+All references below to the "core SYCL specification" or to section numbers in
+the SYCL specification refer to that revision.
+
+This extension also depends on the following other SYCL extensions:
+
+* link:../experimental/sycl_ext_oneapi_properties.asciidoc[
+  sycl_ext_oneapi_properties]
+
+
+== Status
+
+This is a proposed extension specification, intended to gather community
+feedback.
+Interfaces defined in this specification may not be implemented yet or may be
+in a preliminary state.
+The specification itself may also change in incompatible ways before it is
+finalized.
+*Shipping software products should not rely on APIs defined in this
+specification.*
+
+
+== Overview
+
+This extension adds a lower overhead way to allocate device local memory,
+memory which is shared by all work-items in a work-group.
+The `local_accessor` class in the core SYCL specification provides a mechanism
+to do this also, but `local_accessor` has higher overhead because it
+encapsulates both a pointer to the memory and the size of that memory.
+When a `local_accessor` has multiple dimensions, it contains the size in
+each dimension.
+By comparison, the `work_group_memory` class in this extension encapsulates
+only a pointer to the memory without any size information.
+The functionality of `work_group_memory` is, of course, less than
+`local_accessor`, but many applications do not need the extra features.
+
+
+== Specification
+
+=== Feature test macro
+
+This extension provides a feature-test macro as described in the core SYCL
+specification.
+An implementation supporting this extension must predefine the macro
+`SYCL_EXT_ONEAPI_WORK_GROUP_MEMORY` to one of the values defined in the table
+below.
+Applications can test for the existence of this macro to determine if the
+implementation supports this feature, or applications can test the macro's
+value to determine which of the extension's features the implementation
+supports.
+
+[%header,cols="1,5"]
+|===
+|Value
+|Description
+
+|1
+|The APIs of this experimental extension are not versioned, so the
+ feature-test macro always has this value.
+|===
+
+=== New `work_group_memory` class
+
+This extension adds the following new class:
+
+[source,c++]
+----
+namespace sycl::ext::oneapi::experimental {
+
+template<typename DataT, typename PropertyListT = empty_properties_t>
+class work_group_memory {
+ public:
+  using value_type = std::remove_all_extents_t<DataT>;
+
+  work_group_memory();
+  work_group_memory(const work_group_memory& rhs);
+  work_group_memory(handler& cgh);
+  work_group_memory(size_t num, handler& cgh);
+  work_group_memory& operator=(const work_group_memory& rhs);
+
+  operator DataT&() const;
+  const work_group_memory& operator=(const DataT& value) const;
+  DataT* operator&() const;
+
+  template<access::decorated IsDecorated = access::decorated::no>
+  multi_ptr<value_type, access::address_space::local_space, IsDecorated> get_multi_ptr() const;
+};
+
+} // namespace sycl::ext::oneapi::experimental
+----
+
+The `work_group_memory` class allocates device local memory and provides access
+to this memory from within a SYCL kernel function.
+The local memory that is allocated is shared between all work-items of a
+work-group.
+If multiple work-groups execute simultaneously, each of those work-group
+receives its own independent copy of the allocated local memory.
+
+The `work_group_memory` type is a legal kernel parameter type as defined in
+section 4.12.4 "Rules for parameter passing to kernels" of the core SYCL
+specification.
+Applications typically construct an object of type `work_group_memory` in
+command group scope, pass the object as a kernel parameter, and then reference
+the object inside the kernel in order to access the device local memory that it
+contains.
+
+The `work_group_memory` class may only be used in an nd-range kernel.
+If an application passes a `work_group_memory` object as an argument to a
+single-task kernel or to a simple "range" kernel, the implementation must throw
+a synchronous `exception` with the `errc::kernel_argument` error code when the
+kernel is enqueued.
+
+The `DataT` template parameter identifies the type of the objects created in
+device local memory, and this type must be one of the types that is supported
+in device code.
+In order to create an array of objects, `DataT` should be an array type.
+For example, `work_group_memory<float[10]>` creates an array of 10 `float`
+objects in device local memory.
+In order to create an array of objects where the number of elements is
+determined at runtime, specify an unbounded array type such as
+`work_group_memory<float[]>` and use the constructor overload that takes a
+`num` parameter.
+
+If `DataT` is an implicit-lifetime type as defined in the {cpp} core language,
+`work_group_memory` implicitly creates objects of that type with indeterminate
+values.
+For other types, `work_group_memory` merely allocates uninitialized memory, and
+the application is responsible for constructing objects in that memory (e.g. by
+calling placement-new).
+
+The `PropertyListT` template parameter currently has no meaning and must have
+its default value of `empty_properties_t`.
+This template parameter may be used in the future to associate compile-time
+properties with the `work_group_memory`.
+
+==== Type aliases
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+using value_type = std::remove_all_extents_t<DataT>;
+----
+!====
+
+This type alias provides the data type of the device local memory with all
+array extents removed.
+
+==== Constructors and copy assignment
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+work_group_memory();
+----
+!====
+
+_Effects:_ Constructs a "dummy" `work_group_memory` object that does not
+represent any device local memory.
+The only valid operation for a dummy object is the copy-assignment operator,
+which overwrites the object with the right-hand-side of the assignment.
+Passing a dummy object as a kernel argument or calling any of its other
+member functions or operators produces undefined behavior.
+
+[_Note:_ This constructor may be called in either host code or device code.
+_{endnote}_]
+
+'''
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+work_group_memory(const work_group_memory& rhs);
+----
+!====
+
+_Effects:_ Constructs a `work_group_memory` object which is a copy of the
+`rhs` object.
+The new object represents the same underlying device local memory as `rhs`.
+
+[_Note:_ This constructor may be called in either host code or device code.
+_{endnote}_]
+
+[_Note:_ The copied object does not always represent the same underlying device
+local memory when the copy constructor is called in host code.
+See the open issues.
+_{endnote}_]
+
+'''
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+work_group_memory(handler& cgh);              (1)
+work_group_memory(size_t num, handler& cgh);  (2)
+----
+!====
+
+_Preconditions:_ These constructors must be called from host code.
+
+_Constraints (1):_ Available only when `DataT` is not an unbounded array.
+
+_Constraints (2):_ Available only when `DataT` is an unbounded array.
+
+_Effects:_ Constructs a `work_group_memory` object which represents device
+local memory of type `DataT` in the kernel that is enqueued via the `cgh`
+handler.
+Overload (2) uses `num` to determine the number of elements in the unbounded
+array `DataT`.
+
+_Remarks:_ Attempting to pass the `work_group_memory` object as an argument
+to a kernel that is _not_ launched via the `cgh` handler produces undefined
+behavior.
+
+'''
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+work_group_memory& operator=(const work_group_memory& rhs);
+----
+!====
+
+_Effects:_ Replaces the `work_group_memory` object with a copy of the `rhs` object.
+The replaced object represents the same underlying device local memory as `rhs`.
+
+_Returns:_ A reference to the `work_group_memory` object.
+
+[_Note:_ This operator may be called in either host code or device code.
+_{endnote}_]
+
+[_Note:_ The replaced object does not always represent the same underlying
+device local memory when the assignment operator is called in host code.
+See the open issues.
+_{endnote}_]
+
+==== Member functions and operators
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+operator DataT&() const;
+----
+!====
+
+_Preconditions:_ This operator must be called from device code.
+
+_Effects:_ Implicit conversion to the underlying `DataT`.
+
+'''
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+const work_group_memory& operator=(const DataT& value) const;
+----
+!====
+
+_Preconditions:_ This operator must be called from device code.
+
+_Constraints:_ Available only when `DataT` is not an array.
+
+_Effects:_ Assigns the value `value` to the underlying device local memory
+object.
+
+_Returns:_ A reference to the `work_group_memory` object.
+
+'''
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+DataT* operator&() const;
+----
+!====
+
+_Preconditions:_ This operator must be called from device code.
+
+_Returns:_ A pointer to the underlying device local memory object.
+
+'''
+
+[frame=all,grid=none,separator="@"]
+!====
+a@
+[source,c++]
+----
+template<access::decorated IsDecorated = access::decorated::no>
+multi_ptr<value_type, access::address_space::local_space, IsDecorated> get_multi_ptr() const;
+----
+!====
+
+_Preconditions:_ This function must be called from device code.
+
+_Returns:_ A `multi_ptr` to the underlying device local memory object.
+
+
+== Examples
+
+=== Basic usage
+
+The following example illustrates a typical use of the `work_group_memory`
+class.
+
+[source,c++]
+----
+#include <sycl/sycl.hpp>
+namespace syclexp = sycl::ext::oneapi::experimental;
+
+constexpr size_t SIZE = 4096;
+constexpr size_t WGSIZE = 256;
+
+int main() {
+  sycl::queue q;
+
+  q.submit([&](sycl::handler &cgh) {
+    // Allocate one element for each work-item in the work-group.
+    syclexp::work_group_memory<int[WGSIZE]> mem{cgh};
+
+    sycl::nd_range ndr{{SIZE}, {WGSIZE}};
+    cgh.parallel_for(ndr, [=](sycl::nd_item<> it) {
+      size_t id = it.get_local_linear_id();
+
+      // Each work-item has its own dedicated element of the array.
+      mem[id] = /*...*/;
+    });
+  }).wait();
+}
+----
+
+=== Operations on types
+
+The following example illustrates various operations that can be done with the
+`work_group_memory` class when it is templated with different `DataT` types.
+
+[source,c++]
+----
+#include <sycl/sycl.hpp>
+namespace syclexp = sycl::ext::oneapi::experimental;
+
+constexpr size_t SIZE = 4096;
+constexpr size_t WGSIZE = 256;
+
+struct point {
+  int x;
+  int y;
+};
+
+int main() {
+  sycl::queue q;
+
+  q.submit([&](sycl::handler &cgh) {
+    syclexp::work_group_memory<int>       mem1{cgh};    // scalar
+    syclexp::work_group_memory<int[10]>   mem2{cgh};    // bounded array
+    syclexp::work_group_memory<int[]>     mem3{5, cgh}; // unbounded array
+    syclexp::work_group_memory<int[][10]> mem4{2, cgh}; // multi-dimensional array
+    syclexp::work_group_memory<point[10]> mem5{cgh};    // array of struct
+
+    sycl::nd_range ndr{{SIZE}, {WGSIZE}};
+    cgh.parallel_for(ndr, [=](sycl::nd_item<> it) {
+      if (it.get_group().leader()) {
+        // A "work_group_memory" templated on a scalar type acts much like the
+        // enclosed scalar type.
+        ++mem1;
+        mem1++;
+        mem1 += 1;
+        mem1 = mem1 + 1;
+        int *p1 = &mem1;
+
+        // A "work_group_memory" templated on an array type (either bounded or
+        // unbounded) acts like an array.
+        ++mem2[4];
+        mem2[4]++;
+        mem2[4] = mem2[4] + 1;
+        int *p2 = &mem2[4];
+
+        // A multi-dimensional array works as expected.
+        mem4[1][5] = mem4[1][5] + 1;
+        mem4[1][7] = mem4[1][7] + 1;
+
+        // An array of structs works as expected too.
+        mem5[1].x++;
+        mem5[1].y = mem5[1].y + 1;
+      }
+    });
+  }).wait();
+}
+----
+
+=== Usage with a free function kernel
+
+The following example illustrates usage of `work_group_memory` in a free
+function kernel.
+
+[source,c++]
+----
+#include <sycl/sycl.hpp>
+namespace syclexp = sycl::ext::oneapi::experimental;
+namespace syclext = sycl::ext::oneapi;
+
+constexpr size_t SIZE = 4096;
+constexpr size_t WGSIZE = 256;
+
+SYCL_EXT_ONEAPI_FUNCTION_PROPERTY((syclexp::nd_range_kernel<1>))
+void mykernel(syclexp::work_group_memory<int[WGSIZE]> mem) {
+  size_t id = syclext::this_work_item::get_nd_item().get_local_linear_id();
+
+  // Each work-item has its own dedicated element of the device local memory
+  // array.
+  mem[id] = /*...*/;
+}
+
+int main() {
+  sycl::queue q;
+  sycl::context ctxt = q.get_context();
+
+  // Get the kernel object for the "mykernel" kernel.
+  auto exe_bndl =
+    syclexp::get_kernel_bundle<mykernel, sycl::bundle_state::executable>(ctxt);
+  sycl::kernel k_mykernel = exe_bndl.ext_oneapi_get_kernel<mykernel>();
+
+  q.submit([&](sycl::handler &cgh) {
+    // Allocate an array of device local memory with one element for each
+    // work-item in the work-group.
+    syclexp::work_group_memory<int[WGSIZE]> mem{cgh};
+    cgh.set_args(mem);
+
+    sycl::nd_range ndr{{NUM}, {WGSIZE}};
+    cgh.parallel_for(ndr, k_mykernel);
+  }).wait();
+}
+----
+
+
+== Issues
+
+* We have not agreed on the way in which `work_group_memory` should be created
+  when there is a property list.
+  One option is to add a new constructor that takes a `PropertyListT` parameter
+  and use CTAD to deduce the class template parameters.
+  However, we need some way to deduce `DataT` because CTAD does not work unless
+  it deduces all of the template parameters.
+  This leads to a constructor that requires a tag-type parameter like:
++
+[source,c++]
+----
+template<typename T>
+struct type_tag {};
+
+template<typename T>
+inline constexpr type_tag<T> type;
+
+template<typename DataT, typename PropertyListT = empty_properties_t>
+class work_group_memory {
+  work_group_memory(const type_tag<DataT>&, handler& cgh,
+                    const PropertyListT& props = {});
+};
+
+// Deduction guide for the constructor that takes "type_tag".
+template<typename DataT, typename PropertyListT>
+work_group_memory(const type_tag<DataT>&, handler&, const PropertyListT&) ->
+  work_group_memory<DataT, PropertyListT>;
+----
++
+Usage would be like:
++
+[source,c++]
+----
+syclexp::work_group_memory mem{syclexp::type<int[10]>, cgh, props};
+----
++
+Another option is to add a factory function like:
++
+[source,c++]
+----
+template<typename DataT, typename PropertyListT = empty_properties_t>
+work_group_memory<DataT, PropertyListT>
+make_work_group_memory(handler& cgh, const PropertyListT& props = {});
+----
++
+In which case, usage would be like:
++
+[source,c++]
+----
+auto mem = syclexp::make_work_group_memory<int[10]>(cgh, props);
+----
++
+We decided to defer this decision for now because we don't have any properties
+defined for this class yet anyways.
+
+* The copy constructor and copy assignment operator say that the copied object
+  "represents the same underlying device local memory as ``rhs``".
+  This is not currently the case in {dpcpp} when the copy happens in host code.
+  If you pass two `work_group_memory` objects as kernel parameters, each object
+  creates a unique device local memory region, even if one `work_group_memory`
+  object is a copy of the other.
+  The `local_accessor` class behaves the same way.
+  See https://github.com/KhronosGroup/SYCL-Docs/issues/552[this issue] against
+  the SYCL specification.
diff --git a/sycl/doc/syclcompat/README.md b/sycl/doc/syclcompat/README.md
index 6b776cfb777b3..a35d69cb2a0fa 100644
--- a/sycl/doc/syclcompat/README.md
+++ b/sycl/doc/syclcompat/README.md
@@ -399,7 +399,7 @@ static void destroy_event(event_ptr event);
 } // syclcompat
 ```
 
-### Memory Allocation
+### Memory Operations
 
 This library provides interfaces to allocate memory to be accessed within kernel
 functions and on the host. The `syclcompat::malloc` function allocates device
@@ -489,10 +489,12 @@ sycl::event memset_async(pitched_data pitch, int val,
                          sycl::range<3> size,
                          sycl::queue q = get_default_queue()); // 3D matrix
 
+// Free
+void wait_and_free(void *ptr, sycl::queue q = get_default_queue());
 void free(void *ptr, sycl::queue q = get_default_queue());
-sycl::event free_async(const std::vector<void *> &pointers,
-                       const std::vector<sycl::event> &events,
-                       sycl::queue q = get_default_queue());
+sycl::event enqueue_free(const std::vector<void *> &pointers,
+                         const std::vector<sycl::event> &events,
+                         sycl::queue q = get_default_queue());
 
 // Queries pointer allocation type
 class pointer_attributes {
@@ -508,6 +510,64 @@ public:
 } // syclcompat
 ```
 
+The `syclcompat::experimental` namespace contains currently unsupported `memcpy` overloads which take a `syclcompat::experimental::memcpy_parameter` argument. These are included for forwards compatibility and currently throw a `std::runtime_error`.
+
+```cpp
+namespace syclcompat {
+namespace experimental {
+// Forward declarations for types relating to unsupported memcpy_parameter API:
+
+enum memcpy_direction {
+  host_to_host,
+  host_to_device,
+  device_to_host,
+  device_to_device,
+  automatic
+};
+
+#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
+class image_mem_wrapper;
+#endif
+class image_matrix;
+
+/// Memory copy parameters for 2D/3D memory data.
+struct memcpy_parameter {
+  struct data_wrapper {
+    pitched_data pitched{};
+    sycl::id<3> pos{};
+#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
+    experimental::image_mem_wrapper *image_bindless{nullptr};
+#endif
+    image_matrix *image{nullptr};
+  };
+  data_wrapper from{};
+  data_wrapper to{};
+  sycl::range<3> size{};
+  syclcompat::detail::memcpy_direction direction{syclcompat::detail::memcpy_direction::automatic};
+};
+
+/// [UNSUPPORTED] Synchronously copies 2D/3D memory data specified by \p param .
+/// The function will return after the copy is completed.
+///
+/// \param param Memory copy parameters.
+/// \param q Queue to execute the copy task.
+/// \returns no return value.
+static inline void memcpy(const memcpy_parameter &param,
+                          sycl::queue q = get_default_queue());
+
+/// [UNSUPPORTED] Asynchronously copies 2D/3D memory data specified by \p param
+/// . The return of the function does NOT guarantee the copy is completed.
+///
+/// \param param Memory copy parameters.
+/// \param q Queue to execute the copy task.
+/// \returns no return value.
+static inline void memcpy_async(const memcpy_parameter &param,
+                                sycl::queue q = get_default_queue());
+
+} // namespace experimental
+} // namespace syclcompat
+```
+
 Finally, the class `pitched_data`, which manages memory allocation for 3D
 spaces, padded to avoid uncoalesced memory accesses.
 
@@ -760,7 +820,9 @@ public:
   unsigned int get_global_mem_cache_size() const;
   int get_image1d_max() const;
   auto get_image2d_max() const;
+  auto get_image2d_max();
   auto get_image3d_max() const;
+  auto get_image3d_max();
 
   void set_name(const char *name);
   void set_max_work_item_sizes(const sycl::range<3> max_work_item_sizes);
@@ -844,9 +906,24 @@ static inline sycl::context get_default_context();
 // Util function to get a CPU device.
 static inline device_ext &cpu_device();
 
+/// Filter out devices; only keep the device whose name contains one of the
+/// subname in \p dev_subnames.
+/// May break device id mapping and change current device. It's better to be
+/// called before other SYCLcompat or SYCL APIs.
+static inline void filter_device(const std::vector<std::string> &dev_subnames);
+
+/// Print all the devices (and their IDs) in the dev_mgr
+static inline void list_devices();
+
 // Util function to select a device by its id
 static inline unsigned int select_device(unsigned int id);
 
+// Util function to get the device id from a device
+static inline unsigned int get_device_id(const sycl::device &dev);
+
+// Util function to get the number of available devices
+static inline unsigned int device_count();
+
 } // syclcompat
 ```
 
@@ -861,13 +938,19 @@ independently of what is set in this parameter.
 Devices are managed through a helper class, `device_ext`. The `device_ext` class
 associates a vector of `sycl::queues` with its `sycl::device`. The `device_ext`
 destructor waits on a set of `sycl::event` which can be added to via
-`add_event`. This is used, for example, to implement `syclcompat::free_async` to
+`add_event`. This is used, for example, to implement `syclcompat::enqueue_free` to
 schedule release of memory after a kernel or `mempcy`. SYCL device properties
 can be queried through `device_ext` as well.
 `device_ext` also provides the `has_capability_or_fail` member function, which
 throws a `sycl::exception` if the device does not have the specified list of
 `sycl::aspect`.
 
+Devices can be listed and filtered using `syclcompat::list_devices()` and
+`syclcompat::filter_device()`. If `SYCLCOMPAT_VERBOSE` is defined at compile
+time, the available SYCL devices are printed to the standard output both at
+initialization time, and when the device list is filtered using
+`syclcompat::filter_device`.
+
 Users can manage queues through the `syclcompat::set_default_queue(sycl::queue
 q)` free function, and the `device_ext` `set_saved_queue`, `set_default_queue`,
 and `get_saved_queue` member functions.
@@ -1511,6 +1594,13 @@ without modulo overflow for vector types.
 The functions `cmul`,`cdiv`,`cabs`, `cmul_add`, and `conj` define complex math
 operations which accept `sycl::vec<T,2>` arguments representing complex values.
 
+The `dp4a` function returns the 4-way 8-bit dot product accumulate for unsigned
+and signed 32-bit integer values. The `dp2a_lo` and `dp2a_hi` functions return the
+two-way 16-bit to 8-bit dot product using the second and first 16 bits of the
+second operand, respectively. These three APIs return a single 32-bit value with
+the accumulated result, which is unsigned if both operands are `uint32_t` and
+signed otherwise.
+
 ```cpp
 inline unsigned int funnelshift_l(unsigned int low, unsigned int high,
                                   unsigned int shift); 
@@ -1692,6 +1782,24 @@ inline sycl::marray<ValueT, 2> cmul_add(const sycl::marray<ValueT, 2> a,
 template <typename T> sycl::vec<T, 2> conj(sycl::vec<T, 2> x);
 
 template <typename ValueT> inline ValueT reverse_bits(ValueT a);
+
+
+template <typename T1, typename T2>
+using dot_product_acc_t =
+    std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
+                       uint32_t, int32_t>;
+
+template <typename T1, typename T2>
+inline dot_product_acc_t<T1, T2> dp2a_lo(T1 a, T2 b,
+                                         dot_product_acc_t<T1, T2> c);
+
+template <typename T1, typename T2>
+inline dot_product_acc_t<T1, T2> dp2a_hi(T1 a, T2 b,
+                                         dot_product_acc_t<T1, T2> c);
+
+template <typename T1, typename T2>
+inline dot_product_acc_t<T1, T2> dp4a(T1 a, T2 b,
+                                      dot_product_acc_t<T1, T2> c);
 ```
 
 `vectorized_binary` computes the `BinaryOperation` for two operands,
@@ -1754,7 +1862,7 @@ struct sub_sat {
 } // namespace syclcompat
 ```
 
-Finally, the math header provides a set of functions to extend 32-bit operations
+The math header provides a set of functions to extend 32-bit operations
 to 33 bit, and handle sign extension internally. There is support for `add`,
 `sub`, `absdiff`, `min` and `max` operations. Each operation provides overloads
 to include a second, separate, `BinaryOperation` after the first, and include
@@ -1838,6 +1946,591 @@ inline constexpr RetT extend_max_sat(AT a, BT b, CT c,
                                      BinaryOperation second_op);
 ```
 
+Another set of vectorized extend 32-bit operations is provided in the math 
+header.These APIs treat each of the 32-bit operands as 2-elements vector 
+(16-bits each) while handling sign extension to 17-bits internally. There is 
+support for `add`, `sub`, `absdiff`, `min`, `max` and `avg` binary operations. 
+Each operation provides has a `_sat` variat which determines if the returning 
+value is saturated or not, and a `_add` variant that computes the binary sum 
+of the the initial operation outputs and a third operand. 
+
+```cpp
+/// Compute vectorized addition of \p a and \p b, with each value treated as a
+/// 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd2(AT a, BT b, RetT c);
+
+/// Compute vectorized addition of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized addition of the two
+/// values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd2_add(AT a, BT b, RetT c);
+
+/// Compute vectorized addition of \p a and \p b with saturation, with each
+/// value treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd2_sat(AT a, BT b, RetT c);
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub2(AT a, BT b, RetT c);
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 2 elements vector type and extend each element to 17 bit. Then add each
+/// half of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized subtraction of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub2_add(AT a, BT b, RetT c);
+
+/// Compute vectorized subtraction of \p a and \p b with saturation, with each
+/// value treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub2_sat(AT a, BT b, RetT c);
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff2(AT a, BT b, RetT c);
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized abs_diff of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff2_add(AT a, BT b, RetT c);
+
+/// Compute vectorized abs_diff of \p a and \p b with saturation, with each
+/// value treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff2_sat(AT a, BT b, RetT c);
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin2(AT a, BT b, RetT c);
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized minimum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin2_add(AT a, BT b, RetT c);
+
+/// Compute vectorized minimum of \p a and \p b with saturation, with each value
+/// treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin2_sat(AT a, BT b, RetT c);
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax2(AT a, BT b, RetT c);
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized maximum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax2_add(AT a, BT b, RetT c);
+
+/// Compute vectorized maximum of \p a and \p b with saturation, with each value
+/// treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax2_sat(AT a, BT b, RetT c);
+
+/// Compute vectorized average of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized average of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg2(AT a, BT b, RetT c);
+
+/// Compute vectorized average of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend average maximum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg2_add(AT a, BT b, RetT c);
+
+/// Compute vectorized average of \p a and \p b with saturation, with each value
+/// treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized average of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg2_sat(AT a, BT b, RetT c);
+```
+
+Similarly, a set of vectorized extend 32-bit operations is provided in the math 
+header treating each of the 32-bit operands as 4-elements vector (8-bits each) 
+while handling sign extension to 9-bits internally. There is support for `add`,
+`sub`, `absdiff`, `min`, `max` and `avg` binary operations. 
+Each operation provides has a `_sat` variat which determines if the returning 
+value is saturated or not, and a `_add` variant that computes the binary sum 
+of the the initial operation outputs and a third operand. 
+
+```cpp
+/// Compute vectorized addition of \p a and \p b, with each value treated as a
+/// 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd4(AT a, BT b, RetT c);
+
+/// Compute vectorized addition of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized addition of the two
+/// values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd4_add(AT a, BT b, RetT c);
+
+/// Compute vectorized addition of \p a and \p b with saturation, with each
+/// value treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd4_sat(AT a, BT b, RetT c);
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub4(AT a, BT b, RetT c);
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 4 elements vector type and extend each element to 9 bit. Then add each
+/// half of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized subtraction of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub4_add(AT a, BT b, RetT c);
+
+/// Compute vectorized subtraction of \p a and \p b with saturation, with each
+/// value treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub4_sat(AT a, BT b, RetT c);
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff4(AT a, BT b, RetT c);
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized abs_diff of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff4_add(AT a, BT b, RetT c);
+
+/// Compute vectorized abs_diff of \p a and \p b with saturation, with each
+/// value treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff4_sat(AT a, BT b, RetT c);
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin4(AT a, BT b, RetT c);
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized minimum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin4_add(AT a, BT b, RetT c);
+
+/// Compute vectorized minimum of \p a and \p b with saturation, with each value
+/// treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin4_sat(AT a, BT b, RetT c);
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax4(AT a, BT b, RetT c);
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized maximum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax4_add(AT a, BT b, RetT c);
+
+/// Compute vectorized maximum of \p a and \p b with saturation, with each value
+/// treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax4_sat(AT a, BT b, RetT c);
+
+/// Compute vectorized average of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized average of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg4(AT a, BT b, RetT c);
+
+/// Compute vectorized average of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized average of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg4_add(AT a, BT b, RetT c);
+
+/// Compute vectorized average of \p a and \p b with saturation, with each value
+/// treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized average of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg4_sat(AT a, BT b, RetT c);
+```
+
+Vectorized comparison APIs also provided in the math header behave similarly 
+and support a `std` comparison operator parameter which can be `greater`, 
+`less`, `greater_equal`, `less_equal`, `equal_to` or `not_equal_to`. These APIs 
+cover both the 2-elements *(16-bits each)* and 4-elements *(8-bits each)* 
+variants, as well as an additional `_add` variant that computes the sum of the 
+2/4 output elements.
+
+```cpp
+/// Extend \p a and \p b to 33 bit and vectorized compare input values using
+/// specified comparison \p cmp .
+///
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the compare operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] cmp The comparsion operator
+/// \returns The comparison result of the two extended values.
+template <typename AT, typename BT, typename BinaryOperation>
+inline constexpr unsigned extend_vcompare2(AT a, BT b, BinaryOperation cmp);
+
+/// Extend Inputs to 33 bit, and vectorized compare input values using specified
+/// comparison \p cmp , then add the result with \p c .
+///
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the compare operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] cmp The comparsion operator
+/// \returns The comparison result of the two extended values, and add the
+/// result with \p c .
+template <typename AT, typename BT, typename BinaryOperation>
+inline constexpr unsigned extend_vcompare2_add(AT a, BT b, unsigned c,
+                                               BinaryOperation cmp);
+
+/// Extend \p a and \p b to 33 bit and vectorized compare input values using
+/// specified comparison \p cmp .
+///
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the compare operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] cmp The comparsion operator
+/// \returns The comparison result of the two extended values.
+template <typename AT, typename BT, typename BinaryOperation>
+inline constexpr unsigned extend_vcompare4(AT a, BT b, BinaryOperation cmp);
+
+/// Extend Inputs to 33 bit, and vectorized compare input values using specified
+/// comparison \p cmp , then add the result with \p c .
+///
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the compare operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] cmp The comparsion operator
+/// \returns The comparison result of the two extended values, and add the
+/// result with \p c .
+template <typename AT, typename BT, typename BinaryOperation>
+inline constexpr unsigned extend_vcompare4_add(AT a, BT b, unsigned c,
+                                               BinaryOperation cmp);
+```
+
+The math header file provides APIs for bit-field insertion (`bfi_safe`) and
+bit-field extraction (`bfe_safe`). These are bounds-checked variants of
+underlying `detail` APIs (`detail::bfi`, `detail::bfe`) which, in future
+releases, will be exposed to the user.
+
+```c++
+
+/// Bitfield-insert with boundary checking.
+///
+/// Align and insert a bit field from \param x into \param y . Source \param
+/// bit_start gives the starting bit position for the insertion, and source
+/// \param num_bits gives the bit field length in bits.
+///
+/// \tparam T The type of \param x and \param y , must be an unsigned integer.
+/// \param x The source of the bitfield.
+/// \param y The source where bitfield is inserted.
+/// \param bit_start The position to start insertion.
+/// \param num_bits The number of bits to insertion.
+template <typename T>
+inline T bfi_safe(const T x, const T y, const uint32_t bit_start,
+                  const uint32_t num_bits);
+
+/// Bitfield-extract with boundary checking.
+///
+/// Extract bit field from \param source and return the zero or sign-extended
+/// result. Source \param bit_start gives the bit field starting bit position,
+/// and source \param num_bits gives the bit field length in bits.
+///
+/// The result is padded with the sign bit of the extracted field. If `num_bits`
+/// is zero, the  result is zero. If the start position is beyond the msb of the
+/// input, the result is filled with the replicated sign bit of the extracted
+/// field.
+///
+/// \tparam T The type of \param source value, must be an integer.
+/// \param source The source value to extracting.
+/// \param bit_start The position to start extracting.
+/// \param num_bits The number of bits to extracting.
+template <typename T>
+inline T bfe_safe(const T source, const uint32_t bit_start,
+                  const uint32_t num_bits);
+```
+
 ## Sample Code
 
 Below is a simple linear algebra sample, which computes `y = mx + b` implemented
diff --git a/sycl/include/CL/__spirv/spirv_ops.hpp b/sycl/include/CL/__spirv/spirv_ops.hpp
index 6fd4b9ebf63db..3031e73a3c113 100644
--- a/sycl/include/CL/__spirv/spirv_ops.hpp
+++ b/sycl/include/CL/__spirv/spirv_ops.hpp
@@ -1019,10 +1019,16 @@ extern __DPCPP_SYCL_EXTERNAL void
 __spirv_ocl_prefetch(const __attribute__((opencl_global)) char *Ptr,
                      size_t NumBytes) noexcept;
 
-extern __DPCPP_SYCL_EXTERNAL uint16_t
-__spirv_ConvertFToBF16INTEL(float) noexcept;
 extern __DPCPP_SYCL_EXTERNAL float
     __spirv_ConvertBF16ToFINTEL(uint16_t) noexcept;
+extern __DPCPP_SYCL_EXTERNAL uint16_t
+__spirv_ConvertFToBF16INTEL(float) noexcept;
+template <int N>
+extern __DPCPP_SYCL_EXTERNAL __ocl_vec_t<float, N>
+    __spirv_ConvertBF16ToFINTEL(__ocl_vec_t<uint16_t, N>) noexcept;
+template <int N>
+extern __DPCPP_SYCL_EXTERNAL __ocl_vec_t<uint16_t, N>
+    __spirv_ConvertFToBF16INTEL(__ocl_vec_t<float, N>) noexcept;
 
 __SYCL_CONVERGENT__ extern __DPCPP_SYCL_EXTERNAL
     __SYCL_EXPORT __ocl_vec_t<uint32_t, 4>
@@ -1280,6 +1286,7 @@ __CLC_BF16_SCAL_VEC(uint32_t)
 
 extern __DPCPP_SYCL_EXTERNAL int32_t __spirv_BuiltInGlobalHWThreadIDINTEL();
 extern __DPCPP_SYCL_EXTERNAL int32_t __spirv_BuiltInSubDeviceIDINTEL();
+extern __DPCPP_SYCL_EXTERNAL uint64_t __spirv_ReadClockKHR(int);
 
 template <typename from, typename to>
 extern __DPCPP_SYCL_EXTERNAL
diff --git a/sycl/include/sycl/atomic_ref.hpp b/sycl/include/sycl/atomic_ref.hpp
index 5c163cd5fe8e2..1a1c4a63000f7 100644
--- a/sycl/include/sycl/atomic_ref.hpp
+++ b/sycl/include/sycl/atomic_ref.hpp
@@ -568,9 +568,14 @@ class [[__sycl_detail__::__uses_aspects__(aspect::atomic64)]] atomic_ref_impl<
 // Partial specialization for pointer types
 // Arithmetic is emulated because target's representation of T* is unknown
 // TODO: Find a way to use intptr_t or uintptr_t atomics instead
-template <typename T, bool IsAspectAtomic64AttrUsed, memory_order DefaultOrder, memory_scope DefaultScope,
-          access::address_space AddressSpace>
-class atomic_ref_impl<T *, IsAspectAtomic64AttrUsed, DefaultOrder, DefaultScope, AddressSpace>
+template <typename T, bool IsAspectAtomic64AttrUsed, memory_order DefaultOrder,
+          memory_scope DefaultScope, access::address_space AddressSpace>
+#ifndef __SYCL_DEVICE_ONLY__
+class atomic_ref_impl<
+#else
+class [[__sycl_detail__::__uses_aspects__(aspect::atomic64)]] atomic_ref_impl<
+#endif
+    T *, IsAspectAtomic64AttrUsed, DefaultOrder, DefaultScope, AddressSpace>
     : public atomic_ref_base<uintptr_t, DefaultOrder, DefaultScope,
                              AddressSpace> {
 
diff --git a/sycl/include/sycl/detail/cg.hpp b/sycl/include/sycl/detail/cg.hpp
index f0616dcce51b9..8d823c109ee34 100644
--- a/sycl/include/sycl/detail/cg.hpp
+++ b/sycl/include/sycl/detail/cg.hpp
@@ -534,33 +534,41 @@ class CGCopyImage : public CG {
 /// "Semaphore Wait" command group class.
 class CGSemaphoreWait : public CG {
   sycl::detail::pi::PiInteropSemaphoreHandle MInteropSemaphoreHandle;
+  std::optional<uint64_t> MWaitValue;
 
 public:
   CGSemaphoreWait(
       sycl::detail::pi::PiInteropSemaphoreHandle InteropSemaphoreHandle,
-      CG::StorageInitHelper CGData, detail::code_location loc = {})
+      std::optional<uint64_t> WaitValue, CG::StorageInitHelper CGData,
+      detail::code_location loc = {})
       : CG(SemaphoreWait, std::move(CGData), std::move(loc)),
-        MInteropSemaphoreHandle(InteropSemaphoreHandle) {}
+        MInteropSemaphoreHandle(InteropSemaphoreHandle), MWaitValue(WaitValue) {
+  }
 
   sycl::detail::pi::PiInteropSemaphoreHandle getInteropSemaphoreHandle() const {
     return MInteropSemaphoreHandle;
   }
+  std::optional<uint64_t> getWaitValue() const { return MWaitValue; }
 };
 
 /// "Semaphore Signal" command group class.
 class CGSemaphoreSignal : public CG {
   sycl::detail::pi::PiInteropSemaphoreHandle MInteropSemaphoreHandle;
+  std::optional<uint64_t> MSignalValue;
 
 public:
   CGSemaphoreSignal(
       sycl::detail::pi::PiInteropSemaphoreHandle InteropSemaphoreHandle,
-      CG::StorageInitHelper CGData, detail::code_location loc = {})
+      std::optional<uint64_t> SignalValue, CG::StorageInitHelper CGData,
+      detail::code_location loc = {})
       : CG(SemaphoreSignal, std::move(CGData), std::move(loc)),
-        MInteropSemaphoreHandle(InteropSemaphoreHandle) {}
+        MInteropSemaphoreHandle(InteropSemaphoreHandle),
+        MSignalValue(SignalValue) {}
 
   sycl::detail::pi::PiInteropSemaphoreHandle getInteropSemaphoreHandle() const {
     return MInteropSemaphoreHandle;
   }
+  std::optional<uint64_t> getSignalValue() const { return MSignalValue; }
 };
 
 /// "Execute command-buffer" command group class.
diff --git a/sycl/include/sycl/detail/generic_type_traits.hpp b/sycl/include/sycl/detail/generic_type_traits.hpp
index 3b0ce7988f576..bca2fd27eeb49 100644
--- a/sycl/include/sycl/detail/generic_type_traits.hpp
+++ b/sycl/include/sycl/detail/generic_type_traits.hpp
@@ -17,6 +17,8 @@
 #include <sycl/half_type.hpp>                 // for BIsRepresentationT
 #include <sycl/multi_ptr.hpp>                 // for multi_ptr, address_spa...
 
+#include <sycl/ext/oneapi/bfloat16.hpp> // for bfloat16 storage type.
+
 #include <cstddef>     // for byte
 #include <cstdint>     // for uint8_t
 #include <limits>      // for numeric_limits
@@ -252,6 +254,16 @@ inline constexpr bool is_genfloatptr_marray_v =
     (IsDecorated == access::decorated::yes ||
      IsDecorated == access::decorated::no);
 
+template <typename T>
+using is_byte = typename
+#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
+    std::is_same<T, std::byte>;
+#else
+    std::false_type;
+#endif
+
+template <typename T> inline constexpr bool is_byte_v = is_byte<T>::value;
+
 template <typename T>
 using make_floating_point_t = make_type_t<T, gtl::scalar_floating_list>;
 
@@ -332,6 +344,8 @@ template <typename T> auto convertToOpenCLType(T &&x) {
                                                    std::declval<ElemTy>()))>,
                             no_ref::size()>;
 #ifdef __SYCL_DEVICE_ONLY__
+
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
     // TODO: for some mysterious reasons on NonUniformGroups E2E tests fail if
     // we use the "else" version only. I suspect that's an issues with
     // non-uniform groups implementation.
@@ -340,6 +354,10 @@ template <typename T> auto convertToOpenCLType(T &&x) {
     else
       return static_cast<typename MatchingVec::vector_t>(
           x.template as<MatchingVec>());
+#else  // __INTEL_PREVIEW_BREAKING_CHANGES
+    return sycl::bit_cast<typename MatchingVec::vector_t>(x);
+#endif // __INTEL_PREVIEW_BREAKING_CHANGES
+
 #else
     return x.template as<MatchingVec>();
 #endif
@@ -370,7 +388,13 @@ template <typename T> auto convertToOpenCLType(T &&x) {
     static_assert(sizeof(OpenCLType) == sizeof(T));
     return static_cast<OpenCLType>(x);
   } else if constexpr (is_bfloat16_v<no_ref>) {
+    // On host, don't interpret BF16 as uint16.
+#ifdef __SYCL_DEVICE_ONLY__
+    using OpenCLType = sycl::ext::oneapi::detail::Bfloat16StorageT;
+    return sycl::bit_cast<OpenCLType>(x);
+#else
     return std::forward<T>(x);
+#endif
   } else if constexpr (std::is_floating_point_v<no_ref>) {
     static_assert(std::is_same_v<no_ref, float> ||
                       std::is_same_v<no_ref, double>,
diff --git a/sycl/include/sycl/detail/group_sort_impl.hpp b/sycl/include/sycl/detail/group_sort_impl.hpp
index af060edbbdc4c..6974413492a7c 100644
--- a/sycl/include/sycl/detail/group_sort_impl.hpp
+++ b/sycl/include/sycl/detail/group_sort_impl.hpp
@@ -15,11 +15,43 @@
 #include <sycl/builtins.hpp>
 #include <sycl/group_algorithm.hpp>
 #include <sycl/group_barrier.hpp>
+#include <sycl/sycl_span.hpp>
+
+#include <memory>
 
 namespace sycl {
 inline namespace _V1 {
 namespace detail {
 
+// Helpers for sorting algorithms
+#ifdef __SYCL_DEVICE_ONLY__
+template <typename T, typename Group>
+static __SYCL_ALWAYS_INLINE T *align_scratch(sycl::span<std::byte> scratch,
+                                             Group g,
+                                             size_t number_of_elements) {
+  // Adjust the scratch pointer based on alignment of the type T.
+  // Per extension specification if scratch size is less than the value
+  // returned by memory_required then behavior is undefined, so we don't check
+  // that the scratch size statisfies the requirement.
+  T *scratch_begin = nullptr;
+  // We must have a barrier here before array placement new because it is
+  // possible that scratch memory is already in use, so we need to synchronize
+  // work items.
+  sycl::group_barrier(g);
+  if (g.leader()) {
+    void *scratch_ptr = scratch.data();
+    size_t space = scratch.size();
+    scratch_ptr = std::align(alignof(T), number_of_elements * sizeof(T),
+                             scratch_ptr, space);
+    scratch_begin = ::new (scratch_ptr) T[number_of_elements];
+  }
+  // Broadcast leader's pointer (the beginning of the scratch) to all work
+  // items in the group.
+  scratch_begin = sycl::group_broadcast(g, scratch_begin);
+  return scratch_begin;
+}
+#endif
+
 // ---- merge sort implementation
 
 // following two functions could be useless if std::[lower|upper]_bound worked
@@ -68,22 +100,10 @@ struct GetValueType<sycl::multi_ptr<ElementType, Space, IsDecorated>> {
   using type = ElementType;
 };
 
-// since we couldn't assign data to raw memory, it's better to use placement
-// for first assignment
-template <typename Acc, typename T>
-void set_value(Acc ptr, const size_t idx, const T &val, bool is_first) {
-  if (is_first) {
-    ::new (ptr + idx) T(val);
-  } else {
-    ptr[idx] = val;
-  }
-}
-
 template <typename InAcc, typename OutAcc, typename Compare>
 void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1,
            const size_t start_1, const size_t end_1, const size_t end_2,
-           const size_t start_out, Compare comp, const size_t chunk,
-           bool is_first) {
+           const size_t start_out, Compare comp, const size_t chunk) {
   const size_t start_2 = end_1;
   // Borders of the sequences to merge within this call
   const size_t local_start_1 =
@@ -111,8 +131,7 @@ void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1,
     const size_t l_shift_1 = local_start_1 - start_1;
     const size_t l_shift_2 = l_search_bound_2 - start_2;
 
-    set_value(out_acc1, start_out + l_shift_1 + l_shift_2, local_l_item_1,
-              is_first);
+    out_acc1[start_out + l_shift_1 + l_shift_2] = local_l_item_1;
 
     size_t r_search_bound_2{};
     // find right border in 2nd sequence
@@ -123,8 +142,7 @@ void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1,
       const auto r_shift_1 = local_end_1 - 1 - start_1;
       const auto r_shift_2 = r_search_bound_2 - start_2;
 
-      set_value(out_acc1, start_out + r_shift_1 + r_shift_2, local_r_item_1,
-                is_first);
+      out_acc1[start_out + r_shift_1 + r_shift_2] = local_r_item_1;
     }
 
     // Handle intermediate items
@@ -138,8 +156,7 @@ void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1,
       const size_t shift_1 = idx - start_1;
       const size_t shift_2 = l_search_bound_2 - start_2;
 
-      set_value(out_acc1, start_out + shift_1 + shift_2, intermediate_item_1,
-                is_first);
+      out_acc1[start_out + shift_1 + shift_2] = intermediate_item_1;
     }
   }
   // Process 2nd sequence
@@ -152,8 +169,7 @@ void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1,
     const size_t l_shift_1 = l_search_bound_1 - start_1;
     const size_t l_shift_2 = local_start_2 - start_2;
 
-    set_value(out_acc1, start_out + l_shift_1 + l_shift_2, local_l_item_2,
-              is_first);
+    out_acc1[start_out + l_shift_1 + l_shift_2] = local_l_item_2;
 
     size_t r_search_bound_1{};
     // find right border in 1st sequence
@@ -164,8 +180,7 @@ void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1,
       const size_t r_shift_1 = r_search_bound_1 - start_1;
       const size_t r_shift_2 = local_end_2 - 1 - start_2;
 
-      set_value(out_acc1, start_out + r_shift_1 + r_shift_2, local_r_item_2,
-                is_first);
+      out_acc1[start_out + r_shift_1 + r_shift_2] = local_r_item_2;
     }
 
     // Handle intermediate items
@@ -179,8 +194,7 @@ void merge(const size_t offset, InAcc &in_acc1, OutAcc &out_acc1,
       const size_t shift_1 = l_search_bound_1 - start_1;
       const size_t shift_2 = idx - start_2;
 
-      set_value(out_acc1, start_out + shift_1 + shift_2, intermediate_item_2,
-                is_first);
+      out_acc1[start_out + shift_1 + shift_2] = intermediate_item_2;
     }
   }
 }
@@ -200,10 +214,9 @@ void bubble_sort(Iter first, const size_t begin, const size_t end,
   }
 }
 
-template <typename Group, typename Iter, typename Compare>
+template <typename Group, typename Iter, typename T, typename Compare>
 void merge_sort(Group group, Iter first, const size_t n, Compare comp,
-                std::byte *scratch) {
-  using T = typename GetValueType<Iter>::type;
+                T *scratch) {
   const size_t idx = group.get_local_linear_id();
   const size_t local = group.get_local_range().size();
   const size_t chunk = (n - 1) / local + 1;
@@ -212,9 +225,7 @@ void merge_sort(Group group, Iter first, const size_t n, Compare comp,
   bubble_sort(first, idx * chunk, sycl::min((idx + 1) * chunk, n), comp);
   sycl::group_barrier(group);
 
-  T *temp = reinterpret_cast<T *>(scratch);
-  bool data_in_temp = false;
-  bool is_first = true;
+  bool data_in_scratch = false;
   size_t sorted_size = 1;
   while (sorted_size * chunk < n) {
     const size_t start_1 =
@@ -223,26 +234,24 @@ void merge_sort(Group group, Iter first, const size_t n, Compare comp,
     const size_t end_2 = sycl::min(end_1 + sorted_size * chunk, n);
     const size_t offset = chunk * (idx % sorted_size);
 
-    if (!data_in_temp) {
-      merge(offset, first, temp, start_1, end_1, end_2, start_1, comp, chunk,
-            is_first);
+    if (!data_in_scratch) {
+      merge(offset, first, scratch, start_1, end_1, end_2, start_1, comp,
+            chunk);
     } else {
-      merge(offset, temp, first, start_1, end_1, end_2, start_1, comp, chunk,
-            /*is_first*/ false);
+      merge(offset, scratch, first, start_1, end_1, end_2, start_1, comp,
+            chunk);
     }
     sycl::group_barrier(group);
 
-    data_in_temp = !data_in_temp;
+    data_in_scratch = !data_in_scratch;
     sorted_size *= 2;
-    if (is_first)
-      is_first = false;
   }
 
   // copy back if data is in a temporary storage
-  if (data_in_temp) {
+  if (data_in_scratch) {
     for (size_t i = 0; i < chunk; ++i) {
       if (idx * chunk + i < n) {
-        first[idx * chunk + i] = temp[idx * chunk + i];
+        first[idx * chunk + i] = scratch[idx * chunk + i];
       }
     }
     sycl::group_barrier(group);
@@ -601,7 +610,7 @@ template <size_t items_per_work_item, uint32_t radix_bits, bool is_comp_asc,
           typename ValsT, typename GroupT>
 void performRadixIterStaticSize(GroupT group, const uint32_t radix_iter,
                                 const uint32_t last_iter, KeysT *keys,
-                                ValsT vals, const ScratchMemory &memory) {
+                                ValsT *vals, const ScratchMemory &memory) {
   const uint32_t radix_states = getStatesInBits(radix_bits);
   const size_t wgsize = group.get_local_linear_range();
   const size_t idx = group.get_local_linear_id();
diff --git a/sycl/include/sycl/detail/pi.def b/sycl/include/sycl/detail/pi.def
index 66efb7c750ebe..995579d612afb 100644
--- a/sycl/include/sycl/detail/pi.def
+++ b/sycl/include/sycl/detail/pi.def
@@ -206,9 +206,11 @@ _PI_API(piextMemMipmapFree)
 
 // Interop
 _PI_API(piextMemImportOpaqueFD)
+_PI_API(piextImportExternalMemory)
 _PI_API(piextMemReleaseInterop)
 _PI_API(piextMemMapExternalArray)
 _PI_API(piextImportExternalSemaphoreOpaqueFD)
+_PI_API(piextImportExternalSemaphore)
 _PI_API(piextDestroyExternalSemaphore)
 _PI_API(piextWaitExternalSemaphore)
 _PI_API(piextSignalExternalSemaphore)
diff --git a/sycl/include/sycl/detail/pi.h b/sycl/include/sycl/detail/pi.h
index 6f6821360207a..f4e67f7ba6113 100644
--- a/sycl/include/sycl/detail/pi.h
+++ b/sycl/include/sycl/detail/pi.h
@@ -176,9 +176,24 @@
 // piextMemSampledImageCreate
 // 15.52 Added piEnqueueTimestampRecordingExp and
 //       PI_EXT_ONEAPI_DEVICE_INFO_TIMESTAMP_RECORDING_SUPPORT.
+// 15.53 Added new extension functions that enable importing various external
+//       handle types:
+//         - piextImportExternalMemory
+//         - piextImportExternalSemaphore
+//       Deprecated no longer necessary functions:
+//         - piextImportExternalSemaphoreOpaqueFD
+//         - piextMemImportOpaqueFD
+//       The following interop semaphore related functions now take extra
+//       `bool` and `pi_uint64` values:
+//         - `piextWaitExternalSemaphore`
+//         - `piextSignalExternalSemaphore`
+//       The `pi_external_mem_handle_type` enum now has a new
+//       `win32_nt_dx12_resource` value.
+//       the `pi_external_semaphore_handle_type` enum now has a new
+//       `win32_nt_dx12_fence` value.
 
 #define _PI_H_VERSION_MAJOR 15
-#define _PI_H_VERSION_MINOR 52
+#define _PI_H_VERSION_MINOR 53
 
 #define _PI_STRING_HELPER(a) #a
 #define _PI_CONCAT(a, b) _PI_STRING_HELPER(a.b)
@@ -1161,6 +1176,60 @@ struct pi_device_binaries_struct {
 };
 using pi_device_binaries = pi_device_binaries_struct *;
 
+// This union encapsulates the two external handles we currently support.
+// When choosing the correct field from the union we need to look at the value
+// of the enum `pi_external_mem_handle_type` or
+// `pi_external_semaphore_handle_type`.
+union pi_external_handle {
+  // Used universally for all Linux based interoperability functionality.
+  // The associated enum `pi_external_mem_handle_type` in
+  // `pi_external_mem_descriptor` should always be set to
+  // `pi_external_mem_handle_type::opaque_fd`. Likewise for semaphore handles.
+  int file_descriptor;
+
+  // Could be Win32 NT, KMT, or various DX12 handle types.
+  // The `void *` type is used for all of these.
+  // The exact handle type depends on the enum `pi_external_mem_handle_type`.
+  // This enum is found in `pi_external_mem_descriptor`.
+  // It could be a regular NT handle type (`win32_nt_handle`) or a DX12 specific
+  // resource handle type (`win32_nt_dx12_resource`), etc.
+  void *win32_handle;
+};
+
+// This enum enumerates the specific external memory handles types that we want
+// to import.
+enum class pi_external_mem_handle_type {
+  opaque_fd = 0,
+  win32_nt_handle = 1,
+  win32_nt_dx12_resource = 2,
+};
+
+// This struct holds all the information required to import external memory.
+struct pi_external_mem_descriptor {
+  // The type of the external memory handle.
+  pi_external_mem_handle_type handleType;
+  // Union encapsulates both Opaque FD (linux) and Win32 handles (Windows).
+  pi_external_handle handle;
+  // Size of the external memory in bytes.
+  size_t memorySizeBytes;
+};
+
+// This enum enumerates the specific external semaphore handles types that we
+// want to import.
+enum class pi_external_semaphore_handle_type {
+  opaque_fd = 0,
+  win32_nt_handle = 1,
+  win32_nt_dx12_fence = 2,
+};
+
+// This struct holds all the information required to import external semaphores.
+struct pi_external_semaphore_descriptor {
+  // The type of the external semaphore handle.
+  pi_external_semaphore_handle_type handleType;
+  // Union encapsulates both Opaque FD (linux) and Win32 handles (Windows).
+  pi_external_handle handle;
+};
+
 // Opaque types that make reading build log errors easier.
 struct _pi_platform;
 struct _pi_device;
@@ -2856,6 +2925,9 @@ __SYCL_EXPORT pi_result piextMemImageGetInfo(
     const pi_image_mem_handle mem_handle, pi_image_info param_name,
     void *param_value, size_t *param_value_size_ret);
 
+/// [DEPRECATED] This function is deprecated in favor of
+/// `piextImportExternalMemory`
+///
 /// API to import external memory in the form of a file descriptor.
 ///
 /// \param context is the pi_context
@@ -2864,9 +2936,23 @@ __SYCL_EXPORT pi_result piextMemImageGetInfo(
 /// \param file_descriptor is the file descriptor
 /// \param ret_handle is the returned interop memory handle to the external
 /// memory
+__SYCL_EXPORT_DEPRECATED("This function has been deprecated in favor of "
+                         "`piextImportExternalMemory`")
+pi_result piextMemImportOpaqueFD(pi_context context, pi_device device,
+                                 size_t size, int file_descriptor,
+                                 pi_interop_mem_handle *ret_handle);
+
+/// API to import external memory
+///
+/// \param context is the pi_context
+/// \param device is the pi_device
+/// \param mem_descriptor is the interop memory descriptor
+/// \param ret_handle is the returned interop memory handle to the external
+/// memory
 __SYCL_EXPORT pi_result
-piextMemImportOpaqueFD(pi_context context, pi_device device, size_t size,
-                       int file_descriptor, pi_interop_mem_handle *ret_handle);
+piextImportExternalMemory(pi_context context, pi_device device,
+                          pi_external_mem_descriptor *mem_descriptor,
+                          pi_interop_mem_handle *ret_handle);
 
 /// API to map an interop memory handle to an image memory handle.
 ///
@@ -2890,6 +2976,9 @@ __SYCL_EXPORT pi_result piextMemMapExternalArray(
 __SYCL_EXPORT pi_result piextMemReleaseInterop(
     pi_context context, pi_device device, pi_interop_mem_handle memory_handle);
 
+/// [DEPRECATED] This function is deprecated in favor of
+/// `piextImportExternalSemaphore`
+///
 /// API to import an external semaphore in the form of a file descriptor.
 ///
 /// \param context is the pi_context
@@ -2897,9 +2986,24 @@ __SYCL_EXPORT pi_result piextMemReleaseInterop(
 /// \param file_descriptor is the file descriptor
 /// \param ret_handle is the returned interop semaphore handle to the external
 /// semaphore
-__SYCL_EXPORT pi_result piextImportExternalSemaphoreOpaqueFD(
-    pi_context context, pi_device device, int file_descriptor,
-    pi_interop_semaphore_handle *ret_handle);
+__SYCL_EXPORT_DEPRECATED("This function has been deprecated in favor of "
+                         "`piextImportExternalSemaphore`")
+pi_result
+piextImportExternalSemaphoreOpaqueFD(pi_context context, pi_device device,
+                                     int file_descriptor,
+                                     pi_interop_semaphore_handle *ret_handle);
+
+/// API to import an external semaphore
+///
+/// \param context is the pi_context
+/// \param device is the pi_device
+/// \param sem_descriptor is the interop semaphore descriptor
+/// \param ret_handle is the returned interop semaphore handle to the external
+/// semaphore
+__SYCL_EXPORT pi_result
+piextImportExternalSemaphore(pi_context context, pi_device device,
+                             pi_external_semaphore_descriptor *sem_descriptor,
+                             pi_interop_semaphore_handle *ret_handle);
 
 /// API to destroy the external semaphore handle.
 ///
@@ -2915,12 +3019,20 @@ piextDestroyExternalSemaphore(pi_context context, pi_device device,
 ///
 /// \param command_queue is the queue instructed to wait
 /// \param sem_handle is the interop semaphore handle
+/// \param has_wait_value indicates whether the semaphore is capable of setting
+///                       user defined state passed through `wait_value`.
+///                       Otherwise `wait_value` is ignored.
+/// \param wait_value is the user defined value of the semaphore state for
+///                   which this operation will wait upon, provided the
+///                   semaphore type has this capability, and
+///                   `has_wait_value` is `true`.
 /// \param num_events_in_wait_list is the number of events in the wait list
 /// \param event_wait_list is the list of events to wait on before this
 /// operation
 /// \param event is the returned event representing this operation
 __SYCL_EXPORT pi_result piextWaitExternalSemaphore(
     pi_queue command_queue, pi_interop_semaphore_handle sem_handle,
+    bool has_wait_value, pi_uint64 wait_value,
     pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
     pi_event *event);
 
@@ -2929,12 +3041,19 @@ __SYCL_EXPORT pi_result piextWaitExternalSemaphore(
 ///
 /// \param command_queue is the queue instructed to signal
 /// \param sem_handle is the interop semaphore handle to signal
+/// \param has_signal_value indicates whether the semaphore is capable of
+///                         setting user defined state passed through
+///                         `signal_value`. Otherwise `signal_value` is ignored.
+/// \param signal_value is the user defined value to which the state of the
+///                     semaphore will be set, provided the semaphore type has
+///                     this capability, and `has_signal_value` is `true`.
 /// \param num_events_in_wait_list is the number of events in the wait list
 /// \param event_wait_list is the list of events to wait on before this
 /// operation
 /// \param event is the returned event representing this operation
 __SYCL_EXPORT pi_result piextSignalExternalSemaphore(
     pi_queue command_queue, pi_interop_semaphore_handle sem_handle,
+    bool has_signal_value, pi_uint64 signal_value,
     pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
     pi_event *event);
 
diff --git a/sycl/include/sycl/detail/pi.hpp b/sycl/include/sycl/detail/pi.hpp
index 9442d6f2a86bc..0a6713dab1096 100644
--- a/sycl/include/sycl/detail/pi.hpp
+++ b/sycl/include/sycl/detail/pi.hpp
@@ -146,6 +146,8 @@ using PiImageMemHandle = ::pi_image_mem_handle;
 using PiImageCopyFlags = ::pi_image_copy_flags;
 using PiInteropMemHandle = ::pi_interop_mem_handle;
 using PiInteropSemaphoreHandle = ::pi_interop_semaphore_handle;
+using PiExternalMemDescriptor = ::pi_external_mem_descriptor;
+using PiExternalSemaphoreDescriptor = ::pi_external_semaphore_descriptor;
 using PiImageOffset = ::pi_image_offset_struct;
 using PiImageRegion = ::pi_image_region_struct;
 
diff --git a/sycl/include/sycl/detail/vector_arith.hpp b/sycl/include/sycl/detail/vector_arith.hpp
new file mode 100644
index 0000000000000..7a2bce152c1d3
--- /dev/null
+++ b/sycl/include/sycl/detail/vector_arith.hpp
@@ -0,0 +1,394 @@
+//=== vector_arith.hpp --- Implementation of arithmetic ops on sycl::vec  ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <sycl/aliases.hpp>                    // for half, cl_char, cl_int
+#include <sycl/detail/generic_type_traits.hpp> // for is_sigeninteger, is_s...
+#include <sycl/detail/type_list.hpp>           // for is_contained
+#include <sycl/detail/type_traits.hpp>         // for is_floating_point
+
+#include <sycl/ext/oneapi/bfloat16.hpp> // bfloat16
+
+#include <cstddef>
+#include <type_traits> // for enable_if_t, is_same
+
+namespace sycl {
+inline namespace _V1 {
+
+template <typename DataT, int NumElem> class vec;
+
+namespace detail {
+
+template <typename VecT> class VecAccess;
+
+// Macros to populate binary operation on sycl::vec.
+#if defined(__SYCL_BINOP) || defined(BINOP_BASE)
+#error "Undefine __SYCL_BINOP and BINOP_BASE macro"
+#endif
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define BINOP_BASE(BINOP, OPASSIGN, CONVERT, COND)                             \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_t> operator BINOP(const vec_t & Lhs,     \
+                                                        const vec_t & Rhs) {   \
+    vec_t Ret;                                                                 \
+    if constexpr (vec_t::IsBfloat16) {                                         \
+      for (size_t I = 0; I < NumElements; ++I) {                               \
+        Ret[I] = Lhs[I] BINOP Rhs[I];                                          \
+      }                                                                        \
+    } else {                                                                   \
+      auto ExtVecLhs = sycl::bit_cast<typename vec_t::vector_t>(Lhs);          \
+      auto ExtVecRhs = sycl::bit_cast<typename vec_t::vector_t>(Rhs);          \
+      Ret = vec<DataT, NumElements>(ExtVecLhs BINOP ExtVecRhs);                \
+      if constexpr (std::is_same_v<DataT, bool> && CONVERT) {                  \
+        vec_arith_common<bool, NumElements>::ConvertToDataT(Ret);              \
+      }                                                                        \
+    }                                                                          \
+    return Ret;                                                                \
+  }
+#else // __SYCL_DEVICE_ONLY__
+
+#define BINOP_BASE(BINOP, OPASSIGN, CONVERT, COND)                             \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_t> operator BINOP(const vec_t & Lhs,     \
+                                                        const vec_t & Rhs) {   \
+    vec_t Ret{};                                                               \
+    for (size_t I = 0; I < NumElements; ++I) {                                 \
+      Ret[I] = Lhs[I] BINOP Rhs[I];                                            \
+    }                                                                          \
+    return Ret;                                                                \
+  }
+#endif // __SYCL_DEVICE_ONLY__
+
+#define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT, COND)                           \
+  BINOP_BASE(BINOP, OPASSIGN, CONVERT, COND)                                   \
+                                                                               \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_t> operator BINOP(const vec_t & Lhs,     \
+                                                        const DataT & Rhs) {   \
+    return Lhs BINOP vec_t(Rhs);                                               \
+  }                                                                            \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_t> operator BINOP(const DataT & Lhs,     \
+                                                        const vec_t & Rhs) {   \
+    return vec_t(Lhs) BINOP Rhs;                                               \
+  }                                                                            \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_t> &operator OPASSIGN(                   \
+      vec_t & Lhs, const vec_t & Rhs) {                                        \
+    Lhs = Lhs BINOP Rhs;                                                       \
+    return Lhs;                                                                \
+  }                                                                            \
+  template <int Num = NumElements, typename T = DataT>                         \
+  friend std::enable_if_t<(Num != 1) && (COND), vec_t &> operator OPASSIGN(    \
+      vec_t & Lhs, const DataT & Rhs) {                                        \
+    Lhs = Lhs BINOP vec_t(Rhs);                                                \
+    return Lhs;                                                                \
+  }
+
+/****************************************************************
+ *                       vec_arith_common
+ *                 /           |             \
+ *                /            |               \
+ *     vec_arith<int>     vec_arith<float> ...   vec_arith<byte>
+ *                \            |               /
+ *                 \           |              /
+ *                        sycl::vec<T>
+ *
+ * vec_arith_common is the base class for vec_arith. It contains
+ * the common math operators of sycl::vec for all types.
+ * vec_arith is the derived class that contains the math operators
+ * specialized for certain types. sycl::vec inherits from vec_arith.
+ * *************************************************************/
+template <typename DataT, int NumElements> class vec_arith_common;
+template <typename DataT> struct vec_helper;
+
+template <typename DataT, int NumElements>
+class vec_arith : public vec_arith_common<DataT, NumElements> {
+protected:
+  using vec_t = vec<DataT, NumElements>;
+  using ocl_t = detail::select_cl_scalar_integral_signed_t<DataT>;
+  template <typename T> using vec_data = vec_helper<T>;
+
+  // operator!.
+  friend vec<ocl_t, NumElements> operator!(const vec_t &Rhs) {
+#ifdef __SYCL_DEVICE_ONLY__
+    if constexpr (!vec_t::IsBfloat16) {
+      auto extVec = sycl::bit_cast<typename vec_t::vector_t>(Rhs);
+      vec<ocl_t, NumElements> Ret{
+          (typename vec<ocl_t, NumElements>::vector_t) !extVec};
+      return Ret;
+    } else
+#endif // __SYCL_DEVICE_ONLY__
+    {
+      vec<ocl_t, NumElements> Ret{};
+      for (size_t I = 0; I < NumElements; ++I) {
+        // static_cast will work here as the output of ! operator is either 0 or
+        // -1.
+        Ret[I] = static_cast<ocl_t>(-1 * (!Rhs[I]));
+      }
+      return Ret;
+    }
+  }
+
+  // operator +.
+  friend vec_t operator+(const vec_t &Lhs) {
+#ifdef __SYCL_DEVICE_ONLY__
+    auto extVec = sycl::bit_cast<typename vec_t::vector_t>(Lhs);
+    return vec_t{+extVec};
+#else
+    vec_t Ret{};
+    for (size_t I = 0; I < NumElements; ++I)
+      Ret[I] = +Lhs[I];
+    return Ret;
+#endif
+  }
+
+  // operator -.
+  friend vec_t operator-(const vec_t &Lhs) {
+    vec_t Ret{};
+    if constexpr (vec_t::IsBfloat16) {
+      for (size_t I = 0; I < NumElements; I++)
+        Ret[I] = -Lhs[I];
+    } else {
+#ifndef __SYCL_DEVICE_ONLY__
+      for (size_t I = 0; I < NumElements; ++I)
+        Ret[I] = -Lhs[I];
+#else
+      auto extVec = sycl::bit_cast<typename vec_t::vector_t>(Lhs);
+      Ret = vec_t{-extVec};
+      if constexpr (std::is_same_v<DataT, bool>) {
+        vec_arith_common<bool, NumElements>::ConvertToDataT(Ret);
+      }
+#endif
+    }
+    return Ret;
+  }
+
+// Unary operations on sycl::vec
+// FIXME: Don't allow Unary operators on vec<bool> after
+// https://github.com/KhronosGroup/SYCL-CTS/issues/896 gets fixed.
+#ifdef __SYCL_UOP
+#error "Undefine __SYCL_UOP macro"
+#endif
+#define __SYCL_UOP(UOP, OPASSIGN)                                              \
+  friend vec_t &operator UOP(vec_t & Rhs) {                                    \
+    Rhs OPASSIGN DataT{1};                                                     \
+    return Rhs;                                                                \
+  }                                                                            \
+  friend vec_t operator UOP(vec_t &Lhs, int) {                                 \
+    vec_t Ret(Lhs);                                                            \
+    Lhs OPASSIGN DataT{1};                                                     \
+    return Ret;                                                                \
+  }
+
+  __SYCL_UOP(++, +=)
+  __SYCL_UOP(--, -=)
+#undef __SYCL_UOP
+
+  // The logical operations on scalar types results in 0/1, while for vec<>,
+  // logical operations should result in 0 and -1 (similar to OpenCL vectors).
+  // That's why, for vec<DataT, 1>, we need to invert the result of the logical
+  // operations since we store vec<DataT, 1> as scalar type on the device.
+#if defined(__SYCL_RELLOGOP) || defined(RELLOGOP_BASE)
+#error "Undefine __SYCL_RELLOGOP and RELLOGOP_BASE macro."
+#endif
+
+#ifdef __SYCL_DEVICE_ONLY__
+#define RELLOGOP_BASE(RELLOGOP, COND)                                          \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec<ocl_t, NumElements>> operator RELLOGOP(  \
+      const vec_t & Lhs, const vec_t & Rhs) {                                  \
+    vec<ocl_t, NumElements> Ret{};                                             \
+    /* ext_vector_type does not support bfloat16, so for these   */            \
+    /* we do element-by-element operation on the underlying std::array.  */    \
+    if constexpr (vec_t::IsBfloat16) {                                         \
+      for (size_t I = 0; I < NumElements; ++I) {                               \
+        Ret[I] = static_cast<ocl_t>(-(Lhs[I] RELLOGOP Rhs[I]));                \
+      }                                                                        \
+    } else {                                                                   \
+      auto ExtVecLhs = sycl::bit_cast<typename vec_t::vector_t>(Lhs);          \
+      auto ExtVecRhs = sycl::bit_cast<typename vec_t::vector_t>(Rhs);          \
+      /* Cast required to convert unsigned char ext_vec_type to */             \
+      /* char ext_vec_type. */                                                 \
+      Ret = vec<ocl_t, NumElements>(                                           \
+          (typename vec<ocl_t, NumElements>::vector_t)(                        \
+              ExtVecLhs RELLOGOP ExtVecRhs));                                  \
+      /* For NumElements == 1, we use scalar instead of ext_vector_type. */    \
+      if constexpr (NumElements == 1) {                                        \
+        Ret *= -1;                                                             \
+      }                                                                        \
+    }                                                                          \
+    return Ret;                                                                \
+  }
+#else // __SYCL_DEVICE_ONLY__
+#define RELLOGOP_BASE(RELLOGOP, COND)                                          \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec<ocl_t, NumElements>> operator RELLOGOP(  \
+      const vec_t & Lhs, const vec_t & Rhs) {                                  \
+    vec<ocl_t, NumElements> Ret{};                                             \
+    for (size_t I = 0; I < NumElements; ++I) {                                 \
+      Ret[I] = static_cast<ocl_t>(-(Lhs[I] RELLOGOP Rhs[I]));                  \
+    }                                                                          \
+    return Ret;                                                                \
+  }
+#endif
+
+#define __SYCL_RELLOGOP(RELLOGOP, COND)                                        \
+  RELLOGOP_BASE(RELLOGOP, COND)                                                \
+                                                                               \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec<ocl_t, NumElements>> operator RELLOGOP(  \
+      const vec_t & Lhs, const DataT & Rhs) {                                  \
+    return Lhs RELLOGOP vec_t(Rhs);                                            \
+  }                                                                            \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec<ocl_t, NumElements>> operator RELLOGOP(  \
+      const DataT & Lhs, const vec_t & Rhs) {                                  \
+    return vec_t(Lhs) RELLOGOP Rhs;                                            \
+  }
+
+  // OP is: ==, !=, <, >, <=, >=, &&, ||
+  // vec<RET, NumElements> operatorOP(const vec<DataT, NumElements> &Rhs) const;
+  // vec<RET, NumElements> operatorOP(const DataT &Rhs) const;
+  __SYCL_RELLOGOP(==, true)
+  __SYCL_RELLOGOP(!=, true)
+  __SYCL_RELLOGOP(>, true)
+  __SYCL_RELLOGOP(<, true)
+  __SYCL_RELLOGOP(>=, true)
+  __SYCL_RELLOGOP(<=, true)
+
+  // Only available to integral types.
+  __SYCL_RELLOGOP(&&, (!detail::is_vgenfloat_v<T>))
+  __SYCL_RELLOGOP(||, (!detail::is_vgenfloat_v<T>))
+#undef __SYCL_RELLOGOP
+#undef RELLOGOP_BASE
+
+  // Binary operations on sycl::vec<> for all types except std::byte.
+  __SYCL_BINOP(+, +=, true, true)
+  __SYCL_BINOP(-, -=, true, true)
+  __SYCL_BINOP(*, *=, false, true)
+  __SYCL_BINOP(/, /=, false, true)
+
+  // The following OPs are available only when: DataT != cl_float &&
+  // DataT != cl_double && DataT != cl_half && DataT != BF16.
+  __SYCL_BINOP(%, %=, false, (!detail::is_vgenfloat_v<T>))
+  // Bitwise operations are allowed for std::byte.
+  __SYCL_BINOP(|, |=, false, (!detail::is_vgenfloat_v<DataT>))
+  __SYCL_BINOP(&, &=, false, (!detail::is_vgenfloat_v<DataT>))
+  __SYCL_BINOP(^, ^=, false, (!detail::is_vgenfloat_v<DataT>))
+  __SYCL_BINOP(>>, >>=, false, (!detail::is_vgenfloat_v<DataT>))
+  __SYCL_BINOP(<<, <<=, true, (!detail::is_vgenfloat_v<DataT>))
+
+  // friends
+  template <typename T1, int T2> friend class vec;
+}; // class vec_arith<>
+
+#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
+template <int NumElements>
+class vec_arith<std::byte, NumElements>
+    : public vec_arith_common<std::byte, NumElements> {
+protected:
+  // NumElements can never be zero. Still using the redundant check to avoid
+  // incomplete type errors.
+  using DataT = typename std::conditional_t<NumElements == 0, int, std::byte>;
+  using vec_t = vec<DataT, NumElements>;
+  template <typename T> using vec_data = vec_helper<T>;
+
+  // Special <<, >> operators for std::byte.
+  // std::byte is not an arithmetic type and it only supports the following
+  // overloads of >> and << operators.
+  //
+  // 1 template <class IntegerType>
+  //   constexpr std::byte operator<<( std::byte b, IntegerType shift )
+  //   noexcept;
+  friend vec_t operator<<(const vec_t &Lhs, int shift) {
+    vec_t Ret;
+    for (size_t I = 0; I < NumElements; ++I) {
+      Ret[I] = Lhs[I] << shift;
+    }
+    return Ret;
+  }
+  friend vec_t &operator<<=(vec_t &Lhs, int shift) {
+    Lhs = Lhs << shift;
+    return Lhs;
+  }
+
+  // 2 template <class IntegerType>
+  //   constexpr std::byte operator>>( std::byte b, IntegerType shift )
+  //   noexcept;
+  friend vec_t operator>>(const vec_t &Lhs, int shift) {
+    vec_t Ret;
+    for (size_t I = 0; I < NumElements; ++I) {
+      Ret[I] = Lhs[I] >> shift;
+    }
+    return Ret;
+  }
+  friend vec_t &operator>>=(vec_t &Lhs, int shift) {
+    Lhs = Lhs >> shift;
+    return Lhs;
+  }
+
+  __SYCL_BINOP(|, |=, false, true)
+  __SYCL_BINOP(&, &=, false, true)
+  __SYCL_BINOP(^, ^=, false, true)
+
+  // friends
+  template <typename T1, int T2> friend class vec;
+};
+#endif // (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
+
+template <typename DataT, int NumElements> class vec_arith_common {
+protected:
+  using vec_t = vec<DataT, NumElements>;
+
+  static constexpr bool IsBfloat16 =
+      std::is_same_v<DataT, sycl::ext::oneapi::bfloat16>;
+
+  // operator~() available only when: dataT != float && dataT != double
+  // && dataT != half
+  template <typename T = DataT>
+  friend std::enable_if_t<!detail::is_vgenfloat_v<T>, vec_t>
+  operator~(const vec_t &Rhs) {
+#ifdef __SYCL_DEVICE_ONLY__
+    auto extVec = sycl::bit_cast<typename vec_t::vector_t>(Rhs);
+    vec_t Ret{~extVec};
+    if constexpr (std::is_same_v<DataT, bool>) {
+      ConvertToDataT(Ret);
+    }
+    return Ret;
+#else
+    vec_t Ret{};
+    for (size_t I = 0; I < NumElements; ++I) {
+      Ret[I] = ~Rhs[I];
+    }
+    return Ret;
+#endif
+  }
+
+#ifdef __SYCL_DEVICE_ONLY__
+  using vec_bool_t = vec<bool, NumElements>;
+  // Required only for std::bool.
+  static void ConvertToDataT(vec_bool_t &Ret) {
+    for (size_t I = 0; I < NumElements; ++I) {
+      Ret[I] = bit_cast<int8_t>(Ret[I]) != 0;
+    }
+  }
+#endif
+
+  // friends
+  template <typename T1, int T2> friend class vec;
+};
+
+#undef __SYCL_BINOP
+#undef BINOP_BASE
+
+} // namespace detail
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/detail/vector_convert.hpp b/sycl/include/sycl/detail/vector_convert.hpp
index c018fce5bcfa3..e459c59f79202 100644
--- a/sycl/include/sycl/detail/vector_convert.hpp
+++ b/sycl/include/sycl/detail/vector_convert.hpp
@@ -57,12 +57,100 @@
 #include <sycl/detail/generic_type_traits.hpp> // for is_sigeninteger, is_s...
 #include <sycl/exception.hpp>                  // for errc
 
+#include <sycl/ext/oneapi/bfloat16.hpp> // bfloat16
+
 #ifndef __SYCL_DEVICE_ONLY__
 #include <cfenv> // for fesetround, fegetround
 #endif
 
 #include <type_traits>
 
+// Enable on only intel devices.
+#if defined(__SYCL_DEVICE_ONLY__) && (defined(__SPIR__) || defined(__SPIRV__))
+extern "C" {
+// For converting BF16 to other types.
+extern __DPCPP_SYCL_EXTERNAL float __imf_bfloat162float(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_bfloat162uint_rd(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_bfloat162uint_rn(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_bfloat162uint_ru(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL unsigned int __imf_bfloat162uint_rz(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL unsigned short
+__imf_bfloat162ushort_rd(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL unsigned short
+__imf_bfloat162ushort_rn(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL unsigned short
+__imf_bfloat162ushort_ru(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL unsigned short
+__imf_bfloat162ushort_rz(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL unsigned long long
+__imf_bfloat162ull_rd(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL unsigned long long
+__imf_bfloat162ull_rn(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL unsigned long long
+__imf_bfloat162ull_ru(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL unsigned long long
+__imf_bfloat162ull_rz(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL int __imf_bfloat162int_rd(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL int __imf_bfloat162int_rn(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL int __imf_bfloat162int_ru(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL int __imf_bfloat162int_rz(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL short __imf_bfloat162short_rd(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL short __imf_bfloat162short_rn(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL short __imf_bfloat162short_ru(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL short __imf_bfloat162short_rz(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL long long __imf_bfloat162ll_rd(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL long long __imf_bfloat162ll_rn(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL long long __imf_bfloat162ll_ru(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL long long __imf_bfloat162ll_rz(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL short __imf_bfloat16_as_short(uint16_t x);
+extern __DPCPP_SYCL_EXTERNAL unsigned short
+__imf_bfloat16_as_ushort(uint16_t x);
+
+// For converting other types to BF16.
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16(float x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16_rd(float x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16_rn(float x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16_ru(float x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_float2bfloat16_rz(float x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t
+__imf_ushort2bfloat16_rd(unsigned short x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t
+__imf_ushort2bfloat16_rn(unsigned short x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t
+__imf_ushort2bfloat16_ru(unsigned short x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t
+__imf_ushort2bfloat16_rz(unsigned short x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_uint2bfloat16_rd(unsigned int x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_uint2bfloat16_rn(unsigned int x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_uint2bfloat16_ru(unsigned int x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_uint2bfloat16_rz(unsigned int x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t
+__imf_ull2bfloat16_rd(unsigned long long x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t
+__imf_ull2bfloat16_rn(unsigned long long x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t
+__imf_ull2bfloat16_ru(unsigned long long x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t
+__imf_ull2bfloat16_rz(unsigned long long x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_short2bfloat16_rd(short x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_short2bfloat16_rn(short x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_short2bfloat16_ru(short x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_short2bfloat16_rz(short x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_int2bfloat16_rd(int x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_int2bfloat16_rn(int x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_int2bfloat16_ru(int x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_int2bfloat16_rz(int x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_ll2bfloat16_rd(long long x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_ll2bfloat16_rn(long long x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_ll2bfloat16_ru(long long x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_ll2bfloat16_rz(long long x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_double2bfloat16(double x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t __imf_short_as_bfloat16(short x);
+extern __DPCPP_SYCL_EXTERNAL uint16_t
+__imf_ushort_as_bfloat16(unsigned short x);
+}
+#endif // __SYCL_DEVICE_ONLY__ && (defined(__SPIR__) || defined(__SPIRV__))
+
 namespace sycl {
 
 enum class rounding_mode { automatic = 0, rte = 1, rtz = 2, rtp = 3, rtn = 4 };
@@ -81,6 +169,10 @@ inline double trunc(double);
 #endif
 namespace detail {
 
+template <typename FromT, typename ToT, sycl::rounding_mode RoundingMode,
+          int VecSize, typename NativeFromT, typename NativeToT>
+NativeToT convertImpl(NativeFromT);
+
 template <typename T, typename R>
 using is_sint_to_sint =
     std::bool_constant<is_sigeninteger_v<T> && is_sigeninteger_v<R>>;
@@ -123,6 +215,8 @@ using is_float_to_float =
     std::bool_constant<detail::is_floating_point<T>::value &&
                        detail::is_floating_point<R>::value>;
 
+using bfloat16 = sycl::ext::oneapi::bfloat16;
+
 #ifndef __SYCL_DEVICE_ONLY__
 template <typename From, typename To, int VecSize,
           typename Enable = std::enable_if_t<VecSize == 1>>
@@ -196,8 +290,29 @@ template <typename From, typename To, int VecSize,
 To ConvertFToU(From Value) {
   return ConvertFToS<From, To, VecSize, Enable, roundingMode>(Value);
 }
-#else
 
+template <typename NativeToT, sycl::rounding_mode RoundingMode>
+inline NativeToT ConvertFromBF16Scalar(bfloat16 val) {
+  // On host, NativeBF16T is bfloat16. Convert BF16 to float losslessly.
+  float fval = static_cast<float>(val);
+
+  if constexpr (std::is_same_v<NativeToT, float>)
+    return fval;
+  else
+    // Convert float to the desired type.
+    return convertImpl<float, NativeToT, RoundingMode, 1, float, NativeToT>(
+        fval);
+}
+
+template <typename NativeFromT, sycl::rounding_mode RoundingMode>
+bfloat16 ConvertToBF16Scalar(NativeFromT val) {
+
+  constexpr int rm = static_cast<int>(RoundingMode);
+  return sycl::ext::oneapi::detail::ConvertToBfloat16::
+      getBfloat16WithRoundingMode<NativeFromT, rm>(val);
+}
+
+#else
 // Bunch of helpers to "specialize" each template for its own destination type
 // and vector size.
 
@@ -498,8 +613,188 @@ __SYCL_FLOAT_FLOAT_CONVERT_FOR_TYPE(double)
 #undef __SYCL_FLOAT_FLOAT_CONVERT
 #undef __SYCL_FLOAT_FLOAT_CONVERT_FOR_TYPE
 
+template <typename NativeBFT, typename NativeFloatT, int VecSize>
+inline NativeFloatT ConvertBF16ToFVec(NativeBFT vec) {
+  bfloat16 *src = sycl::bit_cast<bfloat16 *>(&vec);
+
+  // OpenCL vector of 3 elements is aligned to 4 multiplied by
+  // the size of data type.
+  constexpr int AdjustedSize = (VecSize == 3) ? 4 : VecSize;
+  float dst[AdjustedSize];
+  sycl::ext::oneapi::detail::BF16VecToFloatVec<VecSize>(src, dst);
+
+  return sycl::bit_cast<NativeFloatT>(dst);
+}
+
+template <typename NativeFloatT, typename NativeBFT, int VecSize>
+inline NativeBFT ConvertFToBF16Vec(NativeFloatT vec) {
+  float *src = sycl::bit_cast<float *>(&vec);
+
+  // OpenCL vector of 3 elements is aligned to 4 multiplied by
+  // the size of data type.
+  constexpr int AdjustedSize = (VecSize == 3) ? 4 : VecSize;
+  bfloat16 dst[AdjustedSize];
+
+  sycl::ext::oneapi::detail::FloatVecToBF16Vec<VecSize>(src, dst);
+  return sycl::bit_cast<NativeBFT>(dst);
+}
+
+/* Emit _imf_* funcs only on Intel hardware.  */
+#if defined(__SPIR__) || defined(__SPIRV__)
+#define EXPAND_BF16_ROUNDING_MODE(type, type_str, rmode, rmode_str)            \
+  template <typename NativeToT, sycl::rounding_mode RoundingMode>              \
+  std::enable_if_t<(std::is_same_v<NativeToT, type> && RoundingMode == rmode), \
+                   NativeToT>                                                  \
+  ConvertFromBF16Scalar(uint16_t val) {                                        \
+    return __imf_bfloat162##type_str##_##rmode_str(val);                       \
+  }                                                                            \
+  template <typename NativeFromT, sycl::rounding_mode RoundingMode>            \
+  std::enable_if_t<                                                            \
+      (std::is_same_v<NativeFromT, type> && RoundingMode == rmode), uint16_t>  \
+  ConvertToBF16Scalar(NativeFromT val) {                                       \
+    return __imf_##type_str##2bfloat16_##rmode_str(val);                       \
+  }
+
+#else // __SYCL_DEVICE_ONLY__ && (defined(__SPIR__) || defined(__SPIRV__))
+// On non-Intel HWs, convert BF16 to float (losslessly) and convert float
+// to the desired type.
+#define EXPAND_BF16_ROUNDING_MODE(type, type_str, rmode, rmode_str)            \
+  template <typename NativeToT, sycl::rounding_mode RoundingMode>              \
+  std::enable_if_t<(std::is_same_v<NativeToT, type> && RoundingMode == rmode), \
+                   NativeToT>                                                  \
+  ConvertFromBF16Scalar(uint16_t val) {                                        \
+    bfloat16 bfval = sycl::bit_cast<bfloat16>(val);                            \
+    float fval = static_cast<float>(bfval);                                    \
+    return convertImpl<fval, NativeToT, RoundingMode, 1, float, NativeToT>(    \
+        fval);                                                                 \
+  }                                                                            \
+  template <typename NativeFromT, sycl::rounding_mode RoundingMode>            \
+  std::enable_if_t<                                                            \
+      (std::is_same_v<NativeFromT, type> && RoundingMode == rmode), uint16_t>  \
+  ConvertToBF16Scalar(NativeFromT val) {                                       \
+    constexpr int rm = static_cast<int>(RoundingMode);                         \
+    bfloat16 bfval = sycl::ext::oneapi::detail::ConvertToBfloat16::            \
+        getBfloat16WithRoundingMode<NativeFromT, rm>(val);                     \
+    return sycl::bit_cast<uint16_t>(bfval);                                    \
+  }
+#endif // __SYCL_DEVICE_ONLY__ && (defined(__SPIR__) || defined(__SPIRV__))
+
+#define EXPAND_BF16_TYPE(type, type_str)                                       \
+  EXPAND_BF16_ROUNDING_MODE(type, type_str, sycl::rounding_mode::automatic,    \
+                            rn)                                                \
+  EXPAND_BF16_ROUNDING_MODE(type, type_str, sycl::rounding_mode::rte, rn)      \
+  EXPAND_BF16_ROUNDING_MODE(type, type_str, sycl::rounding_mode::rtp, ru)      \
+  EXPAND_BF16_ROUNDING_MODE(type, type_str, sycl::rounding_mode::rtn, rd)      \
+  EXPAND_BF16_ROUNDING_MODE(type, type_str, sycl::rounding_mode::rtz, rz)
+
+EXPAND_BF16_TYPE(uint, uint)
+EXPAND_BF16_TYPE(int, int)
+EXPAND_BF16_TYPE(ushort, ushort)
+EXPAND_BF16_TYPE(short, short)
+EXPAND_BF16_TYPE(long, ll)
+EXPAND_BF16_TYPE(unsigned long long, ull)
+
+#undef EXPAND_BF16_TYPE
+#undef EXPAND_BF16_ROUNDING_MODE
+
+// Mapping from BF16 to float is 1:1, lossless, so we accept all
+// rounding modes.
+template <typename NativeToT, sycl::rounding_mode RoundingMode>
+std::enable_if_t<std::is_same_v<NativeToT, float>, NativeToT>
+ConvertFromBF16Scalar(uint16_t val) {
+  bfloat16 bfval = sycl::bit_cast<bfloat16>(val);
+  return static_cast<float>(bfval);
+}
+
+template <typename NativeFromT, sycl::rounding_mode RoundingMode>
+std::enable_if_t<std::is_same_v<NativeFromT, double>, uint16_t>
+ConvertToBF16Scalar(NativeFromT val) {
+#if defined(__SPIR__) || defined(__SPIRV__)
+  return __imf_double2bfloat16(val);
+#else
+  constexpr int rm = static_cast<int>(RoundingMode);
+  bfloat16 bfval =
+      sycl::ext::oneapi::detail::ConvertToBfloat16::getBfloat16WithRoundingMode<
+          NativeFromT, rm>(val);
+  return sycl::bit_cast<uint16_t>(bfval);
+#endif
+}
+
+template <typename NativeFromT, sycl::rounding_mode RoundingMode>
+std::enable_if_t<std::is_same_v<NativeFromT, float>, uint16_t>
+ConvertToBF16Scalar(NativeFromT val) {
+
+#if defined(__SPIR__) || defined(__SPIRV__)
+  if constexpr (RoundingMode == sycl::rounding_mode::automatic ||
+                RoundingMode == sycl::rounding_mode::rte)
+    return __imf_float2bfloat16_rn(val);
+  else if constexpr (RoundingMode == sycl::rounding_mode::rtp)
+    return __imf_float2bfloat16_ru(val);
+  else if constexpr (RoundingMode == sycl::rounding_mode::rtn)
+    return __imf_float2bfloat16_rd(val);
+  else if constexpr (RoundingMode == sycl::rounding_mode::rtz)
+    return __imf_float2bfloat16_rz(val);
+  else
+    static_assert(false, "Invalid rounding mode.");
+#else
+  constexpr int rm = static_cast<int>(RoundingMode);
+  bfloat16 bfval =
+      sycl::ext::oneapi::detail::ConvertToBfloat16::getBfloat16WithRoundingMode<
+          float, rm>(val);
+  return sycl::bit_cast<uint16_t>(bfval);
+#endif
+}
+
 #endif // __SYCL_DEVICE_ONLY__
 
+// Wrapper function for scalar and vector conversions from BF16 type.
+template <typename ToT, typename NativeFromT, typename NativeToT,
+          sycl::rounding_mode RoundingMode, int VecSize>
+NativeToT ConvertFromBF16(NativeFromT val) {
+#ifdef __SYCL_DEVICE_ONLY__
+  //  Use vector conversion from BF16 to float for all rounding modes.
+  if constexpr (std::is_same_v<ToT, float> && VecSize > 1)
+    return ConvertBF16ToFVec<NativeFromT, NativeToT, VecSize>(val);
+  else
+#endif
+    // For VecSize > 1. Only for device.
+    if constexpr (VecSize > 1) {
+      NativeToT retval;
+      for (int i = 0; i < VecSize; i++) {
+        retval[i] = ConvertFromBF16Scalar<ToT, RoundingMode>(val[i]);
+      }
+      return retval;
+    }
+    // For VecSize == 1.
+    else
+      return ConvertFromBF16Scalar<NativeToT, RoundingMode>(val);
+}
+
+// Wrapper function for scalar and vector conversions to BF16 type.
+template <typename FromT, typename NativeFromT, typename NativeToT,
+          sycl::rounding_mode RoundingMode, int VecSize>
+NativeToT ConvertToBF16(NativeFromT val) {
+#ifdef __SYCL_DEVICE_ONLY__
+  //  Use vector conversion to BF16 from float for RNE rounding mode.
+  if constexpr (std::is_same_v<FromT, float> && VecSize > 1 &&
+                (RoundingMode == sycl::rounding_mode::automatic ||
+                 RoundingMode == sycl::rounding_mode::rte))
+    return ConvertFToBF16Vec<NativeFromT, NativeToT, VecSize>(val);
+  else
+#endif
+    // For VecSize > 1. Only for device.
+    if constexpr (VecSize > 1) {
+      NativeToT retval;
+      for (int i = 0; i < VecSize; i++) {
+        retval[i] = ConvertToBF16Scalar<FromT, RoundingMode>(val[i]);
+      }
+      return retval;
+    }
+    // For VecSize == 1.
+    else
+      return ConvertToBF16Scalar<NativeFromT, RoundingMode>(val);
+}
+
 /// Entry point helper for all kinds of converts between scalars and vectors, it
 /// dispatches to a right function depending on source and destination types.
 ///
@@ -537,6 +832,14 @@ NativeToT convertImpl(NativeFromT Value) {
   else if constexpr (is_float_to_float<FromT, ToT>::value)
     return FConvert<NativeFromT, NativeToT, VecSize, ElemTy, RoundingMode>(
         Value);
+  // BF16 conversion to other types.
+  else if constexpr (std::is_same_v<FromT, bfloat16>)
+    return ConvertFromBF16<ToT, NativeFromT, NativeToT, RoundingMode, VecSize>(
+        Value);
+  // conversion from other types to BF16.
+  else if constexpr (std::is_same_v<ToT, bfloat16>)
+    return ConvertToBF16<FromT, NativeFromT, NativeToT, RoundingMode, VecSize>(
+        Value);
   else if constexpr (is_float_to_sint<FromT, ToT>::value)
     return ConvertFToS<NativeFromT, NativeToT, VecSize, ElemTy, RoundingMode>(
         Value);
@@ -558,6 +861,15 @@ NativeToT convertImpl(NativeFromT Value) {
   }
 }
 
+#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
+template <typename FromT, typename ToT, sycl::rounding_mode RoundingMode,
+          int VecSize, typename NativeFromT, typename NativeToT>
+auto ConvertImpl(std::byte val) {
+  return convertImpl<FromT, ToT, RoundingMode, VecSize, NativeFromT, NativeToT>(
+      (std::int8_t)val);
+}
+#endif
+
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp b/sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp
index 2f6584a4bd640..d8022f48a9a1d 100644
--- a/sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/detail/math_intrin.hpp
@@ -244,8 +244,6 @@ __ESIMD_INTRIN __ESIMD_raw_vec_t(T1, N)
     __esimd_ssdp4a_sat(__ESIMD_raw_vec_t(T2, N) src0,
                        __ESIMD_raw_vec_t(T3, N) src1,
                        __ESIMD_raw_vec_t(T4, N) src2) __ESIMD_INTRIN_END;
-__ESIMD_INTRIN __ESIMD_raw_vec_t(uint32_t, 4)
-    __esimd_timestamp() __ESIMD_INTRIN_END;
 
 template <typename T0, typename T1, int SZ>
 __ESIMD_INTRIN __ESIMD_raw_vec_t(T0, SZ)
diff --git a/sycl/include/sycl/ext/intel/esimd/math.hpp b/sycl/include/sycl/ext/intel/esimd/math.hpp
index 096c33a2fda93..67bcaace80673 100644
--- a/sycl/include/sycl/ext/intel/esimd/math.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/math.hpp
@@ -1844,8 +1844,11 @@ __ESIMD_API uint32_t subb(uint32_t &borrow, uint32_t src0, uint32_t src1) {
 /// rdtsc - get the value of timestamp counter.
 /// @return the current value of timestamp counter
 __ESIMD_API uint64_t rdtsc() {
-  __ESIMD_NS::simd<uint32_t, 4> retv = __esimd_timestamp();
-  return retv.template bit_cast_view<uint64_t>()[0];
+#ifdef __SYCL_DEVICE_ONLY__
+  return __spirv_ReadClockKHR(0);
+#else
+  __ESIMD_UNSUPPORTED_ON_HOST;
+#endif
 }
 
 /// @} sycl_esimd_math
diff --git a/sycl/include/sycl/ext/intel/esimd/memory.hpp b/sycl/include/sycl/ext/intel/esimd/memory.hpp
index b8f36e8f57255..6272d8ce97d10 100644
--- a/sycl/include/sycl/ext/intel/esimd/memory.hpp
+++ b/sycl/include/sycl/ext/intel/esimd/memory.hpp
@@ -550,7 +550,7 @@ gather(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
 ///             simd<OffsetT, N / VS> byte_offsets,
 ///             simd_mask<N / VS> mask, PassThruSimdViewT pass_thru,
 ///             PropertyListT props = {});
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters. Loads ("gathers") elements of the type 'T'
 /// from memory locations addressed by the base pointer \p p and byte offsets \p
 /// byte_offsets, and returns the loaded elements. Access to any element's
@@ -591,7 +591,7 @@ gather(const T *p, simd<OffsetT, N / VS> byte_offsets, simd_mask<N / VS> mask,
 ///             OffsetSimdViewT byte_offsets,
 ///             simd_mask<N / VS> mask, PassThruSimdViewT pass_thru,
 ///             PropertyListT props = {});
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters. Loads ("gathers") elements of the type 'T'
 /// from memory locations addressed by the base pointer \p p and byte offsets \p
 /// byte_offsets, and returns the loaded elements. Access to any element's
@@ -637,7 +637,7 @@ gather(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
 ///             OffsetSimdViewT byte_offsets,
 ///             simd_mask<N / VS> mask, simd<T, N> pass_thru,
 ///             PropertyListT props = {});
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters. Loads ("gathers") elements of the type 'T'
 /// from memory locations addressed by the base pointer \p p and byte offsets \p
 /// byte_offsets, and returns the loaded elements. Access to any element's
@@ -711,7 +711,7 @@ gather(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
 /// simd <T, N> gather(const T *p,
 ///             OffsetSimdViewT byte_offsets,
 ///             simd_mask<N / VS> mask, PropertyListT props = {});
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters. Loads ("gathers") elements of the type 'T'
 /// from memory locations addressed by the base pointer \p p and byte offsets \p
 /// byte_offsets, and returns the loaded elements. Access to any element's
@@ -772,7 +772,7 @@ gather(const T *p, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
 /// simd <T, N> gather(const T *p,
 ///             OffsetSimdViewT byte_offsets,
 ///             PropertyListT props = {});
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.  Loads ("gathers") elements of the type 'T'
 /// from memory locations addressed by the base pointer \p p and byte offsets \p
 /// byte_offsets, and returns the loaded elements.
@@ -925,7 +925,7 @@ scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
 /// void scatter(T *p, simd<OffsetT, N / VS> byte_offsets, ValuesSimdViewT vals,
 /// simd_mask<N / VS> mask, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.
 /// Writes ("scatters") elements of the input vector to different memory
 /// locations. Each memory location is base address plus an offset - a
@@ -993,7 +993,7 @@ scatter(T *p, simd<OffsetT, N / VS> byte_offsets, simd<T, N> vals,
 /// void scatter(T *p, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
 /// 	simd_mask<N / VS> mask, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.
 /// Writes ("scatters") elements of the input vector to different memory
 /// locations. Each memory location is base address plus an offset - a
@@ -1033,7 +1033,7 @@ scatter(T *p, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
 /// void scatter(T *p, simd<OffsetT, N / VS> byte_offsets, ValuesSimdViewT vals,
 /// 	PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.
 /// Writes ("scatters") elements of the input vector to different memory
 /// locations. Each memory location is base address plus an offset - a
@@ -1101,7 +1101,7 @@ scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// void scatter(T *p, OffsetSimdViewT byte_offsets, simd<T,N> vals,
 /// 	simd_mask<N / VS> mask, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.
 /// Writes ("scatters") elements of the input vector to different memory
 /// locations. Each memory location is base address plus an offset - a
@@ -1140,7 +1140,7 @@ scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// void scatter(T *p, OffsetSimdViewT byte_offsets, simd<T,N> vals,
 /// 	PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.
 /// Writes ("scatters") elements of the input vector to different memory
 /// locations. Each memory location is base address plus an offset - a
@@ -1214,7 +1214,7 @@ scatter(T *p, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// void scatter(T *p, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
 ///      PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.
 /// Writes ("scatters") elements of the input vector to different memory
 /// locations. Each memory location is base address plus an offset - a
@@ -1971,6 +1971,53 @@ block_load(const T *ptr, simd_mask<1> pred, simd<T, N> pass_thru,
   return detail::block_load_impl<T, N, NewPropertyListT>(ptr, pred, pass_thru);
 }
 
+/// simd<T, N> block_load(const T* ptr, simd_mask<1> pred,
+///                       PassThruSimdViewT pass_thru, props={});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function loads a contiguous memory block from USM pointer \p ptr. If
+/// the predicate \p pred is set to 0, then the load is omitted and the vector
+/// \p pass_thru is returned.
+///
+/// This function has temporary restrictions. See details in the 'Restrictions'
+/// section below. The restrictions will be relaxed in the future.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default assumed alignment is the minimally required element-size
+/// alignment. Note that additional/temporary restrictions are applied
+/// (see Restrictions below).
+///
+/// Restrictions - cache hint and mask imposed - temporary:
+/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or
+///     smaller and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64,
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128,
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256,
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename PassThruSimdViewT, typename T,
+    int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<PassThruSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+block_load(const T *ptr, simd_mask<1> pred, PassThruSimdViewT pass_thru,
+           PropertyListT props = {}) {
+  return block_load<T, N>(ptr, pred, pass_thru.read(), props);
+}
+
 /// simd<T, N> block_load(const T* ptr, size_t byte_offset,
 ///                       simd_mask<1> pred, simd<T, N> pass_thru,
 ///                       props={});                               // (usm-bl-6)
@@ -2017,6 +2064,55 @@ block_load(const T *ptr, size_t byte_offset, simd_mask<1> pred,
   return block_load<T, N>(AdjustedPtr, pred, pass_thru, props);
 }
 
+/// simd<T, N> block_load(const T* ptr, size_t byte_offset,
+///                       simd_mask<1> pred, PassThruSimdViewT pass_thru,
+///                       props={});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function loads a contiguous memory block from address referenced
+/// by USM pointer \p ptr and the given \p byte_offset.
+/// If the predicate \p pred is set to 0, then the load is omitted and the
+/// vector \p pass_thru is returned.
+///
+/// This function has temporary restrictions. See details in the 'Restrictions'
+/// section below. The restrictions will be relaxed in the future.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default assumed alignment is the minimally required element-size
+/// alignment. Note that additional/temporary restrictions are applied
+/// (see Restrictions below).
+///
+/// Restrictions - cache hint and mask imposed - temporary:
+/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or
+///     smaller and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64,
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128,
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256,
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename PassThruSimdViewT, typename T,
+    int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<PassThruSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+block_load(const T *ptr, size_t byte_offset, simd_mask<1> pred,
+           PassThruSimdViewT pass_thru, PropertyListT props = {}) {
+  return block_load<T, N>(ptr, byte_offset, pred, pass_thru.read(), props);
+}
+
 /// Loads a contiguous block of memory from the given memory address \p addr
 /// and returns the loaded data as a vector.
 /// The generated code depends on the combination {T, N, Flags}.
@@ -2294,6 +2390,57 @@ block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
                                                          pass_thru);
 }
 
+/// simd<T, N>
+/// block_load(AccessorT acc, OffsetT byte_offset, simd_mask<1> pred,
+///            PassThruSimdViewT pass_thru, props = {});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function loads a contiguous memory block referenced
+/// by accessor \p acc and the given \p byte_offset.
+/// If the predicate \p pred is set to 0, then the load is omitted and the
+/// \p pass_thru value is returned.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the \p byte_offset must be at least 4-byte aligned for elements of 4-bytes
+/// or smaller and 8-byte aligned for 8-byte elements.
+///
+/// Restrictions - cache hint and predicate imposed - temporary:
+/// R1: \p byte_offset must be at least 4-byte aligned for elements of 4-bytes
+///     or  smaller and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename PassThruSimdViewT,
+    typename T = PassThruSimdViewT::value_type::element_type,
+    int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<PassThruSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_device_accessor_with_v<AccessorT,
+                                          detail::accessor_mode_cap::can_read>,
+    simd<T, N>>
+block_load(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
+           simd_mask<1> pred, PassThruSimdViewT pass_thru,
+           PropertyListT props = {}) {
+  return block_load<T, N>(acc, byte_offset, pred, pass_thru.read(), props);
+}
+
 /// simd<T, N>
 /// block_load(AccessorT acc, OffsetT byte_offset, simd_mask<1> pred,
 ///            props = {});                                        // (acc-bl-4)
@@ -2383,6 +2530,53 @@ block_load(AccessorT acc, simd_mask<1> pred, simd<T, N> pass_thru,
   return block_load<T, N>(acc, 0, pred, pass_thru, NewPropertyListT{});
 }
 
+/// block_load(AccessorT acc, simd_mask<1> pred,
+///            PassThruSimdViewT pass_thru, props = {});
+///
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function loads a contiguous memory block referenced
+/// by accessor \p acc and implied offset=0.
+/// If the predicate \p pred is set to 0, then the load is omitted and the
+/// \p pass_thru value is returned.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2. Other properties are ignored. If \p props
+/// specifies the alignment property, then it is ignored because this
+/// variant implies zero offset, which means the most favourable 16-byte
+/// alignment is used.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Restrictions - cache hint and predicate imposed - temporary:
+/// R1: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R2: The target device must be DG2, PVC or newer GPU.
+template <
+    typename PassThruSimdViewT,
+    typename T = PassThruSimdViewT::value_type::element_type,
+    int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<PassThruSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_device_accessor_with_v<AccessorT,
+                                          detail::accessor_mode_cap::can_read>,
+    simd<T, N>>
+block_load(AccessorT acc, simd_mask<1> pred, PassThruSimdViewT pass_thru,
+           PropertyListT props = {}) {
+  return block_load<T, N>(acc, pred, pass_thru.read(), props);
+}
+
 /// simd<T, N>
 /// block_load(AccessorT acc, simd_mask<1> pred, props = {});      // (acc-bl-6)
 /// This function loads a contiguous memory block referenced
@@ -2638,29 +2832,15 @@ block_store(T *ptr, size_t byte_offset, simd<T, N> vals, simd_mask<1> pred,
   block_store<T, N>(AdjustedPtr, vals, pred, props);
 }
 
-/// Each of the following block_store functions stores the vector 'vals' to a
-/// contiguous memory block at the address referenced by accessor 'acc', or from
-/// 'acc + byte_offset', The parameter 'pred' is the one element predicate. If
-/// it is set to 1, then all 'N' elements are stored. Otherwise, the block store
-/// operation is a NO-OP. The parameter 'props' specifies the optional
-/// compile-time properties of the type esimd::properties and may include
-/// esimd::cache_hint_L1, esimd::cache_hint_L2, esimd::cache_hint_L3,
-/// esimd::alignment.
-
-/// void block_store(AccessorT acc, OffsetT byte_offset,          // (acc-bs-1)
-///                   simd<T, N> vals, props = {});
-
-/// void block_store(AccessorT acc, simd<T, N> vals, props = {}); // (acc-bs-2)
-/// void block_store(AccessorT acc, OffsetT byte_offset,          // (acc-bs-3)
-///     simd<T, N> vals, simd_mask<1> pred, props = {});
-
-/// void block_store(AccessorT acc, simd<T, N> vals,              // (acc-bs-4)
-///                  simd_mask<1> pred, props = {});
-
-/// void block_store(AccessorT acc, OffsetT byte_offset,          // (acc-bs-1)
-///                   simd<T, N> vals, props = {});
-/// This function stores a contiguous memory block to
-/// accessor \p acc and \p byte_offset with data specified by \p vals.
+/// void block_store(T* ptr, ValuesSimdViewT vals, props={});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function stores a contiguous memory block to USM pointer \p ptr
+/// with data specified by \p vals.
+///
+/// There may be temporary restrictions depending on L1, L2 cache hints,
+/// See details in the 'Restrictions' section below. The restrictions will be
+/// relaxed in the future.
 ///
 /// The parameter \p props specifies the optional compile-time properties
 /// of the type esimd::properties and may include esimd::cache_hint_L1,
@@ -2670,61 +2850,263 @@ block_store(T *ptr, size_t byte_offset, simd<T, N> vals, simd_mask<1> pred,
 /// the cache_hint::none value is assumed by default.
 ///
 /// Alignment: If \p props does not specify the 'alignment' property, then
-/// the \p byte_offset must be at least 16-byte aligned if (!(b) && (c))
-/// from the below restrictions, and must be at least 4-byte aligned for
-/// elements of 4-bytes or smaller and 8-byte aligned for 8-byte elements
-/// otherwise. If the 'alignment' property is specified as less than 16 bytes,
-/// then the target device must be DG2 or PVC (not Gen12). The alignment
-/// requirement may be less strict if stateless memory mode is ON, see
-/// block_store(usm_ptr, props) (aka usm-bs-01) for details/requirements.
-///
-/// Restrictions: there may be some extra restrictions depending on
-///    a) stateless memory mode enforcement is ON,
-///    b) cache hints are used,
-///    c) number of bytes stored is either 16,32,64, or 128.
-///    d) the 'alignment' property is specified as less than 16 bytes.
-///
-/// If (b) || !(c) || (d), then the target device must be DG2 or PVC (not
-/// Gen12).
-/// If (a) && !(b), then there is no restriction on the number of
-/// elements to be stored and \p byte_offset must be only element-aligned.
+/// the default assumed alignment is 16 bytes if \p props does not specify any
+/// L1 or L2 cache hints, and the minimally required element-size
+/// alignment otherwise. Note that additional/temporary restrictions may apply
+/// (see Restrictions below).
 ///
-/// Gen12 requirements: !(b) && (c) && !(d).
-///   It can store 16-, 32-, 64-, or 128-bytes only.
-/// DG2/PVC requirements:
-///   It can store such number of elements depending on the type 'T':
-///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
-///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+/// Restrictions - cache hint imposed - temporary:
+/// If L1 or L2 cache hint is passed, then:
+/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or
+///     smaller and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64,
 ///                      or 128(only if alignment is 8-bytes or more);
-///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128,
 ///                      or 256(only if alignment is 8-bytes or more);
-///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256,
 ///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
 template <
-    typename T, int N, typename AccessorT,
+    typename ValuesSimdViewT, typename T,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
-__ESIMD_API std::enable_if_t<
-    ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
-    detail::is_device_accessor_with_v<AccessorT,
-                                      detail::accessor_mode_cap::can_write>>
-block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
-            simd<T, N> vals, PropertyListT props = {}) {
-#ifdef __ESIMD_FORCE_STATELESS_MEM
-  block_store<T, N>(detail::accessorToPointer<T>(acc, byte_offset), vals,
-                    props);
-#else
-  constexpr int DefaultLSCAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
-  constexpr size_t Alignment =
-      detail::getPropertyValue<PropertyListT, alignment_key>(
-          DefaultLSCAlignment);
-  constexpr bool AlignmentRequiresLSC =
-      PropertyListT::template has_property<alignment_key>() && Alignment < 16;
-  using Tx = detail::__raw_t<T>;
-  constexpr unsigned Sz = sizeof(Tx) * N;
-  constexpr bool SzRequiresLSC =
-      Sz < detail::OperandSize::OWORD || Sz % detail::OperandSize::OWORD != 0 ||
-      !detail::isPowerOf2(Sz / detail::OperandSize::OWORD) ||
-      Sz > 8 * detail::OperandSize::OWORD;
+__ESIMD_API std::enable_if_t<detail::is_simd_view_type_v<ValuesSimdViewT> &&
+                             detail::is_property_list_v<PropertyListT>>
+block_store(T *ptr, ValuesSimdViewT vals, PropertyListT props = {}) {
+  block_store<T, N>(ptr, vals.read(), props);
+}
+
+/// void block_store(T* ptr, size_t byte_offset,
+///                          ValuesSimdViewT vals, props={});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function stores a contiguous memory block to USM pointer \p ptr and
+/// byte-offset \p byte_offset with data specified by \p vals.
+///
+/// There may be temporary restrictions depending on L1, L2 cache hints,
+/// See details in the 'Restrictions' section below. The restrictions will be
+/// relaxed in the future.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default assumed alignment is 16 bytes if \p props does not specify any
+/// L1 or L2 cache hints, and the minimally required element-size
+/// alignment otherwise. Note that additional/temporary restrictions may apply
+/// (see Restrictions below).
+///
+/// Restrictions - cache hint imposed - temporary:
+/// If L1 or L2 cache hint is passed, then:
+/// R1: The pointer plus byte offset must be at least 4-byte aligned for
+/// elements of 4-bytes or smaller and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64,
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128,
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256,
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename ValuesSimdViewT, typename T,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+block_store(T *ptr, size_t byte_offset, ValuesSimdViewT vals,
+            PropertyListT props = {}) {
+  block_store<T, N>(ptr, byte_offset, vals.read(), props);
+}
+
+/// void block_store(T* ptr, ValuesSimdViewT vals,
+///             simd_mask<1> pred, props={});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function stores a contiguous memory block to USM pointer \p ptr
+/// with data specified by \p vals. If the predicate \p pred is set to 0,
+/// then the store is omitted.
+///
+/// There are temporary restrictions.  See details in the 'Restrictions'
+/// section below. The restrictions will be relaxed in the future.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default assumed alignment is the minimally required element-size
+/// alignment. Note that additional/temporary restrictions apply (see
+/// Restrictions below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The pointer must be at least 4-byte aligned for elements of 4-bytes or
+///     smaller and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64,
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128,
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256,
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename ValuesSimdViewT, typename T,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<detail::is_simd_view_type_v<ValuesSimdViewT> &&
+                             detail::is_property_list_v<PropertyListT>>
+block_store(T *ptr, ValuesSimdViewT vals, simd_mask<1> pred,
+            PropertyListT props = {}) {
+  block_store<T, N>(ptr, vals.read(), pred, props);
+}
+
+/// void block_store(T* ptr, size_t byte_offset,
+/// ValuesSimdViewT vals, simd_mask<1> pred, props={});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function stores a contiguous memory block to USM pointer \p ptr
+/// and byte-offset \p byte_offset with data specified by \p vals.
+/// If the predicate \p pred is set to 0, then the store is omitted.
+///
+/// There may be temporary restrictions depending on L1, L2 cache hints,
+/// See details in the 'Restrictions' section below. The restrictions will be
+/// relaxed in the future.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default assumed alignment is 16 bytes if \p props does not specify any
+/// L1 or L2 cache hints and \p pred is set to 1, and
+//  the minimally required element-size alignment otherwise.
+/// Note that additional/temporary restrictions may apply
+/// (see Restrictions below).
+///
+/// Restrictions - cache hint or predicate imposed - temporary:
+/// If a predicate, L1 or L2 cache hint is passed, then:
+/// R1: The pointer plus byte offset must be at least 4-byte aligned for
+/// elements of 4-bytes or smaller and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements for 8-byte data: 1, 2, 3, 4, 8, 16, 32, 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64,
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128,
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256,
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename ValuesSimdViewT, typename T,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+block_store(T *ptr, size_t byte_offset, ValuesSimdViewT vals, simd_mask<1> pred,
+            PropertyListT props = {}) {
+  block_store<T, N>(ptr, byte_offset, vals.read(), pred, props);
+}
+
+/// Each of the following block_store functions stores the vector 'vals' to a
+/// contiguous memory block at the address referenced by accessor 'acc', or from
+/// 'acc + byte_offset', The parameter 'pred' is the one element predicate. If
+/// it is set to 1, then all 'N' elements are stored. Otherwise, the block store
+/// operation is a NO-OP. The parameter 'props' specifies the optional
+/// compile-time properties of the type esimd::properties and may include
+/// esimd::cache_hint_L1, esimd::cache_hint_L2, esimd::cache_hint_L3,
+/// esimd::alignment.
+
+/// void block_store(AccessorT acc, OffsetT byte_offset,          // (acc-bs-1)
+///                   simd<T, N> vals, props = {});
+
+/// void block_store(AccessorT acc, simd<T, N> vals, props = {}); // (acc-bs-2)
+/// void block_store(AccessorT acc, OffsetT byte_offset,          // (acc-bs-3)
+///     simd<T, N> vals, simd_mask<1> pred, props = {});
+
+/// void block_store(AccessorT acc, simd<T, N> vals,              // (acc-bs-4)
+///                  simd_mask<1> pred, props = {});
+
+/// void block_store(AccessorT acc, OffsetT byte_offset,          // (acc-bs-1)
+///                   simd<T, N> vals, props = {});
+/// This function stores a contiguous memory block to
+/// accessor \p acc and \p byte_offset with data specified by \p vals.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the \p byte_offset must be at least 16-byte aligned if (!(b) && (c))
+/// from the below restrictions, and must be at least 4-byte aligned for
+/// elements of 4-bytes or smaller and 8-byte aligned for 8-byte elements
+/// otherwise. If the 'alignment' property is specified as less than 16 bytes,
+/// then the target device must be DG2 or PVC (not Gen12). The alignment
+/// requirement may be less strict if stateless memory mode is ON, see
+/// block_store(usm_ptr, props) (aka usm-bs-01) for details/requirements.
+///
+/// Restrictions: there may be some extra restrictions depending on
+///    a) stateless memory mode enforcement is ON,
+///    b) cache hints are used,
+///    c) number of bytes stored is either 16,32,64, or 128.
+///    d) the 'alignment' property is specified as less than 16 bytes.
+///
+/// If (b) || !(c) || (d), then the target device must be DG2 or PVC (not
+/// Gen12).
+/// If (a) && !(b), then there is no restriction on the number of
+/// elements to be stored and \p byte_offset must be only element-aligned.
+///
+/// Gen12 requirements: !(b) && (c) && !(d).
+///   It can store 16-, 32-, 64-, or 128-bytes only.
+/// DG2/PVC requirements:
+///   It can store such number of elements depending on the type 'T':
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+template <
+    typename T, int N, typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+    detail::is_device_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_write>>
+block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
+            simd<T, N> vals, PropertyListT props = {}) {
+#ifdef __ESIMD_FORCE_STATELESS_MEM
+  block_store<T, N>(detail::accessorToPointer<T>(acc, byte_offset), vals,
+                    props);
+#else
+  constexpr int DefaultLSCAlignment = (sizeof(T) <= 4) ? 4 : sizeof(T);
+  constexpr size_t Alignment =
+      detail::getPropertyValue<PropertyListT, alignment_key>(
+          DefaultLSCAlignment);
+  constexpr bool AlignmentRequiresLSC =
+      PropertyListT::template has_property<alignment_key>() && Alignment < 16;
+  using Tx = detail::__raw_t<T>;
+  constexpr unsigned Sz = sizeof(Tx) * N;
+  constexpr bool SzRequiresLSC =
+      Sz < detail::OperandSize::OWORD || Sz % detail::OperandSize::OWORD != 0 ||
+      !detail::isPowerOf2(Sz / detail::OperandSize::OWORD) ||
+      Sz > 8 * detail::OperandSize::OWORD;
   if constexpr (detail::has_cache_hints<PropertyListT>() ||
                 AlignmentRequiresLSC || SzRequiresLSC) {
     using NewPropertyListT =
@@ -2871,49 +3253,247 @@ block_store(AccessorT acc, simd<T, N> vals, simd_mask<1> pred,
   block_store<T, N>(acc, 0, vals, pred, NewPropertyListT{});
 }
 
-/// @} sycl_esimd_memory_block
-
-/// @} sycl_esimd_memory
-
-/// @cond ESIMD_DETAIL
-
-// Implementations of accessor-based gather and scatter functions
-namespace detail {
-template <typename T, int N, typename AccessorTy>
-ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<
-    std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
-    is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_write>>
-scatter_impl(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
-             uint32_t glob_offset, simd_mask<N> mask) {
-
-  static_assert(detail::isPowerOf2(N, 32), "Unexpected vector length");
-  if constexpr (sizeof(T) == 8) {
-    scatter_impl<uint32_t, N>(
-        acc, vals.template bit_cast_view<uint32_t>().template select<N, 2>(0),
-        offsets, glob_offset, mask);
-    scatter_impl<uint32_t, N>(
-        acc, vals.template bit_cast_view<uint32_t>().template select<N, 2>(1),
-        offsets, glob_offset + sizeof(uint32_t), mask);
-  } else {
-    constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
-    // TODO (performance) use hardware-supported scale once BE supports it
-    constexpr int16_t scale = 0;
-    const auto si = __ESIMD_NS::get_surface_index(acc);
-
-    if constexpr (sizeof(T) < 4) {
-      using Tint = std::conditional_t<std::is_integral_v<T>, T,
-                                      detail::uint_type_t<sizeof(T)>>;
-      using Treal = __raw_t<T>;
-      simd<Tint, N> vals_int = bitcast<Tint, Treal, N>(std::move(vals).data());
-      using PromoT = typename std::conditional_t<std::is_signed<Tint>::value,
-                                                 int32_t, uint32_t>;
-      const simd<PromoT, N> promo_vals = convert<PromoT>(std::move(vals_int));
-      __esimd_scatter_scaled<PromoT, N, decltype(si), TypeSizeLog2, scale>(
-          mask.data(), si, glob_offset, offsets.data(), promo_vals.data());
-    } else {
-      using Treal = __raw_t<T>;
-      if constexpr (!std::is_same_v<Treal, T>) {
-        simd<Treal, N> Values = vals.template bit_cast_view<Treal>();
+/// void block_store(AccessorT acc, OffsetT byte_offset,
+///                   ValuesSimdViewT vals, props = {});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function stores a contiguous memory block to
+/// accessor \p acc and \p byte_offset with data specified by \p vals.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the \p byte_offset must be at least 16-byte aligned if (!(b) && (c))
+/// from the below restrictions, and must be at least 4-byte aligned for
+/// elements of 4-bytes or smaller and 8-byte aligned for 8-byte elements
+/// otherwise. If the 'alignment' property is specified as less than 16 bytes,
+/// then the target device must be DG2 or PVC (not Gen12). The alignment
+/// requirement may be less strict if stateless memory mode is ON, see
+/// block_store(usm_ptr, props) (aka usm-bs-01) for details/requirements.
+///
+/// Restrictions: there may be some extra restrictions depending on
+///    a) stateless memory mode enforcement is ON,
+///    b) cache hints are used,
+///    c) number of bytes stored is either 16,32,64, or 128.
+///    d) the 'alignment' property is specified as less than 16 bytes.
+///
+/// If (b) || !(c) || (d), then the target device must be DG2 or PVC (not
+/// Gen12).
+/// If (a) && !(b), then there is no restriction on the number of
+/// elements to be stored and \p byte_offset must be only element-aligned.
+///
+/// Gen12 requirements: !(b) && (c) && !(d).
+///   It can store 16-, 32-, 64-, or 128-bytes only.
+/// DG2/PVC requirements:
+///   It can store such number of elements depending on the type 'T':
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+    detail::is_device_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_write>>
+block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
+            ValuesSimdViewT vals, PropertyListT props = {}) {
+  block_store<T, N>(acc, byte_offset, vals.read(), props);
+}
+
+/// void block_store(AccessorT acc, ValuesSimdViewT vals, props = {});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function stores a contiguous memory block to
+/// accessor \p acc with data specified by \p vals and implied offset=0.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2. Other properties are ignored. If \p props specifies
+/// the alignment property, then it is ignored because this variant implies
+/// zero offset, which means the most favourable 16-byte alignment is used.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Restrictions: there may be some extra restrictions depending on
+///    a) stateless memory mode enforcement is ON,
+///    b) cache hints are used,
+///    c) number of bytes stored is either 16,32,64, or 128.
+/// If (b) || !(c), then the target device must be DG2 or PVC (not Gen12).
+/// If (a) && !(b), then there is no restriction on the number of elements
+/// to be stored.
+///
+/// Gen12 requirements: !(b) && (c).
+///   It can store 16-, 32-, 64-, or 128-bytes only.
+/// DG2/PVC requirements:
+///   It can store such number of elements depending on the type 'T':
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), or 128;
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), or 256;
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), or 512.
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+    detail::is_device_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_write>>
+block_store(AccessorT acc, ValuesSimdViewT vals, PropertyListT props = {}) {
+  block_store<T, N>(acc, vals.read(), props);
+}
+
+/// void block_store(AccessorT acc, OffsetT byte_offset,
+///     ValuesSimdViewT vals, simd_mask<1> pred, props = {});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function stores a contiguous memory block to
+/// accessor \p acc and \p byte_offset with data specified by \p vals.
+/// If the predicate \p pred is set to 0, then the store is omitted.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2, esimd::alignment. Other properties are ignored.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the \p byte_offset must be at least 4-byte aligned for elements of 4-bytes
+/// or smaller and 8-byte aligned for 8-byte elements.
+/// The alignment requirement may be less strict if stateless memory mode is ON,
+/// see block_store(usm_ptr, props) (aka usm-bs-01) for details/requirements.
+///
+/// Restrictions:
+/// R1: The target device must be DG2 or PVC (not Gen12).
+///
+/// R2:
+///   It can store such number of elements depending on the type 'T':
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+    detail::is_device_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_write>>
+block_store(AccessorT acc, detail::DeviceAccessorOffsetT byte_offset,
+            ValuesSimdViewT vals, simd_mask<1> pred, PropertyListT props = {}) {
+  block_store<T, N>(acc, byte_offset, vals.read(), pred, props);
+}
+
+/// void block_store(AccessorT acc, ValuesSimdViewT vals,
+///                  simd_mask<1> pred, props = {});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// This function stores a contiguous memory block to
+/// accessor \p acc with data specified by \p vals and implied offset=0.
+/// If the predicate \p pred is set to 0, then the store is omitted.
+///
+/// The parameter \p props specifies the optional compile-time properties
+/// of the type esimd::properties and may include esimd::cache_hint_L1,
+/// esimd::cache_hint_L2. Other properties are ignored. If \p props specifies
+/// the alignment property, then it is ignored because this variant implies
+/// zero offset, which means the most favourable 16-byte alignment is used.
+///
+/// Cache hints: If \p props does not specify any L1 or L2 cache hints, then
+/// the cache_hint::none value is assumed by default.
+///
+/// Restrictions:
+/// R1: The target device must be DG2 or PVC (not Gen12).
+///
+/// R2:
+///   It can store such number of elements depending on the type 'T':
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), or 128;
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), or 256;
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), or 512.
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+    detail::is_device_accessor_with_v<AccessorT,
+                                      detail::accessor_mode_cap::can_write>>
+block_store(AccessorT acc, ValuesSimdViewT vals, simd_mask<1> pred,
+            PropertyListT props = {}) {
+  block_store<T, N>(acc, vals.read(), pred, props);
+}
+
+/// @} sycl_esimd_memory_block
+
+/// @} sycl_esimd_memory
+
+/// @cond ESIMD_DETAIL
+
+// Implementations of accessor-based gather and scatter functions
+namespace detail {
+template <typename T, int N, typename AccessorTy>
+ESIMD_INLINE ESIMD_NODEBUG std::enable_if_t<
+    std::is_same_v<detail::LocalAccessorMarker, AccessorTy> ||
+    is_accessor_with_v<AccessorTy, detail::accessor_mode_cap::can_write>>
+scatter_impl(AccessorTy acc, simd<T, N> vals, simd<uint32_t, N> offsets,
+             uint32_t glob_offset, simd_mask<N> mask) {
+
+  static_assert(detail::isPowerOf2(N, 32), "Unexpected vector length");
+  if constexpr (sizeof(T) == 8) {
+    scatter_impl<uint32_t, N>(
+        acc, vals.template bit_cast_view<uint32_t>().template select<N, 2>(0),
+        offsets, glob_offset, mask);
+    scatter_impl<uint32_t, N>(
+        acc, vals.template bit_cast_view<uint32_t>().template select<N, 2>(1),
+        offsets, glob_offset + sizeof(uint32_t), mask);
+  } else {
+    constexpr int TypeSizeLog2 = detail::ElemsPerAddrEncoding<sizeof(T)>();
+    // TODO (performance) use hardware-supported scale once BE supports it
+    constexpr int16_t scale = 0;
+    const auto si = __ESIMD_NS::get_surface_index(acc);
+
+    if constexpr (sizeof(T) < 4) {
+      using Tint = std::conditional_t<std::is_integral_v<T>, T,
+                                      detail::uint_type_t<sizeof(T)>>;
+      using Treal = __raw_t<T>;
+      simd<Tint, N> vals_int = bitcast<Tint, Treal, N>(std::move(vals).data());
+      using PromoT = typename std::conditional_t<std::is_signed<Tint>::value,
+                                                 int32_t, uint32_t>;
+      const simd<PromoT, N> promo_vals = convert<PromoT>(std::move(vals_int));
+      __esimd_scatter_scaled<PromoT, N, decltype(si), TypeSizeLog2, scale>(
+          mask.data(), si, glob_offset, offsets.data(), promo_vals.data());
+    } else {
+      using Treal = __raw_t<T>;
+      if constexpr (!std::is_same_v<Treal, T>) {
+        simd<Treal, N> Values = vals.template bit_cast_view<Treal>();
         __esimd_scatter_scaled<Treal, N, decltype(si), TypeSizeLog2, scale>(
             mask.data(), si, glob_offset, offsets.data(), Values.data());
       } else {
@@ -4327,7 +4907,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// 	         simd_mask<N / VS> mask,
 ///              PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the accessor \p acc and byte offsets \p byte_offsets. Access to any
@@ -4368,7 +4948,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// void scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// 	         PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the accessor \p acc and byte offsets \p byte_offsets.
@@ -4409,7 +4989,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 ///              ValuesSimdViewT vals, simd_mask<N / VS> mask,
 ///              PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the accessor \p acc and byte offsets \p byte_offsets. Access to any
@@ -4457,7 +5037,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
 /// void scatter(AccessorTy acc, OffsetSimdViewT byte_offsets,
 ///              ValuesSimdViewT vals, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the accessor \p acc and byte offsets \p byte_offsets.
@@ -4502,7 +5082,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
 ///              ValuesSimdViewT vals, simd_mask<N / VS> mask,
 ///              PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the accessor \p acc and byte offsets \p byte_offsets. Access to any
@@ -4545,7 +5125,7 @@ scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets,
 /// void scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets,
 ///              ValuesSimdViewT vals, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the accessor \p acc and byte offsets \p byte_offsets.
@@ -5469,7 +6049,7 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
 ///             OffsetSimdViewT byte_offsets,
 ///             simd_mask<N / VS> mask, simd<T, N> pass_thru,
 ///             PropertyListT props = {});
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.
 /// Loads ("gathers") elements of the type 'T' from Shared Local Memory
 /// locations addressed by byte offsets \p byte_offsets, and returns the loaded
@@ -5514,7 +6094,7 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
 ///             OffsetSimdViewT byte_offsets,
 ///             simd_mask<N / VS> mask, PassThruSimdViewT pass_thru,
 ///             PropertyListT props = {});
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.
 /// Loads ("gathers") elements of the type 'T' from Shared Local Memory
 /// locations addressed by byte offsets \p byte_offsets, and returns the loaded
@@ -5563,7 +6143,7 @@ slm_gather(OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
 ///             OffsetSimdViewT byte_offsets,
 ///             simd_mask<N / VS> mask, PassThruSimdViewT pass_thru,
 ///             PropertyListT props = {});
-/// Variation of the API that allows to use \c simd_view without specifying \c T
+/// Variation of the API that allows using \c simd_view without specifying \c T
 /// and \c N template parameters.
 /// Loads ("gathers") elements of the type 'T' from Shared Local Memory
 /// locations addressed by byte offsets \p byte_offsets, and returns the loaded
@@ -5844,7 +6424,7 @@ slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// void slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
 ///	         simd_mask<N / VS> mask, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to Shared Local Memory
 /// locations addressed by byte offsets \p byte_offsets.
@@ -5876,7 +6456,7 @@ slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// void slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
 ///	         PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to Shared Local Memory
 /// locations addressed by byte offsets \p byte_offsets.
@@ -5911,7 +6491,7 @@ slm_scatter(OffsetSimdViewT byte_offsets, simd<T, N> vals,
 ///              ValuesSimdViewT vals, simd_mask<N / VS> mask,
 ///              PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to Shared Local Memory
 /// locations addressed by byte offsets \p byte_offsets.
@@ -5948,7 +6528,7 @@ slm_scatter(OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
 /// void slm_scatter(OffsetSimdViewT byte_offsets,
 ///              ValuesSimdViewT vals, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to Shared Local Memory
 /// locations addressed by byte offsets \p byte_offsets.
@@ -5986,7 +6566,7 @@ slm_scatter(OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
 ///              ValuesSimdViewT vals, simd_mask<N / VS> mask,
 ///              PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to Shared Local Memory
 /// locations addressed by byte offsets \p byte_offsets.
@@ -6018,7 +6598,7 @@ slm_scatter(simd<OffsetT, N / VS> byte_offsets, ValuesSimdViewT vals,
 /// void slm_scatter(simd<OffsetT, N / VS> byte_offsets,
 ///              ValuesSimdViewT vals, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to Shared Local Memory
 /// locations addressed by byte offsets \p byte_offsets.
@@ -6350,6 +6930,51 @@ slm_block_load(uint32_t offset, simd_mask<1> pred, simd<T, N> pass_thru,
   return Result.template bit_cast_view<T>();
 }
 
+/// simd<T, N> slm_block_load(uint32_t byte_offset,
+///                           simd_mask<1> pred,
+///                           PassThruSimdViewT pass_thru, props={});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Loads a contiguous memory block from SLM (Shared Local Memory) at the
+/// given \p byte_offset.
+/// The parameter \p pred is the one-element predicate. If it is set to 1,
+/// then all 'N' elements are loaded. Otherwise, the block load operation
+/// is a NO-OP.
+/// The parameter 'pass_thru' specifies the values being copied to the returned
+/// result if 'pred' is set to 0.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is the minimally required (see (R1) below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller
+///     elements and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename PassThruSimdViewT,
+    typename T = PassThruSimdViewT::value_type::element_type,
+    int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<PassThruSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+slm_block_load(uint32_t offset, simd_mask<1> pred, PassThruSimdViewT pass_thru,
+               PropertyListT props = {}) {
+  return slm_block_load<T, N>(offset, pred, pass_thru.read(), props);
+}
+
 /// simd<T, N> block_load(local_accessor lacc, uint32_t byte_offset,
 ///                       props={});                              // (lacc-bl-1)
 /// Loads a contiguous memory block from SLM (Shared Local Memory) associated
@@ -6539,7 +7164,53 @@ block_load(AccessorT lacc, uint32_t byte_offset, simd_mask<1> pred,
   return slm_block_load<T, N>(byte_offset, pred, pass_thru, props);
 }
 
-/// simd<T, N> block_load(local_accessor lacc,
+/// simd<T, N> block_load(local_accessor lacc, uint32_t byte_offset,
+///                       simd_mask<1> pred, PassThruSimdViewT pass_thru,
+///                       props={});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Loads a contiguous memory block from SLM (Shared Local Memory) associated
+/// the local accessor \p lacc at the given \p byte_offset.
+/// The parameter \p pred is the one-element predicate. If it is set to 1,
+/// then all 'N' elements are loaded. Otherwise, the block load operation
+/// is a NO-OP, and \p pass_thru value is returned.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is the minimally required (see (R1) below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The \p lacc + \p byte_offset must be at least 4-byte aligned for 4-byte
+///     or smaller elements and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename PassThruSimdViewT,
+    typename T = PassThruSimdViewT::value_type::element_type,
+    int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<PassThruSimdViewT> &&
+        detail::is_local_accessor_with_v<AccessorT,
+                                         detail::accessor_mode_cap::can_read> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+block_load(AccessorT lacc, uint32_t byte_offset, simd_mask<1> pred,
+           PassThruSimdViewT pass_thru, PropertyListT props = {}) {
+  return block_load<T, N>(lacc, byte_offset, pred, pass_thru.read(), props);
+}
+
+/// simd<T, N> block_load(local_accessor lacc,
 ///                       simd_mask<1> pred, simd<T, N> pass_thru,
 ///                       props={});                              // (lacc-bl-6)
 /// Loads a contiguous memory block from SLM (Shared Local Memory) associated
@@ -6579,6 +7250,51 @@ block_load(AccessorT lacc, simd_mask<1> pred, simd<T, N> pass_thru,
                               pass_thru, props);
 }
 
+/// simd<T, N> block_load(local_accessor lacc,
+///                       simd_mask<1> pred, PassThruSimdViewT pass_thru,
+///                       props={});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Loads a contiguous memory block from SLM (Shared Local Memory) associated
+/// with the local accessor \p lacc at zero offset.
+///
+/// The parameter \p pred is the one-element predicate. If it is set to 1,
+/// then all 'N' elements are loaded. Otherwise, the block load operation
+/// is a NO-OP, and \p pass_thru value is returned.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is the minimally required (see (R1) below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The local accessor \p lacc must point to memory at least 4-byte aligned
+///     for elements of 4-bytes or smaller and 8-byte aligned for 8-byte
+///     elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2), or 128;
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2), or 256;
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2), or 512.
+/// R2: The target device must be DG2, PVC or newer GPU.
+template <
+    typename PassThruSimdViewT,
+    typename T = PassThruSimdViewT::value_type::element_type,
+    int N = PassThruSimdViewT::getSizeX() * PassThruSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<PassThruSimdViewT> &&
+        detail::is_local_accessor_with_v<AccessorT,
+                                         detail::accessor_mode_cap::can_read> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+block_load(AccessorT lacc, simd_mask<1> pred, PassThruSimdViewT pass_thru,
+           PropertyListT props = {}) {
+  return block_load<T, N>(lacc, pred, pass_thru.read(), props);
+}
+
 /// Stores elements of the vector \p vals to a contiguous block of SLM memory
 /// at the given byte-offset \p offset.
 /// The generated code depends on the combination {T, N, Flags}.
@@ -6745,6 +7461,76 @@ slm_block_store(uint32_t byte_offset, simd<T, N> vals,
       sycl::bit_cast<__ESIMD_DNS::vector_type_t<StoreElemT, N>>(vals.data()));
 }
 
+/// void slm_block_store(uint32_t byte_offset, ValuesSimdViewT vals,
+///                      simd_mask<1> pred, props={});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
+/// Memory) at the given \p byte_offset. The parameter \p pred is the
+/// one-element predicate. If it is set to 1, then all 'N' elements are stored.
+/// Otherwise, the block stored operation is a NO-OP.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is the minimally required (see (R1) below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller
+///     elements and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+slm_block_store(uint32_t byte_offset, ValuesSimdViewT vals, simd_mask<1> pred,
+                PropertyListT props = {}) {
+  slm_block_store<T, N>(byte_offset, vals.read(), pred, props);
+}
+
+/// void slm_block_store(uint32_t byte_offset, ValuesSimdViewT vals,
+///                      props = {});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Stores the vector \p vals to a contiguous memory block in SLM
+/// (Shared Local Memory) at the given \p byte_offset. The parameter 'props'
+/// specifies the optional compile-time properties list. Only esimd::alignment
+/// property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is 16-bytes to generate block_store
+/// instruction on all known target devices (Gen12, DG2, PVC, etc).
+/// On Gen12 (opposing to DG2 and PVC) the alignment smaller than 8-bytes
+/// is valid, but requires JIT compiler generating a slower SCATTER instead
+/// of faster BLOCK_STORE.
+/// !!! Passing \p byte_offset not aligned by 16-bytes and not specifying
+/// the actual alignment in \p props produces incorrect store results on Gen12.
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+slm_block_store(uint32_t byte_offset, ValuesSimdViewT vals,
+                PropertyListT props = {}) {
+  slm_block_store<T, N>(byte_offset, vals.read(), props);
+}
+
 /// void block_store(local_accessor lacc, uint32_t byte_offset, // (lacc-bs-1)
 ///                  simd<T, N> vals, props={});
 /// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
@@ -6876,79 +7662,232 @@ block_store(AccessorT lacc, simd<T, N> vals, simd_mask<1> pred,
             PropertyListT props = {}) {
   slm_block_store<T, N>(detail::localAccessorToOffset(lacc), vals, pred, props);
 }
-namespace detail {
 
-// lsc_atomic_update() operations may share atomic_op values for data types
-// of the same (fp vs integral) class for convenience (e.g. re-use 'fmax' for
-// all FP types). In fact those data types may require using different internal
-// opcodes. This function returns the corresponding internal opcode for
-// the input type 'T' and operation 'Op'.
-template <typename T, __ESIMD_NS::atomic_op Op>
-constexpr int lsc_to_internal_atomic_op() {
-  constexpr __ESIMD_NS::native::lsc::atomic_op LSCOp =
-      __ESIMD_DNS::to_lsc_atomic_op<Op>();
-  return static_cast<int>(LSCOp);
+/// void block_store(local_accessor lacc, uint32_t byte_offset,
+///                  ValuesSimdViewT vals, props={});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
+/// Memory) associated with the local accessor \p lacc at the given \p
+/// byte_offset. The parameter 'props' specifies the optional compile-time
+/// properties list. Only esimd::alignment property is used. Other properties
+/// are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is 16-bytes to generate block_store
+/// instruction on all known target devices (Gen12, DG2, PVC, etc).
+/// On Gen12 (opposing to DG2 and PVC) the alignment smaller than 8-bytes
+/// is valid, but requires JIT compiler generating a slower SCATTER instead
+/// of faster BLOCK_STORE.
+/// !!! Passing \p byte_offset not aligned by 16-bytes and not specifying
+/// the actual alignment in \p props produces incorrect store results on Gen12.
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    detail::is_local_accessor_with_v<AccessorT,
+                                     detail::accessor_mode_cap::can_write> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+block_store(AccessorT lacc, uint32_t byte_offset, ValuesSimdViewT vals,
+            PropertyListT props = {}) {
+  block_store<T, N>(lacc, byte_offset, vals.read(), props);
 }
 
-/// SLM atomic.
-/// Supported platforms: DG2, PVC
-/// VISA instruction: lsc_atomic_<OP>.slm
-///
-/// @tparam Op is operation type.
-/// @tparam T is element type.
-/// @tparam N is the number of channels (platform dependent).
-/// @tparam DS is the data size.
-/// @param offsets is the zero-based offsets.
-/// @param pred is predicate.
+/// void block_store(local_accessor lacc, ValuesSimdViewT vals,
+///                  props={});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Stores the vector \p vals to a contiguous memory block in SLM
+/// (Shared Local Memory) associated with the local accessor \p lacc. The
+/// parameter 'props' specifies the optional compile-time properties list. Only
+/// esimd::alignment property is used. Other properties are ignored.
 ///
-/// @return A vector of the old values at the memory locations before the
-///   update.
-
-template <atomic_op Op, typename T, int N, lsc_data_size DS>
-__ESIMD_API std::enable_if_t<get_num_args<Op>() == 0, simd<T, N>>
-slm_atomic_update_impl(simd<uint32_t, N> offsets, simd_mask<N> pred) {
-  check_lsc_data_size<T, DS>();
-  check_atomic<Op, T, N, 0, /*IsLSC*/ true>();
-  constexpr uint16_t AddressScale = 1;
-  constexpr int ImmOffset = 0;
-  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
-  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
-  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
-  using MsgT = typename lsc_expand_type<T>::type;
-  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
-  simd<MsgT, N> Tmp =
-      __esimd_lsc_xatomic_slm_0<MsgT, IOp, cache_hint::none, cache_hint::none,
-                                AddressScale, ImmOffset, EDS, VS, Transposed,
-                                N>(pred.data(), offsets.data());
-  return lsc_format_ret<T>(Tmp);
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is 16-bytes to generate block_store
+/// instruction on all known target devices (Gen12, DG2, PVC, etc).
+/// On Gen12 (opposing to DG2 and PVC) the alignment smaller than 8-bytes
+/// is valid, but requires JIT compiler generating a slower SCATTER instead
+/// of faster BLOCK_STORE.
+/// !!! Passing \p byte_offset not aligned by 16-bytes and not specifying
+/// the actual alignment in \p props produces incorrect store results on Gen12.
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    detail::is_local_accessor_with_v<AccessorT,
+                                     detail::accessor_mode_cap::can_write> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+block_store(AccessorT lacc, ValuesSimdViewT vals, PropertyListT props = {}) {
+  block_store<T, N>(lacc, vals.read(), props);
 }
 
-/// SLM atomic.
-/// Supported platforms: DG2, PVC
-/// VISA instruction: lsc_atomic_<OP>.slm
+/// void block_store(local_accessor lacc, uint32_t byte_offset,
+///                  ValuesSimdViewT vals, simd_mask<1> pred, props={});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
+/// Memory) associated with the local accessor \p lacc at the given \p
+/// byte_offset. The parameter \p pred is the one-element predicate. If it is
+/// set to 1, then all 'N' elements are stored. Otherwise, the block store
+/// operation is a NO-OP.
 ///
-/// @tparam Op is operation type.
-/// @tparam T is element type.
-/// @tparam N is the number of channels (platform dependent).
-/// @tparam DS is the data size.
-/// @param offsets is the zero-based offsets.
-/// @param src0 is the first atomic operand.
-/// @param pred is predicate.
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
 ///
-/// @return A vector of the old values at the memory locations before the
-///   update.
-template <atomic_op Op, typename T, int N, lsc_data_size DS>
-__ESIMD_API std::enable_if_t<get_num_args<Op>() == 1, simd<T, N>>
-slm_atomic_update_impl(simd<uint32_t, N> offsets, simd<T, N> src0,
-                       simd_mask<N> pred) {
-  check_lsc_data_size<T, DS>();
-  check_atomic<Op, T, N, 1, /*IsLSC*/ true>();
-  constexpr uint16_t AddressScale = 1;
-  constexpr int ImmOffset = 0;
-  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
-  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
-  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
-  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is the minimally required (see (R1) below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller
+///     elements and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    detail::is_local_accessor_with_v<AccessorT,
+                                     detail::accessor_mode_cap::can_write> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+block_store(AccessorT lacc, uint32_t byte_offset, ValuesSimdViewT vals,
+            simd_mask<1> pred, PropertyListT props = {}) {
+  block_store<T, N>(lacc, byte_offset, vals.read(), pred, props);
+}
+
+/// void block_store(local_accessor lacc, ValuesSimdViewT vals,
+///                  simd_mask<1> pred, props={});
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Stores the vector \p vals to a contiguous memory block in SLM (Shared Local
+/// Memory) associated with the local accessor \p lacc. The parameter \p pred is
+/// the one-element predicate. If it is set to 1, then all 'N' elements are
+/// stored. Otherwise, the block store operation is a NO-OP.
+///
+/// The parameter 'props' specifies the optional compile-time properties
+/// list. Only esimd::alignment property is used. Other properties are ignored.
+///
+/// Alignment: If \p props does not specify the 'alignment' property, then
+/// the default expected alignment is the minimally required (see (R1) below).
+///
+/// Restrictions - predicate imposed - temporary:
+/// R1: The \p byte_offset must be at least 4-byte aligned for 4-byte or smaller
+///     elements and 8-byte aligned for 8-byte elements.
+/// R2: The number of elements must be:
+///     for 8-byte data: 1, 2, 3, 4, 8, 16, 32(max for DG2), 64;
+///     for 4-byte data: 1, 2, 3, 4, 8, 16, 32, 64(max for DG2),
+///                      or 128(only if alignment is 8-bytes or more);
+///     for 2-byte data: 2, 4, 6, 8, 16, 32, 64, 128(max for DG2),
+///                      or 256(only if alignment is 8-bytes or more);
+///     for 1-byte data: 4, 8, 12, 16, 32, 64, 128, 256(max for DG2),
+///                      or 512(only if alignment is 8-bytes or more).
+/// R3: The target device must be DG2, PVC or newer GPU.
+template <
+    typename ValuesSimdViewT,
+    typename T = ValuesSimdViewT::value_type::element_type,
+    int N = ValuesSimdViewT::getSizeX() * ValuesSimdViewT::getSizeY(),
+    typename AccessorT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<ValuesSimdViewT> &&
+    detail::is_local_accessor_with_v<AccessorT,
+                                     detail::accessor_mode_cap::can_write> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+block_store(AccessorT lacc, ValuesSimdViewT vals, simd_mask<1> pred,
+            PropertyListT props = {}) {
+  block_store<T, N>(lacc, vals.read(), pred, props);
+}
+namespace detail {
+
+// lsc_atomic_update() operations may share atomic_op values for data types
+// of the same (fp vs integral) class for convenience (e.g. re-use 'fmax' for
+// all FP types). In fact those data types may require using different internal
+// opcodes. This function returns the corresponding internal opcode for
+// the input type 'T' and operation 'Op'.
+template <typename T, __ESIMD_NS::atomic_op Op>
+constexpr int lsc_to_internal_atomic_op() {
+  constexpr __ESIMD_NS::native::lsc::atomic_op LSCOp =
+      __ESIMD_DNS::to_lsc_atomic_op<Op>();
+  return static_cast<int>(LSCOp);
+}
+
+/// SLM atomic.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_atomic_<OP>.slm
+///
+/// @tparam Op is operation type.
+/// @tparam T is element type.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam DS is the data size.
+/// @param offsets is the zero-based offsets.
+/// @param pred is predicate.
+///
+/// @return A vector of the old values at the memory locations before the
+///   update.
+
+template <atomic_op Op, typename T, int N, lsc_data_size DS>
+__ESIMD_API std::enable_if_t<get_num_args<Op>() == 0, simd<T, N>>
+slm_atomic_update_impl(simd<uint32_t, N> offsets, simd_mask<N> pred) {
+  check_lsc_data_size<T, DS>();
+  check_atomic<Op, T, N, 0, /*IsLSC*/ true>();
+  constexpr uint16_t AddressScale = 1;
+  constexpr int ImmOffset = 0;
+  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
+  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
+  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
+  using MsgT = typename lsc_expand_type<T>::type;
+  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
+  simd<MsgT, N> Tmp =
+      __esimd_lsc_xatomic_slm_0<MsgT, IOp, cache_hint::none, cache_hint::none,
+                                AddressScale, ImmOffset, EDS, VS, Transposed,
+                                N>(pred.data(), offsets.data());
+  return lsc_format_ret<T>(Tmp);
+}
+
+/// SLM atomic.
+/// Supported platforms: DG2, PVC
+/// VISA instruction: lsc_atomic_<OP>.slm
+///
+/// @tparam Op is operation type.
+/// @tparam T is element type.
+/// @tparam N is the number of channels (platform dependent).
+/// @tparam DS is the data size.
+/// @param offsets is the zero-based offsets.
+/// @param src0 is the first atomic operand.
+/// @param pred is predicate.
+///
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename T, int N, lsc_data_size DS>
+__ESIMD_API std::enable_if_t<get_num_args<Op>() == 1, simd<T, N>>
+slm_atomic_update_impl(simd<uint32_t, N> offsets, simd<T, N> src0,
+                       simd_mask<N> pred) {
+  check_lsc_data_size<T, DS>();
+  check_atomic<Op, T, N, 1, /*IsLSC*/ true>();
+  constexpr uint16_t AddressScale = 1;
+  constexpr int ImmOffset = 0;
+  constexpr lsc_data_size EDS = expand_data_size(finalize_data_size<T, DS>());
+  constexpr lsc_vector_size VS = to_lsc_vector_size<1>();
+  constexpr lsc_data_order Transposed = lsc_data_order::nontranspose;
+  constexpr int IOp = lsc_to_internal_atomic_op<T, Op>();
   if constexpr (std::is_same_v<T, double> || std::is_same_v<T, float>) {
     return __esimd_lsc_xatomic_slm_1<T, IOp, cache_hint::none, cache_hint::none,
                                      AddressScale, ImmOffset, EDS, VS,
@@ -7107,7 +8046,7 @@ atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset,
 /// atomic_update(local_accessor lacc,
 ///               simd<uint32_t, N> byte_offset,
 ///               simd<T, N> src0,
-///               simd_mask<1> pred = 1);                       // (lacc-au1-1)
+///               simd_mask<N> mask = 1);                       // (lacc-au1-1)
 ///
 
 /// Usage of cache hints or non-standard operation width N requires DG2 or PVC.
@@ -7158,11 +8097,102 @@ slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
   }
 }
 
+/// simd<T, N>
+/// slm_atomic_update(simd<uint32_t, N> byte_offset,
+///                   SrcSimdViewT src0,
+///                   simd_mask<N> mask = 1)
+///
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename SrcSimdViewT,
+          typename T = SrcSimdViewT::value_type::element_type, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(simd<uint32_t, N> byte_offset, SrcSimdViewT src0,
+                  simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return slm_atomic_update<Op, T, N>(byte_offset, src0.read(), mask);
+}
+
+/// simd<T, N>
+/// slm_atomic_update(OffsetSimdViewT byte_offset,
+///                   simd<T, N> src0,
+///                   simd_mask<N> mask = 1)
+///
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename OffsetSimdViewT, typename T, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(OffsetSimdViewT byte_offset, simd<T, N> src0,
+                  simd_mask<N> mask = 1) {
+  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return slm_atomic_update<Op, T, N>(byte_offset.read(), src0, mask);
+}
+
+/// simd<T, N>
+/// slm_atomic_update(OffsetSimdViewT byte_offset,
+///                   SrcSimdViewT src0,
+///                   simd_mask<N> mask = 1)
+///
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
+          typename T = SrcSimdViewT::value_type::element_type,
+          int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY()>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT> &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+                  simd_mask<N> mask = 1) {
+  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return slm_atomic_update<Op, T, N>(byte_offset.read(), src0.read(), mask);
+}
+
 /// simd<T, N>
 /// atomic_update(local_accessor lacc,
 ///               simd<uint32_t, N> byte_offset,
 ///               simd<T, N> src0,
-///               simd_mask<1> pred = 1);                       // (lacc-au1-1)
+///               simd_mask<1> mask = 1);                       // (lacc-au1-1)
 ///
 /// Atomically updates \c N memory locations in SLM indicated by
 /// local accessor \p lacc and a vector of offsets, and returns a vector of old
@@ -7186,6 +8216,105 @@ atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, simd<T, N> src0,
   return slm_atomic_update<Op, T, N>(byte_offset, src0, mask);
 }
 
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               OffsetSimdViewT byte_offset,
+///               simd<T, N> src0,
+///               simd_mask<1> mask = 1);
+///
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// local accessor \p lacc and a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename OffsetSimdViewT, typename T, int N,
+          typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              simd_mask<N> mask = 1) {
+  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(lacc, byte_offset.read(), src0, mask);
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               simd<uint32_t, N> byte_offset,
+///               SrcSimdViewT src0,
+///               simd_mask<1> mask = 1);
+///
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// local accessor \p lacc and a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename SrcSimdViewT,
+          typename T = SrcSimdViewT::value_type::element_type, int N,
+          typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, SrcSimdViewT src0,
+              simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(lacc, byte_offset, src0.read(), mask);
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0,
+///               simd_mask<1> mask = 1);
+///
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// local accessor \p lacc and a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT,
+          typename T = SrcSimdViewT::value_type::element_type,
+          int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+          typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 1 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              simd_mask<N> mask = 1) {
+  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(lacc, byte_offset.read(), src0.read(), mask);
+}
 /// Two argument variant of the atomic update operation.
 
 /// simd<T, N>
@@ -7198,7 +8327,7 @@ atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, simd<T, N> src0,
 ///               simd<uint32_t, N> byte_offset,
 ///               simd<T, N> src0,
 ///               simd<T, N> src1,
-///               simd_mask<1> pred = 1);                      // (lacc-au2-1)
+///               simd_mask<1> mask = 1);                      // (lacc-au2-1)
 ///
 
 /// simd<T, N>
@@ -7241,11 +8370,232 @@ slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
 }
 
 /// simd<T, N>
-/// atomic_update(local_accessor lacc,
+/// slm_atomic_update(simd<uint32_t, N> byte_offset,
+///                   SrcSimdViewT src0, simd<T, N> src1,
+///                   simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand (new value).
+/// @param src1 is the second atomic operand (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename SrcSimdViewT, typename T, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(simd<uint32_t, N> byte_offset, SrcSimdViewT src0,
+                  simd<T, N> src1, simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset and src1 parameters.");
+  return slm_atomic_update<Op, T, N>(byte_offset, src0.read(), src1, mask);
+}
+
+/// simd<T, N>
+/// slm_atomic_update(simd<uint32_t, N> byte_offset,
+///                   simd<T, N> src0, SrcSimdViewT src1,
+///                   simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand (new value).
+/// @param src1 is the second atomic operand (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename SrcSimdViewT, typename T, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(simd<uint32_t, N> byte_offset, simd<T, N> src0,
+                  SrcSimdViewT src1, simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src1 parameter must correspond to the size of "
+                "byte_offset and src0 parameters.");
+  return slm_atomic_update<Op, T, N>(byte_offset, src0, src1.read(), mask);
+}
+
+/// simd<T, N>
+/// slm_atomic_update(simd<uint32_t, N> byte_offset,
+///                   SrcSimdViewT src0, SrcSimdViewT src1,
+///                   simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand (new value).
+/// @param src1 is the second atomic operand (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename SrcSimdViewT,
+          typename T = SrcSimdViewT::value_type::element_type, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(simd<uint32_t, N> byte_offset, SrcSimdViewT src0,
+                  SrcSimdViewT src1, simd_mask<N> mask = 1) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+      "Size of src1 and src0 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return slm_atomic_update<Op, T, N>(byte_offset, src0.read(), src1.read(),
+                                     mask);
+}
+
+/// simd<T, N>
+/// slm_atomic_update(OffsetSimdViewT byte_offset,
+///                   simd<T, N> src0, simd<T, N> src1,
+///                   simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand (new value).
+/// @param src1 is the second atomic operand (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename OffsetSimdViewT, typename T, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(OffsetSimdViewT byte_offset, simd<T, N> src0, simd<T, N> src1,
+                  simd_mask<N> mask = 1) {
+  static_assert(
+      N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src1 and src0 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return slm_atomic_update<Op, T, N>(byte_offset.read(), src0, src1, mask);
+}
+
+/// simd<T, N>
+/// slm_atomic_update(OffsetSimdViewT byte_offset,
+///                   SrcSimdViewT src0, simd<T, N> src1,
+///                   simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand (new value).
+/// @param src1 is the second atomic operand (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
+          typename T, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+                  simd<T, N> src1, simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+                    N == OffsetSimdViewT::getSizeX() *
+                             OffsetSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset and src1 parameters.");
+  return slm_atomic_update<Op, T, N>(byte_offset.read(), src0.read(), src1,
+                                     mask);
+}
+
+/// simd<T, N>
+/// slm_atomic_update(OffsetSimdViewT byte_offset,
+///                   simd<T, N> src0, SrcSimdViewT src1,
+///                   simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand (new value).
+/// @param src1 is the second atomic operand (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
+          typename T, int N>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(OffsetSimdViewT byte_offset, simd<T, N> src0,
+                  SrcSimdViewT src1, simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+                    N == OffsetSimdViewT::getSizeX() *
+                             OffsetSimdViewT::getSizeY(),
+                "Size of src1 parameter must correspond to the size of "
+                "byte_offset and src0 parameters.");
+  return slm_atomic_update<Op, T, N>(byte_offset.read(), src0, src1.read(),
+                                     mask);
+}
+
+/// simd<T, N>
+/// slm_atomic_update(OffsetSimdViewT byte_offset,
+///                   SrcSimdViewT src0, SrcSimdViewT src1,
+///                   simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Atomically updates \c N memory locations in SLM indicated by
+/// a vector of offsets, and returns a vector of old
+/// values found at the memory locations before update.
+/// @tparam Op The atomic operation.
+/// @param byte_offset The vector of 32-bit offsets.
+/// @param src0 is the first atomic operand (new value).
+/// @param src1 is the second atomic operand (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
+          typename T = SrcSimdViewT::value_type::element_type,
+          int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY()>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT>,
+                             simd<T, N>>
+slm_atomic_update(OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+                  SrcSimdViewT src1, simd_mask<N> mask = 1) {
+  static_assert(
+      N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src1 and src0 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return slm_atomic_update<Op, T, N>(byte_offset.read(), src0, src1, mask);
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
 ///               simd<uint32_t, N> byte_offset,
 ///               simd<T, N> src0,
 ///               simd<T, N> src1,
-///               simd_mask<1> pred = 1);                      // (lacc-au2-1)
+///               simd_mask<N> mask = 1);                      // (lacc-au2-1)
 template <atomic_op Op, typename T, int N, typename AccessorT>
 __ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
                                  __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
@@ -7256,6 +8606,175 @@ atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, simd<T, N> src0,
   return slm_atomic_update<Op, T, N>(byte_offset, src0, src1, mask);
 }
 
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               simd<uint32_t, N> byte_offset,
+///               SrcSimdViewT src0,
+///               simd<T, N> src1,
+///               simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+template <atomic_op Op, typename SrcSimdViewT, typename T, int N,
+          typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset and src1 parameters.");
+  return atomic_update<Op, T, N>(lacc, byte_offset, src0.read(), src1, mask);
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               simd<uint32_t, N> byte_offset,
+///               simd<T, N> src0,
+///               SrcSimdViewT src1,
+///               simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+template <atomic_op Op, typename SrcSimdViewT, typename T, int N,
+          typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, simd<T, N> src0,
+              SrcSimdViewT src1, simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src1 parameter must correspond to the size of "
+                "byte_offset and src0 parameters.");
+  return atomic_update<Op, T, N>(lacc, byte_offset, src0, src1.read(), mask);
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               simd<uint32_t, N> byte_offset,
+///               SrcSimdViewT src0,
+///               SrcSimdViewT src1,
+///               simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+template <atomic_op Op, typename SrcSimdViewT,
+          typename T = SrcSimdViewT::value_type::element_type, int N,
+          typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, simd<uint32_t, N> byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, simd_mask<N> mask = 1) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+      "Size of src1 and src0 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return atomic_update<Op, T, N>(lacc, byte_offset, src0.read(), src1.read(),
+                                 mask);
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               OffsetSimdViewT byte_offset,
+///               simd<T, N> src0,
+///               simd<T, N> src1,
+///               simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+template <atomic_op Op, typename OffsetSimdViewT, typename T, int N,
+          typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              simd<T, N> src1, simd_mask<N> mask = 1) {
+  static_assert(
+      N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src1 and src0 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return atomic_update<Op, T, N>(lacc, byte_offset.read(), src0, src1, mask);
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0,
+///               simd<T, N> src1,
+///               simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
+          typename T, int N, typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset and src1 parameters.");
+  return atomic_update<Op, T, N>(lacc, byte_offset.read(), src0.read(), src1,
+                                 mask);
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               OffsetSimdViewT byte_offset,
+///               simd<T, N> src0,
+///               SrcSimdViewT src1,
+///               simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
+          typename T, int N, typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              SrcSimdViewT src1, simd_mask<N> mask = 1) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+                    N == OffsetSimdViewT::getSizeX() *
+                             OffsetSimdViewT::getSizeY(),
+                "Size of src1 parameter must correspond to the size of "
+                "byte_offset and src0 parameters.");
+  return atomic_update<Op, T, N>(lacc, byte_offset.read(), src0, src1.read(),
+                                 mask);
+}
+
+/// simd<T, N>
+/// atomic_update(local_accessor lacc,
+///               OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0,
+///               SrcSimdViewT src1,
+///               simd_mask<N> mask = 1);
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+template <atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT,
+          typename T = SrcSimdViewT::value_type::element_type,
+          int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+          typename AccessorT>
+__ESIMD_API std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 2 &&
+                                 detail::is_simd_view_type_v<SrcSimdViewT> &&
+                                 detail::is_simd_view_type_v<OffsetSimdViewT> &&
+                                 __ESIMD_DNS::is_rw_local_accessor_v<AccessorT>,
+                             simd<T, N>>
+atomic_update(AccessorT lacc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, simd_mask<N> mask = 1) {
+  static_assert(
+      N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src1 and src0 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return atomic_update<Op, T, N>(lacc, byte_offset.read(), src0.read(),
+                                 src1.read(), mask);
+}
+
 /// @} sycl_esimd_memory_slm
 
 namespace detail {
@@ -7744,8 +9263,37 @@ __ESIMD_API std::enable_if_t<
         detail::is_simd_view_type_v<OffsetSimdViewT>,
     simd<T, N>>
 atomic_update(T *p, OffsetSimdViewT byte_offset, PropertyListT props = {}) {
-  simd_mask<N> mask = 1;
-  return atomic_update<Op, T, N>(p, byte_offset.read(), mask, props);
+  return atomic_update<Op, T, N>(p, byte_offset.read(), props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               props = {});
+///
+/// A variation of \c atomic_update API with \c offsets represented as
+/// \c simd_view object without mask operand and allows the use without
+/// specifying \c T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be \c atomic_op::inc,
+///   \c atomic_op::dec, or \c atomic_op::load.
+/// @param p The USM pointer.
+/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+///   Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename OffsetSimdViewT, typename T,
+    int N = OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 0 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT byte_offset, PropertyListT props = {}) {
+  return atomic_update<Op, T, N>(p, byte_offset.read(), props);
 }
 
 /// A variation of \c atomic_update API with \c offset represented as
@@ -7881,21 +9429,27 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
 
 /// simd<T, N>
 /// atomic_update(T *ptr, simd<Toffset, N> byte_offset,
-///               simd<T, N> src0, props = {});                  // (usm-au1-2)
-
-/// A variation of \c atomic_update API without mask operand.
-
+///               SrcSimdViewT src0, simd_mask<N> mask, props = {});
+///
+/// Atomically updates \c N memory locations represented by a USM pointer and
+/// a vector of offsets relative to the pointer, and returns a vector of old
+/// values found at the memory locations before update. The update operation
+/// has 1 additional argument.
+/// A variation of \c atomic_update API with \c src0 represented as
+/// \c simd_view object and allows the use without specifying \c T and \c N
+/// template parameters.
+///
 /// @tparam Op The atomic operation - can be one of the following:
 /// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
 /// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
 /// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
 /// \c atomic_op::fmax, \c atomic_op::fmin, \c atomic_op::fadd, \c
 /// atomic_op::fsub, \c atomic_op::store.
-/// @tparam T The vector element type.
-/// @tparam N The number of memory locations to update.
 /// @param p The USM pointer.
 /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
 /// @param src0 The additional argument.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
 /// @param props The parameter 'props' specifies the optional compile-time
 ///   properties list. Only L1/L2 properties are used. Other properties are
 ///   ignored.
@@ -7903,20 +9457,100 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
 ///   update.
 ///
 template <
-    atomic_op Op, typename T, int N, typename Toffset,
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
     __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
         ext::oneapi::experimental::is_property_list_v<PropertyListT>,
     simd<T, N>>
-atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
-              PropertyListT props = {}) {
-  simd_mask<N> mask = 1;
-  return atomic_update<Op, T, N>(p, byte_offset, src0, mask, props);
+atomic_update(T *p, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset, src0.read(), mask, props);
 }
 
 /// simd<T, N>
-/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+/// atomic_update(T *ptr, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, props = {});                  // (usm-au1-2)
+
+/// A variation of \c atomic_update API without mask operand.
+
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
+/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
+/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
+/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
+/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The additional argument.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename T, int N, typename Toffset,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
+              PropertyListT props = {}) {
+  simd_mask<N> mask = 1;
+  return atomic_update<Op, T, N>(p, byte_offset, src0, mask, props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *ptr, simd<Toffset, N> byte_offset,
+///               SrcSimdViewT src0, props = {});
+
+/// A variation of \c atomic_update API with \c src0 represented as
+/// \c simd_view object and no mask operand and allows the use without
+/// specifying \c T and \c N template parameters.
+
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
+/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
+/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
+/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
+/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The additional argument.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(T *p, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset, src0.read(), props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
 ///               simd<T, N> src0,
 ///               simd_mask<N> mask, props = {});                // (usm-au1-3)
 ///
@@ -7955,6 +9589,54 @@ atomic_update(T *p, OffsetSimdViewT offsets, simd<T, N> src0, simd_mask<N> mask,
   return atomic_update<Op, T, N>(p, offsets.read(), src0, mask, props);
 }
 
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0,
+///               simd_mask<N> mask, props = {});
+///
+/// A variation of \c atomic_update API with \c byte_offset and \c src0
+/// represented as \c simd_view object and allows the use without specifying \c
+/// T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
+/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
+/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
+/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
+/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param p The USM pointer.
+/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The additional argument.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT, typename T,
+    int N,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT offsets, SrcSimdViewT src0,
+              simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(
+      N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY() &&
+          N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+      "Size of src0 and offsets parameters must correspond to the size of "
+      "mask parameter.");
+  return atomic_update<Op, T, N>(p, offsets.read(), src0.read(), mask, props);
+}
+
 /// simd<T, N>
 /// atomic_update(T *p, OffsetSimdViewT byte_offset,
 ///               simd<T, N> src0,
@@ -7994,6 +9676,48 @@ atomic_update(T *p, OffsetSimdViewT offsets, simd<T, N> src0,
   return atomic_update<Op, T, N>(p, offsets.read(), src0, mask, props);
 }
 
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0,
+///               props = {});
+///
+/// A variation of \c atomic_update API with \c byte_offset represented as
+/// \c simd_view object and no mask operand and allows the use without
+/// specifying \c T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
+/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
+/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
+/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
+/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+/// @param p The USM pointer.
+/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The additional argument.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename OffsetSimdViewT, typename SrcSimdViewT, typename T,
+    int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT offsets, SrcSimdViewT src0,
+              PropertyListT props = {}) {
+  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "offsets parameter.");
+  return atomic_update<Op, T, N>(p, offsets.read(), src0.read(), props);
+}
+
 /// A variation of \c atomic_update API with \c offset represented as
 /// scalar object.
 ///
@@ -8125,17 +9849,21 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
 
 /// simd<T, N>
 /// atomic_update(T *p, simd<Toffset, N> byte_offset,
-///               simd<T, N> src0, simd<T, N> src1,
-///               props = {});                                  // (usm-au2-2)
-//
+///               SrcSimdViewT src0, simd<T, N> src1,
+///               simd_mask<N> mask, props = {});
+///
+/// A variation of \c atomic_update API with \c src0 represented as
+/// \c simd_view object and allows the use without specifying \c T and \c N
+/// template parameters.
+
 /// @tparam Op The atomic operation - can be one of the following:
 ///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
-/// @tparam T The vector element type.
-/// @tparam N The number of memory locations to update.
 /// @param p The USM pointer.
 /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
 /// @param src0 The first additional argument (new value).
 /// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
 /// @param props The parameter 'props' specifies the optional compile-time
 ///   properties list. Only L1/L2 properties are used.
 //    Other properties are ignored.
@@ -8143,27 +9871,73 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
 ///   update.
 ///
 template <
-    atomic_op Op, typename T, int N, typename Toffset,
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(T *p, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset, src0.read(), src1, mask,
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, SrcSimdViewT src1,
+///               simd_mask<N> mask, props = {});
+///
+/// A variation of \c atomic_update API with \c src1 represented as
+/// \c simd_view object and allows the use without specifying \c T and \c N
+/// template parameters.
+
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
     __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
         ext::oneapi::experimental::is_property_list_v<PropertyListT>,
     simd<T, N>>
 atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
-              simd<T, N> src1, PropertyListT props = {}) {
-  simd_mask<N> mask = 1;
-  return atomic_update<Op, T, N>(p, byte_offset, src0, src1, mask, props);
+              SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src1 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset, src0, src1.read(), mask,
+                                 props);
 }
 
 /// simd<T, N>
-/// atomic_update(T *p, OffsetSimdViewT byte_offset,
-///               simd<T, N> src0, simd<T, N> src1,
-///               simd_mask<N> mask, props = {})                // (usm-au2-3)
+/// atomic_update(T *p, simd<Toffset, N> byte_offset,
+///               SrcSimdViewT src0, SrcSimdViewT src1,
+///               simd_mask<N> mask, props = {});
 ///
+/// A variation of \c atomic_update API with \c src0 and \c src1 represented as
+/// \c simd_view object and allows the use without specifying \c T and \c N
+/// template parameters.
+
 /// @tparam Op The atomic operation - can be one of the following:
 ///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
-/// @tparam T The vector element type.
-/// @tparam N The number of memory locations to update.
 /// @param p The USM pointer.
 /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
 /// @param src0 The first additional argument (new value).
@@ -8175,25 +9949,30 @@ atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
 //    Other properties are ignored.
 /// @return A vector of the old values at the memory locations before the
 ///   update.
+///
 template <
-    atomic_op Op, typename T, int N, typename OffsetSimdViewT,
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
     __ESIMD_DNS::get_num_args<Op>() == 2 &&
-        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
-        detail::is_simd_view_type_v<OffsetSimdViewT>,
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
     simd<T, N>>
-atomic_update(T *p, OffsetSimdViewT byte_offset, simd<T, N> src0,
-              simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
-  return atomic_update<Op, T, N>(p, byte_offset.read(), src0, src1, mask,
+atomic_update(T *p, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+      "Size of src1 and src0 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset, src0.read(), src1.read(), mask,
                                  props);
 }
 
 /// simd<T, N>
-/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+/// atomic_update(T *p, simd<Toffset, N> byte_offset,
 ///               simd<T, N> src0, simd<T, N> src1,
-///               props = {})                                   // (usm-au2-4)
-///
+///               props = {});                                  // (usm-au2-2)
+//
 /// @tparam Op The atomic operation - can be one of the following:
 ///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
 /// @tparam T The vector element type.
@@ -8207,82 +9986,1255 @@ atomic_update(T *p, OffsetSimdViewT byte_offset, simd<T, N> src0,
 //    Other properties are ignored.
 /// @return A vector of the old values at the memory locations before the
 ///   update.
+///
 template <
-    atomic_op Op, typename T, int N, typename OffsetSimdViewT,
+    atomic_op Op, typename T, int N, typename Toffset,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
     __ESIMD_DNS::get_num_args<Op>() == 2 &&
-        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
-        detail::is_simd_view_type_v<OffsetSimdViewT>,
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
     simd<T, N>>
-atomic_update(T *p, OffsetSimdViewT byte_offset, simd<T, N> src0,
+atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
               simd<T, N> src1, PropertyListT props = {}) {
   simd_mask<N> mask = 1;
-  return atomic_update<Op, T, N>(p, byte_offset.read(), src0, src1, mask,
-                                 props);
+  return atomic_update<Op, T, N>(p, byte_offset, src0, src1, mask, props);
 }
 
-/// A variation of \c atomic_update API with \c byte_offset represented as
-/// scalar.
+/// simd<T, N>
+/// atomic_update(T *p, simd<Toffset, N> byte_offset,
+///               SrcSimdViewT src0, simd<T, N> src1,
+///               props = {});
 ///
+/// A variation of \c atomic_update API with \c src0 represented as
+/// \c simd_view object without \c mask operand and allows the use without
+/// specifying \c T and \c N template parameters.
+
 /// @tparam Op The atomic operation - can be one of the following:
 ///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
-/// @tparam Tx The vector element type.
-/// @tparam N The number of memory locations to update.
 /// @param p The USM pointer.
-/// @param byte_offset The scalar 32-bit or 64-bit offset in bytes.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
 /// @param src0 The first additional argument (new value).
 /// @param src1 The second additional argument (expected value).
-/// @param mask Operation mask, only locations with non-zero in the
-///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
 /// @return A vector of the old values at the memory locations before the
 ///   update.
 ///
-template <atomic_op Op, typename Tx, int N, typename Toffset>
-__ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>, simd<Tx, N>>
-atomic_update(Tx *p, Toffset byte_offset, simd<Tx, N> src0, simd<Tx, N> src1,
-              simd_mask<N> mask) {
-  return atomic_update<Op, Tx, N>(p, simd<Toffset, N>(byte_offset), src0, src1,
-                                  mask);
+template <
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(T *p, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset, src0.read(), src1, props);
 }
 
-/// @anchor accessor_atomic_update0
-/// @brief No-argument variant of the atomic update operation.
-///
-/// simd<T, N>
-/// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
-///               simd_mask<N> mask, props = {});               /// (acc-au0-1)
-/// simd<T, N>
-/// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
-///               props = {});                                  /// (acc-au0-2)
-/// simd<T, N>
-/// atomic_update(AccessorT acc, OffsetSimdViewT byte_offset,
-///               simd_mask<N> mask, props = {});               /// (acc-au0-3)
 /// simd<T, N>
-/// atomic_update(AccessorT acc, OffsetSimdViewT byte_offset,
-///               props = {});                                  /// (acc-au0-4)
+/// atomic_update(T *p, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, SrcSimdViewT src1,
+///               props = {});
 ///
+/// A variation of \c atomic_update API with \c src1 represented as
+/// \c simd_view object without \c mask operand and allows the use without
+/// specifying \c T and \c N template parameters.
 
-/// Usage of cache hints or non-standard operation width N requires DG2 or PVC.
-///
-/// simd<T, N>
-/// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
-///               simd_mask<N> mask, props = {});               /// (acc-au0-1)
-///
-/// Atomically updates \c N memory locations represented by an accessor and
-/// a vector of offsets, and returns a vector of old values found at the
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(T *p, simd<Toffset, N> byte_offset, simd<T, N> src0,
+              SrcSimdViewT src1, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src1 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset, src0, src1.read(), props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, simd<Toffset, N> byte_offset,
+///               SrcSimdViewT src0, SrcSimdViewT src1,
+///               props = {});
+///
+/// A variation of \c atomic_update API with \c src0 and \c src1 represented as
+/// \c simd_view object without \c mask operand and allows the use without
+/// specifying \c T and \c N template parameters.
+
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(T *p, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+      "Size of src1 and src0 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset, src0.read(), src1.read(),
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               simd<T, N> src0, simd<T, N> src1,
+///               simd_mask<N> mask, props = {})                // (usm-au2-3)
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename T, int N, typename OffsetSimdViewT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
+  return atomic_update<Op, T, N>(p, byte_offset.read(), src0, src1, mask,
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0, simd<T, N> src1,
+///               simd_mask<N> mask, props = {})
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+          N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src0 and byte_offset parameters must correspond to the size of "
+      "mask parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset.read(), src0.read(), src1, mask,
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               simd<T, N> src0, SrcSimdViewT src1,
+///               simd_mask<N> mask, props = {})
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+          N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src1 and byte_offset parameters must correspond to the size of "
+      "mask parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset.read(), src0, src1.read(), mask,
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0, SrcSimdViewT src1,
+///               simd_mask<N> mask, props = {})
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+                    N == OffsetSimdViewT::getSizeX() *
+                             OffsetSimdViewT::getSizeY(),
+                "Size of src0, src1 and byte_offset parameters must correspond "
+                "to the size of "
+                "mask parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset.read(), src0.read(),
+                                 src1.read(), mask, props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               simd<T, N> src0, simd<T, N> src1,
+///               props = {})                                   // (usm-au2-4)
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename T, int N, typename OffsetSimdViewT,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              simd<T, N> src1, PropertyListT props = {}) {
+  simd_mask<N> mask = 1;
+  return atomic_update<Op, T, N>(p, byte_offset.read(), src0, src1, mask,
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0, simd<T, N> src1,
+///               props = {})
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+          N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src0 and byte_offset parameters must correspond to the size of "
+      "src1 parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset.read(), src0.read(), src1,
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               simd<T, N> src0, SrcSimdViewT src1,
+///               props = {})
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              SrcSimdViewT src1, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+          N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src1 and byte_offset parameters must correspond to the size of "
+      "src0 parameter.");
+  return atomic_update<Op, T, N>(p, byte_offset.read(), src0, src1.read(),
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(T *p, OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0, SrcSimdViewT src1,
+///               props = {})
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @param p The USM pointer.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(T *p, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, PropertyListT props = {}) {
+  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+                "Size of src0, src1 and byte_offset parameters must be equal.");
+  return atomic_update<Op, T, N>(p, byte_offset.read(), src0.read(),
+                                 src1.read(), props);
+}
+
+/// A variation of \c atomic_update API with \c byte_offset represented as
+/// scalar.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam Tx The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @param p The USM pointer.
+/// @param byte_offset The scalar 32-bit or 64-bit offset in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <atomic_op Op, typename Tx, int N, typename Toffset>
+__ESIMD_API std::enable_if_t<std::is_integral_v<Toffset>, simd<Tx, N>>
+atomic_update(Tx *p, Toffset byte_offset, simd<Tx, N> src0, simd<Tx, N> src1,
+              simd_mask<N> mask) {
+  return atomic_update<Op, Tx, N>(p, simd<Toffset, N>(byte_offset), src0, src1,
+                                  mask);
+}
+
+/// @anchor accessor_atomic_update0
+/// @brief No-argument variant of the atomic update operation.
+///
+/// simd<T, N>
+/// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
+///               simd_mask<N> mask, props = {});               /// (acc-au0-1)
+/// simd<T, N>
+/// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
+///               props = {});                                  /// (acc-au0-2)
+/// simd<T, N>
+/// atomic_update(AccessorT acc, OffsetSimdViewT byte_offset,
+///               simd_mask<N> mask, props = {});               /// (acc-au0-3)
+/// simd<T, N>
+/// atomic_update(AccessorT acc, OffsetSimdViewT byte_offset,
+///               props = {});                                  /// (acc-au0-4)
+///
+
+/// Usage of cache hints or non-standard operation width N requires DG2 or PVC.
+///
+/// simd<T, N>
+/// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
+///               simd_mask<N> mask, props = {});               /// (acc-au0-1)
+///
+/// Atomically updates \c N memory locations represented by an accessor and
+/// a vector of offsets, and returns a vector of old values found at the
 /// memory locations before update. The update operation has no arguments
 /// in addition to the value at the memory location.
 ///
-/// @tparam Op The atomic operation - can be \c atomic_op::inc,
-/// \c atomic_op::dec, or \c atomic_op::load.
+/// @tparam Op The atomic operation - can be \c atomic_op::inc,
+/// \c atomic_op::dec, or \c atomic_op::load.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
+/// offsets are supported only when stateless memory accesses are enforced, i.e.
+/// accessor based accesses are automatically converted to stateless accesses.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 0 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd_mask<N> mask,
+              PropertyListT props = {}) {
+#ifdef __ESIMD_FORCE_STATELESS_MEM
+  return atomic_update<Op, T, N>(__ESIMD_DNS::accessorToPointer<T>(acc),
+                                 byte_offset, mask, props);
+#else
+  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
+
+  if constexpr (detail::has_cache_hints<PropertyListT>() ||
+                !detail::isPowerOf2(N, 32) || sizeof(T) < 4) {
+    return detail::atomic_update_impl<
+        Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
+        acc, byte_offset, mask);
+  } else {
+    if constexpr (Op == atomic_op::load) {
+      if constexpr (std::is_integral_v<T>) {
+        return atomic_update<atomic_op::bit_or, T, N>(
+            acc, byte_offset, simd<T, N>(0), mask, props);
+      } else {
+        using Tint = detail::uint_type_t<sizeof(T)>;
+        simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
+            acc, byte_offset, simd<Tint, N>(0), mask, props);
+        return Res.template bit_cast_view<T>();
+      }
+    } else {
+      detail::check_atomic<Op, T, N, 0>();
+      static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
+
+      static_assert(sizeof(T) == 4, "Only 32 bit data is supported");
+      const auto si = get_surface_index(acc);
+      using Tx = typename detail::__raw_t<T>;
+      return __esimd_dword_atomic0<Op, Tx, N>(mask.data(), si,
+                                              byte_offset.data());
+    }
+  }
+#endif
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
+///               props = {});                                  /// (acc-au0-2)
+/// A variation of \c atomic_update API without mask operand
+///
+/// @tparam Op The atomic operation - can be \c atomic_op::inc,
+/// \c atomic_op::dec, or \c atomic_op::load.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
+/// offsets are supported only when stateless memory accesses are enforced, i.e.
+/// accessor based accesses are automatically converted to stateless accesses.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 0 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
+              PropertyListT props = {}) {
+  simd_mask<N> mask = 1;
+  return atomic_update<Op, T, N>(acc, byte_offset, mask, props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorT acc, OffsetSimdViewT byte_offset,
+///               simd_mask<N> mask, props = {});               /// (acc-au0-3)
+/// A variation of \c atomic_update API with \c offsets represented as
+/// \c simd_view object.
+///
+/// @tparam Op The atomic operation - can be \c atomic_op::inc,
+/// \c atomic_op::dec, or \c atomic_op::load.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
+/// 64-bit offsets are supported only when stateless memory accesses are
+/// enforced, i.e. accessor based accesses are automatically converted to
+/// stateless accesses.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename T, int N, typename OffsetSimdViewT,
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 0 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd_mask<N> mask,
+              PropertyListT props = {}) {
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), mask, props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorT acc, OffsetSimdViewT byte_offset,
+///               props = {});                                  /// (acc-au0-4)
+/// A variation of \c atomic_update API with \c offsets represented as
+/// \c simd_view object and no mask operand.
+///
+/// @tparam Op The atomic operation - can be \c atomic_op::inc,
+/// \c atomic_op::dec, or \c atomic_op::load.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
+/// 64-bit offsets are supported only when stateless memory accesses are
+/// enforced, i.e. accessor based accesses are automatically converted to
+/// stateless accesses.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename T, int N, typename OffsetSimdViewT,
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 0 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset,
+              PropertyListT props = {}) {
+  simd_mask<N> mask = 1;
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), mask, props);
+}
+
+/// A variation of \c atomic_update API with \c offset represented as
+/// scalar.
+///
+/// @tparam Op The atomic operation - can be \c atomic_op::inc,
+/// \c atomic_op::dec, or \c atomic_op::load.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The scalar 32-bit or 64-bit offset in bytes. 64-bit
+/// offset are supported only when stateless memory accesses are enforced,
+/// i.e. accessor based accesses are automatically converted to stateless
+/// accesses.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <atomic_op Op, typename T, int N, typename Toffset,
+          typename AccessorTy>
+__ESIMD_API
+    std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0 &&
+                         __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
+                     simd<T, N>>
+    atomic_update(AccessorTy acc, Toffset byte_offset, simd_mask<N> mask) {
+  return atomic_update<Op, T, N>(acc, simd<Toffset, N>(byte_offset), mask);
+}
+
+/// A variation of \c atomic_update API with \p byte_offset represented as
+/// scalar using \c local_accessor.
+///
+/// @tparam Op The atomic operation - can be \c atomic_op::inc,
+/// \c atomic_op::dec, or \c atomic_op::load.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The scalar 32-bit or 64-bit offset in bytes. 64-bit
+/// offset are supported only when stateless memory accesses are enforced,
+/// i.e. accessor based accesses are automatically converted to stateless
+/// accesses.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <atomic_op Op, typename T, int N, typename AccessorTy>
+__ESIMD_API
+    std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0 &&
+                         __ESIMD_DNS::is_rw_local_accessor_v<AccessorTy>,
+                     simd<T, N>>
+    atomic_update(AccessorTy acc, uint32_t byte_offset, simd_mask<N> mask) {
+  return atomic_update<Op, T, N>(acc, simd<uint32_t, N>(byte_offset), mask);
+}
+
+/// @anchor accessor_atomic_update1
+/// @brief Single-argument variant of the atomic update operation.
+///
+/// simd<T, N>
+/// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, simd_mask<N> mask, props = {});//(acc-au1-1)
+/// simd<T, N>
+/// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, props = {});                  // (acc-au1-2)
+///
+/// simd<T, N>
+/// atomic_update(AccessorT acc,
+////              OffsetSimdViewT byte_offset,
+///               simd<T, N> src0,
+///               simd_mask<N> mask, props = {});                // (acc-au1-3)
+/// simd<T, N>
+/// atomic_update(AccessorT acc,
+///               OffsetSimdViewT byte_offset,
+///               simd<T, N> src0,
+///               props = {});                                   // (acc-au1-4)
+///
+
+/// simd<T, N>
+/// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, simd_mask<N> mask, props = {});//(acc-au1-1)
+///
+/// Atomically updates \c N memory locations represented by an accessor and
+/// a vector of offsets, and returns a vector of old values found at the
+/// memory locations before update. The update operation has 1 additional
+/// argument.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
+/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
+/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
+/// \c atomic_op::fmax, \c atomic_op::fmin, \c atomic_op::fadd, \c
+/// atomic_op::fsub, \c atomic_op::store.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
+/// offsets are supported only when stateless memory accesses are enforced, i.e.
+/// accessor based accesses are automatically converted to stateless accesses.
+/// @param src0 The additional argument.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+
+template <
+    atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
+              simd_mask<N> mask, PropertyListT props = {}) {
+#ifdef __ESIMD_FORCE_STATELESS_MEM
+  return atomic_update<Op, T, N>(__ESIMD_DNS::accessorToPointer<T>(acc),
+                                 byte_offset, src0, mask, props);
+#else
+  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
+  static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
+  // Auto-convert FP atomics to LSC version.
+  if constexpr (detail::has_cache_hints<PropertyListT>() ||
+                Op == atomic_op::fmin || Op == atomic_op::fmax ||
+                Op == atomic_op::fadd || Op == atomic_op::fsub ||
+                !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
+    return detail::atomic_update_impl<
+        Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
+        acc, byte_offset, src0, mask);
+  } else if constexpr (Op == atomic_op::store) {
+    if constexpr (std::is_integral_v<T>) {
+      return atomic_update<atomic_op::xchg, T, N>(acc, byte_offset, src0, mask,
+                                                  props);
+    } else {
+      using Tint = detail::uint_type_t<sizeof(T)>;
+      simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
+          acc, byte_offset, src0.template bit_cast_view<Tint>(), mask, props);
+      return Res.template bit_cast_view<T>();
+    }
+  } else {
+    detail::check_atomic<Op, T, N, 1>();
+    static_assert(sizeof(T) == 4, "Only 32 bit data is supported");
+    const auto si = __ESIMD_NS::get_surface_index(acc);
+    using Tx = typename detail::__raw_t<T>;
+    return __esimd_dword_atomic1<Op, Tx, N>(
+        mask.data(), si, byte_offset.data(),
+        sycl::bit_cast<__ESIMD_DNS::vector_type_t<Tx, N>>(src0.data()));
+  }
+#endif
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
+///               SrcSimdViewT src0, simd_mask<N> mask, props = {});
+///
+/// Atomically updates \c N memory locations represented by an accessor and
+/// a vector of offsets, and returns a vector of old values found at the
+/// memory locations before update. The update operation has 1 additional
+/// argument.
+/// A variation of \c atomic_update API with \c src0 represented as
+/// \c simd_view object and allows the use without
+/// specifying \c T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
+/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
+/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
+/// \c atomic_op::fmax, \c atomic_op::fmin, \c atomic_op::fadd, \c
+/// atomic_op::fsub, \c atomic_op::store.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
+/// offsets are supported only when stateless memory accesses are enforced, i.e.
+/// accessor based accesses are automatically converted to stateless accesses.
+/// @param src0 The additional argument.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+
+template <
+    atomic_op Op, typename SrcSimdViewT, typename Toffset,
+    typename T = SrcSimdViewT::value_type::element_type, int N,
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), mask, props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, props = {});                  // (acc-au1-2)
+///
+/// A variation of \c atomic_update API with no mask operand.
+///
+/// Atomically updates \c N memory locations represented by an accessor and
+/// a vector of offsets, and returns a vector of old values found at the
+/// memory locations before update. The update operation has 1 additional
+/// argument.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
+/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
+/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
+/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
+/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// 64-bit offsets are supported only when stateless memory accesses are
+/// enforced, i.e. accessor based accesses are automatically converted to
+/// stateless accesses.
+/// @param src0 The additional argument.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
+              PropertyListT props = {}) {
+  simd_mask<N> mask = 1;
+  return atomic_update<Op, T, N>(acc, byte_offset, src0, mask, props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorT acc, SrcSimdViewT byte_offset,
+///               simd<T, N> src0, props = {});
+///
+/// A variation of \c atomic_update API with no mask operand and \c src0
+/// represented as \c simd_view object that allows the use without specifying
+/// \c T and \c N template parameters.
+///
+/// Atomically updates \c N memory locations represented by an accessor and
+/// a vector of offsets, and returns a vector of old values found at the
+/// memory locations before update. The update operation has 1 additional
+/// argument.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
+/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
+/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
+/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
+/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// 64-bit offsets are supported only when stateless memory accesses are
+/// enforced, i.e. accessor based accesses are automatically converted to
+/// stateless accesses.
+/// @param src0 The additional argument.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT, typename Toffset,
+    typename T = SrcSimdViewT::value_type::element_type, int N,
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorT acc,
+///               OffsetSimdViewT byte_offset,
+///               simd<T, N> src0,
+///               simd_mask<N> mask, props = {});                // (acc-au1-3)
+///
+/// A variation of \c atomic_update API with \c byte_offset represented as
+/// \c simd_view object.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
+/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
+/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
+/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
+/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
+/// 64-bit offsets are supported only when stateless memory accesses are
+/// enforced, i.e. accessor based accesses are automatically converted to
+/// stateless accesses.
+/// @param src0 The additional argument.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename T, int N, typename OffsetSimdViewT,
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              simd_mask<N> mask, PropertyListT props = {}) {
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, mask, props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorT acc,
+///               OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0,
+///               simd_mask<N> mask, props = {});
+///
+/// A variation of \c atomic_update API with \c byte_offset and \c src0
+/// represented as \c simd_view object that allows the use without specifying
+/// \c T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
+/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
+/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
+/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
+/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
+/// 64-bit offsets are supported only when stateless memory accesses are
+/// enforced, i.e. accessor based accesses are automatically converted to
+/// stateless accesses.
+/// @param src0 The additional argument.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT,
+    typename T = SrcSimdViewT::value_type::element_type,
+    int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0.read(), mask,
+                                 props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorT acc,
+///               OffsetSimdViewT byte_offset,
+///               simd<T, N> src0,
+///               props = {});                                   // (acc-au1-4)
+///
+/// A variation of \c atomic_update API with \c byte_offset represented as
+/// \c simd_view object and no mask operand.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
+/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
+/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
+/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
+/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+/// @tparam T The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
+/// 64-bit offsets are supported only when stateless memory accesses are
+/// enforced, i.e. accessor based accesses are automatically converted to
+/// stateless accesses.
+/// @param src0 The additional argument.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename T, int N, typename OffsetSimdViewT,
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              PropertyListT props = {}) {
+  simd_mask<N> mask = 1;
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, mask, props);
+}
+
+/// simd<T, N>
+/// atomic_update(AccessorT acc,
+///               OffsetSimdViewT byte_offset,
+///               SrcSimdViewT src0,
+///               props = {});
+///
+/// A variation of \c atomic_update API with \c byte_offset and \c src0
+/// represented as \c simd_view object and no \c mask operand that allows the
+/// use without specifying \c T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
+/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
+/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
+/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
+/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
+/// 64-bit offsets are supported only when stateless memory accesses are
+/// enforced, i.e. accessor based accesses are automatically converted to
+/// stateless accesses.
+/// @param src0 The additional argument.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used. Other properties are
+///   ignored.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT,
+    typename T = SrcSimdViewT::value_type::element_type,
+    int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              PropertyListT props = {}) {
+  static_assert(N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0.read(), props);
+}
+
+/// A variation of \c atomic_update API with \c offset represented as
+/// scalar object.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
+/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
+/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
+/// \c atomic_op::fmax, \c atomic_op::fmin \c atomic_op::store.
+/// @tparam Tx The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param offset The scalar 32-bit or 64-bit offset in bytes. 64-bit
+/// offset are supported only when stateless memory accesses are enforced, i.e.
+/// accessor based accesses are automatically converted to stateless accesses.
+/// @param src0 The additional argument.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <atomic_op Op, typename T, int N, typename Toffset,
+          typename AccessorTy>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
+    simd<T, N>>
+atomic_update(AccessorTy acc, Toffset offset, simd<T, N> src0,
+              simd_mask<N> mask) {
+  return atomic_update<Op, T, N>(acc, simd<Toffset, N>(offset), src0, mask);
+}
+
+/// A variation of \c atomic_update API with \c offset represented as
+/// scalar object and uses \c local_accessor.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
+/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
+/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
+/// \c atomic_op::fmax, \c atomic_op::fmin \c atomic_op::store.
+/// @tparam Tx The vector element type.
+/// @tparam N The number of memory locations to update.
+/// @tparam AccessorTy type of the SYCL accessor.
+/// @param acc The SYCL accessor.
+/// @param offset The scalar 32-bit offset in bytes.
+/// @param src0 The additional argument.
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @return A vector of the old values at the memory locations before the
+///   update.
+///
+template <atomic_op Op, typename Tx, int N, typename AccessorTy>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::is_rw_local_accessor_v<AccessorTy> &&
+        ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
+    simd<Tx, N>>
+atomic_update(AccessorTy acc, uint32_t offset, simd<Tx, N> src0,
+              simd_mask<N> mask) {
+  return atomic_update<Op, Tx, N>(acc, simd<uint32_t, N>(offset), src0, mask);
+}
+
+/// @anchor accessor_atomic_update2
+/// @brief Two-argument variant of the atomic update operation.
+///
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, simd<T, N> src1,
+//                simd_mask<N> mask,props = {});                 // (acc-au2-1)
+///
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, simd<T, N> src1,
+///               props = {});                                   // (acc-au2-2)
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, OffsetSimdViewT
+///               byte_offset, simd<T, N> src0, simd<T, N> src1,
+///               simd_mask<N> mask, props = {});                // (acc-au2-3)
+///
+/// simd<T, N>
+/// atomic_update(AccessorTy acc,
+///               OffsetSimdViewT, byte_offset,
+///               simd<T, N> src0, simd<T, N> src1, props = {}); // (acc-au2-4)
+///
+
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, simd<T, N> src1,
+//                simd_mask<N> mask,props = {});                 // (acc-au2-1)
+///
+/// Atomically updates \c N memory locations represented by an accessor and
+/// a vector of offsets and returns a vector of old
+/// values found at the memory locations before update. The update operation
+/// has 2 additional arguments.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
 /// @tparam T The vector element type.
 /// @tparam N The number of memory locations to update.
 /// @tparam AccessorTy type of the SYCL accessor.
 /// @param acc The SYCL accessor.
 /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
-/// offsets are supported only when stateless memory accesses are enforced, i.e.
-/// accessor based accesses are automatically converted to stateless accesses.
+/// offsets are supported only when stateless memory accesses are enforced,
+/// i.e. accessor based accesses are automatically converted to stateless
+/// accesses.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
 /// @param mask Operation mask, only locations with non-zero in the
 ///   corresponding mask element are updated.
 /// @param props The parameter 'props' specifies the optional compile-time
@@ -8295,62 +11247,69 @@ template <
     atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
-    __ESIMD_DNS::get_num_args<Op>() == 0 &&
+    __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
         __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
         ext::oneapi::experimental::is_property_list_v<PropertyListT>,
     simd<T, N>>
-atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd_mask<N> mask,
-              PropertyListT props = {}) {
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
+              simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
 #ifdef __ESIMD_FORCE_STATELESS_MEM
   return atomic_update<Op, T, N>(__ESIMD_DNS::accessorToPointer<T>(acc),
-                                 byte_offset, mask, props);
+                                 byte_offset, src0, src1, mask, props);
 #else
   static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
-
+  static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
+  // Use LSC atomic when cache hints are present, FP atomics is used,
+  // non-power of two length is used, operation width greater than 32, or the
+  // data size is less than 4 bytes,
   if constexpr (detail::has_cache_hints<PropertyListT>() ||
-                !detail::isPowerOf2(N, 32) || sizeof(T) < 4) {
+                Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) ||
+                sizeof(T) < 4) {
+    // 2-argument lsc_atomic_update arguments order matches the standard one -
+    // expected value first, then new value. But atomic_update uses reverse
+    // order, hence the src1/src0 swap.
     return detail::atomic_update_impl<
         Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
-        acc, byte_offset, mask);
+        acc, byte_offset, src1, src0, mask);
   } else {
-    if constexpr (Op == atomic_op::load) {
-      if constexpr (std::is_integral_v<T>) {
-        return atomic_update<atomic_op::bit_or, T, N>(
-            acc, byte_offset, simd<T, N>(0), mask, props);
-      } else {
-        using Tint = detail::uint_type_t<sizeof(T)>;
-        simd<Tint, N> Res = atomic_update<atomic_op::bit_or, Tint, N>(
-            acc, byte_offset, simd<Tint, N>(0), mask, props);
-        return Res.template bit_cast_view<T>();
-      }
-    } else {
-      detail::check_atomic<Op, T, N, 0>();
-      static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
-
-      static_assert(sizeof(T) == 4, "Only 32 bit data is supported");
-      const auto si = get_surface_index(acc);
-      using Tx = typename detail::__raw_t<T>;
-      return __esimd_dword_atomic0<Op, Tx, N>(mask.data(), si,
-                                              byte_offset.data());
-    }
+    detail::check_atomic<Op, T, N, 2>();
+    static_assert(sizeof(T) == 4, "Only 32 bit data is supported");
+    const auto si = __ESIMD_NS::get_surface_index(acc);
+    using Tx = typename detail::__raw_t<T>;
+    return __esimd_dword_atomic2<Op, Tx, N>(
+        mask.data(), si, byte_offset.data(),
+        sycl::bit_cast<__ESIMD_DNS::vector_type_t<Tx, N>>(src0.data()),
+        sycl::bit_cast<__ESIMD_DNS::vector_type_t<Tx, N>>(src1.data()));
   }
 #endif
 }
 
 /// simd<T, N>
-/// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
-///               props = {});                                  /// (acc-au0-2)
-/// A variation of \c atomic_update API without mask operand
+/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
+///               SrcSimdViewT src0, simd<T, N> src1,
+//                simd_mask<N> mask,props = {});
 ///
-/// @tparam Op The atomic operation - can be \c atomic_op::inc,
-/// \c atomic_op::dec, or \c atomic_op::load.
-/// @tparam T The vector element type.
-/// @tparam N The number of memory locations to update.
+/// Atomically updates \c N memory locations represented by an accessor and
+/// a vector of offsets and returns a vector of old
+/// values found at the memory locations before update. The update operation
+/// has 2 additional arguments.
+///
+/// A variation of \c atomic_update API with \c src0 represented as
+/// \c simd_view object and allows the use without specifying \c T and \c N
+/// template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
 /// @tparam AccessorTy type of the SYCL accessor.
 /// @param acc The SYCL accessor.
 /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
-/// offsets are supported only when stateless memory accesses are enforced, i.e.
-/// accessor based accesses are automatically converted to stateless accesses.
+/// offsets are supported only when stateless memory accesses are enforced,
+/// i.e. accessor based accesses are automatically converted to stateless
+/// accesses.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
 /// @param props The parameter 'props' specifies the optional compile-time
 ///   properties list. Only L1/L2 properties are used.
 //    Other properties are ignored.
@@ -8358,35 +11317,48 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd_mask<N> mask,
 ///   update.
 ///
 template <
-    atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
+    typename AccessorTy,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
-    __ESIMD_DNS::get_num_args<Op>() == 0 &&
+    __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
         __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
         ext::oneapi::experimental::is_property_list_v<PropertyListT>,
     simd<T, N>>
-atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
-              PropertyListT props = {}) {
-  simd_mask<N> mask = 1;
-  return atomic_update<Op, T, N>(acc, byte_offset, mask, props);
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), src1, mask,
+                                 props);
 }
 
 /// simd<T, N>
-/// atomic_update(AccessorT acc, OffsetSimdViewT byte_offset,
-///               simd_mask<N> mask, props = {});               /// (acc-au0-3)
-/// A variation of \c atomic_update API with \c offsets represented as
-/// \c simd_view object.
+/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, SrcSimdViewT src1,
+//                simd_mask<N> mask,props = {});
 ///
-/// @tparam Op The atomic operation - can be \c atomic_op::inc,
-/// \c atomic_op::dec, or \c atomic_op::load.
-/// @tparam T The vector element type.
-/// @tparam N The number of memory locations to update.
+/// Atomically updates \c N memory locations represented by an accessor and
+/// a vector of offsets and returns a vector of old
+/// values found at the memory locations before update. The update operation
+/// has 2 additional arguments.
+///
+/// A variation of \c atomic_update API with \c src1 represented as
+/// \c simd_view object and allows the use without specifying \c T and \c N
+/// template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
 /// @tparam AccessorTy type of the SYCL accessor.
 /// @param acc The SYCL accessor.
-/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
-/// 64-bit offsets are supported only when stateless memory accesses are
-/// enforced, i.e. accessor based accesses are automatically converted to
-/// stateless accesses.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
+/// offsets are supported only when stateless memory accesses are enforced,
+/// i.e. accessor based accesses are automatically converted to stateless
+/// accesses.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
 /// @param mask Operation mask, only locations with non-zero in the
 ///   corresponding mask element are updated.
 /// @param props The parameter 'props' specifies the optional compile-time
@@ -8396,509 +11368,490 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
 ///   update.
 ///
 template <
-    atomic_op Op, typename T, int N, typename OffsetSimdViewT,
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
     typename AccessorTy,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
-    __ESIMD_DNS::get_num_args<Op>() == 0 &&
+    __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
         __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
-        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
-        detail::is_simd_view_type_v<OffsetSimdViewT>,
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
     simd<T, N>>
-atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd_mask<N> mask,
-              PropertyListT props = {}) {
-  return atomic_update<Op, T, N>(acc, byte_offset.read(), mask, props);
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
+              SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src1 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset, src0, src1.read(), mask,
+                                 props);
 }
 
 /// simd<T, N>
-/// atomic_update(AccessorT acc, OffsetSimdViewT byte_offset,
-///               props = {});                                  /// (acc-au0-4)
-/// A variation of \c atomic_update API with \c offsets represented as
-/// \c simd_view object and no mask operand.
+/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
+///               SrcSimdViewT src0, SrcSimdViewT src1,
+//                simd_mask<N> mask,props = {});
 ///
-/// @tparam Op The atomic operation - can be \c atomic_op::inc,
-/// \c atomic_op::dec, or \c atomic_op::load.
-/// @tparam T The vector element type.
-/// @tparam N The number of memory locations to update.
+/// Atomically updates \c N memory locations represented by an accessor and
+/// a vector of offsets and returns a vector of old
+/// values found at the memory locations before update. The update operation
+/// has 2 additional arguments.
+///
+/// A variation of \c atomic_update API with \c src0 and \c src1 represented as
+/// \c simd_view object and allows the use without specifying \c T and \c N
+/// template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
 /// @tparam AccessorTy type of the SYCL accessor.
 /// @param acc The SYCL accessor.
-/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
-/// 64-bit offsets are supported only when stateless memory accesses are
-/// enforced, i.e. accessor based accesses are automatically converted to
-/// stateless accesses.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
+/// offsets are supported only when stateless memory accesses are enforced,
+/// i.e. accessor based accesses are automatically converted to stateless
+/// accesses.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
 /// @return A vector of the old values at the memory locations before the
 ///   update.
 ///
 template <
-    atomic_op Op, typename T, int N, typename OffsetSimdViewT,
-    typename AccessorTy,
+    atomic_op Op, typename SrcSimdViewT,
+    typename T = SrcSimdViewT::value_type::element_type, int N,
+    typename Toffset, typename AccessorTy,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
-    __ESIMD_DNS::get_num_args<Op>() == 0 &&
+    __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
         __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
-        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
-        detail::is_simd_view_type_v<OffsetSimdViewT>,
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
     simd<T, N>>
-atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset,
-              PropertyListT props = {}) {
-  simd_mask<N> mask = 1;
-  return atomic_update<Op, T, N>(acc, byte_offset.read(), mask, props);
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+      "Size of src0 and src1 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), src1.read(),
+                                 mask, props);
 }
 
-/// A variation of \c atomic_update API with \c offset represented as
-/// scalar.
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, simd<T, N> src1,
+///               props = {});                                   // (acc-au2-2)
 ///
-/// @tparam Op The atomic operation - can be \c atomic_op::inc,
-/// \c atomic_op::dec, or \c atomic_op::load.
+/// A variation of \c atomic_update API with no mask operand.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
 /// @tparam T The vector element type.
 /// @tparam N The number of memory locations to update.
-/// @tparam AccessorTy type of the SYCL accessor.
 /// @param acc The SYCL accessor.
-/// @param byte_offset The scalar 32-bit or 64-bit offset in bytes. 64-bit
-/// offset are supported only when stateless memory accesses are enforced,
-/// i.e. accessor based accesses are automatically converted to stateless
-/// accesses.
-/// @param mask Operation mask, only locations with non-zero in the
-///   corresponding mask element are updated.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
 /// @return A vector of the old values at the memory locations before the
 ///   update.
 ///
-template <atomic_op Op, typename T, int N, typename Toffset,
-          typename AccessorTy>
-__ESIMD_API
-    std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0 &&
-                         __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy>,
-                     simd<T, N>>
-    atomic_update(AccessorTy acc, Toffset byte_offset, simd_mask<N> mask) {
-  return atomic_update<Op, T, N>(acc, simd<Toffset, N>(byte_offset), mask);
+template <
+    atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
+              simd<T, N> src1, PropertyListT props = {}) {
+  simd_mask<N> mask = 1;
+  return atomic_update<Op, T, N>(acc, byte_offset, src0, src1, mask, props);
 }
 
-/// A variation of \c atomic_update API with \p byte_offset represented as
-/// scalar using \c local_accessor.
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
+///               SrcSimdViewT src0, simd<T, N> src1,
+//                props = {});
 ///
-/// @tparam Op The atomic operation - can be \c atomic_op::inc,
-/// \c atomic_op::dec, or \c atomic_op::load.
-/// @tparam T The vector element type.
-/// @tparam N The number of memory locations to update.
+/// Atomically updates \c N memory locations represented by an accessor and
+/// a vector of offsets and returns a vector of old
+/// values found at the memory locations before update. The update operation
+/// has 2 additional arguments.
+///
+/// A variation of \c atomic_update API with no \c mask operand and with \c src0
+/// represented as \c simd_view object and allows the use without specifying \c
+/// T and \c N template parameters.
+///
+/// @tparam Op The atomic operation - can be one of the following:
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
 /// @tparam AccessorTy type of the SYCL accessor.
 /// @param acc The SYCL accessor.
-/// @param byte_offset The scalar 32-bit or 64-bit offset in bytes. 64-bit
-/// offset are supported only when stateless memory accesses are enforced,
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
+/// offsets are supported only when stateless memory accesses are enforced,
 /// i.e. accessor based accesses are automatically converted to stateless
 /// accesses.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
 /// @param mask Operation mask, only locations with non-zero in the
 ///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
 /// @return A vector of the old values at the memory locations before the
 ///   update.
 ///
-template <atomic_op Op, typename T, int N, typename AccessorTy>
-__ESIMD_API
-    std::enable_if_t<__ESIMD_DNS::get_num_args<Op>() == 0 &&
-                         __ESIMD_DNS::is_rw_local_accessor_v<AccessorTy>,
-                     simd<T, N>>
-    atomic_update(AccessorTy acc, uint32_t byte_offset, simd_mask<N> mask) {
-  return atomic_update<Op, T, N>(acc, simd<uint32_t, N>(byte_offset), mask);
+template <
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src0 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), src1, props);
 }
 
-/// @anchor accessor_atomic_update1
-/// @brief Single-argument variant of the atomic update operation.
-///
-/// simd<T, N>
-/// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
-///               simd<T, N> src0, simd_mask<N> mask, props = {});//(acc-au1-1)
-/// simd<T, N>
-/// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
-///               simd<T, N> src0, props = {});                  // (acc-au1-2)
-///
-/// simd<T, N>
-/// atomic_update(AccessorT acc,
-////              OffsetSimdViewT byte_offset,
-///               simd<T, N> src0,
-///               simd_mask<N> mask, props = {});                // (acc-au1-3)
-/// simd<T, N>
-/// atomic_update(AccessorT acc,
-///               OffsetSimdViewT byte_offset,
-///               simd<T, N> src0,
-///               props = {});                                   // (acc-au1-4)
-///
-
 /// simd<T, N>
-/// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
-///               simd<T, N> src0, simd_mask<N> mask, props = {});//(acc-au1-1)
+/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
+///               simd<T, N> src0, SrcSimdViewT src1,
+//                props = {});
 ///
 /// Atomically updates \c N memory locations represented by an accessor and
-/// a vector of offsets, and returns a vector of old values found at the
-/// memory locations before update. The update operation has 1 additional
-/// argument.
+/// a vector of offsets and returns a vector of old
+/// values found at the memory locations before update. The update operation
+/// has 2 additional arguments.
+///
+/// A variation of \c atomic_update API with no \c mask operand with \c src1
+/// represented as \c simd_view object and allows the use without specifying \c
+/// T and \c N template parameters.
 ///
 /// @tparam Op The atomic operation - can be one of the following:
-/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
-/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
-/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
-/// \c atomic_op::fmax, \c atomic_op::fmin, \c atomic_op::fadd, \c
-/// atomic_op::fsub, \c atomic_op::store.
-/// @tparam T The vector element type.
-/// @tparam N The number of memory locations to update.
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
 /// @tparam AccessorTy type of the SYCL accessor.
 /// @param acc The SYCL accessor.
 /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
-/// offsets are supported only when stateless memory accesses are enforced, i.e.
-/// accessor based accesses are automatically converted to stateless accesses.
-/// @param src0 The additional argument.
+/// offsets are supported only when stateless memory accesses are enforced,
+/// i.e. accessor based accesses are automatically converted to stateless
+/// accesses.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
 /// @param mask Operation mask, only locations with non-zero in the
 ///   corresponding mask element are updated.
 /// @param props The parameter 'props' specifies the optional compile-time
-///   properties list. Only L1/L2 properties are used. Other properties are
-///   ignored.
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
 /// @return A vector of the old values at the memory locations before the
 ///   update.
 ///
-
 template <
-    atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
+    atomic_op Op, typename SrcSimdViewT, typename T, int N, typename Toffset,
+    typename AccessorTy,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
-    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+    __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
         __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
         ext::oneapi::experimental::is_property_list_v<PropertyListT>,
     simd<T, N>>
 atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
-              simd_mask<N> mask, PropertyListT props = {}) {
-#ifdef __ESIMD_FORCE_STATELESS_MEM
-  return atomic_update<Op, T, N>(__ESIMD_DNS::accessorToPointer<T>(acc),
-                                 byte_offset, src0, mask, props);
-#else
-  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
-  static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
-  // Auto-convert FP atomics to LSC version.
-  if constexpr (detail::has_cache_hints<PropertyListT>() ||
-                Op == atomic_op::fmin || Op == atomic_op::fmax ||
-                Op == atomic_op::fadd || Op == atomic_op::fsub ||
-                !__ESIMD_DNS::isPowerOf2(N, 32) || sizeof(T) < 4) {
-    return detail::atomic_update_impl<
-        Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
-        acc, byte_offset, src0, mask);
-  } else if constexpr (Op == atomic_op::store) {
-    if constexpr (std::is_integral_v<T>) {
-      return atomic_update<atomic_op::xchg, T, N>(acc, byte_offset, src0, mask,
-                                                  props);
-    } else {
-      using Tint = detail::uint_type_t<sizeof(T)>;
-      simd<Tint, N> Res = atomic_update<atomic_op::xchg, Tint, N>(
-          acc, byte_offset, src0.template bit_cast_view<Tint>(), mask, props);
-      return Res.template bit_cast_view<T>();
-    }
-  } else {
-    detail::check_atomic<Op, T, N, 1>();
-    static_assert(sizeof(T) == 4, "Only 32 bit data is supported");
-    const auto si = __ESIMD_NS::get_surface_index(acc);
-    using Tx = typename detail::__raw_t<T>;
-    return __esimd_dword_atomic1<Op, Tx, N>(
-        mask.data(), si, byte_offset.data(),
-        sycl::bit_cast<__ESIMD_DNS::vector_type_t<Tx, N>>(src0.data()));
-  }
-#endif
+              SrcSimdViewT src1, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+                "Size of src1 parameter must correspond to the size of "
+                "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset, src0, src1.read(), props);
 }
 
 /// simd<T, N>
-/// atomic_update(AccessorT acc, simd<Toffset, N> byte_offset,
-///               simd<T, N> src0, props = {});                  // (acc-au1-2)
-///
-/// A variation of \c atomic_update API with no mask operand.
+/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
+///               SrcSimdViewT src0, SrcSimdViewT src1,
+//                props = {});
 ///
 /// Atomically updates \c N memory locations represented by an accessor and
-/// a vector of offsets, and returns a vector of old values found at the
-/// memory locations before update. The update operation has 1 additional
-/// argument.
+/// a vector of offsets and returns a vector of old
+/// values found at the memory locations before update. The update operation
+/// has 2 additional arguments.
+///
+/// A variation of \c atomic_update API with no \c mask operand with \c src0 and
+/// \c src1 represented as \c simd_view object and allows the use without
+/// specifying \c T and \c N template parameters.
 ///
 /// @tparam Op The atomic operation - can be one of the following:
-/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
-/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
-/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
-/// \c atomic_op::fmax, \c atomic_op::fmin, \c atomic_op::fadd, \c
-/// atomic_op::fsub, \c atomic_op::store.
-/// @tparam T The vector element type.
-/// @tparam N The number of memory locations to update.
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
 /// @tparam AccessorTy type of the SYCL accessor.
 /// @param acc The SYCL accessor.
 /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
-/// offsets are supported only when stateless memory accesses are enforced, i.e.
-/// accessor based accesses are automatically converted to stateless accesses.
-/// @param src0 The additional argument.
+/// offsets are supported only when stateless memory accesses are enforced,
+/// i.e. accessor based accesses are automatically converted to stateless
+/// accesses.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
 /// @param props The parameter 'props' specifies the optional compile-time
-///   properties list. Only L1/L2 properties are used. Other properties are
-///   ignored.
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
 /// @return A vector of the old values at the memory locations before the
 ///   update.
 ///
 template <
-    atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
+    atomic_op Op, typename SrcSimdViewT,
+    typename T = SrcSimdViewT::value_type::element_type, int N,
+    typename Toffset, typename AccessorTy,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
-    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+    __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
+        detail::is_simd_view_type_v<SrcSimdViewT> &&
         __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
         ext::oneapi::experimental::is_property_list_v<PropertyListT>,
     simd<T, N>>
-atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
-              PropertyListT props = {}) {
-  simd_mask<N> mask = 1;
-  return atomic_update<Op, T, N>(acc, byte_offset, src0, mask, props);
+atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
+      "Size of src0 and src1 parameters must correspond to the size of "
+      "byte_offset parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset, src0.read(), src1.read(),
+                                 props);
 }
 
 /// simd<T, N>
-/// atomic_update(AccessorT acc,
-///               OffsetSimdViewT byte_offset,
-///               simd<T, N> src0,
-///               simd_mask<N> mask, props = {});                // (acc-au1-3)
+/// atomic_update(AccessorTy acc, OffsetSimdViewT
+///               byte_offset, simd<T, N> src0, simd<T, N> src1,
+///               simd_mask<N> mask, props = {});              // (acc-au2-3)
 ///
 /// A variation of \c atomic_update API with \c byte_offset represented as
-/// \c simd_view object.
+/// a \c simd_view object.
 ///
 /// @tparam Op The atomic operation - can be one of the following:
-/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
-/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
-/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
-/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
-/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
 /// @tparam T The vector element type.
 /// @tparam N The number of memory locations to update.
-/// @tparam AccessorTy type of the SYCL accessor.
 /// @param acc The SYCL accessor.
-/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
-/// 64-bit offsets are supported only when stateless memory accesses are
-/// enforced, i.e. accessor based accesses are automatically converted to
-/// stateless accesses.
-/// @param src0 The additional argument.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
 /// @param mask Operation mask, only locations with non-zero in the
 ///   corresponding mask element are updated.
 /// @param props The parameter 'props' specifies the optional compile-time
-///   properties list. Only L1/L2 properties are used. Other properties are
-///   ignored.
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
 /// @return A vector of the old values at the memory locations before the
 ///   update.
-///
 template <
     atomic_op Op, typename T, int N, typename OffsetSimdViewT,
     typename AccessorTy,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
-    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
         __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
         ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
         detail::is_simd_view_type_v<OffsetSimdViewT>,
     simd<T, N>>
 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
-              simd_mask<N> mask, PropertyListT props = {}) {
-  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, mask, props);
+              simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1, mask,
+                                 props);
 }
 
 /// simd<T, N>
-/// atomic_update(AccessorT acc,
-///               OffsetSimdViewT byte_offset,
-///               simd<T, N> src0,
-///               props = {});                                   // (acc-au1-4)
+/// atomic_update(AccessorTy acc, OffsetSimdViewT
+///               byte_offset, SrcSimdViewT src0, simd<T, N> src1,
+///               simd_mask<N> mask, props = {});
 ///
-/// A variation of \c atomic_update API with \c byte_offset represented as
-/// \c simd_view object and no mask operand.
+/// A variation of \c atomic_update API with \c byte_offset and \c src0
+/// represented as \c simd_view object and allows the use without specifying \c
+/// T and \c N template parameters.
 ///
 /// @tparam Op The atomic operation - can be one of the following:
-/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c
-/// atomic_op::max, \c atomic_op::xchg, \c atomic_op::bit_and, \c
-/// atomic_op::bit_or, \c atomic_op::bit_xor, \c atomic_op::minsint, \c
-/// atomic_op::maxsint, \c atomic_op::fmax, \c atomic_op::fmin, \c
-/// atomic_op::fadd, \c atomic_op::fsub, \c atomic_op::store.
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
 /// @tparam T The vector element type.
 /// @tparam N The number of memory locations to update.
-/// @tparam AccessorTy type of the SYCL accessor.
 /// @param acc The SYCL accessor.
-/// @param byte_offset The simd_view of 32-bit or 64-bit offsets in bytes.
-/// 64-bit offsets are supported only when stateless memory accesses are
-/// enforced, i.e. accessor based accesses are automatically converted to
-/// stateless accesses.
-/// @param src0 The additional argument.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
+/// @param mask Operation mask, only locations with non-zero in the
+///   corresponding mask element are updated.
 /// @param props The parameter 'props' specifies the optional compile-time
-///   properties list. Only L1/L2 properties are used. Other properties are
-///   ignored.
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
 /// @return A vector of the old values at the memory locations before the
 ///   update.
-///
 template <
-    atomic_op Op, typename T, int N, typename OffsetSimdViewT,
-    typename AccessorTy,
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N, typename AccessorTy,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
-    __ESIMD_DNS::get_num_args<Op>() == 1 &&
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
         __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
         ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
-        detail::is_simd_view_type_v<OffsetSimdViewT>,
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
     simd<T, N>>
-atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
-              PropertyListT props = {}) {
-  simd_mask<N> mask = 1;
-  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, mask, props);
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+          N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src0 and byte_offset parameters must correspond to the size of "
+      "src1 parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0.read(), src1,
+                                 mask, props);
 }
-
-/// A variation of \c atomic_update API with \c offset represented as
-/// scalar object.
+
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, OffsetSimdViewT
+///               byte_offset, simd<T, N> src0, SrcSimdViewT src1,
+///               simd_mask<N> mask, props = {});
+///
+/// A variation of \c atomic_update API with \c byte_offset and \c src1
+/// represented as \c simd_view object and allows the use without specifying \c
+/// T and \c N template parameters.
 ///
 /// @tparam Op The atomic operation - can be one of the following:
-/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
-/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
-/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
-/// \c atomic_op::fmax, \c atomic_op::fmin \c atomic_op::store.
-/// @tparam Tx The vector element type.
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam T The vector element type.
 /// @tparam N The number of memory locations to update.
-/// @tparam AccessorTy type of the SYCL accessor.
 /// @param acc The SYCL accessor.
-/// @param offset The scalar 32-bit or 64-bit offset in bytes. 64-bit
-/// offset are supported only when stateless memory accesses are enforced, i.e.
-/// accessor based accesses are automatically converted to stateless accesses.
-/// @param src0 The additional argument.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
 /// @param mask Operation mask, only locations with non-zero in the
 ///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
 /// @return A vector of the old values at the memory locations before the
 ///   update.
-///
-template <atomic_op Op, typename T, int N, typename Toffset,
-          typename AccessorTy>
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N, typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
-    __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
-        ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
     simd<T, N>>
-atomic_update(AccessorTy acc, Toffset offset, simd<T, N> src0,
-              simd_mask<N> mask) {
-  return atomic_update<Op, T, N>(acc, simd<Toffset, N>(offset), src0, mask);
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+          N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src1 and byte_offset parameters must correspond to the size of "
+      "src0 parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1.read(),
+                                 mask, props);
 }
 
-/// A variation of \c atomic_update API with \c offset represented as
-/// scalar object and uses \c local_accessor.
+/// simd<T, N>
+/// atomic_update(AccessorTy acc, OffsetSimdViewT
+///               byte_offset, SrcSimdViewT src0, SrcSimdViewT src1,
+///               simd_mask<N> mask, props = {});
+///
+/// A variation of \c atomic_update API with \c byte_offset, \c src0 and
+/// \c src1 represented as \c simd_view object and allows the use without
+/// specifying \c T and \c N template parameters.
 ///
 /// @tparam Op The atomic operation - can be one of the following:
-/// \c atomic_op::add, \c atomic_op::sub, \c atomic_op::min, \c atomic_op::max,
-/// \c atomic_op::xchg, \c atomic_op::bit_and, \c atomic_op::bit_or,
-/// \c atomic_op::bit_xor, \c atomic_op::minsint, \c atomic_op::maxsint,
-/// \c atomic_op::fmax, \c atomic_op::fmin \c atomic_op::store.
-/// @tparam Tx The vector element type.
+///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
+/// @tparam T The vector element type.
 /// @tparam N The number of memory locations to update.
-/// @tparam AccessorTy type of the SYCL accessor.
 /// @param acc The SYCL accessor.
-/// @param offset The scalar 32-bit offset in bytes.
-/// @param src0 The additional argument.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
+/// @param src0 The first additional argument (new value).
+/// @param src1 The second additional argument (expected value).
 /// @param mask Operation mask, only locations with non-zero in the
 ///   corresponding mask element are updated.
+/// @param props The parameter 'props' specifies the optional compile-time
+///   properties list. Only L1/L2 properties are used.
+//    Other properties are ignored.
 /// @return A vector of the old values at the memory locations before the
 ///   update.
-///
-template <atomic_op Op, typename Tx, int N, typename AccessorTy>
+template <
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT,
+    typename T = SrcSimdViewT::value_type::element_type, int N,
+    typename AccessorTy,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
-    __ESIMD_DNS::is_rw_local_accessor_v<AccessorTy> &&
-        ((Op != atomic_op::store && Op != atomic_op::xchg) || N == 1),
-    simd<Tx, N>>
-atomic_update(AccessorTy acc, uint32_t offset, simd<Tx, N> src0,
-              simd_mask<N> mask) {
-  return atomic_update<Op, Tx, N>(acc, simd<uint32_t, N>(offset), src0, mask);
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
+        __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
+    simd<T, N>>
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, simd_mask<N> mask, PropertyListT props = {}) {
+  static_assert(N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+                    N == OffsetSimdViewT::getSizeX() *
+                             OffsetSimdViewT::getSizeY(),
+                "Size of src0, src1 and byte_offset parameters must correspond "
+                "to the size of "
+                "mask parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0.read(),
+                                 src1.read(), mask, props);
 }
 
-/// @anchor accessor_atomic_update2
-/// @brief Two-argument variant of the atomic update operation.
-///
-/// simd<T, N>
-/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
-///               simd<T, N> src0, simd<T, N> src1,
-//                simd_mask<N> mask,props = {});                 // (acc-au2-1)
-///
-/// simd<T, N>
-/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
-///               simd<T, N> src0, simd<T, N> src1,
-///               props = {});                                   // (acc-au2-2)
-/// simd<T, N>
-/// atomic_update(AccessorTy acc, OffsetSimdViewT
-///               byte_offset, simd<T, N> src0, simd<T, N> src1,
-///               simd_mask<N> mask, props = {});                // (acc-au2-3)
-///
 /// simd<T, N>
 /// atomic_update(AccessorTy acc,
 ///               OffsetSimdViewT, byte_offset,
 ///               simd<T, N> src0, simd<T, N> src1, props = {}); // (acc-au2-4)
 ///
-
-/// simd<T, N>
-/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
-///               simd<T, N> src0, simd<T, N> src1,
-//                simd_mask<N> mask,props = {});                 // (acc-au2-1)
-///
-/// Atomically updates \c N memory locations represented by an accessor and
-/// a vector of offsets and returns a vector of old
-/// values found at the memory locations before update. The update operation
-/// has 2 additional arguments.
+/// A variation of \c atomic_update API with \c byte_offset represented as
+/// a \c simd_view object and no mask operand.
 ///
 /// @tparam Op The atomic operation - can be one of the following:
 ///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
 /// @tparam T The vector element type.
 /// @tparam N The number of memory locations to update.
-/// @tparam AccessorTy type of the SYCL accessor.
 /// @param acc The SYCL accessor.
-/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes. 64-bit
-/// offsets are supported only when stateless memory accesses are enforced,
-/// i.e. accessor based accesses are automatically converted to stateless
-/// accesses.
+/// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
 /// @param src0 The first additional argument (new value).
 /// @param src1 The second additional argument (expected value).
-/// @param mask Operation mask, only locations with non-zero in the
-///   corresponding mask element are updated.
 /// @param props The parameter 'props' specifies the optional compile-time
 ///   properties list. Only L1/L2 properties are used.
 //    Other properties are ignored.
 /// @return A vector of the old values at the memory locations before the
 ///   update.
-///
 template <
-    atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
+    atomic_op Op, typename T, int N, typename OffsetSimdViewT,
+    typename AccessorTy,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
-    __ESIMD_DNS::get_num_args<Op>() == 2 && std::is_integral_v<Toffset> &&
+    __ESIMD_DNS::get_num_args<Op>() == 2 &&
         __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
-        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT>,
     simd<T, N>>
-atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
-              simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
-#ifdef __ESIMD_FORCE_STATELESS_MEM
-  return atomic_update<Op, T, N>(__ESIMD_DNS::accessorToPointer<T>(acc),
-                                 byte_offset, src0, src1, mask, props);
-#else
-  static_assert(std::is_integral_v<Toffset>, "Unsupported offset type");
-  static_assert(sizeof(Toffset) == 4, "Only 32 bit offset is supported");
-  // Use LSC atomic when cache hints are present, FP atomics is used,
-  // non-power of two length is used, operation width greater than 32, or the
-  // data size is less than 4 bytes,
-  if constexpr (detail::has_cache_hints<PropertyListT>() ||
-                Op == atomic_op::fcmpxchg || !__ESIMD_DNS::isPowerOf2(N, 32) ||
-                sizeof(T) < 4) {
-    // 2-argument lsc_atomic_update arguments order matches the standard one -
-    // expected value first, then new value. But atomic_update uses reverse
-    // order, hence the src1/src0 swap.
-    return detail::atomic_update_impl<
-        Op, T, N, detail::lsc_data_size::default_size, PropertyListT>(
-        acc, byte_offset, src1, src0, mask);
-  } else {
-    detail::check_atomic<Op, T, N, 2>();
-    static_assert(sizeof(T) == 4, "Only 32 bit data is supported");
-    const auto si = __ESIMD_NS::get_surface_index(acc);
-    using Tx = typename detail::__raw_t<T>;
-    return __esimd_dword_atomic2<Op, Tx, N>(
-        mask.data(), si, byte_offset.data(),
-        sycl::bit_cast<__ESIMD_DNS::vector_type_t<Tx, N>>(src0.data()),
-        sycl::bit_cast<__ESIMD_DNS::vector_type_t<Tx, N>>(src1.data()));
-  }
-#endif
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
+              simd<T, N> src1, PropertyListT props = {}) {
+  simd_mask<N> mask = 1;
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1, mask,
+                                 props);
 }
 
 /// simd<T, N>
-/// atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset,
-///               simd<T, N> src0, simd<T, N> src1,
-///               props = {});                                   // (acc-au2-2)
+/// atomic_update(AccessorTy acc, OffsetSimdViewT
+///               byte_offset, SrcSimdViewT src0, simd<T, N> src1,
+///               props = {});
 ///
-/// A variation of \c atomic_update API with no mask operand.
+/// A variation of \c atomic_update API with with no mask operand and \c
+/// byte_offset and \c src0 represented as \c simd_view object and allows the
+/// use without specifying \c T and \c N template parameters.
 ///
 /// @tparam Op The atomic operation - can be one of the following:
 ///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
@@ -8913,28 +11866,36 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
 //    Other properties are ignored.
 /// @return A vector of the old values at the memory locations before the
 ///   update.
-///
 template <
-    atomic_op Op, typename T, int N, typename Toffset, typename AccessorTy,
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N, typename AccessorTy,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
     __ESIMD_DNS::get_num_args<Op>() == 2 &&
         __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
-        ext::oneapi::experimental::is_property_list_v<PropertyListT>,
+        ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
     simd<T, N>>
-atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
               simd<T, N> src1, PropertyListT props = {}) {
-  simd_mask<N> mask = 1;
-  return atomic_update<Op, T, N>(acc, byte_offset, src0, src1, mask, props);
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+          N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src0 and byte_offset parameters must correspond to the size of "
+      "src1 parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0.read(), src1,
+                                 props);
 }
 
 /// simd<T, N>
 /// atomic_update(AccessorTy acc, OffsetSimdViewT
-///               byte_offset, simd<T, N> src0, simd<T, N> src1,
-///               simd_mask<N> mask, props = {});              // (acc-au2-3)
+///               byte_offset, simd<T, N> src0, SrcSimdViewT src1,
+///               props = {});
 ///
-/// A variation of \c atomic_update API with \c byte_offset represented as
-/// a \c simd_view object.
+/// A variation of \c atomic_update API with no mask operand and \c byte_offset
+/// and \c src1 represented as \c simd_view object and allows the use without
+/// specifying \c T and \c N template parameters.
 ///
 /// @tparam Op The atomic operation - can be one of the following:
 ///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
@@ -8944,36 +11905,41 @@ atomic_update(AccessorTy acc, simd<Toffset, N> byte_offset, simd<T, N> src0,
 /// @param byte_offset The vector of 32-bit or 64-bit offsets in bytes.
 /// @param src0 The first additional argument (new value).
 /// @param src1 The second additional argument (expected value).
-/// @param mask Operation mask, only locations with non-zero in the
-///   corresponding mask element are updated.
 /// @param props The parameter 'props' specifies the optional compile-time
 ///   properties list. Only L1/L2 properties are used.
 //    Other properties are ignored.
 /// @return A vector of the old values at the memory locations before the
 ///   update.
 template <
-    atomic_op Op, typename T, int N, typename OffsetSimdViewT,
-    typename AccessorTy,
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT, typename T,
+    int N, typename AccessorTy,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
     __ESIMD_DNS::get_num_args<Op>() == 2 &&
         __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
         ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
-        detail::is_simd_view_type_v<OffsetSimdViewT>,
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
     simd<T, N>>
 atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
-              simd<T, N> src1, simd_mask<N> mask, PropertyListT props = {}) {
-  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1, mask,
+              SrcSimdViewT src1, PropertyListT props = {}) {
+  static_assert(
+      N == SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY() &&
+          N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src1 and byte_offset parameters must correspond to the size of "
+      "src0 parameter.");
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1.read(),
                                  props);
 }
 
 /// simd<T, N>
-/// atomic_update(AccessorTy acc,
-///               OffsetSimdViewT, byte_offset,
-///               simd<T, N> src0, simd<T, N> src1, props = {}); // (acc-au2-4)
+/// atomic_update(AccessorTy acc, OffsetSimdViewT
+///               byte_offset, SrcSimdViewT src0, SrcSimdViewT src1,
+///               props = {});
 ///
-/// A variation of \c atomic_update API with \c byte_offset represented as
-/// a \c simd_view object and no mask operand.
+/// A variation of \c atomic_update API with no mask operand and \c byte_offset,
+/// \c src0 and \c src1 represented as \c simd_view object and allows the use
+/// without specifying \c T and \c N template parameters.
 ///
 /// @tparam Op The atomic operation - can be one of the following:
 ///   \c atomic_op::cmpxchg, \c atomic_op::fcmpxchg.
@@ -8989,20 +11955,25 @@ atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
 /// @return A vector of the old values at the memory locations before the
 ///   update.
 template <
-    atomic_op Op, typename T, int N, typename OffsetSimdViewT,
+    atomic_op Op, typename SrcSimdViewT, typename OffsetSimdViewT,
+    typename T = SrcSimdViewT::value_type::element_type,
+    int N = SrcSimdViewT::getSizeX() * SrcSimdViewT::getSizeY(),
     typename AccessorTy,
     typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
 __ESIMD_API std::enable_if_t<
     __ESIMD_DNS::get_num_args<Op>() == 2 &&
         __ESIMD_DNS::is_rw_device_accessor_v<AccessorTy> &&
         ext::oneapi::experimental::is_property_list_v<PropertyListT> &&
-        detail::is_simd_view_type_v<OffsetSimdViewT>,
+        detail::is_simd_view_type_v<OffsetSimdViewT> &&
+        detail::is_simd_view_type_v<SrcSimdViewT>,
     simd<T, N>>
-atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, simd<T, N> src0,
-              simd<T, N> src1, PropertyListT props = {}) {
-  simd_mask<N> mask = 1;
-  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0, src1, mask,
-                                 props);
+atomic_update(AccessorTy acc, OffsetSimdViewT byte_offset, SrcSimdViewT src0,
+              SrcSimdViewT src1, PropertyListT props = {}) {
+  static_assert(
+      N == OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY(),
+      "Size of src0, src1 and byte_offset parameters must correspond.");
+  return atomic_update<Op, T, N>(acc, byte_offset.read(), src0.read(),
+                                 src1.read(), props);
 }
 
 /// A variation of \c atomic_update API with \c offsets represented as
@@ -9860,7 +12831,7 @@ scatter(AccessorT acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// 	         simd_mask<N / VS> mask,
 ///              PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the local accessor \p acc and byte offsets \p byte_offsets. Access to any
@@ -9901,7 +12872,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// void scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 /// 	         PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the local accessor \p acc and byte offsets \p byte_offsets.
@@ -9942,7 +12913,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, simd<T, N> vals,
 ///              ValuesSimdViewT vals, simd_mask<N / VS> mask,
 ///              PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the local accessor \p acc and byte offsets \p byte_offsets. Access to any
@@ -9990,7 +12961,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
 /// void scatter(AccessorTy acc, OffsetSimdViewT byte_offsets,
 ///              ValuesSimdViewT vals, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the local accessor \p acc and byte offsets \p byte_offsets.
@@ -10035,7 +13006,7 @@ scatter(AccessorTy acc, OffsetSimdViewT byte_offsets, ValuesSimdViewT vals,
 ///              ValuesSimdViewT vals, simd_mask<N / VS> mask,
 ///              PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the local accessor \p acc and byte offsets \p byte_offsets. Access to any
@@ -10078,7 +13049,7 @@ scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets,
 /// void scatter(AccessorTy acc, simd<OffsetT, N / VS> byte_offsets,
 ///              ValuesSimdViewT vals, PropertyListT props = {});
 ///
-/// Variation of the API that allows to use \c simd_view without specifying
+/// Variation of the API that allows using \c simd_view without specifying
 /// \c T and \c N template parameters.
 /// Stores ("scatters") elements of the type 'T' to memory locations addressed
 /// by the local accessor \p acc and byte offsets \p byte_offsets.
@@ -10388,6 +13359,66 @@ prefetch(const T *p, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
   prefetch<T, N, VS>(p, byte_offsets.read(), props);
 }
 
+/// template <int VS = 1, typename T, int N, typename OffsetSimdViewT,
+///           typename PropertyListT = empty_properties_t>
+/// void prefetch(const T *p, OffsetSimdViewT byte_offsets,
+///             simd_mask<N / VS> mask, PropertyListT props = {});
+/// Supported platforms: DG2, PVC only.
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Prefetches elements of the type 'T' from memory locations
+/// addressed by the base pointer \p p and byte offsets \p byte_offsets to the
+/// cache. Access to any element's memory location can be disabled via the input
+/// vector of predicates \p mask. If mask[i] is unset, then the load from (p +
+/// byte_offsets[i]) is skipped.
+/// @tparam VS Vector size. It can also be read as the number of reads per
+/// each address. The parameter 'N' must be divisible by 'VS'.
+/// @param p The base address.
+/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes.
+/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned.
+/// @param mask The access mask.
+/// @param props The optional compile-time properties. Only cache hint
+/// properties are used.
+template <
+    int VS = 1, typename OffsetSimdViewT, typename T,
+    int N = OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY() * VS,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<OffsetSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+prefetch(const T *p, OffsetSimdViewT byte_offsets, simd_mask<N / VS> mask,
+         PropertyListT props = {}) {
+  prefetch<T, N, VS>(p, byte_offsets.read(), mask, props);
+}
+
+/// template <int VS = 1, typename T, int N, typename OffsetSimdViewT,
+///           typename PropertyListT = empty_properties_t>
+/// void prefetch(const T *p, OffsetSimdViewT byte_offsets,
+///             PropertyListT props = {});
+/// Supported platforms: DG2, PVC only.
+/// Variation of the API that allows using \c simd_view without specifying
+/// \c T and \c N template parameters.
+/// Prefetches elements of the type 'T' from memory locations
+/// addressed by the base pointer \p p and byte offsets \p byte_offsets to the
+/// cache.
+/// @tparam VS Vector size. It can also be read as the number of reads per
+/// each address. The parameter 'N' must be divisible by 'VS'.
+/// @param p The base address.
+/// @param byte_offsets the vector of 32-bit or 64-bit offsets in bytes.
+/// For each i, ((byte*)p + byte_offsets[i]) must be element size aligned.
+/// @param props The optional compile-time properties. Only cache hint
+/// properties are used.
+template <
+    int VS = 1, typename OffsetSimdViewT, typename T,
+    int N = OffsetSimdViewT::getSizeX() * OffsetSimdViewT::getSizeY() * VS,
+    typename PropertyListT = ext::oneapi::experimental::empty_properties_t>
+__ESIMD_API std::enable_if_t<
+    detail::is_simd_view_type_v<OffsetSimdViewT> &&
+    ext::oneapi::experimental::is_property_list_v<PropertyListT>>
+prefetch(const T *p, OffsetSimdViewT byte_offsets, PropertyListT props = {}) {
+  prefetch<T, N, VS>(p, byte_offsets.read(), props);
+}
+
 /// template <typename T, int VS = 1, typename OffsetT,
 ///           typename PropertyListT = empty_properties_t>
 /// void prefetch(const T *p, OffsetT byte_offset, simd_mask<1> mask,
diff --git a/sycl/include/sycl/ext/oneapi/bfloat16.hpp b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
index 3a16dcd244b4c..de97de176e53d 100644
--- a/sycl/include/sycl/ext/oneapi/bfloat16.hpp
+++ b/sycl/include/sycl/ext/oneapi/bfloat16.hpp
@@ -18,6 +18,30 @@ extern "C" __DPCPP_SYCL_EXTERNAL uint16_t
 __devicelib_ConvertFToBF16INTEL(const float &) noexcept;
 extern "C" __DPCPP_SYCL_EXTERNAL float
 __devicelib_ConvertBF16ToFINTEL(const uint16_t &) noexcept;
+extern "C" __DPCPP_SYCL_EXTERNAL void
+__devicelib_ConvertFToBF16INTELVec1(const float *, uint16_t *) noexcept;
+extern "C" __DPCPP_SYCL_EXTERNAL void
+__devicelib_ConvertBF16ToFINTELVec1(const uint16_t *, float *) noexcept;
+extern "C" __DPCPP_SYCL_EXTERNAL void
+__devicelib_ConvertFToBF16INTELVec2(const float *, uint16_t *) noexcept;
+extern "C" __DPCPP_SYCL_EXTERNAL void
+__devicelib_ConvertBF16ToFINTELVec2(const uint16_t *, float *) noexcept;
+extern "C" __DPCPP_SYCL_EXTERNAL void
+__devicelib_ConvertFToBF16INTELVec3(const float *, uint16_t *) noexcept;
+extern "C" __DPCPP_SYCL_EXTERNAL void
+__devicelib_ConvertBF16ToFINTELVec3(const uint16_t *, float *) noexcept;
+extern "C" __DPCPP_SYCL_EXTERNAL void
+__devicelib_ConvertFToBF16INTELVec4(const float *, uint16_t *) noexcept;
+extern "C" __DPCPP_SYCL_EXTERNAL void
+__devicelib_ConvertBF16ToFINTELVec4(const uint16_t *, float *) noexcept;
+extern "C" __DPCPP_SYCL_EXTERNAL void
+__devicelib_ConvertFToBF16INTELVec8(const float *, uint16_t *) noexcept;
+extern "C" __DPCPP_SYCL_EXTERNAL void
+__devicelib_ConvertBF16ToFINTELVec8(const uint16_t *, float *) noexcept;
+extern "C" __DPCPP_SYCL_EXTERNAL void
+__devicelib_ConvertFToBF16INTELVec16(const float *, uint16_t *) noexcept;
+extern "C" __DPCPP_SYCL_EXTERNAL void
+__devicelib_ConvertBF16ToFINTELVec16(const uint16_t *, float *) noexcept;
 
 namespace sycl {
 inline namespace _V1 {
@@ -29,9 +53,35 @@ namespace detail {
 using Bfloat16StorageT = uint16_t;
 Bfloat16StorageT bfloat16ToBits(const bfloat16 &Value);
 bfloat16 bitsToBfloat16(const Bfloat16StorageT Value);
+// Class to convert different data types to Bfloat16
+// with different rounding modes.
+class ConvertToBfloat16;
+
+template <int N> void BF16VecToFloatVec(const bfloat16 src[N], float dst[N]) {
+#if defined(__SYCL_DEVICE_ONLY__) && (defined(__SPIR__) || defined(__SPIRV__))
+  const uint16_t *src_i16 = sycl::bit_cast<const uint16_t *>(src);
+  if constexpr (N == 1)
+    __devicelib_ConvertBF16ToFINTELVec1(src_i16, dst);
+  else if constexpr (N == 2)
+    __devicelib_ConvertBF16ToFINTELVec2(src_i16, dst);
+  else if constexpr (N == 3)
+    __devicelib_ConvertBF16ToFINTELVec3(src_i16, dst);
+  else if constexpr (N == 4)
+    __devicelib_ConvertBF16ToFINTELVec4(src_i16, dst);
+  else if constexpr (N == 8)
+    __devicelib_ConvertBF16ToFINTELVec8(src_i16, dst);
+  else if constexpr (N == 16)
+    __devicelib_ConvertBF16ToFINTELVec16(src_i16, dst);
+#else
+  for (int i = 0; i < N; ++i) {
+    dst[i] = (float)src[i];
+  }
+#endif
+}
 
 // sycl::vec support
 namespace bf16 {
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 #ifdef __SYCL_DEVICE_ONLY__
 using Vec2StorageT = Bfloat16StorageT __attribute__((ext_vector_type(2)));
 using Vec3StorageT = Bfloat16StorageT __attribute__((ext_vector_type(3)));
@@ -45,6 +95,7 @@ using Vec4StorageT = std::array<Bfloat16StorageT, 4>;
 using Vec8StorageT = std::array<Bfloat16StorageT, 8>;
 using Vec16StorageT = std::array<Bfloat16StorageT, 16>;
 #endif
+#endif // __INTEL_PREVIEW_BREAKING_CHANGES
 } // namespace bf16
 } // namespace detail
 
@@ -56,6 +107,7 @@ class bfloat16 {
   detail::bfloat16ToBits(const bfloat16 &Value);
   friend inline bfloat16
   detail::bitsToBfloat16(const detail::Bfloat16StorageT Value);
+  friend class detail::ConvertToBfloat16;
 
 public:
   bfloat16() = default;
@@ -237,6 +289,30 @@ class bfloat16 {
 
 namespace detail {
 
+template <int N> void FloatVecToBF16Vec(float src[N], bfloat16 dst[N]) {
+#if defined(__SYCL_DEVICE_ONLY__) && (defined(__SPIR__) || defined(__SPIRV__))
+  uint16_t *dst_i16 = sycl::bit_cast<uint16_t *>(dst);
+  if constexpr (N == 1)
+    __devicelib_ConvertFToBF16INTELVec1(src, dst_i16);
+  else if constexpr (N == 2)
+    __devicelib_ConvertFToBF16INTELVec2(src, dst_i16);
+  else if constexpr (N == 3)
+    __devicelib_ConvertFToBF16INTELVec3(src, dst_i16);
+  else if constexpr (N == 4)
+    __devicelib_ConvertFToBF16INTELVec4(src, dst_i16);
+  else if constexpr (N == 8)
+    __devicelib_ConvertFToBF16INTELVec8(src, dst_i16);
+  else if constexpr (N == 16)
+    __devicelib_ConvertFToBF16INTELVec16(src, dst_i16);
+#else
+  for (int i = 0; i < N; ++i) {
+    // No need to cast as bfloat16 has a assignment op overload that takes
+    // a float.
+    dst[i] = src[i];
+  }
+#endif
+}
+
 // Helper function for getting the internal representation of a bfloat16.
 inline Bfloat16StorageT bfloat16ToBits(const bfloat16 &Value) {
   return Value.value;
@@ -250,6 +326,315 @@ inline bfloat16 bitsToBfloat16(const Bfloat16StorageT Value) {
   return res;
 }
 
+// Class to convert different data types to Bfloat16
+// with different rounding modes.
+class ConvertToBfloat16 {
+
+  // The automatic rounding mode is RTE.
+  enum SYCLRoundingMode { automatic = 0, rte = 1, rtz = 2, rtp = 3, rtn = 4 };
+
+  // Function to get the most significant bit position of a number.
+  template <typename Ty> static size_t get_msb_pos(const Ty &x) {
+    assert(x != 0);
+    size_t idx = 0;
+    Ty mask = ((Ty)1 << (sizeof(Ty) * 8 - 1));
+    for (idx = 0; idx < (sizeof(Ty) * 8); ++idx) {
+      if ((x & mask) == mask)
+        break;
+      mask >>= 1;
+    }
+
+    return (sizeof(Ty) * 8 - 1 - idx);
+  }
+
+  // Helper function to get BF16 from float with different rounding modes.
+  // Reference:
+  // https://github.com/intel/llvm/blob/sycl/libdevice/imf_bf16.hpp#L30
+  static bfloat16
+  getBFloat16FromFloatWithRoundingMode(const float &f,
+                                       SYCLRoundingMode roundingMode) {
+
+    if (roundingMode == SYCLRoundingMode::automatic ||
+        roundingMode == SYCLRoundingMode::rte) {
+      // Use the default rounding mode.
+      return bfloat16{f};
+    } else {
+      uint32_t u32_val = sycl::bit_cast<uint32_t>(f);
+      uint16_t bf16_sign = static_cast<uint16_t>((u32_val >> 31) & 0x1);
+      uint16_t bf16_exp = static_cast<uint16_t>((u32_val >> 23) & 0x7FF);
+      uint32_t f_mant = u32_val & 0x7F'FFFF;
+      uint16_t bf16_mant = static_cast<uint16_t>(f_mant >> 16);
+      // +/-infinity and NAN
+      if (bf16_exp == 0xFF) {
+        if (!f_mant)
+          return bitsToBfloat16(bf16_sign ? 0xFF80 : 0x7F80);
+        else
+          return bitsToBfloat16((bf16_sign << 15) | (bf16_exp << 7) |
+                                bf16_mant);
+      }
+
+      // +/-0
+      if (!bf16_exp && !f_mant) {
+        return bitsToBfloat16(bf16_sign ? 0x8000 : 0x0);
+      }
+
+      uint16_t mant_discard = static_cast<uint16_t>(f_mant & 0xFFFF);
+      switch (roundingMode) {
+      case SYCLRoundingMode::rtn:
+        if (bf16_sign && mant_discard)
+          bf16_mant++;
+        break;
+      case SYCLRoundingMode::rtz:
+        break;
+      case SYCLRoundingMode::rtp:
+        if (!bf16_sign && mant_discard)
+          bf16_mant++;
+        break;
+
+      // Should not reach here. Adding these just to suppress the warning.
+      case SYCLRoundingMode::automatic:
+      case SYCLRoundingMode::rte:
+        break;
+      }
+
+      // if overflow happens, bf16_exp will be 0xFF and bf16_mant will be 0,
+      // infinity will be returned.
+      if (bf16_mant == 0x80) {
+        bf16_mant = 0;
+        bf16_exp++;
+      }
+
+      return bitsToBfloat16((bf16_sign << 15) | (bf16_exp << 7) | bf16_mant);
+    }
+  }
+
+  // Helper function to get BF16 from unsigned integral data types
+  // with different rounding modes.
+  // Reference:
+  // https://github.com/intel/llvm/blob/sycl/libdevice/imf_bf16.hpp#L302
+  template <typename T>
+  static bfloat16
+  getBFloat16FromUIntegralWithRoundingMode(T &u,
+                                           SYCLRoundingMode roundingMode) {
+
+    size_t msb_pos = get_msb_pos(u);
+    // return half representation for 1
+    if (msb_pos == 0)
+      return bitsToBfloat16(0x3F80);
+
+    T mant = u & ((static_cast<T>(1) << msb_pos) - 1);
+    // Unsigned integral value can be represented by 1.mant * (2^msb_pos),
+    // msb_pos is also the bit number of mantissa, 0 < msb_pos < sizeof(Ty) * 8,
+    // exponent of bfloat16 precision value range is [-126, 127].
+
+    uint16_t b_exp = msb_pos;
+    uint16_t b_mant;
+
+    if (msb_pos <= 7) {
+      // No need to round off if we can losslessly fit the input value in
+      // mantissa of bfloat16.
+      mant <<= (7 - msb_pos);
+      b_mant = static_cast<uint16_t>(mant);
+    } else {
+      b_mant = static_cast<uint16_t>(mant >> (msb_pos - 7));
+      T mant_discard = mant & ((static_cast<T>(1) << (msb_pos - 7)) - 1);
+      T mid = static_cast<T>(1) << (msb_pos - 8);
+      switch (roundingMode) {
+      case SYCLRoundingMode::automatic:
+      case SYCLRoundingMode::rte:
+        if ((mant_discard > mid) ||
+            ((mant_discard == mid) && ((b_mant & 0x1) == 0x1)))
+          b_mant++;
+        break;
+      case SYCLRoundingMode::rtp:
+        if (mant_discard)
+          b_mant++;
+        break;
+      case SYCLRoundingMode::rtn:
+      case SYCLRoundingMode::rtz:
+        break;
+      }
+    }
+    if (b_mant == 0x80) {
+      b_exp++;
+      b_mant = 0;
+    }
+
+    b_exp += 127;
+    return bitsToBfloat16((b_exp << 7) | b_mant);
+  }
+
+  // Helper function to get BF16 from signed integral data types.
+  // Reference:
+  // https://github.com/intel/llvm/blob/sycl/libdevice/imf_bf16.hpp#L353
+  template <typename T>
+  static bfloat16
+  getBFloat16FromSIntegralWithRoundingMode(T &i,
+                                           SYCLRoundingMode roundingMode) {
+    // Get unsigned type corresponding to T.
+    typedef typename std::make_unsigned_t<T> UTy;
+
+    uint16_t b_sign = (i >= 0) ? 0 : 0x8000;
+    UTy ui = (i > 0) ? static_cast<UTy>(i) : static_cast<UTy>(-i);
+    size_t msb_pos = get_msb_pos<UTy>(ui);
+    if (msb_pos == 0)
+      return bitsToBfloat16(b_sign ? 0xBF80 : 0x3F80);
+    UTy mant = ui & ((static_cast<UTy>(1) << msb_pos) - 1);
+
+    uint16_t b_exp = msb_pos;
+    uint16_t b_mant;
+    if (msb_pos <= 7) {
+      mant <<= (7 - msb_pos);
+      b_mant = static_cast<uint16_t>(mant);
+    } else {
+      b_mant = static_cast<uint16_t>(mant >> (msb_pos - 7));
+      T mant_discard = mant & ((static_cast<T>(1) << (msb_pos - 7)) - 1);
+      T mid = static_cast<T>(1) << (msb_pos - 8);
+      switch (roundingMode) {
+      case SYCLRoundingMode::automatic:
+      case SYCLRoundingMode::rte:
+        if ((mant_discard > mid) ||
+            ((mant_discard == mid) && ((b_mant & 0x1) == 0x1)))
+          b_mant++;
+        break;
+      case SYCLRoundingMode::rtp:
+        if (mant_discard && !b_sign)
+          b_mant++;
+        break;
+      case SYCLRoundingMode::rtn:
+        if (mant_discard && b_sign)
+          b_mant++;
+      case SYCLRoundingMode::rtz:
+        break;
+      }
+    }
+
+    if (b_mant == 0x80) {
+      b_exp++;
+      b_mant = 0;
+    }
+    b_exp += 127;
+    return bitsToBfloat16(b_sign | (b_exp << 7) | b_mant);
+  }
+
+  // Helper function to get BF16 from double with RTE rounding modes.
+  // Reference:
+  // https://github.com/intel/llvm/blob/sycl/libdevice/imf_bf16.hpp#L79
+  static bfloat16 getBFloat16FromDoubleWithRTE(const double &d) {
+
+    uint64_t u64_val = sycl::bit_cast<uint64_t>(d);
+    int16_t bf16_sign = (u64_val >> 63) & 0x1;
+    uint16_t fp64_exp = static_cast<uint16_t>((u64_val >> 52) & 0x7FF);
+    uint64_t fp64_mant = (u64_val & 0xF'FFFF'FFFF'FFFF);
+    uint16_t bf16_mant;
+    // handling +/-infinity and NAN for double input
+    if (fp64_exp == 0x7FF) {
+      if (!fp64_mant) {
+        return bf16_sign ? 0xFF80 : 0x7F80;
+      } else {
+        // returns a quiet NaN
+        return 0x7FC0;
+      }
+    }
+
+    // Subnormal double precision is converted to 0
+    if (fp64_exp == 0) {
+      return bf16_sign ? 0x8000 : 0x0;
+    }
+
+    fp64_exp -= 1023;
+    // handling overflow, convert to +/-infinity
+    if (static_cast<int16_t>(fp64_exp) > 127) {
+      return bf16_sign ? 0xFF80 : 0x7F80;
+    }
+
+    // handling underflow
+    if (static_cast<int16_t>(fp64_exp) < -133) {
+      return bf16_sign ? 0x8000 : 0x0;
+    }
+
+    //-133 <= fp64_exp <= 127, 1.signicand * 2^fp64_exp
+    // For these numbers, they are NOT subnormal double-precision numbers but
+    // will turn into subnormal when converting to bfloat16
+    uint64_t discard_bits;
+    if (static_cast<int16_t>(fp64_exp) < -126) {
+      fp64_mant |= 0x10'0000'0000'0000;
+      fp64_mant >>= -126 - static_cast<int16_t>(fp64_exp) - 1;
+      discard_bits = fp64_mant & 0x3FFF'FFFF'FFFF;
+      bf16_mant = static_cast<uint16_t>(fp64_mant >> 46);
+      if (discard_bits > 0x2000'0000'0000 ||
+          ((discard_bits == 0x2000'0000'0000) && ((bf16_mant & 0x1) == 0x1)))
+        bf16_mant += 1;
+      fp64_exp = 0;
+      if (bf16_mant == 0x80) {
+        bf16_mant = 0;
+        fp64_exp = 1;
+      }
+      return (bf16_sign << 15) | (fp64_exp << 7) | bf16_mant;
+    }
+
+    // For normal value, discard 45 bits from mantissa
+    discard_bits = fp64_mant & 0x1FFF'FFFF'FFFF;
+    bf16_mant = static_cast<uint16_t>(fp64_mant >> 45);
+    if (discard_bits > 0x1000'0000'0000 ||
+        ((discard_bits == 0x1000'0000'0000) && ((bf16_mant & 0x1) == 0x1)))
+      bf16_mant += 1;
+
+    if (bf16_mant == 0x80) {
+      if (fp64_exp != 127) {
+        bf16_mant = 0;
+        fp64_exp++;
+      } else {
+        return bf16_sign ? 0xFF80 : 0x7F80;
+      }
+    }
+    fp64_exp += 127;
+
+    return (bf16_sign << 15) | (fp64_exp << 7) | bf16_mant;
+  }
+
+public:
+  template <typename Ty, int rm>
+  static bfloat16 getBfloat16WithRoundingMode(const Ty &a) {
+
+    if (!a)
+      return bfloat16{0.0f};
+
+    constexpr SYCLRoundingMode roundingMode = static_cast<SYCLRoundingMode>(rm);
+
+    // Float.
+    if constexpr (std::is_same_v<Ty, float>) {
+      return getBFloat16FromFloatWithRoundingMode(a, roundingMode);
+    }
+    // Double.
+    else if constexpr (std::is_same_v<Ty, double>) {
+      static_assert(
+          roundingMode == SYCLRoundingMode::automatic ||
+              roundingMode == SYCLRoundingMode::rte,
+          "Only automatic/RTE rounding mode is supported for double type.");
+      return getBFloat16FromDoubleWithRTE(a);
+    }
+    // Half
+    else if constexpr (std::is_same_v<Ty, sycl::half>) {
+      // Convert half to float and then convert to bfloat16.
+      // Conversion of half to float is lossless as the latter
+      // have a wider dynamic range.
+      return getBFloat16FromFloatWithRoundingMode(static_cast<float>(a),
+                                                  roundingMode);
+    }
+    // Unsigned integral types.
+    else if constexpr (std::is_integral_v<Ty> && std::is_unsigned_v<Ty>) {
+      return getBFloat16FromUIntegralWithRoundingMode<Ty>(a, roundingMode);
+    }
+    // Signed integral types.
+    else if constexpr (std::is_integral_v<Ty> && std::is_signed_v<Ty>) {
+      return getBFloat16FromSIntegralWithRoundingMode<Ty>(a, roundingMode);
+    } else {
+      static_assert(std::is_integral_v<Ty> || std::is_floating_point_v<Ty>,
+                    "Only integral and floating point types are supported.");
+    }
+  }
+}; // class ConvertToBfloat16.
 } // namespace detail
 
 } // namespace ext::oneapi
diff --git a/sycl/include/sycl/ext/oneapi/bindless_images.hpp b/sycl/include/sycl/ext/oneapi/bindless_images.hpp
index 696301e5c3098..4a8f618a78959 100644
--- a/sycl/include/sycl/ext/oneapi/bindless_images.hpp
+++ b/sycl/include/sycl/ext/oneapi/bindless_images.hpp
@@ -181,7 +181,7 @@ void free_mipmap_mem(image_mem_handle handle, const sycl::queue &syclQueue);
  *  @return  Memory handle to the individual mipmap image
  */
 __SYCL_EXPORT image_mem_handle get_mip_level_mem_handle(
-    const image_mem_handle mipMem, const unsigned int level,
+    const image_mem_handle mipMem, unsigned int level,
     const sycl::device &syclDevice, const sycl::context &syclContext);
 
 /**
@@ -192,9 +192,9 @@ __SYCL_EXPORT image_mem_handle get_mip_level_mem_handle(
  *  @param   syclQueue The queue in which we created our memory handle
  *  @return  Memory handle to the individual mipmap image
  */
-__SYCL_EXPORT image_mem_handle get_mip_level_mem_handle(
-    const image_mem_handle mipMem, const unsigned int level,
-    const sycl::queue &syclQueue);
+__SYCL_EXPORT image_mem_handle
+get_mip_level_mem_handle(const image_mem_handle mipMem, unsigned int level,
+                         const sycl::queue &syclQueue);
 
 /**
  *  @brief   Import external memory taking an external memory handle (the type
@@ -1299,7 +1299,7 @@ template <typename DataT, typename HintT = DataT, typename CoordT>
 DataT fetch_image_array(const unsampled_image_handle &imageHandle
                         [[maybe_unused]],
                         const CoordT &coords [[maybe_unused]],
-                        const int arrayLayer [[maybe_unused]]) {
+                        int arrayLayer [[maybe_unused]]) {
   detail::assert_unsampled_coords<CoordT>();
   constexpr size_t coordSize = detail::coord_size<CoordT>();
   static_assert(coordSize == 1 || coordSize == 2,
@@ -1347,7 +1347,7 @@ DataT fetch_image_array(const unsampled_image_handle &imageHandle
  */
 template <typename DataT, typename HintT = DataT>
 DataT fetch_cubemap(const unsampled_image_handle &imageHandle,
-                    const int2 &coords, const unsigned int face) {
+                    const int2 &coords, unsigned int face) {
   return fetch_image_array<DataT, HintT>(imageHandle, coords, face);
 }
 
@@ -1442,7 +1442,7 @@ void write_image(unsampled_image_handle imageHandle [[maybe_unused]],
 template <typename DataT, typename CoordT>
 void write_image_array(unsampled_image_handle imageHandle [[maybe_unused]],
                        const CoordT &coords [[maybe_unused]],
-                       const int arrayLayer [[maybe_unused]],
+                       int arrayLayer [[maybe_unused]],
                        const DataT &color [[maybe_unused]]) {
   detail::assert_unsampled_coords<CoordT>();
   constexpr size_t coordSize = detail::coord_size<CoordT>();
@@ -1482,7 +1482,7 @@ void write_image_array(unsampled_image_handle imageHandle [[maybe_unused]],
  */
 template <typename DataT>
 void write_cubemap(unsampled_image_handle imageHandle, const sycl::int2 &coords,
-                   const int face, const DataT &color) {
+                   int face, const DataT &color) {
   return write_image_array(imageHandle, coords, face, color);
 }
 
@@ -1774,5 +1774,138 @@ inline event queue::ext_oneapi_copy(
       },
       CodeLoc);
 }
+
+inline event queue::ext_oneapi_wait_external_semaphore(
+    sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+    event DepEvent, const detail::code_location &CodeLoc) {
+  detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
+  return submit(
+      [&](handler &CGH) {
+        CGH.depends_on(DepEvent);
+        CGH.ext_oneapi_wait_external_semaphore(SemaphoreHandle);
+      },
+      CodeLoc);
+}
+
+inline event queue::ext_oneapi_wait_external_semaphore(
+    sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+    const std::vector<event> &DepEvents, const detail::code_location &CodeLoc) {
+  detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
+  return submit(
+      [&](handler &CGH) {
+        CGH.depends_on(DepEvents);
+        CGH.ext_oneapi_wait_external_semaphore(SemaphoreHandle);
+      },
+      CodeLoc);
+}
+
+inline event queue::ext_oneapi_wait_external_semaphore(
+    sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+    uint64_t WaitValue, const detail::code_location &CodeLoc) {
+  detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
+  return submit(
+      [&](handler &CGH) {
+        CGH.ext_oneapi_wait_external_semaphore(SemaphoreHandle, WaitValue);
+      },
+      CodeLoc);
+}
+
+inline event queue::ext_oneapi_wait_external_semaphore(
+    sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+    uint64_t WaitValue, event DepEvent, const detail::code_location &CodeLoc) {
+  detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
+  return submit(
+      [&](handler &CGH) {
+        CGH.depends_on(DepEvent);
+        CGH.ext_oneapi_wait_external_semaphore(SemaphoreHandle, WaitValue);
+      },
+      CodeLoc);
+}
+
+inline event queue::ext_oneapi_wait_external_semaphore(
+    sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+    uint64_t WaitValue, const std::vector<event> &DepEvents,
+    const detail::code_location &CodeLoc) {
+  detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
+  return submit(
+      [&](handler &CGH) {
+        CGH.depends_on(DepEvents);
+        CGH.ext_oneapi_wait_external_semaphore(SemaphoreHandle, WaitValue);
+      },
+      CodeLoc);
+}
+
+inline event queue::ext_oneapi_signal_external_semaphore(
+    sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+    const detail::code_location &CodeLoc) {
+  detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
+  return submit(
+      [&](handler &CGH) {
+        CGH.ext_oneapi_signal_external_semaphore(SemaphoreHandle);
+      },
+      CodeLoc);
+}
+
+inline event queue::ext_oneapi_signal_external_semaphore(
+    sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+    event DepEvent, const detail::code_location &CodeLoc) {
+  detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
+  return submit(
+      [&](handler &CGH) {
+        CGH.depends_on(DepEvent);
+        CGH.ext_oneapi_signal_external_semaphore(SemaphoreHandle);
+      },
+      CodeLoc);
+}
+
+inline event queue::ext_oneapi_signal_external_semaphore(
+    sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+    const std::vector<event> &DepEvents, const detail::code_location &CodeLoc) {
+  detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
+  return submit(
+      [&](handler &CGH) {
+        CGH.depends_on(DepEvents);
+        CGH.ext_oneapi_signal_external_semaphore(SemaphoreHandle);
+      },
+      CodeLoc);
+}
+
+inline event queue::ext_oneapi_signal_external_semaphore(
+    sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+    uint64_t SignalValue, const detail::code_location &CodeLoc) {
+  detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
+  return submit(
+      [&](handler &CGH) {
+        CGH.ext_oneapi_signal_external_semaphore(SemaphoreHandle, SignalValue);
+      },
+      CodeLoc);
+}
+
+inline event queue::ext_oneapi_signal_external_semaphore(
+    sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+    uint64_t SignalValue, event DepEvent,
+    const detail::code_location &CodeLoc) {
+  detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
+  return submit(
+      [&](handler &CGH) {
+        CGH.depends_on(DepEvent);
+        CGH.ext_oneapi_signal_external_semaphore(SemaphoreHandle, SignalValue);
+      },
+      CodeLoc);
+}
+
+inline event queue::ext_oneapi_signal_external_semaphore(
+    sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+    uint64_t SignalValue, const std::vector<event> &DepEvents,
+    const detail::code_location &CodeLoc) {
+  detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
+  return submit(
+      [&](handler &CGH) {
+        CGH.depends_on(DepEvents);
+        CGH.ext_oneapi_signal_external_semaphore(SemaphoreHandle, SignalValue);
+      },
+      CodeLoc);
+}
+
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/bindless_images_interop.hpp b/sycl/include/sycl/ext/oneapi/bindless_images_interop.hpp
index f7caddc1b5bf7..3992f5d93075d 100644
--- a/sycl/include/sycl/ext/oneapi/bindless_images_interop.hpp
+++ b/sycl/include/sycl/ext/oneapi/bindless_images_interop.hpp
@@ -16,6 +16,20 @@ namespace sycl {
 inline namespace _V1 {
 namespace ext::oneapi::experimental {
 
+// Types of external memory handles
+enum class external_mem_handle_type {
+  opaque_fd = 0,
+  win32_nt_handle = 1,
+  win32_nt_dx12_resource = 2,
+};
+
+// Types of external semaphore handles
+enum class external_semaphore_handle_type {
+  opaque_fd = 0,
+  win32_nt_handle = 1,
+  win32_nt_dx12_fence = 2,
+};
+
 /// Opaque interop memory handle type
 struct interop_mem_handle {
   using raw_handle_type = pi_uint64;
@@ -26,6 +40,7 @@ struct interop_mem_handle {
 struct interop_semaphore_handle {
   using raw_handle_type = pi_uint64;
   raw_handle_type raw_handle;
+  external_semaphore_handle_type handle_type;
 };
 
 // External resource file descriptor type
@@ -46,12 +61,14 @@ struct resource_win32_name {
 /// Opaque external memory descriptor type
 template <typename ResourceType> struct external_mem_descriptor {
   ResourceType external_resource;
+  external_mem_handle_type handle_type;
   size_t size_in_bytes;
 };
 
 // Opaque external semaphore descriptor type
 template <typename ResourceType> struct external_semaphore_descriptor {
   ResourceType external_resource;
+  external_semaphore_handle_type handle_type;
 };
 
 /// EVERYTHING BELOW IS DEPRECATED
diff --git a/sycl/include/sycl/ext/oneapi/experimental/architectures.def b/sycl/include/sycl/ext/oneapi/experimental/architectures.def
new file mode 100644
index 0000000000000..47741b0ba3778
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/architectures.def
@@ -0,0 +1,179 @@
+
+// If new element is added to this enum:
+//
+// Update
+//   - "detail::min_<category>_architecture" below if needed
+//   - "detail::max_<category>_architecture" below if needed
+//   - sycl_ext_oneapi_device_architecture specification doc
+//   - "-fsycl-targets" description in sycl/doc/UsersManual.md
+//
+// Add
+//   - new value for -fsycl-targets option to the compiler driver in
+//     accordance with changes from sycl/doc/UsersManual.md and update the
+//     compiler driver tests
+//   - ___SYCL_TARGET_<ARCH>__ to the compiler driver and to all places below
+//   - the unique ID of the new architecture to the SYCL RT source code to
+//     support querying the device architecture through
+//     device::get_info<ext::oneapi::experimental::info::device::architecture>
+//   - alias of architecture if this is Intel GPU architecture in format
+//     intel_gpu_<intel_gpu_arch_version>
+//
+// Important note about keeping architecture IDs below unique:
+//   - the architecture ID must be a hex number with 16 digits
+//   - the architecture ID must suit the following template:
+//     0x AA BBBB CCCCCCCC DD (without spaces), where
+//       - AA is 2-digit ID of the architecture family which must be unique
+//       - BBBB is 4-digit number reserved for future modifications
+//         to keep uniqueness. It should be always 0000 for now
+//       - CCCCCCCC is 8-digit number of architecture itself. It must be
+//         unique for all architectures inside the family
+//       - DD is 2-digit number reserved for future unexpected modifications
+//         to keep uniqueness. It should be always 00 for now
+//
+__SYCL_ARCHITECTURE(unknown, 0x9900000000000000)
+//
+// Intel CPU architectures
+//
+// AA is 03,
+// CCCCCCCC is the architecture ID from the DEVICE_IP_VERSION extension of
+// underlied backend
+// Note: CCCCCCCC for x86_64 consists of all zeros
+__SYCL_ARCHITECTURE(x86_64, 0x0300000000000000)
+__SYCL_ARCHITECTURE(intel_cpu_spr, 0x0300000000000800)
+__SYCL_ARCHITECTURE(intel_cpu_gnr, 0x0300000000000900)
+//
+// Intel GPU architectures
+//
+// AA is 00,
+// CCCCCCCC is GMDID of that architecture
+__SYCL_ARCHITECTURE(intel_gpu_bdw, 0x0000000200000000)           // Intel(R) microarchitecture code name Broadwell
+__SYCL_ARCHITECTURE(intel_gpu_skl, 0x0000000240000900)           // Intel(R) microarchitecture code name Skylake
+__SYCL_ARCHITECTURE(intel_gpu_kbl, 0x0000000240400900)           // Kaby Lake
+__SYCL_ARCHITECTURE(intel_gpu_cfl, 0x0000000240800900)           // Coffee Lake
+__SYCL_ARCHITECTURE(intel_gpu_apl, 0x0000000240c00000)           // Apollo Lake
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_bxt, intel_gpu_apl)          // Broxton
+__SYCL_ARCHITECTURE(intel_gpu_glk, 0x0000000241000000)           // Gemini Lake
+__SYCL_ARCHITECTURE(intel_gpu_whl, 0x0000000241400000)           // Whiskey Lake
+__SYCL_ARCHITECTURE(intel_gpu_aml, 0x0000000241800000)           // Amber Lake
+__SYCL_ARCHITECTURE(intel_gpu_cml, 0x0000000241c00000)           // Comet Lake
+__SYCL_ARCHITECTURE(intel_gpu_icllp, 0x00000002c0000000)         // Ice Lake
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_icl, intel_gpu_icllp)        // Ice Lake
+__SYCL_ARCHITECTURE(intel_gpu_ehl, 0x00000002c0800000)           // Elkhart Lake
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_jsl, intel_gpu_ehl)          // Jasper Lake
+__SYCL_ARCHITECTURE(intel_gpu_tgllp, 0x0000000300000000)         // Tiger Lake
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_tgl, intel_gpu_tgllp)        // Tiger Lake
+__SYCL_ARCHITECTURE(intel_gpu_rkl, 0x0000000300400000)           // Rocket Lake
+__SYCL_ARCHITECTURE(intel_gpu_adl_s, 0x0000000300800000)         // Alder Lake S
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_rpl_s, intel_gpu_adl_s)      // Raptor Lake
+__SYCL_ARCHITECTURE(intel_gpu_adl_p, 0x0000000300c00000)         // Alder Lake P
+__SYCL_ARCHITECTURE(intel_gpu_adl_n, 0x0000000301000000)         // Alder Lake N
+__SYCL_ARCHITECTURE(intel_gpu_dg1, 0x0000000302800000)           // DG1
+__SYCL_ARCHITECTURE(intel_gpu_acm_g10, 0x000000030dc00800)       // Alchemist G10
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_dg2_g10, intel_gpu_acm_g10)  // Alchemist G10
+__SYCL_ARCHITECTURE(intel_gpu_acm_g11, 0x000000030e000500)       // Alchemist G11
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_dg2_g11, intel_gpu_acm_g11)  // Alchemist G11
+__SYCL_ARCHITECTURE(intel_gpu_acm_g12, 0x000000030e400000)       // Alchemist G12
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_dg2_g12, intel_gpu_acm_g12)  // Alchemist G12
+__SYCL_ARCHITECTURE(intel_gpu_pvc, 0x000000030f000700)           // Ponte Vecchio
+__SYCL_ARCHITECTURE(intel_gpu_pvc_vg, 0x000000030f400700)        // Ponte Vecchio VG
+__SYCL_ARCHITECTURE(intel_gpu_mtl_u, 0x0000000311800400)         // Meteor Lake U
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_mtl_s, intel_gpu_mtl_u)      // Meteor Lake S
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_arl_u, intel_gpu_mtl_u)      // Arrow Lake U
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_arl_s, intel_gpu_mtl_u)      // Arrow Lake S
+__SYCL_ARCHITECTURE(intel_gpu_mtl_h, 0x0000000311c00400)         // Meteor Lake H
+__SYCL_ARCHITECTURE(intel_gpu_arl_h, 0x0000000312800400)         // Arrow Lake H
+__SYCL_ARCHITECTURE(intel_gpu_bmg_g21, 0x0000000500400400)       // Battlemage G21
+__SYCL_ARCHITECTURE(intel_gpu_lnl_m, 0x0000000501000400)         // Lunar Lake
+//
+// NVIDIA architectures
+//
+// AA is 01,
+// CCCCCCCC is the SM version ID of that architecture
+__SYCL_ARCHITECTURE(nvidia_gpu_sm_50, 0x0100000000005000)
+__SYCL_ARCHITECTURE(nvidia_gpu_sm_52, 0x0100000000005200)
+__SYCL_ARCHITECTURE(nvidia_gpu_sm_53, 0x0100000000005300)
+__SYCL_ARCHITECTURE(nvidia_gpu_sm_60, 0x0100000000006000)
+__SYCL_ARCHITECTURE(nvidia_gpu_sm_61, 0x0100000000006100)
+__SYCL_ARCHITECTURE(nvidia_gpu_sm_62, 0x0100000000006200)
+__SYCL_ARCHITECTURE(nvidia_gpu_sm_70, 0x0100000000007000)
+__SYCL_ARCHITECTURE(nvidia_gpu_sm_72, 0x0100000000007200)
+__SYCL_ARCHITECTURE(nvidia_gpu_sm_75, 0x0100000000007500)
+__SYCL_ARCHITECTURE(nvidia_gpu_sm_80, 0x0100000000008000)
+__SYCL_ARCHITECTURE(nvidia_gpu_sm_86, 0x0100000000008600)
+__SYCL_ARCHITECTURE(nvidia_gpu_sm_87, 0x0100000000008700)
+__SYCL_ARCHITECTURE(nvidia_gpu_sm_89, 0x0100000000008900)
+__SYCL_ARCHITECTURE(nvidia_gpu_sm_90, 0x0100000000009000)
+__SYCL_ARCHITECTURE(nvidia_gpu_sm_90a, 0x01000000000090a0)
+//
+// AMD architectures
+//
+// AA is 02,
+// CCCCCCCC is the GFX version ID of that architecture
+__SYCL_ARCHITECTURE(amd_gpu_gfx700, 0x0200000000070000)
+__SYCL_ARCHITECTURE(amd_gpu_gfx701, 0x0200000000070100)
+__SYCL_ARCHITECTURE(amd_gpu_gfx702, 0x0200000000070200)
+__SYCL_ARCHITECTURE(amd_gpu_gfx801, 0x0200000000080100)
+__SYCL_ARCHITECTURE(amd_gpu_gfx802, 0x0200000000080200)
+__SYCL_ARCHITECTURE(amd_gpu_gfx803, 0x0200000000080300)
+__SYCL_ARCHITECTURE(amd_gpu_gfx805, 0x0200000000080500)
+__SYCL_ARCHITECTURE(amd_gpu_gfx810, 0x0200000000081000)
+__SYCL_ARCHITECTURE(amd_gpu_gfx900, 0x0200000000090000)
+__SYCL_ARCHITECTURE(amd_gpu_gfx902, 0x0200000000090200)
+__SYCL_ARCHITECTURE(amd_gpu_gfx904, 0x0200000000090400)
+__SYCL_ARCHITECTURE(amd_gpu_gfx906, 0x0200000000090600)
+__SYCL_ARCHITECTURE(amd_gpu_gfx908, 0x0200000000090800)
+__SYCL_ARCHITECTURE(amd_gpu_gfx909, 0x0200000000090900)
+__SYCL_ARCHITECTURE(amd_gpu_gfx90a, 0x0200000000090a00)
+__SYCL_ARCHITECTURE(amd_gpu_gfx90c, 0x0200000000090c00)
+__SYCL_ARCHITECTURE(amd_gpu_gfx940, 0x0200000000094000)
+__SYCL_ARCHITECTURE(amd_gpu_gfx941, 0x0200000000094100)
+__SYCL_ARCHITECTURE(amd_gpu_gfx942, 0x0200000000094200)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1010, 0x0200000000101000)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1011, 0x0200000000101100)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1012, 0x0200000000101200)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1013, 0x0200000000101300)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1030, 0x0200000000103000)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1031, 0x0200000000103100)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1032, 0x0200000000103200)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1033, 0x0200000000103300)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1034, 0x0200000000103400)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1035, 0x0200000000103500)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1036, 0x0200000000103600)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1100, 0x0200000000110000)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1101, 0x0200000000110100)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1102, 0x0200000000110200)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1103, 0x0200000000110300)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1150, 0x0200000000115000)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1151, 0x0200000000115100)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1200, 0x0200000000120000)
+__SYCL_ARCHITECTURE(amd_gpu_gfx1201, 0x0200000000120100)
+//
+// Aliases for Intel graphics architectures
+//
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_8_0_0, intel_gpu_bdw)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_9_0_9, intel_gpu_skl)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_9_1_9, intel_gpu_kbl)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_9_2_9, intel_gpu_cfl)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_9_3_0, intel_gpu_apl)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_9_4_0, intel_gpu_glk)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_9_5_0, intel_gpu_whl)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_9_6_0, intel_gpu_aml)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_9_7_0, intel_gpu_cml)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_11_0_0, intel_gpu_icllp)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_11_2_0, intel_gpu_ehl)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_0_0, intel_gpu_tgllp)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_1_0, intel_gpu_rkl)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_2_0, intel_gpu_adl_s)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_3_0, intel_gpu_adl_p)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_4_0, intel_gpu_adl_n)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_10_0, intel_gpu_dg1)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_55_8, intel_gpu_acm_g10)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_56_5, intel_gpu_acm_g11)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_57_0, intel_gpu_acm_g12)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_60_7, intel_gpu_pvc)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_61_7, intel_gpu_pvc_vg)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_70_4, intel_gpu_mtl_u)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_71_4, intel_gpu_mtl_h)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_12_74_4, intel_gpu_arl_h)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_20_1_4, intel_gpu_bmg_g21)
+__SYCL_ARCHITECTURE_ALIAS(intel_gpu_20_4_4, intel_gpu_lnl_m)
\ No newline at end of file
diff --git a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp
index 2b611f46ddadd..fb4b49a44d4d3 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/bfloat16_math.hpp
@@ -9,6 +9,7 @@
 #pragma once
 
 #include <sycl/builtins.hpp>            // for ceil, cos, exp, exp10, exp2
+#include <sycl/builtins_utils_vec.hpp>  // For simplify_if_swizzle, is_swizzle
 #include <sycl/detail/memcpy.hpp>       // sycl::detail::memcpy
 #include <sycl/ext/oneapi/bfloat16.hpp> // for bfloat16, bfloat16ToBits
 #include <sycl/marray.hpp>              // for marray
@@ -30,6 +31,17 @@ uint32_t to_uint32_t(sycl::marray<bfloat16, N> x, size_t start) {
 }
 } // namespace detail
 
+// Trait to check if the type is a vector or swizzle of bfloat16.
+template <typename T>
+constexpr bool is_vec_or_swizzle_bf16_v =
+    sycl::detail::is_vec_or_swizzle_v<T> &&
+    sycl::detail::is_valid_elem_type_v<T, bfloat16>;
+
+template <typename T>
+constexpr int num_elements_v = sycl::detail::num_elements<T>::value;
+
+/******************* isnan ********************/
+
 // According to bfloat16 format, NAN value's exponent field is 0xFF and
 // significand has non-zero bits.
 template <typename T>
@@ -46,6 +58,21 @@ template <size_t N> sycl::marray<bool, N> isnan(sycl::marray<bfloat16, N> x) {
   return res;
 }
 
+// Overload for BF16 vec and swizzles.
+template <typename T, int N = num_elements_v<T>>
+std::enable_if_t<is_vec_or_swizzle_bf16_v<T>, sycl::vec<int16_t, N>>
+isnan(T x) {
+  sycl::vec<int16_t, N> res;
+  for (size_t i = 0; i < N; i++) {
+    // The result of isnan is 0 or 1 but SPEC requires
+    // isnan() of vec/swizzle to return -1 or 0.
+    res[i] = isnan(x[i]) ? -1 : 0;
+  }
+  return res;
+}
+
+/******************* fabs ********************/
+
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fabs(T x) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
@@ -89,6 +116,19 @@ sycl::marray<bfloat16, N> fabs(sycl::marray<bfloat16, N> x) {
   return res;
 }
 
+// Overload for BF16 vec and swizzles.
+template <typename T, int N = num_elements_v<T>>
+std::enable_if_t<is_vec_or_swizzle_bf16_v<T>, sycl::vec<bfloat16, N>>
+fabs(T x) {
+  sycl::vec<bfloat16, N> res;
+  for (size_t i = 0; i < N; i++) {
+    res[i] = fabs(x[i]);
+  }
+  return res;
+}
+
+/******************* fmin ********************/
+
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fmin(T x, T y) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
@@ -146,6 +186,22 @@ sycl::marray<bfloat16, N> fmin(sycl::marray<bfloat16, N> x,
   return res;
 }
 
+// Overload for different combination of BF16 vec and swizzles.
+template <typename T1, typename T2, int N1 = num_elements_v<T1>,
+          int N2 = num_elements_v<T2>>
+std::enable_if_t<is_vec_or_swizzle_bf16_v<T1> && is_vec_or_swizzle_bf16_v<T2> &&
+                     N1 == N2,
+                 sycl::vec<bfloat16, N1>>
+fmin(T1 x, T2 y) {
+  sycl::vec<bfloat16, N1> res;
+  for (size_t i = 0; i < N1; i++) {
+    res[i] = fmin(x[i], y[i]);
+  }
+  return res;
+}
+
+/******************* fmax ********************/
+
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fmax(T x, T y) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
@@ -202,6 +258,22 @@ sycl::marray<bfloat16, N> fmax(sycl::marray<bfloat16, N> x,
   return res;
 }
 
+// Overload for different combination of BF16 vec and swizzles.
+template <typename T1, typename T2, int N1 = num_elements_v<T1>,
+          int N2 = num_elements_v<T2>>
+std::enable_if_t<is_vec_or_swizzle_bf16_v<T1> && is_vec_or_swizzle_bf16_v<T2> &&
+                     N1 == N2,
+                 sycl::vec<bfloat16, N1>>
+fmax(T1 x, T2 y) {
+  sycl::vec<bfloat16, N1> res;
+  for (size_t i = 0; i < N1; i++) {
+    res[i] = fmax(x[i], y[i]);
+  }
+  return res;
+}
+
+/******************* fma *********************/
+
 template <typename T>
 std::enable_if_t<std::is_same_v<T, bfloat16>, T> fma(T x, T y, T z) {
 #if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
@@ -248,6 +320,22 @@ sycl::marray<bfloat16, N> fma(sycl::marray<bfloat16, N> x,
   return res;
 }
 
+// Overload for different combination of BF16 vec and swizzles.
+template <typename T1, typename T2, typename T3, int N1 = num_elements_v<T1>,
+          int N2 = num_elements_v<T2>, int N3 = num_elements_v<T3>>
+std::enable_if_t<is_vec_or_swizzle_bf16_v<T1> && is_vec_or_swizzle_bf16_v<T2> &&
+                     is_vec_or_swizzle_bf16_v<T3> && N1 == N2 && N2 == N3,
+                 sycl::vec<bfloat16, N1>>
+fma(T1 x, T2 y, T3 z) {
+  sycl::vec<bfloat16, N1> res;
+  for (size_t i = 0; i < N1; i++) {
+    res[i] = fma(x[i], y[i], z[i]);
+  }
+  return res;
+}
+
+/******************* unary math operations ********************/
+
 #define BFLOAT16_MATH_FP32_WRAPPERS(op)                                        \
   template <typename T>                                                        \
   std::enable_if_t<std::is_same<T, bfloat16>::value, T> op(T x) {              \
@@ -264,37 +352,77 @@ sycl::marray<bfloat16, N> fma(sycl::marray<bfloat16, N> x,
     return res;                                                                \
   }
 
+#define BFLOAT16_MATH_FP32_WRAPPERS_VEC(op)                                    \
+  /* Overload for BF16 vec and swizzles. */                                    \
+  template <typename T, int N = num_elements_v<T>>                             \
+  std::enable_if_t<is_vec_or_swizzle_bf16_v<T>, sycl::vec<bfloat16, N>> op(    \
+      T x) {                                                                   \
+    sycl::vec<bfloat16, N> res;                                                \
+    for (size_t i = 0; i < N; i++) {                                           \
+      res[i] = op(x[i]);                                                       \
+    }                                                                          \
+    return res;                                                                \
+  }
+
 BFLOAT16_MATH_FP32_WRAPPERS(ceil)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(ceil)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(ceil)
+
 BFLOAT16_MATH_FP32_WRAPPERS(cos)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(cos)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(cos)
+
 BFLOAT16_MATH_FP32_WRAPPERS(exp)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(exp)
+
 BFLOAT16_MATH_FP32_WRAPPERS(exp10)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp10)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(exp10)
+
 BFLOAT16_MATH_FP32_WRAPPERS(exp2)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(exp2)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(exp2)
+
 BFLOAT16_MATH_FP32_WRAPPERS(floor)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(floor)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(floor)
+
 BFLOAT16_MATH_FP32_WRAPPERS(log)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(log)
+
 BFLOAT16_MATH_FP32_WRAPPERS(log2)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log2)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(log2)
+
 BFLOAT16_MATH_FP32_WRAPPERS(log10)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(log10)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(log10)
+
 BFLOAT16_MATH_FP32_WRAPPERS(rint)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(rint)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(rint)
+
 BFLOAT16_MATH_FP32_WRAPPERS(rsqrt)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(rsqrt)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(rsqrt)
+
 BFLOAT16_MATH_FP32_WRAPPERS(sin)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(sin)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(sin)
+
 BFLOAT16_MATH_FP32_WRAPPERS(sqrt)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(sqrt)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(sqrt)
+
 BFLOAT16_MATH_FP32_WRAPPERS(trunc)
 BFLOAT16_MATH_FP32_WRAPPERS_MARRAY(trunc)
+BFLOAT16_MATH_FP32_WRAPPERS_VEC(trunc)
 
 #undef BFLOAT16_MATH_FP32_WRAPPERS
 #undef BFLOAT16_MATH_FP32_WRAPPERS_MARRAY
+#undef BFLOAT16_MATH_FP32_WRAPPERS_VEC
 } // namespace ext::oneapi::experimental
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp b/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp
index c6d367dbda959..0d83f9f84f790 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/device_architecture.hpp
@@ -17,183 +17,11 @@ inline namespace _V1 {
 namespace ext::oneapi::experimental {
 
 enum class architecture : uint64_t {
-  // If new element is added to this enum:
-  //
-  // Update
-  //   - "detail::min_<category>_architecture" below if needed
-  //   - "detail::max_<category>_architecture" below if needed
-  //   - sycl_ext_oneapi_device_architecture specification doc
-  //   - "-fsycl-targets" description in sycl/doc/UsersManual.md
-  //
-  // Add
-  //   - new value for -fsycl-targets option to the compiler driver in
-  //     accordance with changes from sycl/doc/UsersManual.md and update the
-  //     compiler driver tests
-  //   - ___SYCL_TARGET_<ARCH>__ to the compiler driver and to all places below
-  //   - the unique ID of the new architecture to the SYCL RT source code to
-  //     support querying the device architecture through
-  //     device::get_info<ext::oneapi::experimental::info::device::architecture>
-  //   - alias of architecture if this is Intel GPU architecture in format
-  //     intel_gpu_<intel_gpu_arch_version>
-  //
-  // Important note about keeping architecture IDs below unique:
-  //   - the architecture ID must be a hex number with 16 digits
-  //   - the architecture ID must suit the following template:
-  //     0x AA BBBB CCCCCCCC DD (without spaces), where
-  //       - AA is 2-digit ID of the architecture family which must be unique
-  //       - BBBB is 4-digit number reserved for future modifications
-  //         to keep uniqueness. It should be always 0000 for now
-  //       - CCCCCCCC is 8-digit number of architecture itself. It must be
-  //         unique for all architectures inside the family
-  //       - DD is 2-digit number reserved for future unexpected modifications
-  //         to keep uniqueness. It should be always 00 for now
-  //
-  x86_64 = 0x9900000000000000,
-  //
-  // Intel CPU architectures
-  //
-  // AA is 03,
-  // CCCCCCCC is the architecture ID from the DEVICE_IP_VERSION extension of
-  // underlied backend
-  intel_cpu_spr = 0x0300000000000800,
-  intel_cpu_gnr = 0x0300000000000900,
-  //
-  // Intel GPU architectures
-  //
-  // AA is 00,
-  // CCCCCCCC is GMDID of that architecture
-  intel_gpu_bdw =
-      0x0000000200000000, // Intel(R) microarchitecture code name Broadwell
-  intel_gpu_skl =
-      0x0000000240000900, // Intel(R) microarchitecture code name Skylake
-  intel_gpu_kbl = 0x0000000240400900,     // Kaby Lake
-  intel_gpu_cfl = 0x0000000240800900,     // Coffee Lake
-  intel_gpu_apl = 0x0000000240c00000,     // Apollo Lake
-  intel_gpu_bxt = intel_gpu_apl,          // Broxton
-  intel_gpu_glk = 0x0000000241000000,     // Gemini Lake
-  intel_gpu_whl = 0x0000000241400000,     // Whiskey Lake
-  intel_gpu_aml = 0x0000000241800000,     // Amber Lake
-  intel_gpu_cml = 0x0000000241c00000,     // Comet Lake
-  intel_gpu_icllp = 0x00000002c0000000,   // Ice Lake
-  intel_gpu_icl = intel_gpu_icllp,        // Ice Lake
-  intel_gpu_ehl = 0x00000002c0800000,     // Elkhart Lake
-  intel_gpu_jsl = intel_gpu_ehl,          // Jasper Lake
-  intel_gpu_tgllp = 0x0000000300000000,   // Tiger Lake
-  intel_gpu_tgl = intel_gpu_tgllp,        // Tiger Lake
-  intel_gpu_rkl = 0x0000000300400000,     // Rocket Lake
-  intel_gpu_adl_s = 0x0000000300800000,   // Alder Lake S
-  intel_gpu_rpl_s = intel_gpu_adl_s,      // Raptor Lake
-  intel_gpu_adl_p = 0x0000000300c00000,   // Alder Lake P
-  intel_gpu_adl_n = 0x0000000301000000,   // Alder Lake N
-  intel_gpu_dg1 = 0x0000000302800000,     // DG1
-  intel_gpu_acm_g10 = 0x000000030dc00800, // Alchemist G10
-  intel_gpu_dg2_g10 = intel_gpu_acm_g10,  // Alchemist G10
-  intel_gpu_acm_g11 = 0x000000030e000500, // Alchemist G11
-  intel_gpu_dg2_g11 = intel_gpu_acm_g11,  // Alchemist G11
-  intel_gpu_acm_g12 = 0x000000030e400000, // Alchemist G12
-  intel_gpu_dg2_g12 = intel_gpu_acm_g12,  // Alchemist G12
-  intel_gpu_pvc = 0x000000030f000700,     // Ponte Vecchio
-  intel_gpu_pvc_vg = 0x000000030f400700,  // Ponte Vecchio VG
-  intel_gpu_mtl_u = 0x0000000311800400,   // Meteor Lake U
-  intel_gpu_mtl_s = intel_gpu_mtl_u,      // Meteor Lake S
-  intel_gpu_arl_u = intel_gpu_mtl_u,      // Arrow Lake U
-  intel_gpu_arl_s = intel_gpu_mtl_u,      // Arrow Lake S
-  intel_gpu_mtl_h = 0x0000000311c00400,   // Meteor Lake H
-  intel_gpu_arl_h = 0x0000000312800400,   // Arrow Lake H
-  intel_gpu_bmg_g21 = 0x0000000500400400, // Battlemage G21
-  intel_gpu_lnl_m = 0x0000000501000400,   // Lunar Lake
-  //
-  // NVIDIA architectures
-  //
-  // AA is 01,
-  // CCCCCCCC is the SM version ID of that architecture
-  nvidia_gpu_sm_50 = 0x0100000000005000,
-  nvidia_gpu_sm_52 = 0x0100000000005200,
-  nvidia_gpu_sm_53 = 0x0100000000005300,
-  nvidia_gpu_sm_60 = 0x0100000000006000,
-  nvidia_gpu_sm_61 = 0x0100000000006100,
-  nvidia_gpu_sm_62 = 0x0100000000006200,
-  nvidia_gpu_sm_70 = 0x0100000000007000,
-  nvidia_gpu_sm_72 = 0x0100000000007200,
-  nvidia_gpu_sm_75 = 0x0100000000007500,
-  nvidia_gpu_sm_80 = 0x0100000000008000,
-  nvidia_gpu_sm_86 = 0x0100000000008600,
-  nvidia_gpu_sm_87 = 0x0100000000008700,
-  nvidia_gpu_sm_89 = 0x0100000000008900,
-  nvidia_gpu_sm_90 = 0x0100000000009000,
-  //
-  // AMD architectures
-  //
-  // AA is 02,
-  // CCCCCCCC is the GFX version ID of that architecture
-  amd_gpu_gfx700 = 0x0200000000070000,
-  amd_gpu_gfx701 = 0x0200000000070100,
-  amd_gpu_gfx702 = 0x0200000000070200,
-  amd_gpu_gfx801 = 0x0200000000080100,
-  amd_gpu_gfx802 = 0x0200000000080200,
-  amd_gpu_gfx803 = 0x0200000000080300,
-  amd_gpu_gfx805 = 0x0200000000080500,
-  amd_gpu_gfx810 = 0x0200000000081000,
-  amd_gpu_gfx900 = 0x0200000000090000,
-  amd_gpu_gfx902 = 0x0200000000090200,
-  amd_gpu_gfx904 = 0x0200000000090400,
-  amd_gpu_gfx906 = 0x0200000000090600,
-  amd_gpu_gfx908 = 0x0200000000090800,
-  amd_gpu_gfx909 = 0x0200000000090900,
-  amd_gpu_gfx90a = 0x0200000000090a00,
-  amd_gpu_gfx90c = 0x0200000000090c00,
-  amd_gpu_gfx940 = 0x0200000000094000,
-  amd_gpu_gfx941 = 0x0200000000094100,
-  amd_gpu_gfx942 = 0x0200000000094200,
-  amd_gpu_gfx1010 = 0x0200000000101000,
-  amd_gpu_gfx1011 = 0x0200000000101100,
-  amd_gpu_gfx1012 = 0x0200000000101200,
-  amd_gpu_gfx1013 = 0x0200000000101300,
-  amd_gpu_gfx1030 = 0x0200000000103000,
-  amd_gpu_gfx1031 = 0x0200000000103100,
-  amd_gpu_gfx1032 = 0x0200000000103200,
-  amd_gpu_gfx1033 = 0x0200000000103300,
-  amd_gpu_gfx1034 = 0x0200000000103400,
-  amd_gpu_gfx1035 = 0x0200000000103500,
-  amd_gpu_gfx1036 = 0x0200000000103600,
-  amd_gpu_gfx1100 = 0x0200000000110000,
-  amd_gpu_gfx1101 = 0x0200000000110100,
-  amd_gpu_gfx1102 = 0x0200000000110200,
-  amd_gpu_gfx1103 = 0x0200000000110300,
-  amd_gpu_gfx1150 = 0x0200000000115000,
-  amd_gpu_gfx1151 = 0x0200000000115100,
-  amd_gpu_gfx1200 = 0x0200000000120000,
-  amd_gpu_gfx1201 = 0x0200000000120100,
-  //
-  // Aliases for Intel graphics architectures
-  //
-  intel_gpu_8_0_0 = intel_gpu_bdw,
-  intel_gpu_9_0_9 = intel_gpu_skl,
-  intel_gpu_9_1_9 = intel_gpu_kbl,
-  intel_gpu_9_2_9 = intel_gpu_cfl,
-  intel_gpu_9_3_0 = intel_gpu_apl,
-  intel_gpu_9_4_0 = intel_gpu_glk,
-  intel_gpu_9_5_0 = intel_gpu_whl,
-  intel_gpu_9_6_0 = intel_gpu_aml,
-  intel_gpu_9_7_0 = intel_gpu_cml,
-  intel_gpu_11_0_0 = intel_gpu_icllp,
-  intel_gpu_11_2_0 = intel_gpu_ehl,
-  intel_gpu_12_0_0 = intel_gpu_tgllp,
-  intel_gpu_12_1_0 = intel_gpu_rkl,
-  intel_gpu_12_2_0 = intel_gpu_adl_s,
-  intel_gpu_12_3_0 = intel_gpu_adl_p,
-  intel_gpu_12_4_0 = intel_gpu_adl_n,
-  intel_gpu_12_10_0 = intel_gpu_dg1,
-  intel_gpu_12_55_8 = intel_gpu_acm_g10,
-  intel_gpu_12_56_5 = intel_gpu_acm_g11,
-  intel_gpu_12_57_0 = intel_gpu_acm_g12,
-  intel_gpu_12_60_7 = intel_gpu_pvc,
-  intel_gpu_12_61_7 = intel_gpu_pvc_vg,
-  intel_gpu_12_70_4 = intel_gpu_mtl_u,
-  intel_gpu_12_71_4 = intel_gpu_mtl_h,
-  intel_gpu_12_74_4 = intel_gpu_arl_h,
-  intel_gpu_20_1_4 = intel_gpu_bmg_g21,
-  intel_gpu_20_4_4 = intel_gpu_lnl_m,
+#define __SYCL_ARCHITECTURE(NAME, VAL) NAME = VAL,
+#define __SYCL_ARCHITECTURE_ALIAS(NAME, VAL) NAME = VAL,
+#include <sycl/ext/oneapi/experimental/architectures.def>
+#undef __SYCL_ARCHITECTURE
+#undef __SYCL_ARCHITECTURE_ALIAS
 };
 
 enum class arch_category {
@@ -231,7 +59,7 @@ static constexpr ext::oneapi::experimental::architecture
         ext::oneapi::experimental::architecture::nvidia_gpu_sm_50;
 static constexpr ext::oneapi::experimental::architecture
     max_nvidia_gpu_architecture =
-        ext::oneapi::experimental::architecture::nvidia_gpu_sm_90;
+        ext::oneapi::experimental::architecture::nvidia_gpu_sm_90a;
 
 static constexpr ext::oneapi::experimental::architecture
     min_amd_gpu_architecture =
diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp
index d33502b7e3f24..2885a7673795b 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/group_helpers_sorters.hpp
@@ -15,9 +15,10 @@
 #include <sycl/detail/pi.h>             // for PI_ERROR_INVALID_DEVICE
 #include <sycl/exception.hpp>           // for sycl_category, exception
 #include <sycl/ext/oneapi/bfloat16.hpp> // for bfloat16
-#include <sycl/memory_enums.hpp>        // for memory_scope
-#include <sycl/range.hpp>               // for range
-#include <sycl/sycl_span.hpp>           // for span
+#include <sycl/ext/oneapi/properties/properties.hpp>
+#include <sycl/memory_enums.hpp> // for memory_scope
+#include <sycl/range.hpp>        // for range
+#include <sycl/sycl_span.hpp>    // for span
 
 #ifdef __SYCL_DEVICE_ONLY__
 #include <sycl/detail/group_sort_impl.hpp>
@@ -36,6 +37,54 @@ namespace sycl {
 inline namespace _V1 {
 namespace ext::oneapi::experimental {
 
+enum class group_algorithm_data_placement { blocked, striped };
+
+struct input_data_placement_key
+    : detail::compile_time_property_key<detail::PropKind::InputDataPlacement> {
+  template <group_algorithm_data_placement Placement>
+  using value_t =
+      property_value<input_data_placement_key,
+                     std::integral_constant<int, static_cast<int>(Placement)>>;
+};
+
+struct output_data_placement_key
+    : detail::compile_time_property_key<detail::PropKind::OutputDataPlacement> {
+  template <group_algorithm_data_placement Placement>
+  using value_t =
+      property_value<output_data_placement_key,
+                     std::integral_constant<int, static_cast<int>(Placement)>>;
+};
+
+template <group_algorithm_data_placement Placement>
+inline constexpr input_data_placement_key::value_t<Placement>
+    input_data_placement;
+
+template <group_algorithm_data_placement Placement>
+inline constexpr output_data_placement_key::value_t<Placement>
+    output_data_placement;
+
+namespace detail {
+
+template <typename Properties>
+constexpr bool isInputBlocked(Properties properties) {
+  if constexpr (properties.template has_property<input_data_placement_key>())
+    return properties.template get_property<input_data_placement_key>() ==
+           input_data_placement<group_algorithm_data_placement::blocked>;
+  else
+    return true;
+}
+
+template <typename Properties>
+constexpr bool isOutputBlocked(Properties properties) {
+  if constexpr (properties.template has_property<output_data_placement_key>())
+    return properties.template get_property<output_data_placement_key>() ==
+           output_data_placement<group_algorithm_data_placement::blocked>;
+  else
+    return true;
+}
+
+} // namespace detail
+
 // ---- group helpers
 template <typename Group, size_t Extent> class group_with_scratchpad {
   Group g;
@@ -48,7 +97,7 @@ template <typename Group, size_t Extent> class group_with_scratchpad {
   sycl::span<std::byte, Extent> get_memory() const { return scratch; }
 };
 
-// ---- sorters
+// Default sorter provided by the first version of the extension specification.
 template <typename Compare = std::less<>> class default_sorter {
   Compare comp;
   sycl::span<std::byte> scratch;
@@ -63,10 +112,10 @@ template <typename Compare = std::less<>> class default_sorter {
   void operator()([[maybe_unused]] Group g, [[maybe_unused]] Ptr first,
                   [[maybe_unused]] Ptr last) {
 #ifdef __SYCL_DEVICE_ONLY__
-    // Per extension specification if scratch size is less than the value
-    // returned by memory_required then behavior is undefined, so we don't check
-    // that the scratch size statisfies the requirement.
-    sycl::detail::merge_sort(g, first, last - first, comp, scratch.data());
+    using T = typename sycl::detail::GetValueType<Ptr>::type;
+    size_t n = last - first;
+    T *scratch_begin = sycl::detail::align_scratch<T>(scratch, g, n);
+    sycl::detail::merge_sort(g, first, n, comp, scratch_begin);
 #else
     throw sycl::exception(
         std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()),
@@ -77,16 +126,14 @@ template <typename Compare = std::less<>> class default_sorter {
   template <typename Group, typename T>
   T operator()([[maybe_unused]] Group g, T val) {
 #ifdef __SYCL_DEVICE_ONLY__
-    // Per extension specification if scratch size is less than the value
-    // returned by memory_required then behavior is undefined, so we don't check
-    // that the scratch size statisfies the requirement.
+    std::size_t local_id = g.get_local_linear_id();
     auto range_size = g.get_local_range().size();
-    size_t local_id = g.get_local_linear_id();
-    T *temp = reinterpret_cast<T *>(scratch.data());
-    ::new (temp + local_id) T(val);
-    sycl::detail::merge_sort(g, temp, range_size, comp,
-                             scratch.data() + range_size * sizeof(T));
-    val = temp[local_id];
+    T *scratch_begin = sycl::detail::align_scratch<T>(
+        scratch, g, /* output storage and temporary storage */ 2 * range_size);
+    scratch_begin[local_id] = val;
+    sycl::detail::merge_sort(g, scratch_begin, range_size, comp,
+                             scratch_begin + range_size);
+    val = scratch_begin[local_id];
 #else
     throw sycl::exception(
         std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()),
@@ -122,6 +169,7 @@ template <typename T> struct ConvertToComp<T, sorting_order::descending> {
 };
 } // namespace detail
 
+// Radix sorter provided by the first version of the extension specification.
 template <typename ValT, sorting_order OrderT = sorting_order::ascending,
           unsigned int BitsPerPass = 4>
 class radix_sorter {
@@ -199,6 +247,318 @@ class radix_sorter {
   }
 };
 
+// Default sorters provided by the second version of the extension
+// specification.
+namespace default_sorters {
+
+template <typename CompareT = std::less<>> class joint_sorter {
+  CompareT comp;
+  sycl::span<std::byte> scratch;
+
+public:
+  template <size_t Extent>
+  joint_sorter(sycl::span<std::byte, Extent> scratch_,
+               CompareT comp_ = CompareT())
+      : comp(comp_), scratch(scratch_) {}
+
+  template <typename Group, typename Ptr>
+  void operator()([[maybe_unused]] Group g, [[maybe_unused]] Ptr first,
+                  [[maybe_unused]] Ptr last) {
+#ifdef __SYCL_DEVICE_ONLY__
+    using T = typename sycl::detail::GetValueType<Ptr>::type;
+    size_t n = last - first;
+    T *scratch_begin = sycl::detail::align_scratch<T>(scratch, g, n);
+    sycl::detail::merge_sort(g, first, n, comp, scratch_begin);
+#else
+    throw sycl::exception(
+        std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()),
+        "default_sorter constructor is not supported on host device.");
+#endif
+  }
+
+  template <typename T>
+  static size_t memory_required(sycl::memory_scope, size_t range_size) {
+    return range_size * sizeof(T) + alignof(T);
+  }
+};
+
+template <typename T, typename CompareT = std::less<>,
+          std::size_t ElementsPerWorkItem = 1>
+class group_sorter {
+  CompareT comp;
+  sycl::span<std::byte> scratch;
+
+public:
+  template <std::size_t Extent>
+  group_sorter(sycl::span<std::byte, Extent> scratch_,
+               CompareT comp_ = CompareT{})
+      : comp(comp_), scratch(scratch_) {}
+
+  template <typename Group> T operator()([[maybe_unused]] Group g, T val) {
+#ifdef __SYCL_DEVICE_ONLY__
+    std::size_t local_id = g.get_local_linear_id();
+    auto range_size = g.get_local_range().size();
+    T *scratch_begin = sycl::detail::align_scratch<T>(
+        scratch, g, /* output storage and temporary storage */ 2 * range_size);
+    scratch_begin[local_id] = val;
+    sycl::detail::merge_sort(g, scratch_begin, range_size, comp,
+                             scratch_begin + range_size);
+    val = scratch_begin[local_id];
+#else
+    throw sycl::exception(
+        std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()),
+        "default_sorter operator() is not supported on host device.");
+#endif
+    return val;
+  }
+
+  template <typename Group, typename Properties>
+  void operator()([[maybe_unused]] Group g,
+                  [[maybe_unused]] sycl::span<T, ElementsPerWorkItem> values,
+                  [[maybe_unused]] Properties properties) {
+#ifdef __SYCL_DEVICE_ONLY__
+    std::size_t local_id = g.get_local_linear_id();
+    auto wg_size = g.get_local_range().size();
+    auto number_of_elements = wg_size * ElementsPerWorkItem;
+    T *scratch_begin = sycl::detail::align_scratch<T>(
+        scratch, g,
+        /* output storage and temporary storage */ 2 * number_of_elements);
+    for (std::uint32_t i = 0; i < ElementsPerWorkItem; ++i)
+      scratch_begin[local_id * ElementsPerWorkItem + i] = values[i];
+    sycl::detail::merge_sort(g, scratch_begin, number_of_elements, comp,
+                             scratch_begin + number_of_elements);
+
+    std::size_t shift{};
+    for (std::uint32_t i = 0; i < ElementsPerWorkItem; ++i) {
+      if constexpr (detail::isOutputBlocked(properties)) {
+        shift = local_id * ElementsPerWorkItem + i;
+      } else {
+        shift = i * wg_size + local_id;
+      }
+      values[i] = scratch_begin[shift];
+    }
+#endif
+  }
+
+  static std::size_t memory_required(sycl::memory_scope scope,
+                                     size_t range_size) {
+    return 2 * joint_sorter<>::template memory_required<T>(
+                   scope, range_size * ElementsPerWorkItem);
+  }
+};
+
+template <typename KeyTy, typename ValueTy, typename CompareT = std::less<>,
+          std::size_t ElementsPerWorkItem = 1>
+class group_key_value_sorter {
+  CompareT comp;
+  sycl::span<std::byte> scratch;
+
+public:
+  template <std::size_t Extent>
+  group_key_value_sorter(sycl::span<std::byte, Extent> scratch_,
+                         CompareT comp_ = {})
+      : comp(comp_), scratch(scratch_) {}
+
+  template <typename Group>
+  std::tuple<KeyTy, ValueTy> operator()(Group g, KeyTy key, ValueTy value) {
+    static_assert(ElementsPerWorkItem == 1,
+                  "ElementsPerWorkItem must be equal 1");
+
+    using KeyValue = std::tuple<KeyTy, ValueTy>;
+    auto comp_key_value = [this_comp = this->comp](const KeyValue &lhs,
+                                                   const KeyValue &rhs) {
+      return this_comp(std::get<0>(lhs), std::get<0>(rhs));
+    };
+    return group_sorter<KeyValue, decltype(comp_key_value),
+                        ElementsPerWorkItem>(scratch, comp_key_value)(
+        g, KeyValue(key, value));
+  }
+
+  static std::size_t memory_required(sycl::memory_scope scope,
+                                     std::size_t range_size) {
+    return group_sorter<std::tuple<KeyTy, ValueTy>, CompareT,
+                        ElementsPerWorkItem>::memory_required(scope,
+                                                              range_size);
+  }
+};
+} // namespace default_sorters
+
+// Radix sorters provided by the second version of the extension specification.
+namespace radix_sorters {
+
+template <typename ValT, sorting_order OrderT = sorting_order::ascending,
+          unsigned int BitsPerPass = 4>
+class joint_sorter {
+
+  sycl::span<std::byte> scratch;
+  uint32_t first_bit = 0;
+  uint32_t last_bit = 0;
+
+  static constexpr uint32_t bits = BitsPerPass;
+  using bitset_t = std::bitset<sizeof(ValT) * CHAR_BIT>;
+
+public:
+  template <std::size_t Extent>
+  joint_sorter(sycl::span<std::byte, Extent> scratch_,
+               const bitset_t mask = bitset_t{}.set())
+      : scratch(scratch_) {
+    static_assert((std::is_arithmetic<ValT>::value ||
+                   std::is_same<ValT, sycl::half>::value ||
+                   std::is_same<ValT, sycl::ext::oneapi::bfloat16>::value),
+                  "radix sort is not supported for the given type");
+
+    for (first_bit = 0; first_bit < mask.size() && !mask[first_bit];
+         ++first_bit)
+      ;
+    for (last_bit = first_bit; last_bit < mask.size() && mask[last_bit];
+         ++last_bit)
+      ;
+  }
+
+  template <typename GroupT, typename PtrT>
+  void operator()([[maybe_unused]] GroupT g, [[maybe_unused]] PtrT first,
+                  [[maybe_unused]] PtrT last) {
+#ifdef __SYCL_DEVICE_ONLY__
+    sycl::detail::privateDynamicSort</*is_key_value=*/false,
+                                     OrderT == sorting_order::ascending,
+                                     /*empty*/ 1, BitsPerPass>(
+        g, first, /*empty*/ first, last - first, scratch.data(), first_bit,
+        last_bit);
+#else
+    throw sycl::exception(
+        std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()),
+        "radix_sorter is not supported on host device.");
+#endif
+  }
+
+  static constexpr std::size_t
+  memory_required([[maybe_unused]] sycl::memory_scope scope,
+                  std::size_t range_size) {
+    return range_size * sizeof(ValT) +
+           (1 << bits) * range_size * sizeof(uint32_t) + alignof(uint32_t);
+  }
+};
+
+template <typename ValT, sorting_order OrderT = sorting_order::ascending,
+          size_t ElementsPerWorkItem = 1, unsigned int BitsPerPass = 4>
+class group_sorter {
+
+  sycl::span<std::byte> scratch;
+  uint32_t first_bit = 0;
+  uint32_t last_bit = 0;
+
+  static constexpr uint32_t bits = BitsPerPass;
+  using bitset_t = std::bitset<sizeof(ValT) * CHAR_BIT>;
+
+public:
+  template <std::size_t Extent>
+  group_sorter(sycl::span<std::byte, Extent> scratch_,
+               const bitset_t mask = bitset_t{}.set())
+      : scratch(scratch_) {
+    static_assert((std::is_arithmetic<ValT>::value ||
+                   std::is_same<ValT, sycl::half>::value ||
+                   std::is_same<ValT, sycl::ext::oneapi::bfloat16>::value),
+                  "radix sort is not usable");
+
+    for (first_bit = 0; first_bit < mask.size() && !mask[first_bit];
+         ++first_bit)
+      ;
+    for (last_bit = first_bit; last_bit < mask.size() && mask[last_bit];
+         ++last_bit)
+      ;
+  }
+
+  template <typename GroupT>
+  ValT operator()([[maybe_unused]] GroupT g, [[maybe_unused]] ValT val) {
+#ifdef __SYCL_DEVICE_ONLY__
+    ValT result[]{val};
+    sycl::detail::privateStaticSort</*is_key_value=*/false,
+                                    /*is_blocked=*/true,
+                                    OrderT == sorting_order::ascending,
+                                    /*items_per_work_item=*/1, bits>(
+        g, result, /*empty*/ result, scratch.data(), first_bit, last_bit);
+    return result[0];
+#else
+    throw sycl::exception(
+        std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()),
+        "radix_sorter is not supported on host device.");
+#endif
+  }
+
+  template <typename Group, typename Properties>
+  void operator()([[maybe_unused]] Group g,
+                  [[maybe_unused]] sycl::span<ValT, ElementsPerWorkItem> values,
+                  [[maybe_unused]] Properties properties) {
+#ifdef __SYCL_DEVICE_ONLY__
+    sycl::detail::privateStaticSort<
+        /*is_key_value=*/false, detail::isOutputBlocked(properties),
+        OrderT == sorting_order::ascending, ElementsPerWorkItem, bits>(
+        g, values.data(), /*empty*/ values.data(), scratch.data(), first_bit,
+        last_bit);
+#endif
+  }
+
+  static constexpr size_t
+  memory_required([[maybe_unused]] sycl::memory_scope scope,
+                  size_t range_size) {
+    return (std::max)(range_size * sizeof(ValT),
+                      range_size * (1 << bits) * sizeof(uint32_t));
+  }
+};
+
+template <typename KeyTy, typename ValueTy,
+          sorting_order Order = sorting_order::ascending,
+          size_t ElementsPerWorkItem = 1, unsigned int BitsPerPass = 4>
+class group_key_value_sorter {
+  sycl::span<std::byte> scratch;
+  uint32_t first_bit;
+  uint32_t last_bit;
+
+  static constexpr uint32_t bits = BitsPerPass;
+  using bitset_t = std::bitset<sizeof(KeyTy) * CHAR_BIT>;
+
+public:
+  template <std::size_t Extent>
+  group_key_value_sorter(sycl::span<std::byte, Extent> scratch_,
+                         const bitset_t mask = bitset_t{}.set())
+      : scratch(scratch_) {
+    static_assert((std::is_arithmetic<KeyTy>::value ||
+                   std::is_same<KeyTy, sycl::half>::value),
+                  "radix sort is not usable");
+    for (first_bit = 0; first_bit < mask.size() && !mask[first_bit];
+         ++first_bit)
+      ;
+    for (last_bit = first_bit; last_bit < mask.size() && mask[last_bit];
+         ++last_bit)
+      ;
+  }
+
+  template <typename Group>
+  std::tuple<KeyTy, ValueTy> operator()([[maybe_unused]] Group g, KeyTy key,
+                                        ValueTy val) {
+    static_assert(ElementsPerWorkItem == 1, "ElementsPerWorkItem must be 1");
+    KeyTy key_result[]{key};
+    ValueTy val_result[]{val};
+#ifdef __SYCL_DEVICE_ONLY__
+    sycl::detail::privateStaticSort<
+        /*is_key_value=*/true,
+        /*is_blocked=*/true, Order == sorting_order::ascending, 1, bits>(
+        g, key_result, val_result, scratch.data(), first_bit, last_bit);
+#endif
+    key = key_result[0];
+    val = val_result[0];
+    return {key, val};
+  }
+
+  static constexpr std::size_t memory_required(sycl::memory_scope,
+                                               std::size_t range_size) {
+    return (std::max)(range_size * ElementsPerWorkItem *
+                          (sizeof(KeyTy) + sizeof(ValueTy)),
+                      range_size * (1 << bits) * sizeof(uint32_t));
+  }
+};
+} // namespace radix_sorters
+
 } // namespace ext::oneapi::experimental
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/experimental/group_sort.hpp b/sycl/include/sycl/ext/oneapi/experimental/group_sort.hpp
index 092fec5c7da0c..5dece1c54f7c4 100644
--- a/sycl/include/sycl/ext/oneapi/experimental/group_sort.hpp
+++ b/sycl/include/sycl/ext/oneapi/experimental/group_sort.hpp
@@ -68,6 +68,19 @@ struct is_sorter_impl<Sorter, Group, Ptr,
 template <typename Sorter, typename Group, typename ValOrPtr>
 struct is_sorter : decltype(is_sorter_impl<Sorter, Group, ValOrPtr>::test(0)) {
 };
+
+template <typename Sorter, typename Group, typename Key, typename Value,
+          typename = void>
+struct is_key_value_sorter : std::false_type {};
+
+template <typename Sorter, typename Group, typename Key, typename Value>
+struct is_key_value_sorter<
+    Sorter, Group, Key, Value,
+    std::enable_if_t<
+        std::is_same_v<std::invoke_result_t<Sorter, Group, Key, Value>,
+                       std::tuple<Key, Value>> &&
+        sycl::is_group_v<Group>>> : std::true_type {};
+
 } // namespace detail
 
 // ---- sort_over_group
@@ -90,7 +103,7 @@ sort_over_group(experimental::group_with_scratchpad<Group, Extent> exec,
                 T value, Compare comp) {
   return sort_over_group(
       exec.get_group(), value,
-      experimental::default_sorter<Compare>(exec.get_memory(), comp));
+      default_sorters::group_sorter<T, Compare, 1>(exec.get_memory(), comp));
 }
 
 template <typename Group, typename T, size_t Extent>
@@ -98,7 +111,60 @@ std::enable_if_t<sycl::is_group_v<std::decay_t<Group>>, T>
 sort_over_group(experimental::group_with_scratchpad<Group, Extent> exec,
                 T value) {
   return sort_over_group(exec.get_group(), value,
-                         experimental::default_sorter<>(exec.get_memory()));
+                         default_sorters::group_sorter<T>(exec.get_memory()));
+}
+
+template <typename Group, typename T, std::size_t ElementsPerWorkItem,
+          typename Sorter,
+          typename Properties = ext::oneapi::experimental::empty_properties_t>
+std::enable_if_t<sycl::ext::oneapi::experimental::is_property_list_v<
+                     std::decay_t<Properties>>,
+                 void>
+sort_over_group([[maybe_unused]] Group g,
+                [[maybe_unused]] sycl::span<T, ElementsPerWorkItem> values,
+                [[maybe_unused]] Sorter sorter,
+                [[maybe_unused]] Properties properties = {}) {
+#ifdef __SYCL_DEVICE_ONLY__
+  return sorter(g, values, properties);
+#else
+  throw sycl::exception(
+      std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()),
+      "Group algorithms are not supported on host device.");
+#endif
+}
+
+template <typename Group, typename T, std::size_t Extent,
+          std::size_t ElementsPerWorkItem,
+          typename Properties = ext::oneapi::experimental::empty_properties_t>
+std::enable_if_t<sycl::ext::oneapi::experimental::is_property_list_v<
+                     std::decay_t<Properties>>,
+                 void>
+sort_over_group(experimental::group_with_scratchpad<Group, Extent> exec,
+                sycl::span<T, ElementsPerWorkItem> values,
+                Properties properties = {}) {
+  return sort_over_group(
+      exec.get_group(), values,
+      default_sorters::group_sorter<T, std::less<T>, ElementsPerWorkItem>(
+          exec.get_memory()),
+      properties);
+}
+
+template <typename Group, typename T, std::size_t Extent,
+          std::size_t ElementsPerWorkItem, typename Compare,
+          typename Properties = ext::oneapi::experimental::empty_properties_t>
+std::enable_if_t<!sycl::ext::oneapi::experimental::is_property_list_v<
+                     std::decay_t<Compare>> &&
+                     sycl::ext::oneapi::experimental::is_property_list_v<
+                         std::decay_t<Properties>>,
+                 void>
+sort_over_group(experimental::group_with_scratchpad<Group, Extent> exec,
+                sycl::span<T, ElementsPerWorkItem> values, Compare comp,
+                Properties properties = {}) {
+  return sort_over_group(
+      exec.get_group(), values,
+      default_sorters::group_sorter<T, Compare, ElementsPerWorkItem>(
+          exec.get_memory(), comp),
+      properties);
 }
 
 // ---- joint_sort
@@ -120,7 +186,7 @@ std::enable_if_t<!detail::is_sorter<Compare, Group, Iter>::value, void>
 joint_sort(experimental::group_with_scratchpad<Group, Extent> exec, Iter first,
            Iter last, Compare comp) {
   joint_sort(exec.get_group(), first, last,
-             experimental::default_sorter<Compare>(exec.get_memory(), comp));
+             default_sorters::joint_sorter<Compare>(exec.get_memory(), comp));
 }
 
 template <typename Group, typename Iter, size_t Extent>
@@ -128,7 +194,49 @@ std::enable_if_t<sycl::is_group_v<std::decay_t<Group>>, void>
 joint_sort(experimental::group_with_scratchpad<Group, Extent> exec, Iter first,
            Iter last) {
   joint_sort(exec.get_group(), first, last,
-             experimental::default_sorter<>(exec.get_memory()));
+             default_sorters::joint_sorter<>(exec.get_memory()));
+}
+
+template <typename Group, typename KeyTy, typename ValueTy, typename Sorter>
+std::enable_if_t<
+    detail::is_key_value_sorter<Sorter, Group, KeyTy, ValueTy>::value,
+    std::tuple<KeyTy, ValueTy>>
+sort_key_value_over_group([[maybe_unused]] Group g, [[maybe_unused]] KeyTy key,
+                          [[maybe_unused]] ValueTy value,
+                          [[maybe_unused]] Sorter sorter) {
+#ifdef __SYCL_DEVICE_ONLY__
+  return sorter(g, key, value);
+#else
+  throw sycl::exception(
+      std::error_code(PI_ERROR_INVALID_DEVICE, sycl::sycl_category()),
+      "Group algorithms are not supported on host device.");
+#endif
+}
+
+template <typename Group, typename KeyTy, typename ValueTy, typename Compare,
+          std::size_t Extent>
+std::enable_if_t<
+    !detail::is_key_value_sorter<Compare, Group, KeyTy, ValueTy>::value,
+    std::tuple<KeyTy, ValueTy>>
+sort_key_value_over_group(
+    experimental::group_with_scratchpad<Group, Extent> exec, KeyTy key,
+    ValueTy value, Compare comp) {
+  return sort_key_value_over_group(
+      exec.get_group(), key, value,
+      default_sorters::group_key_value_sorter<KeyTy, ValueTy, Compare>(
+          exec.get_memory(), comp));
+}
+
+template <typename KeyTy, typename ValueTy, typename Group, std::size_t Extent>
+std::enable_if_t<sycl::is_group_v<std::decay_t<Group>>,
+                 std::tuple<KeyTy, ValueTy>>
+sort_key_value_over_group(
+    experimental::group_with_scratchpad<Group, Extent> exec, KeyTy key,
+    ValueTy value) {
+  return sort_key_value_over_group(
+      exec.get_group(), key, value,
+      default_sorters::group_key_value_sorter<KeyTy, ValueTy>(
+          exec.get_memory()));
 }
 
 } // namespace ext::oneapi::experimental
diff --git a/sycl/include/sycl/ext/oneapi/experimental/virtual_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/virtual_functions.hpp
new file mode 100644
index 0000000000000..a173689cbc652
--- /dev/null
+++ b/sycl/include/sycl/ext/oneapi/experimental/virtual_functions.hpp
@@ -0,0 +1,74 @@
+#pragma once
+
+#include <sycl/ext/oneapi/properties/property.hpp>
+#include <sycl/ext/oneapi/properties/property_value.hpp>
+
+namespace sycl {
+inline namespace _V1 {
+namespace ext::oneapi::experimental {
+struct indirectly_callable_key {
+  template <typename Set>
+  using value_t =
+      sycl::ext::oneapi::experimental::property_value<indirectly_callable_key,
+                                                      Set>;
+};
+
+template <typename Set = void>
+inline constexpr indirectly_callable_key::value_t<Set> indirectly_callable;
+
+struct calls_indirectly_key {
+  template <typename First = void, typename... SetIds>
+  using value_t =
+      sycl::ext::oneapi::experimental::property_value<calls_indirectly_key,
+                                                      First, SetIds...>;
+};
+
+template <typename First = void, typename... Rest>
+inline constexpr calls_indirectly_key::value_t<First, Rest...> calls_indirectly;
+
+template <> struct is_property_key<indirectly_callable_key> : std::true_type {};
+template <> struct is_property_key<calls_indirectly_key> : std::true_type {};
+
+namespace detail {
+
+template <>
+struct IsCompileTimeProperty<indirectly_callable_key> : std::true_type {};
+template <>
+struct IsCompileTimeProperty<calls_indirectly_key> : std::true_type {};
+
+template <> struct PropertyToKind<indirectly_callable_key> {
+  static constexpr PropKind Kind = PropKind::IndirectlyCallable;
+};
+
+template <> struct PropertyToKind<calls_indirectly_key> {
+  static constexpr PropKind Kind = PropKind::CallsIndirectly;
+};
+
+template <typename Set>
+struct PropertyMetaInfo<indirectly_callable_key::value_t<Set>> {
+  static constexpr const char *name = "indirectly-callable";
+  static constexpr const char *value =
+#ifdef __SYCL_DEVICE_ONLY__
+      __builtin_sycl_unique_stable_name(Set);
+#else
+      "";
+#endif
+};
+
+template <typename First, typename... Rest>
+struct PropertyMetaInfo<calls_indirectly_key::value_t<First, Rest...>> {
+  static constexpr const char *name = "calls-indirectly";
+  static constexpr const char *value =
+#ifdef __SYCL_DEVICE_ONLY__
+      // FIXME: we should handle Rest... here as well
+      __builtin_sycl_unique_stable_name(First);
+#else
+      "";
+#endif
+};
+
+} // namespace detail
+
+} // namespace ext::oneapi::experimental
+} // namespace _V1
+} // namespace sycl
diff --git a/sycl/include/sycl/ext/oneapi/properties/property.hpp b/sycl/include/sycl/ext/oneapi/properties/property.hpp
index 89d7dd7852a8a..3f1bb28268d39 100644
--- a/sycl/include/sycl/ext/oneapi/properties/property.hpp
+++ b/sycl/include/sycl/ext/oneapi/properties/property.hpp
@@ -205,8 +205,12 @@ enum PropKind : uint32_t {
   WorkItemProgress = 64,
   NDRangeKernel = 65,
   SingleTaskKernel = 66,
+  IndirectlyCallable = 67,
+  CallsIndirectly = 68,
+  InputDataPlacement = 69,
+  OutputDataPlacement = 70,
   // PropKindSize must always be the last value.
-  PropKindSize = 67,
+  PropKindSize = 71,
 };
 
 struct property_key_base_tag {};
diff --git a/sycl/include/sycl/half_type.hpp b/sycl/include/sycl/half_type.hpp
index 951146f2cdfbb..799ff9fb186e9 100644
--- a/sycl/include/sycl/half_type.hpp
+++ b/sycl/include/sycl/half_type.hpp
@@ -249,11 +249,14 @@ using StorageT = _Float16;
 using BIsRepresentationT = _Float16;
 using VecElemT = _Float16;
 
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 using Vec2StorageT = VecElemT __attribute__((ext_vector_type(2)));
 using Vec3StorageT = VecElemT __attribute__((ext_vector_type(3)));
 using Vec4StorageT = VecElemT __attribute__((ext_vector_type(4)));
 using Vec8StorageT = VecElemT __attribute__((ext_vector_type(8)));
 using Vec16StorageT = VecElemT __attribute__((ext_vector_type(16)));
+#endif // __INTEL_PREVIEW_BREAKING_CHANGES
+
 #else // SYCL_DEVICE_ONLY
 using StorageT = detail::host_half_impl::half;
 // No need to extract underlying data type for built-in functions operating on
@@ -261,6 +264,7 @@ using StorageT = detail::host_half_impl::half;
 using BIsRepresentationT = half;
 using VecElemT = half;
 
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
 // On the host side we cannot use OpenCL cl_half# types as an underlying type
 // for vec because they are actually defined as an integer type under the
 // hood. As a result half values will be converted to the integer and passed
@@ -270,6 +274,8 @@ using Vec3StorageT = std::array<VecElemT, 3>;
 using Vec4StorageT = std::array<VecElemT, 4>;
 using Vec8StorageT = std::array<VecElemT, 8>;
 using Vec16StorageT = std::array<VecElemT, 16>;
+#endif // __INTEL_PREVIEW_BREAKING_CHANGES
+
 #endif // SYCL_DEVICE_ONLY
 
 #ifndef __SYCL_DEVICE_ONLY__
diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index 2c6df91a6589b..a71f5400a813d 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -34,6 +34,7 @@
 #include <sycl/ext/oneapi/device_global/properties.hpp>
 #include <sycl/ext/oneapi/experimental/graph.hpp>
 #include <sycl/ext/oneapi/experimental/use_root_sync_prop.hpp>
+#include <sycl/ext/oneapi/experimental/virtual_functions.hpp>
 #include <sycl/ext/oneapi/kernel_properties/properties.hpp>
 #include <sycl/ext/oneapi/properties/properties.hpp>
 #include <sycl/group.hpp>
@@ -962,6 +963,10 @@ class __SYCL_EXPORT handler {
                  sycl::ext::intel::experimental::fp_control_key>() &&
              KI::isESIMD()),
         "Floating point control property is supported for ESIMD kernels only.");
+    static_assert(
+        !PropertiesT::template has_property<
+            sycl::ext::oneapi::experimental::indirectly_callable_key>(),
+        "indirectly_callable property cannot be applied to SYCL kernels");
     if constexpr (PropertiesT::template has_property<
                       sycl::ext::intel::experimental::cache_config_key>()) {
       auto Config = Props.template get_property<
@@ -3291,22 +3296,48 @@ class __SYCL_EXPORT handler {
       size_t DeviceRowPitch, sycl::range<3> HostExtent,
       sycl::range<3> CopyExtent);
 
-  /// Instruct the queue with a non-blocking wait on an external semaphore.
-  /// An exception is thrown if \p SemaphoreHandle is incomplete.
+  /// Submit a non-blocking device-side wait on an external
+  //  semaphore to the queue.
+  /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the
+  /// type of semaphore requires an explicit value to wait upon.
+  ///
+  /// \param SemaphoreHandle is an opaque external interop semaphore handle
+  void ext_oneapi_wait_external_semaphore(
+      ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle);
+
+  /// Submit a non-blocking device-side wait on an external
+  //  semaphore to the queue.
+  /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the
+  /// type of semaphore does not support waiting on an explicitly passed value.
   ///
   /// \param SemaphoreHandle is an opaque external interop semaphore handle
+  /// \param WaitValue is the value that this semaphore will wait upon, until it
+  ///                  allows any further commands to execute on the queue.
   void ext_oneapi_wait_external_semaphore(
-      sycl::ext::oneapi::experimental::interop_semaphore_handle
-          SemaphoreHandle);
+      ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+      uint64_t WaitValue);
 
   /// Instruct the queue to signal the external semaphore once all previous
-  /// commands have completed execution.
-  /// An exception is thrown if \p SemaphoreHandle is incomplete.
+  /// commands submitted to the queue have completed execution.
+  /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the
+  /// type of semaphore requires an explicit value to signal.
+  ///
+  /// \param SemaphoreHandle is an opaque external interop semaphore handle
+  void ext_oneapi_signal_external_semaphore(
+      ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle);
+
+  /// Instruct the queue to set the state of the external semaphore to
+  /// \p SignalValue once all previous commands submitted to the queue have
+  /// completed execution.
+  /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the
+  /// type of semaphore does not support signalling an explicitly passed value.
   ///
   /// \param SemaphoreHandle is an opaque external interop semaphore handle
+  /// \param SignalValue is the value that this semaphore signal, once all
+  ///                    prior opeartions on the queue complete.
   void ext_oneapi_signal_external_semaphore(
-      sycl::ext::oneapi::experimental::interop_semaphore_handle
-          SemaphoreHandle);
+      ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+      uint64_t SignalValue);
 
 private:
   std::shared_ptr<detail::handler_impl> MImpl;
diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp
index 73a5ea8e7307a..4d32218ab09d4 100644
--- a/sycl/include/sycl/queue.hpp
+++ b/sycl/include/sycl/queue.hpp
@@ -1851,9 +1851,10 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
       const detail::code_location &CodeLoc = detail::code_location::current());
 
   /// Instruct the queue with a non-blocking wait on an external semaphore.
-  /// An exception is thrown if \p SemaphoreHandle is incomplete.
+  /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the
+  /// type of semaphore requires an explicit value to wait upon.
   ///
-  /// \param SemaphoreHandle is an opaque external interop semaphore handle
+  /// \param SemaphoreHandle is an opaque external interop semaphore handle.
   /// \return an event representing the wait operation.
   event ext_oneapi_wait_external_semaphore(
       sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
@@ -1867,7 +1868,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   }
 
   /// Instruct the queue with a non-blocking wait on an external semaphore.
-  /// An exception is thrown if \p SemaphoreHandle is incomplete.
+  /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the
+  /// type of semaphore requires an explicit value to wait upon.
   ///
   /// \param SemaphoreHandle is an opaque external interop semaphore handle
   /// \param DepEvent is an event that specifies the kernel dependencies.
@@ -1875,56 +1877,78 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   event ext_oneapi_wait_external_semaphore(
       sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
       event DepEvent,
-      const detail::code_location &CodeLoc = detail::code_location::current()) {
-    detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
-    return submit(
-        [&](handler &CGH) {
-          CGH.depends_on(DepEvent);
-          CGH.ext_oneapi_wait_external_semaphore(SemaphoreHandle);
-        },
-        CodeLoc);
-  }
+      const detail::code_location &CodeLoc = detail::code_location::current());
 
   /// Instruct the queue with a non-blocking wait on an external semaphore.
-  /// An exception is thrown if \p SemaphoreHandle is incomplete.
+  /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the
+  /// type of semaphore requires an explicit value to wait upon.
   ///
-  /// \param SemaphoreHandle is an opaque external interop semaphore handle
+  /// \param SemaphoreHandle is an opaque external interop semaphore handle.
   /// \param DepEvents is a vector of events that specifies the kernel
   /// dependencies.
   /// \return an event representing the wait operation.
   event ext_oneapi_wait_external_semaphore(
       sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
       const std::vector<event> &DepEvents,
-      const detail::code_location &CodeLoc = detail::code_location::current()) {
-    detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
-    return submit(
-        [&](handler &CGH) {
-          CGH.depends_on(DepEvents);
-          CGH.ext_oneapi_wait_external_semaphore(SemaphoreHandle);
-        },
-        CodeLoc);
-  }
+      const detail::code_location &CodeLoc = detail::code_location::current());
+
+  /// Instruct the queue with a non-blocking wait on an external semaphore.
+  /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the
+  /// type of semaphore does not support waiting on an explicitly passed value.
+  ///
+  /// \param SemaphoreHandle is an opaque external interop semaphore handle
+  /// \param WaitValue is the value that this semaphore will wait upon, until it
+  ///                  allows any further commands to execute on the queue.
+  /// \return an event representing the wait operation.
+  event ext_oneapi_wait_external_semaphore(
+      sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+      uint64_t WaitValue,
+      const detail::code_location &CodeLoc = detail::code_location::current());
+
+  /// Instruct the queue with a non-blocking wait on an external semaphore.
+  /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the
+  /// type of semaphore does not support waiting on an explicitly passed value.
+  ///
+  /// \param SemaphoreHandle is an opaque external interop semaphore handle
+  /// \param WaitValue is the value that this semaphore will wait upon, until it
+  ///                  allows any further commands to execute on the queue.
+  /// \param DepEvent is an event that specifies the kernel dependencies.
+  /// \return an event representing the wait operation.
+  event ext_oneapi_wait_external_semaphore(
+      sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+      uint64_t WaitValue, event DepEvent,
+      const detail::code_location &CodeLoc = detail::code_location::current());
+
+  /// Instruct the queue with a non-blocking wait on an external semaphore.
+  /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the
+  /// type of semaphore does not support waiting on an explicitly passed value.
+  ///
+  /// \param SemaphoreHandle is an opaque external interop semaphore handle
+  /// \param WaitValue is the value that this semaphore will wait upon, until it
+  ///                  allows any further commands to execute on the queue.
+  /// \param DepEvents is a vector of events that specifies the kernel
+  /// dependencies.
+  /// \return an event representing the wait operation.
+  event ext_oneapi_wait_external_semaphore(
+      sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+      uint64_t WaitValue, const std::vector<event> &DepEvents,
+      const detail::code_location &CodeLoc = detail::code_location::current());
 
   /// Instruct the queue to signal the external semaphore once all previous
   /// commands have completed execution.
-  /// An exception is thrown if \p SemaphoreHandle is incomplete.
+  /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the
+  /// type of semaphore requires an explicit value to signal.
   ///
   /// \param SemaphoreHandle is an opaque external interop semaphore handle
   /// \return an event representing the signal operation.
   event ext_oneapi_signal_external_semaphore(
       sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
-      const detail::code_location &CodeLoc = detail::code_location::current()) {
-    detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
-    return submit(
-        [&](handler &CGH) {
-          CGH.ext_oneapi_signal_external_semaphore(SemaphoreHandle);
-        },
-        CodeLoc);
-  }
+      const detail::code_location &CodeLoc = detail::code_location::current());
 
   /// Instruct the queue to signal the external semaphore once all previous
   /// commands have completed execution.
-  /// An exception is thrown if \p SemaphoreHandle is incomplete.
+  /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the
+  /// type of semaphore requires an explicit value to signal.
   ///
   /// \param SemaphoreHandle is an opaque external interop semaphore handle
   /// \param DepEvent is an event that specifies the kernel dependencies.
@@ -1932,19 +1956,12 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   event ext_oneapi_signal_external_semaphore(
       sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
       event DepEvent,
-      const detail::code_location &CodeLoc = detail::code_location::current()) {
-    detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
-    return submit(
-        [&](handler &CGH) {
-          CGH.depends_on(DepEvent);
-          CGH.ext_oneapi_signal_external_semaphore(SemaphoreHandle);
-        },
-        CodeLoc);
-  }
+      const detail::code_location &CodeLoc = detail::code_location::current());
 
   /// Instruct the queue to signal the external semaphore once all previous
   /// commands have completed execution.
-  /// An exception is thrown if \p SemaphoreHandle is incomplete.
+  /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the
+  /// type of semaphore requires an explicit value to signal.
   ///
   /// \param SemaphoreHandle is an opaque external interop semaphore handle
   /// \param DepEvents is a vector of events that specifies the kernel
@@ -1953,15 +1970,52 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
   event ext_oneapi_signal_external_semaphore(
       sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
       const std::vector<event> &DepEvents,
-      const detail::code_location &CodeLoc = detail::code_location::current()) {
-    detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
-    return submit(
-        [&](handler &CGH) {
-          CGH.depends_on(DepEvents);
-          CGH.ext_oneapi_signal_external_semaphore(SemaphoreHandle);
-        },
-        CodeLoc);
-  }
+      const detail::code_location &CodeLoc = detail::code_location::current());
+
+  /// Instruct the queue to signal the external semaphore once all previous
+  /// commands have completed execution.
+  /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the
+  /// type of semaphore does not support signalling an explicitly passed value.
+  ///
+  /// \param SemaphoreHandle is an opaque external interop semaphore handle
+  /// \param SignalValue is the value that this semaphore signal, once all
+  ///                    prior opeartions on the queue complete.
+  /// \return an event representing the signal operation.
+  event ext_oneapi_signal_external_semaphore(
+      sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+      uint64_t SignalValue,
+      const detail::code_location &CodeLoc = detail::code_location::current());
+
+  /// Instruct the queue to signal the external semaphore once all previous
+  /// commands have completed execution.
+  /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the
+  /// type of semaphore does not support signalling an explicitly passed value.
+  ///
+  /// \param SemaphoreHandle is an opaque external interop semaphore handle
+  /// \param SignalValue is the value that this semaphore signal, once all
+  ///                    prior opeartions on the queue complete.
+  /// \param DepEvent is an event that specifies the kernel dependencies.
+  /// \return an event representing the signal operation.
+  event ext_oneapi_signal_external_semaphore(
+      sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+      uint64_t SignalValue, event DepEvent,
+      const detail::code_location &CodeLoc = detail::code_location::current());
+
+  /// Instruct the queue to signal the external semaphore once all previous
+  /// commands have completed execution.
+  /// An exception is thrown if \p SemaphoreHandle is incomplete, or if the
+  /// type of semaphore does not support signalling an explicitly passed value.
+  ///
+  /// \param SemaphoreHandle is an opaque external interop semaphore handle
+  /// \param SignalValue is the value that this semaphore signal, once all
+  ///                    prior opeartions on the queue complete.
+  /// \param DepEvents is a vector of events that specifies the kernel
+  /// dependencies.
+  /// \return an event representing the signal operation.
+  event ext_oneapi_signal_external_semaphore(
+      sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+      uint64_t SignalValue, const std::vector<event> &DepEvents,
+      const detail::code_location &CodeLoc = detail::code_location::current());
 
   /// single_task version with a kernel represented as a lambda.
   ///
diff --git a/sycl/include/sycl/vector.hpp b/sycl/include/sycl/vector.hpp
index 200a77a9adf83..d5d193bfa7add 100644
--- a/sycl/include/sycl/vector.hpp
+++ b/sycl/include/sycl/vector.hpp
@@ -786,9 +786,27 @@ template <typename Type, int NumElements> class vec {
                           detail::ConvertToOpenCLType_t<vec_data_t<convertT>>>,
       vec<convertT, NumElements>>
   convert() const {
+    using bfloat16 = sycl::ext::oneapi::bfloat16;
     static_assert(std::is_integral_v<vec_data_t<convertT>> ||
-                      detail::is_floating_point<convertT>::value,
+                      detail::is_floating_point<convertT>::value ||
+                      // Conversion to BF16 available only for float.
+                      (std::is_same_v<convertT, bfloat16> &&
+                       std::is_same_v<DataT, float>),
                   "Unsupported convertT");
+
+    // Currently, for float ---> bfloat16 conversion, we only support
+    // Round-to-even rounding mode.
+    constexpr bool isFloatToBF16Conv =
+        std::is_same_v<convertT, bfloat16> && std::is_same_v<DataT, float>;
+    constexpr bool isBF16ToFloatConv =
+        std::is_same_v<DataT, bfloat16> && std::is_same_v<convertT, float>;
+    if constexpr (isFloatToBF16Conv) {
+      static_assert(roundingMode == rounding_mode::automatic ||
+                        roundingMode == rounding_mode::rte,
+                    "Currently, we only supoort round-to-even rounding mode \
+                      for float ---> bfloat16 conversion.");
+    }
+
     using T = vec_data_t<DataT>;
     using R = vec_data_t<convertT>;
     using OpenCLT = detail::ConvertToOpenCLType_t<T>;
@@ -828,10 +846,19 @@ template <typename Type, int NumElements> class vec {
     {
       // Otherwise, we fallback to per-element conversion:
       for (size_t I = 0; I < NumElements; ++I) {
-        Result.setValue(
-            I, vec_data<convertT>::get(
-                   detail::convertImpl<T, R, roundingMode, 1, OpenCLT, OpenCLR>(
-                       vec_data<DataT>::get(getValue(I)))));
+        // For float -> bf16.
+        if constexpr (isFloatToBF16Conv) {
+          Result[I] = bfloat16((*this)[I]);
+        } else
+          // For bf16 -> float.
+          if constexpr (isBF16ToFloatConv) {
+            Result[I] = (float)((*this)[I]);
+          } else {
+            Result.setValue(I, vec_data<convertT>::get(
+                                   detail::convertImpl<T, R, roundingMode, 1,
+                                                       OpenCLT, OpenCLR>(
+                                       vec_data<DataT>::get(getValue(I)))));
+          }
       }
     }
 
diff --git a/sycl/include/sycl/vector_preview.hpp b/sycl/include/sycl/vector_preview.hpp
index 7300bc0e088a0..c6993fd27c73f 100644
--- a/sycl/include/sycl/vector_preview.hpp
+++ b/sycl/include/sycl/vector_preview.hpp
@@ -26,10 +26,6 @@
 #error "SYCL device compiler is built without ext_vector_type support"
 #endif
 
-#if defined(__SYCL_DEVICE_ONLY__)
-#define __SYCL_USE_EXT_VECTOR_TYPE__
-#endif
-
 #include <sycl/access/access.hpp>              // for decorated, address_space
 #include <sycl/aliases.hpp>                    // for half, cl_char, cl_int
 #include <sycl/detail/common.hpp>              // for ArrayCreator, RepeatV...
@@ -39,14 +35,16 @@
 #include <sycl/detail/memcpy.hpp>              // for memcpy
 #include <sycl/detail/type_list.hpp>           // for is_contained
 #include <sycl/detail/type_traits.hpp>         // for is_floating_point
+#include <sycl/detail/vector_arith.hpp>
 #include <sycl/detail/vector_convert.hpp>      // for convertImpl
 #include <sycl/detail/vector_traits.hpp>       // for vector_alignment
 #include <sycl/half_type.hpp>                  // for StorageT, half, Vec16...
 
 #include <sycl/ext/oneapi/bfloat16.hpp> // bfloat16
 
+#include <algorithm>   // for std::min
 #include <array>       // for array
-#include <assert.h>    // for assert
+#include <cassert>     // for assert
 #include <cstddef>     // for size_t, NULL, byte
 #include <cstdint>     // for uint8_t, int16_t, int...
 #include <functional>  // for divides, multiplies
@@ -86,313 +84,75 @@ struct elem {
 };
 
 namespace detail {
-// select_apply_cl_t selects from T8/T16/T32/T64 basing on
-// sizeof(_IN).  expected to handle scalar types in _IN.
-template <typename _IN, typename T8, typename T16, typename T32, typename T64>
-using select_apply_cl_t = std::conditional_t<
-    sizeof(_IN) == 1, T8,
-    std::conditional_t<sizeof(_IN) == 2, T16,
-                       std::conditional_t<sizeof(_IN) == 4, T32, T64>>>;
-
-template <typename T> struct vec_helper {
-  using RetType = T;
-  static constexpr RetType get(T value) { return value; }
-  static constexpr RetType set(T value) { return value; }
-};
-template <> struct vec_helper<bool> {
-  using RetType = select_apply_cl_t<bool, std::int8_t, std::int16_t,
-                                    std::int32_t, std::int64_t>;
-  static constexpr RetType get(bool value) { return value; }
-  static constexpr RetType set(bool value) { return value; }
-};
-
-template <> struct vec_helper<sycl::ext::oneapi::bfloat16> {
-  using RetType = sycl::ext::oneapi::bfloat16;
-  using BFloat16StorageT = sycl::ext::oneapi::detail::Bfloat16StorageT;
-  static constexpr RetType get(BFloat16StorageT value) {
-#if defined(__SYCL_BITCAST_IS_CONSTEXPR)
-    return sycl::bit_cast<RetType>(value);
-#else
-    // awkward workaround. sycl::bit_cast isn't constexpr in older GCC
-    // C++20 will give us both std::bit_cast and constexpr reinterpet for void*
-    // but neither available yet.
-    union {
-      sycl::ext::oneapi::bfloat16 bf16;
-      sycl::ext::oneapi::detail::Bfloat16StorageT storage;
-    } result = {};
-    result.storage = value;
-    return result.bf16;
-#endif
-  }
-
-  static constexpr RetType get(RetType value) { return value; }
-
-  static constexpr BFloat16StorageT set(RetType value) {
-#if defined(__SYCL_BITCAST_IS_CONSTEXPR)
-    return sycl::bit_cast<BFloat16StorageT>(value);
-#else
-    union {
-      sycl::ext::oneapi::bfloat16 bf16;
-      sycl::ext::oneapi::detail::Bfloat16StorageT storage;
-    } result = {};
-    result.bf16 = value;
-    return result.storage;
-#endif
-  }
-};
-
-#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
-template <> struct vec_helper<std::byte> {
-  using RetType = std::uint8_t;
-  static constexpr RetType get(std::byte value) { return (RetType)value; }
-  static constexpr RetType set(std::byte value) { return (RetType)value; }
-  static constexpr std::byte get(std::uint8_t value) {
-    return (std::byte)value;
-  }
-  static constexpr std::byte set(std::uint8_t value) {
-    return (std::byte)value;
-  }
-};
-#endif
-
 template <typename VecT, typename OperationLeftT, typename OperationRightT,
           template <typename> class OperationCurrentT, int... Indexes>
 class SwizzleOp;
 
-template <typename T, int N, typename V = void> struct VecStorage;
-
-// Element type for relational operator return value.
-template <typename DataT>
-using rel_t = typename std::conditional_t<
-    sizeof(DataT) == sizeof(opencl::cl_char), opencl::cl_char,
-    typename std::conditional_t<
-        sizeof(DataT) == sizeof(opencl::cl_short), opencl::cl_short,
-        typename std::conditional_t<
-            sizeof(DataT) == sizeof(opencl::cl_int), opencl::cl_int,
-            typename std::conditional_t<sizeof(DataT) ==
-                                            sizeof(opencl::cl_long),
-                                        opencl::cl_long, bool>>>>;
-
 // Special type indicating that SwizzleOp should just read value from vector -
 // not trying to perform any operations. Should not be called.
 template <typename T> class GetOp {
 public:
   using DataT = T;
-  DataT getValue(size_t) const { return (DataT)0; }
-  DataT operator()(DataT, DataT) { return (DataT)0; }
-};
-
-// Forward declarations
-template <typename TransformedArgType, int Dims, typename KernelType>
-class RoundedRangeKernel;
-template <typename TransformedArgType, int Dims, typename KernelType>
-class RoundedRangeKernelWithKH;
-
-// Vectors of size 1 are handled separately and therefore 1 is not included in
-// the check below.
-constexpr bool isValidVectorSize(int N) {
-  return N == 2 || N == 3 || N == 4 || N == 8 || N == 16;
-}
-template <typename T, int N, typename V> struct VecStorage {
-  static_assert(
-      isValidVectorSize(N) || N == 1,
-      "Incorrect number of elements for sycl::vec: only 1, 2, 3, 4, 8 "
-      "or 16 are supported");
-  static_assert(!std::is_same_v<V, void>, "Incorrect data type for sycl::vec");
-};
-
-#ifdef __SYCL_DEVICE_ONLY__
-// device always has ext vector support, but for huge vectors
-// we switch to std::array, so that we can use a smaller alignment (64)
-// this is to support MSVC, which has a max of 64 for direct params.
-template <typename T, int N> struct VecStorageImpl {
-  static constexpr size_t Num = (N == 3) ? 4 : N;
-  static constexpr size_t Sz = Num * sizeof(T);
-  using DataType =
-      typename std::conditional<Sz <= 64, T __attribute__((ext_vector_type(N))),
-                                std::array<T, Num>>::type;
-  using VectorDataType = T __attribute__((ext_vector_type(N)));
-};
-#else  // __SYCL_DEVICE_ONLY__
-template <typename T, int N> struct VecStorageImpl {
-  using DataType = std::array<T, (N == 3) ? 4 : N>;
-};
-#endif // __SYCL_DEVICE_ONLY__
-
-// Single element bool
-template <> struct VecStorage<bool, 1, void> {
-  using DataType = bool;
-#ifdef __SYCL_DEVICE_ONLY__
-  using VectorDataType = bool;
-#endif // __SYCL_DEVICE_ONLY__
-};
-
-// Multiple element bool
-template <int N>
-struct VecStorage<bool, N, typename std::enable_if_t<isValidVectorSize(N)>> {
-  using DataType =
-      typename VecStorageImpl<select_apply_cl_t<bool, std::int8_t, std::int16_t,
-                                                std::int32_t, std::int64_t>,
-                              N>::DataType;
-#ifdef __SYCL_DEVICE_ONLY__
-  using VectorDataType =
-      typename VecStorageImpl<select_apply_cl_t<bool, std::int8_t, std::int16_t,
-                                                std::int32_t, std::int64_t>,
-                              N>::VectorDataType;
-#endif // __SYCL_DEVICE_ONLY__
-};
-
-#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
-// Single element byte. Multiple elements will propagate through a later
-// specialization.
-template <> struct VecStorage<std::byte, 1, void> {
-  using DataType = std::int8_t;
-#ifdef __SYCL_DEVICE_ONLY__
-  using VectorDataType = std::int8_t;
-#endif // __SYCL_DEVICE_ONLY__
-};
-#endif // (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
-
-// Single element signed integers
-template <typename T>
-struct VecStorage<T, 1, typename std::enable_if_t<is_sigeninteger_v<T>>> {
-  using DataType = T;
-#ifdef __SYCL_DEVICE_ONLY__
-  using VectorDataType = DataType;
-#endif // __SYCL_DEVICE_ONLY__
-};
-
-// Single element unsigned integers
-template <typename T>
-struct VecStorage<T, 1, typename std::enable_if_t<is_sugeninteger_v<T>>> {
-  using DataType = T;
-#ifdef __SYCL_DEVICE_ONLY__
-  using VectorDataType = DataType;
-#endif // __SYCL_DEVICE_ONLY__
-};
-
-// Single element floating-point (except half/bfloat16)
-template <typename T>
-struct VecStorage<
-    T, 1,
-    typename std::enable_if_t<!is_half_or_bf16_v<T> && is_sgenfloat_v<T>>> {
-  using DataType = T;
-#ifdef __SYCL_DEVICE_ONLY__
-  using VectorDataType = DataType;
-#endif // __SYCL_DEVICE_ONLY__
-};
-// Multiple elements signed/unsigned integers and floating-point (except
-// half/bfloat16)
-template <typename T, int N>
-struct VecStorage<
-    T, N,
-    typename std::enable_if_t<isValidVectorSize(N) &&
-                              (is_sgeninteger_v<T> ||
-                               (is_sgenfloat_v<T> && !is_half_or_bf16_v<T>))>> {
-  using DataType =
-      typename VecStorageImpl<typename VecStorage<T, 1>::DataType, N>::DataType;
-#ifdef __SYCL_DEVICE_ONLY__
-  using VectorDataType =
-      typename VecStorageImpl<typename VecStorage<T, 1>::DataType,
-                              N>::VectorDataType;
-#endif // __SYCL_DEVICE_ONLY__
-};
-
-// Single element half
-template <> struct VecStorage<half, 1, void> {
-  using DataType = sycl::detail::half_impl::StorageT;
-#ifdef __SYCL_DEVICE_ONLY__
-  using VectorDataType = sycl::detail::half_impl::StorageT;
-#endif // __SYCL_DEVICE_ONLY__
+  DataT getValue(size_t) const {
+    if constexpr (std::is_same_v<DataT, sycl::detail::host_half_impl::half>)
+      return DataT{0.0f};
+    else
+      return (DataT)0;
+  }
+  DataT operator()(DataT, DataT) {
+    if constexpr (std::is_same_v<DataT, sycl::detail::host_half_impl::half>)
+      return DataT{0.0f};
+    else
+      return (DataT)0;
+  }
 };
 
-// Multiple elements half
-#if defined(__SYCL_DEVICE_ONLY__)
-#define __SYCL_DEFINE_HALF_VECSTORAGE(Num)                                     \
-  template <> struct VecStorage<half, Num, void> {                             \
-    using DataType = sycl::detail::half_impl::Vec##Num##StorageT;              \
-    using VectorDataType = sycl::detail::half_impl::Vec##Num##StorageT;        \
-  };
-#else // defined(__SYCL_DEVICE_ONLY__)
-#define __SYCL_DEFINE_HALF_VECSTORAGE(Num)                                     \
-  template <> struct VecStorage<half, Num, void> {                             \
-    using DataType = sycl::detail::half_impl::Vec##Num##StorageT;              \
-  };
-#endif // defined(__SYCL_DEVICE_ONLY__)
-
-__SYCL_DEFINE_HALF_VECSTORAGE(2)
-__SYCL_DEFINE_HALF_VECSTORAGE(3)
-__SYCL_DEFINE_HALF_VECSTORAGE(4)
-__SYCL_DEFINE_HALF_VECSTORAGE(8)
-__SYCL_DEFINE_HALF_VECSTORAGE(16)
-#undef __SYCL_DEFINE_HALF_VECSTORAGE
-
-// Single element bfloat16
-template <> struct VecStorage<sycl::ext::oneapi::bfloat16, 1, void> {
-  using DataType = sycl::ext::oneapi::detail::Bfloat16StorageT;
-  // using VectorDataType = sycl::ext::oneapi::bfloat16;
-  using VectorDataType = sycl::ext::oneapi::detail::Bfloat16StorageT;
-};
-// Multiple elements bfloat16
-#define __SYCL_DEFINE_BF16_VECSTORAGE(Num)                                     \
-  template <> struct VecStorage<sycl::ext::oneapi::bfloat16, Num, void> {      \
-    using DataType = sycl::ext::oneapi::detail::bf16::Vec##Num##StorageT;      \
-    using VectorDataType =                                                     \
-        sycl::ext::oneapi::detail::bf16::Vec##Num##StorageT;                   \
-  };
-__SYCL_DEFINE_BF16_VECSTORAGE(2)
-__SYCL_DEFINE_BF16_VECSTORAGE(3)
-__SYCL_DEFINE_BF16_VECSTORAGE(4)
-__SYCL_DEFINE_BF16_VECSTORAGE(8)
-__SYCL_DEFINE_BF16_VECSTORAGE(16)
-#undef __SYCL_DEFINE_BF16_VECSTORAGE
 } // namespace detail
 
-template <typename T> using vec_data = detail::vec_helper<T>;
-
-template <typename T>
-using vec_data_t = typename detail::vec_helper<T>::RetType;
-
 ///////////////////////// class sycl::vec /////////////////////////
-/// Provides a cross-patform vector class template that works efficiently on
-/// SYCL devices as well as in host C++ code.
-///
-/// \ingroup sycl_api
-template <typename Type, int NumElements> class vec {
-  using DataT = Type;
+// Provides a cross-platform vector class template that works efficiently on
+// SYCL devices as well as in host C++ code.
+template <typename DataT, int NumElements>
+class vec : public detail::vec_arith<DataT, NumElements> {
+
+  static_assert(NumElements == 1 || NumElements == 2 || NumElements == 3 ||
+                    NumElements == 4 || NumElements == 8 || NumElements == 16,
+                "Invalid number of elements for sycl::vec: only 1, 2, 3, 4, 8 "
+                "or 16 are supported");
+  static_assert(sizeof(bool) == sizeof(int8_t), "bool size is not 1 byte");
+
+  // https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#memory-layout-and-alignment
+  // It is required by the SPEC to align vec<DataT, 3> with vec<DataT, 4>.
+  static constexpr size_t AdjustedNum = (NumElements == 3) ? 4 : NumElements;
 
   // This represent type of underlying value. There should be only one field
   // in the class, so vec<float, 16> should be equal to float16 in memory.
-  using DataType = typename detail::VecStorage<DataT, NumElements>::DataType;
+  using DataType = std::array<DataT, AdjustedNum>;
 
-  static constexpr bool IsHostHalf =
-      std::is_same_v<DataT, sycl::detail::half_impl::half> &&
-      std::is_same_v<sycl::detail::half_impl::StorageT,
-                     sycl::detail::host_half_impl::half>;
+#ifdef __SYCL_DEVICE_ONLY__
+  using element_type_for_vector_t = typename detail::map_type<
+      DataT,
+#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
+      std::byte, /*->*/ std::uint8_t, //
+#endif
+      bool, /*->*/ std::int8_t,                             //
+      sycl::half, /*->*/ sycl::detail::half_impl::StorageT, //
+      sycl::ext::oneapi::bfloat16,
+      /*->*/ sycl::ext::oneapi::detail::Bfloat16StorageT, //
+      DataT, /*->*/ DataT                                 //
+      >::type;
 
-  static constexpr bool IsBfloat16 =
-      std::is_same_v<DataT, sycl::ext::oneapi::bfloat16>;
+public:
+  // Type used for passing sycl::vec to SPIRV builtins.
+  // We can not use ext_vector_type(1) as it's not supported by SPIRV
+  // plugins (CTS fails).
+  using vector_t =
+      typename std::conditional_t<NumElements == 1, element_type_for_vector_t,
+                                  element_type_for_vector_t __attribute__((
+                                      ext_vector_type(NumElements)))>;
 
-  static constexpr size_t AdjustedNum = (NumElements == 3) ? 4 : NumElements;
-  static constexpr size_t Sz = sizeof(DataT) * AdjustedNum;
-  static constexpr bool IsSizeGreaterThanMaxAlign =
-      (Sz > detail::MaxVecAlignment);
-
-  // TODO: There is no support for vector half type on host yet.
-  // Also, when Sz is greater than alignment, we use std::array instead of
-  // vector extension. This is for MSVC compatibility, which has a max alignment
-  // of 64 for direct params. If we drop MSVC, we can have alignment the same as
-  // size and use vector extensions for all sizes.
-  static constexpr bool IsUsingArrayOnDevice =
-      (IsHostHalf || IsBfloat16 || IsSizeGreaterThanMaxAlign);
-
-#if defined(__SYCL_DEVICE_ONLY__)
-  static constexpr bool NativeVec = NumElements > 1 && !IsUsingArrayOnDevice;
-  static constexpr bool IsUsingArrayOnHost = false; // not compiling for host.
-#else
-  static constexpr bool NativeVec = false;
-  static constexpr bool IsUsingArrayOnHost = true; // host always std::array.
-#endif
+private:
+#endif // __SYCL_DEVICE_ONLY__
 
   static constexpr int getNumElements() { return NumElements; }
 
@@ -411,7 +171,7 @@ template <typename Type, int NumElements> class vec {
   template <typename DataT_, typename T, std::size_t... Is>
   static constexpr std::array<DataT_, sizeof...(Is)>
   VecToArray(const vec<T, sizeof...(Is)> &V, std::index_sequence<Is...>) {
-    return {static_cast<DataT_>(V.getValue(Is))...};
+    return {static_cast<DataT_>(V[Is])...};
   }
   template <typename DataT_, typename T, int N, typename T2, typename T3,
             template <typename> class T4, int... T5, std::size_t... Is>
@@ -446,7 +206,9 @@ template <typename Type, int NumElements> class vec {
   }
   template <typename DataT_, typename T>
   static constexpr auto FlattenVecArgHelper(const T &A) {
-    return std::array<DataT_, 1>{vec_data<DataT_>::get(static_cast<DataT_>(A))};
+    // static_cast required to avoid narrowing conversion warning
+    // when T = unsigned long int and DataT_ = int.
+    return std::array<DataT_, 1>{static_cast<DataT_>(A)};
   }
   template <typename DataT_, typename T> struct FlattenVecArg {
     constexpr auto operator()(const T &A) const {
@@ -541,205 +303,89 @@ template <typename Type, int NumElements> class vec {
   using EnableIfSuitableNumElements =
       typename std::enable_if_t<SizeChecker<0, NumElements, argTN...>::value>;
 
-  template <size_t... Is>
-  constexpr vec(const std::array<vec_data_t<DataT>, NumElements> &Arr,
-                std::index_sequence<Is...>)
-      : m_Data{([&](vec_data_t<DataT> v) constexpr {
-          if constexpr (std::is_same_v<sycl::ext::oneapi::bfloat16, DataT>)
-            return v.value;
-          else
-            return vec_data_t<DataT>(static_cast<DataT>(v));
-        })(Arr[Is])...} {}
+  // Element type for relational operator return value.
+  using rel_t = detail::select_cl_scalar_integral_signed_t<DataT>;
 
 public:
+  // Aliases required by SYCL 2020 to make sycl::vec consistent
+  // with that of marray and buffer.
   using element_type = DataT;
   using value_type = DataT;
-  using rel_t = detail::rel_t<DataT>;
-#ifdef __SYCL_DEVICE_ONLY__
-  using vector_t =
-      typename detail::VecStorage<DataT, NumElements>::VectorDataType;
-#endif // __SYCL_DEVICE_ONLY__
 
+  /****************** Constructors **************/
   vec() = default;
-
   constexpr vec(const vec &Rhs) = default;
   constexpr vec(vec &&Rhs) = default;
 
-  constexpr vec &operator=(const vec &Rhs) = default;
-
-  // W/o this, things like "vec<char,*> = vec<signed char, *>" doesn't work.
-  template <typename Ty = DataT>
-  typename std::enable_if_t<!std::is_same_v<Ty, rel_t> &&
-                                std::is_convertible_v<vec_data_t<Ty>, rel_t>,
-                            vec &>
-  operator=(const vec<rel_t, NumElements> &Rhs) {
-    *this = Rhs.template as<vec>();
-    return *this;
-  }
-
-#ifdef __SYCL_USE_EXT_VECTOR_TYPE__
-  template <typename T = void>
-  using EnableIfNotHostHalf = typename std::enable_if_t<!IsHostHalf, T>;
-
-  template <typename T = void>
-  using EnableIfHostHalf = typename std::enable_if_t<IsHostHalf, T>;
-
-  template <typename T = void>
-  using EnableIfUsingArrayOnDevice =
-      typename std::enable_if_t<IsUsingArrayOnDevice, T>;
-
-  template <typename T = void>
-  using EnableIfNotUsingArrayOnDevice =
-      typename std::enable_if_t<!IsUsingArrayOnDevice, T>;
-#endif // __SYCL_USE_EXT_VECTOR_TYPE__
-
-  template <typename T = void>
-  using EnableIfUsingArray =
-      typename std::enable_if_t<IsUsingArrayOnDevice || IsUsingArrayOnHost, T>;
-
-  template <typename T = void>
-  using EnableIfNotUsingArray =
-      typename std::enable_if_t<!IsUsingArrayOnDevice && !IsUsingArrayOnHost,
-                                T>;
-
-#ifdef __SYCL_USE_EXT_VECTOR_TYPE__
-
-  template <typename Ty = DataT>
-  explicit constexpr vec(const EnableIfNotUsingArrayOnDevice<Ty> &arg)
-      : m_Data{DataType(vec_data<Ty>::get(arg))} {}
+private:
+  // Implementation detail for the next public ctor.
+  template <size_t... Is>
+  constexpr vec(const std::array<DataT, NumElements> &Arr,
+                std::index_sequence<Is...>)
+      : m_Data{Arr[Is]...} {}
 
-  template <typename Ty = DataT>
-  typename std::enable_if_t<
-      std::is_fundamental_v<vec_data_t<Ty>> ||
-          detail::is_half_or_bf16_v<typename std::remove_const_t<Ty>>,
-      vec &>
-  operator=(const EnableIfNotUsingArrayOnDevice<Ty> &Rhs) {
-    m_Data = (DataType)vec_data<Ty>::get(Rhs);
-    return *this;
-  }
+public:
+  explicit constexpr vec(const DataT &arg)
+      : vec{detail::RepeatValue<NumElements>(arg),
+            std::make_index_sequence<NumElements>()} {}
 
-  template <typename Ty = DataT>
-  explicit constexpr vec(const EnableIfUsingArrayOnDevice<Ty> &arg)
-      : vec{detail::RepeatValue<NumElements>(
-                static_cast<vec_data_t<DataT>>(arg)),
+  // Constructor from values of base type or vec of base type. Checks that
+  // base types are match and that the NumElements == sum of lengths of args.
+  template <typename... argTN, typename = EnableIfSuitableTypes<argTN...>,
+            typename = EnableIfSuitableNumElements<argTN...>>
+  constexpr vec(const argTN &...args)
+      : vec{VecArgArrayCreator<DataT, argTN...>::Create(args...),
             std::make_index_sequence<NumElements>()} {}
 
+  /****************** Assignment Operators **************/
+  constexpr vec &operator=(const vec &Rhs) = default;
+
+  // Template required to prevent ambiguous overload with the copy assignment
+  // when NumElements == 1. The template prevents implicit conversion from
+  // vec<_, 1> to DataT.
   template <typename Ty = DataT>
   typename std::enable_if_t<
-      std::is_fundamental_v<vec_data_t<Ty>> ||
+      std::is_fundamental_v<Ty> ||
           detail::is_half_or_bf16_v<typename std::remove_const_t<Ty>>,
       vec &>
-  operator=(const EnableIfUsingArrayOnDevice<Ty> &Rhs) {
-    for (int i = 0; i < NumElements; ++i) {
-      setValue(i, Rhs);
-    }
+  operator=(const DataT &Rhs) {
+    *this = vec{Rhs};
     return *this;
   }
-#else  // __SYCL_USE_EXT_VECTOR_TYPE__
-  explicit constexpr vec(const DataT &arg)
-      : vec{detail::RepeatValue<NumElements>(
-                static_cast<vec_data_t<DataT>>(arg)),
-            std::make_index_sequence<NumElements>()} {}
 
+  // W/o this, things like "vec<char,*> = vec<signed char, *>" doesn't work.
   template <typename Ty = DataT>
   typename std::enable_if_t<
-      std::is_fundamental_v<vec_data_t<Ty>> ||
-          detail::is_half_or_bf16_v<typename std::remove_const_t<Ty>>,
-      vec &>
-  operator=(const DataT &Rhs) {
-    for (int i = 0; i < NumElements; ++i) {
-      setValue(i, Rhs);
-    }
+      !std::is_same_v<Ty, rel_t> && std::is_convertible_v<Ty, rel_t>, vec &>
+  operator=(const vec<rel_t, NumElements> &Rhs) {
+    *this = Rhs.template as<vec>();
     return *this;
   }
-#endif // __SYCL_USE_EXT_VECTOR_TYPE__
-
-#ifdef __SYCL_USE_EXT_VECTOR_TYPE__
-  // Optimized naive constructors with NumElements of DataT values.
-  // We don't expect compilers to optimize vararg recursive functions well.
-
-  // Helper type to make specific constructors available only for specific
-  // number of elements.
-  template <int IdxNum, typename T = void>
-  using EnableIfMultipleElems = typename std::enable_if_t<
-      std::is_convertible_v<T, DataT> && NumElements == IdxNum, DataT>;
-  template <typename Ty = DataT>
-  constexpr vec(const EnableIfMultipleElems<2, Ty> Arg0,
-                const EnableIfNotUsingArrayOnDevice<Ty> Arg1)
-      : m_Data{vec_data<Ty>::get(Arg0), vec_data<Ty>::get(Arg1)} {}
-  template <typename Ty = DataT>
-  constexpr vec(const EnableIfMultipleElems<3, Ty> Arg0,
-                const EnableIfNotUsingArrayOnDevice<Ty> Arg1, const DataT Arg2)
-      : m_Data{vec_data<Ty>::get(Arg0), vec_data<Ty>::get(Arg1),
-               vec_data<Ty>::get(Arg2)} {}
-  template <typename Ty = DataT>
-  constexpr vec(const EnableIfMultipleElems<4, Ty> Arg0,
-                const EnableIfNotUsingArrayOnDevice<Ty> Arg1, const DataT Arg2,
-                const Ty Arg3)
-      : m_Data{vec_data<Ty>::get(Arg0), vec_data<Ty>::get(Arg1),
-               vec_data<Ty>::get(Arg2), vec_data<Ty>::get(Arg3)} {}
-  template <typename Ty = DataT>
-  constexpr vec(const EnableIfMultipleElems<8, Ty> Arg0,
-                const EnableIfNotUsingArrayOnDevice<Ty> Arg1, const DataT Arg2,
-                const DataT Arg3, const DataT Arg4, const DataT Arg5,
-                const DataT Arg6, const DataT Arg7)
-      : m_Data{vec_data<Ty>::get(Arg0), vec_data<Ty>::get(Arg1),
-               vec_data<Ty>::get(Arg2), vec_data<Ty>::get(Arg3),
-               vec_data<Ty>::get(Arg4), vec_data<Ty>::get(Arg5),
-               vec_data<Ty>::get(Arg6), vec_data<Ty>::get(Arg7)} {}
-  template <typename Ty = DataT>
-  constexpr vec(const EnableIfMultipleElems<16, Ty> Arg0,
-                const EnableIfNotUsingArrayOnDevice<Ty> Arg1, const DataT Arg2,
-                const DataT Arg3, const DataT Arg4, const DataT Arg5,
-                const DataT Arg6, const DataT Arg7, const DataT Arg8,
-                const DataT Arg9, const DataT ArgA, const DataT ArgB,
-                const DataT ArgC, const DataT ArgD, const DataT ArgE,
-                const DataT ArgF)
-      : m_Data{vec_data<Ty>::get(Arg0), vec_data<Ty>::get(Arg1),
-               vec_data<Ty>::get(Arg2), vec_data<Ty>::get(Arg3),
-               vec_data<Ty>::get(Arg4), vec_data<Ty>::get(Arg5),
-               vec_data<Ty>::get(Arg6), vec_data<Ty>::get(Arg7),
-               vec_data<Ty>::get(Arg8), vec_data<Ty>::get(Arg9),
-               vec_data<Ty>::get(ArgA), vec_data<Ty>::get(ArgB),
-               vec_data<Ty>::get(ArgC), vec_data<Ty>::get(ArgD),
-               vec_data<Ty>::get(ArgE), vec_data<Ty>::get(ArgF)} {}
-#endif // __SYCL_USE_EXT_VECTOR_TYPE__
-
-  // Constructor from values of base type or vec of base type. Checks that
-  // base types are match and that the NumElements == sum of lengths of args.
-  template <typename... argTN, typename = EnableIfSuitableTypes<argTN...>,
-            typename = EnableIfSuitableNumElements<argTN...>>
-  constexpr vec(const argTN &...args)
-      : vec{VecArgArrayCreator<vec_data_t<DataT>, argTN...>::Create(args...),
-            std::make_index_sequence<NumElements>()} {}
 
 #ifdef __SYCL_DEVICE_ONLY__
-  template <typename vector_t_ = vector_t,
-            typename =
-                typename std::enable_if_t<std::is_same_v<vector_t_, vector_t> &&
-                                          !std::is_same_v<vector_t_, DataT>>>
-  constexpr vec(vector_t openclVector) {
-    if constexpr (!IsUsingArrayOnDevice) {
-      m_Data = openclVector;
-    } else {
-      m_Data = bit_cast<DataType>(openclVector);
-    }
-  }
-
-  operator vector_t() const {
-    if constexpr (!IsUsingArrayOnDevice) {
-      return m_Data;
-    } else {
-      auto ptr = bit_cast<const vector_t *>((&m_Data)->data());
-      return *ptr;
-    }
-  }
+  // Make it a template to avoid ambiguity with `vec(const DataT &)` when
+  // `vector_t` is the same as `DataT`. Not that the other ctor isn't a template
+  // so we don't even need a smart `enable_if` condition here, the mere fact of
+  // this being a template makes the other ctor preferred.
+  template <
+      typename vector_t_ = vector_t,
+      typename = typename std::enable_if_t<std::is_same_v<vector_t_, vector_t>>>
+  constexpr vec(vector_t_ openclVector) {
+    m_Data = sycl::bit_cast<DataType>(openclVector);
+  }
+
+  /* @SYCL2020
+   * Available only when: compiled for the device.
+   * Converts this SYCL vec instance to the underlying backend-native vector
+   * type defined by vector_t.
+   */
+  operator vector_t() const { return sycl::bit_cast<vector_t>(m_Data); }
 #endif // __SYCL_DEVICE_ONLY__
 
   // Available only when: NumElements == 1
   template <int N = NumElements>
   operator typename std::enable_if_t<N == 1, DataT>() const {
-    return vec_data<DataT>::get(m_Data);
+    return m_Data[0];
   }
 
   __SYCL2020_DEPRECATED("get_count() is deprecated, please use size() instead")
@@ -750,105 +396,124 @@ template <typename Type, int NumElements> class vec {
   static constexpr size_t get_size() { return byte_size(); }
   static constexpr size_t byte_size() noexcept { return sizeof(m_Data); }
 
-  // convertImpl can't be called with the same From and To types and therefore
-  // we need this version of convert which is mostly no-op.
-  template <typename convertT,
-            rounding_mode roundingMode = rounding_mode::automatic>
-  std::enable_if_t<
-      std::is_same_v<vec_data_t<DataT>, vec_data_t<convertT>> ||
-          std::is_same_v<detail::ConvertToOpenCLType_t<vec_data_t<DataT>>,
-                         detail::ConvertToOpenCLType_t<vec_data_t<convertT>>>,
-      vec<convertT, NumElements>>
-  convert() const {
-    static_assert(std::is_integral_v<vec_data_t<convertT>> ||
-                      detail::is_floating_point<convertT>::value,
-                  "Unsupported convertT");
-    if constexpr (!std::is_same_v<DataT, convertT>) {
-      // Dummy conversion for cases like vec<signed char> -> vec<char>
-      vec<convertT, NumElements> Result;
-      for (size_t I = 0; I < NumElements; ++I)
-        Result.setValue(I, static_cast<convertT>(getValue(I)));
+private:
+  // We interpret bool as int8_t, std::byte as uint8_t for conversion to other
+  // types.
+  template <typename T>
+  using ConvertBoolAndByteT =
+      typename detail::map_type<T,
+#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
+                                std::byte, /*->*/ std::uint8_t, //
+#endif
+                                bool, /*->*/ std::int8_t, //
+                                T, /*->*/ T               //
+                                >::type;
 
-      return Result;
-    } else {
-      // No conversion necessary
-      return *this;
-    }
+  // getValue should be able to operate on different underlying
+  // types: enum cl_float#N , builtin vector float#N, builtin type float.
+  constexpr auto getValue(int Index) const {
+    using RetType =
+        typename std::conditional_t<detail::is_byte_v<DataT>, int8_t,
+#ifdef __SYCL_DEVICE_ONLY__
+                                    element_type_for_vector_t
+#else
+                                    DataT
+#endif
+                                    >;
+
+#ifdef __SYCL_DEVICE_ONLY__
+    if constexpr (std::is_same_v<DataT, sycl::ext::oneapi::bfloat16>)
+      return sycl::bit_cast<RetType>(m_Data[Index]);
+    else
+#endif
+      return static_cast<RetType>(m_Data[Index]);
   }
 
+public:
   template <typename convertT,
             rounding_mode roundingMode = rounding_mode::automatic>
-  std::enable_if_t<
-      !std::is_same_v<vec_data_t<DataT>, vec_data_t<convertT>> &&
-          !std::is_same_v<detail::ConvertToOpenCLType_t<vec_data_t<DataT>>,
-                          detail::ConvertToOpenCLType_t<vec_data_t<convertT>>>,
-      vec<convertT, NumElements>>
-  convert() const {
-    static_assert(std::is_integral_v<vec_data_t<convertT>> ||
-                      detail::is_floating_point<convertT>::value,
+  vec<convertT, NumElements> convert() const {
+
+    using T = ConvertBoolAndByteT<DataT>;
+    using R = ConvertBoolAndByteT<convertT>;
+    using bfloat16 = sycl::ext::oneapi::bfloat16;
+    static_assert(std::is_integral_v<R> ||
+                      detail::is_floating_point<R>::value ||
+                      std::is_same_v<R, bfloat16>,
                   "Unsupported convertT");
-    using T = vec_data_t<DataT>;
-    using R = vec_data_t<convertT>;
+
     using OpenCLT = detail::ConvertToOpenCLType_t<T>;
     using OpenCLR = detail::ConvertToOpenCLType_t<R>;
     vec<convertT, NumElements> Result;
 
-#if defined(__SYCL_DEVICE_ONLY__)
-    using OpenCLVecT = OpenCLT __attribute__((ext_vector_type(NumElements)));
-    using OpenCLVecR = OpenCLR __attribute__((ext_vector_type(NumElements)));
-    // Whole vector conversion can only be done, if:
-    constexpr bool canUseNativeVectorConvert =
+    // convertImpl can't be called with the same From and To types and therefore
+    // we need some special processing in a few cases.
+    if constexpr (std::is_same_v<DataT, convertT>) {
+      return *this;
+    } else if constexpr (std::is_same_v<OpenCLT, OpenCLR> ||
+                         std::is_same_v<T, R>) {
+      for (size_t I = 0; I < NumElements; ++I)
+        Result[I] = static_cast<convertT>(getValue(I));
+      return Result;
+    } else {
+
+#ifdef __SYCL_DEVICE_ONLY__
+      using OpenCLVecT = OpenCLT __attribute__((ext_vector_type(NumElements)));
+      using OpenCLVecR = OpenCLR __attribute__((ext_vector_type(NumElements)));
+
+      auto NativeVector = sycl::bit_cast<vector_t>(*this);
+      using ConvertTVecType = typename vec<convertT, NumElements>::vector_t;
+
+      // Whole vector conversion can only be done, if:
+      constexpr bool canUseNativeVectorConvert =
 #ifdef __NVPTX__
-        // - we are not on CUDA, see intel/llvm#11840
-        false &&
+          //  TODO: Likely unnecessary as
+          //  https://github.com/intel/llvm/issues/11840 has been closed
+          //  already.
+          false &&
 #endif
-        // - both vectors are represented using native vector types;
-        NativeVec && vec<convertT, NumElements>::NativeVec &&
-        // - vec storage has an equivalent OpenCL native vector it is implicitly
-        //   convertible to. There are some corner cases where it is not the
-        //   case with char, long and long long types.
-        std::is_convertible_v<decltype(m_Data), OpenCLVecT> &&
-        std::is_convertible_v<decltype(Result.m_Data), OpenCLVecR> &&
-        // - it is not a signed to unsigned (or vice versa) conversion
-        //   see comments within 'convertImpl' for more details;
-        !detail::is_sint_to_from_uint<T, R>::value &&
-        // - destination type is not bool. bool is stored as integer under the
-        //   hood and therefore conversion to bool looks like conversion between
-        //   two integer types. Since bit pattern for true and false is not
-        //   defined, there is no guarantee that integer conversion yields
-        //   right results here;
-        !std::is_same_v<convertT, bool>;
-    if constexpr (canUseNativeVectorConvert) {
-      Result.m_Data = detail::convertImpl<T, R, roundingMode, NumElements,
-                                          OpenCLVecT, OpenCLVecR>(m_Data);
-    } else
-#endif // defined(__SYCL_DEVICE_ONLY__)
-    {
-      // Otherwise, we fallback to per-element conversion:
-      for (size_t I = 0; I < NumElements; ++I) {
-        Result.setValue(
-            I, vec_data<convertT>::get(
-                   detail::convertImpl<T, R, roundingMode, 1, OpenCLT, OpenCLR>(
-                       vec_data<DataT>::get(getValue(I)))));
+          NumElements > 1 &&
+          // - vec storage has an equivalent OpenCL native vector it is
+          //   implicitly convertible to. There are some corner cases where it
+          //   is not the case with char, long and long long types.
+          std::is_convertible_v<vector_t, OpenCLVecT> &&
+          std::is_convertible_v<ConvertTVecType, OpenCLVecR> &&
+          // - it is not a signed to unsigned (or vice versa) conversion
+          //   see comments within 'convertImpl' for more details;
+          !detail::is_sint_to_from_uint<T, R>::value &&
+          // - destination type is not bool. bool is stored as integer under the
+          //   hood and therefore conversion to bool looks like conversion
+          //   between two integer types. Since bit pattern for true and false
+          //   is not defined, there is no guarantee that integer conversion
+          //   yields right results here;
+          !std::is_same_v<convertT, bool>;
+
+      if constexpr (canUseNativeVectorConvert) {
+        Result.m_Data = sycl::bit_cast<decltype(Result.m_Data)>(
+            detail::convertImpl<T, R, roundingMode, NumElements, OpenCLVecT,
+                                OpenCLVecR>(NativeVector));
+      } else
+#endif // __SYCL_DEVICE_ONLY__
+      {
+        // Otherwise, we fallback to per-element conversion:
+        for (size_t I = 0; I < NumElements; ++I) {
+          auto val =
+              detail::convertImpl<T, R, roundingMode, 1, OpenCLT, OpenCLR>(
+                  getValue(I));
+#ifdef __SYCL_DEVICE_ONLY__
+          // On device, we interpret BF16 as uint16.
+          if constexpr (std::is_same_v<convertT, bfloat16>)
+            Result[I] = sycl::bit_cast<convertT>(val);
+          else
+#endif
+            Result[I] = static_cast<convertT>(val);
+        }
       }
     }
-
     return Result;
   }
 
-  template <typename asT> asT as() const {
-    static_assert((sizeof(*this) == sizeof(asT)),
-                  "The new SYCL vec type must have the same storage size in "
-                  "bytes as this SYCL vec");
-    static_assert(
-        detail::is_contained<asT, detail::gtl::vector_basic_list>::value ||
-            detail::is_contained<asT, detail::gtl::vector_bool_list>::value,
-        "asT must be SYCL vec of a different element type and "
-        "number of elements specified by asT");
-    asT Result;
-    detail::memcpy(&Result.m_Data, &m_Data, sizeof(decltype(Result.m_Data)));
-    return Result;
-  }
+  template <typename asT> asT as() const { return sycl::bit_cast<asT>(*this); }
 
   template <int... SwizzleIndexes> Swizzle<SwizzleIndexes...> swizzle() {
     return this;
@@ -859,60 +524,11 @@ template <typename Type, int NumElements> class vec {
     return this;
   }
 
-  // ext_vector_type is used as an underlying type for sycl::vec on device.
-  // The problem is that for clang vector types the return of operator[] is a
-  // temporary and not a reference to the element in the vector. In practice
-  // reinterpret_cast<DataT *>(&m_Data)[i]; is working. According to
-  // http://llvm.org/docs/GetElementPtr.html#can-gep-index-into-vector-elements
-  // this is not disallowed now. But could probably be disallowed in the future.
-  // That is why tests are added to check that behavior of the compiler has
-  // not changed.
-  //
-  // Implement operator [] in the same way for host and device.
-  // TODO: change host side implementation when underlying type for host side
-  // will be changed to std::array.
-  // NOTE: aliasing the incompatible types of bfloat16 may lead to problems if
-  // aggressively optimized. Specializing with noinline to avoid as workaround.
+  const DataT &operator[](int i) const { return m_Data[i]; }
 
-  template <typename T = DataT>
-  typename std::enable_if_t<!std::is_same_v<T, sycl::ext::oneapi::bfloat16>,
-                            const DataT &>
-  operator[](int i) const {
-    return reinterpret_cast<const DataT *>(&m_Data)[i];
-  }
+  DataT &operator[](int i) { return m_Data[i]; }
 
-  template <typename T = DataT>
-  typename std::enable_if_t<!std::is_same_v<T, sycl::ext::oneapi::bfloat16>,
-                            DataT &>
-  operator[](int i) {
-    return reinterpret_cast<DataT *>(&m_Data)[i];
-  }
-
-#ifdef _MSC_VER
-#define __SYCL_NOINLINE_BF16 __declspec(noinline)
-#else
-#define __SYCL_NOINLINE_BF16 __attribute__((noinline))
-#endif
-
-  template <typename T = DataT>
-  __SYCL_NOINLINE_BF16
-      typename std::enable_if_t<std::is_same_v<T, sycl::ext::oneapi::bfloat16>,
-                                const DataT &>
-      operator[](int i) const {
-    return reinterpret_cast<const DataT *>(&m_Data)[i];
-  }
-
-  template <typename T = DataT>
-  __SYCL_NOINLINE_BF16
-      typename std::enable_if_t<std::is_same_v<T, sycl::ext::oneapi::bfloat16>,
-                                DataT &>
-      operator[](int i) {
-    return reinterpret_cast<DataT *>(&m_Data)[i];
-  }
-
-#undef __SYCL_NOINLINE_BF16
-
-  // Begin hi/lo, even/odd, xyzw, and rgba swizzles.
+  // Begin hi/lo, even/odd, xyzw, and rgba swizzles. @{
 private:
   // Indexer used in the swizzles.def
   // Currently it is defined as a template struct. Replacing it with a constexpr
@@ -930,13 +546,13 @@ template <typename Type, int NumElements> class vec {
 #define __SYCL_ACCESS_RETURN this
 #include "swizzles.def"
 #undef __SYCL_ACCESS_RETURN
-  // End of hi/lo, even/odd, xyzw, and rgba swizzles.
+  // }@ End of hi/lo, even/odd, xyzw, and rgba swizzles.
 
   template <access::address_space Space, access::decorated DecorateAddress>
   void load(size_t Offset, multi_ptr<const DataT, Space, DecorateAddress> Ptr) {
     for (int I = 0; I < NumElements; I++) {
-      setValue(I, *multi_ptr<const DataT, Space, DecorateAddress>(
-                      Ptr + Offset * NumElements + I));
+      m_Data[I] = *multi_ptr<const DataT, Space, DecorateAddress>(
+          Ptr + Offset * NumElements + I);
     }
   }
   template <access::address_space Space, access::decorated DecorateAddress>
@@ -961,7 +577,7 @@ template <typename Type, int NumElements> class vec {
              multi_ptr<DataT, Space, DecorateAddress> Ptr) const {
     for (int I = 0; I < NumElements; I++) {
       *multi_ptr<DataT, Space, DecorateAddress>(Ptr + Offset * NumElements +
-                                                I) = getValue(I);
+                                                I) = m_Data[I];
     }
   }
   template <int Dimensions, access::mode Mode,
@@ -976,404 +592,23 @@ template <typename Type, int NumElements> class vec {
     store(Offset, MultiPtr);
   }
 
-  void ConvertToDataT() {
-    for (size_t i = 0; i < NumElements; ++i) {
-      DataT tmp = getValue(i);
-      setValue(i, tmp);
-    }
-  }
-
-#ifdef __SYCL_BINOP
-#error "Undefine __SYCL_BINOP macro"
-#endif
-
-#ifdef __SYCL_USE_EXT_VECTOR_TYPE__
-#define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT)                                 \
-  friend vec operator BINOP(const vec &Lhs, const vec &Rhs) {                  \
-    vec Ret;                                                                   \
-    if constexpr (IsUsingArrayOnDevice) {                                      \
-      for (size_t I = 0; I < NumElements; ++I) {                               \
-        Ret.setValue(I, (Lhs.getValue(I) BINOP Rhs.getValue(I)));              \
-      }                                                                        \
-    } else {                                                                   \
-      Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data;                                \
-      if constexpr (std::is_same_v<Type, bool> && CONVERT) {                   \
-        Ret.ConvertToDataT();                                                  \
-      }                                                                        \
-    }                                                                          \
-    return Ret;                                                                \
-  }                                                                            \
-  friend vec operator BINOP(const vec &Lhs, const DataT &Rhs) {                \
-    return Lhs BINOP vec(Rhs);                                                 \
-  }                                                                            \
-  friend vec operator BINOP(const DataT &Lhs, const vec &Rhs) {                \
-    return vec(Lhs) BINOP Rhs;                                                 \
-  }                                                                            \
-  friend vec &operator OPASSIGN(vec & Lhs, const vec & Rhs) {                  \
-    Lhs = Lhs BINOP Rhs;                                                       \
-    return Lhs;                                                                \
-  }                                                                            \
-  template <int Num = NumElements>                                             \
-  friend typename std::enable_if_t<Num != 1, vec &> operator OPASSIGN(         \
-      vec & Lhs, const DataT & Rhs) {                                          \
-    Lhs = Lhs BINOP vec(Rhs);                                                  \
-    return Lhs;                                                                \
-  }
-
-#else // __SYCL_USE_EXT_VECTOR_TYPE__
-
-#define __SYCL_BINOP(BINOP, OPASSIGN, CONVERT)                                 \
-  friend vec operator BINOP(const vec &Lhs, const vec &Rhs) {                  \
-    vec Ret{};                                                                 \
-    if constexpr (NativeVec)                                                   \
-      Ret.m_Data = Lhs.m_Data BINOP Rhs.m_Data;                                \
-    else                                                                       \
-      for (size_t I = 0; I < NumElements; ++I)                                 \
-        Ret.setValue(I, (DataT)(vec_data<DataT>::get(Lhs.getValue(             \
-                            I)) BINOP vec_data<DataT>::get(Rhs.getValue(I)))); \
-    return Ret;                                                                \
-  }                                                                            \
-  friend vec operator BINOP(const vec &Lhs, const DataT &Rhs) {                \
-    return Lhs BINOP vec(Rhs);                                                 \
-  }                                                                            \
-  friend vec operator BINOP(const DataT &Lhs, const vec &Rhs) {                \
-    return vec(Lhs) BINOP Rhs;                                                 \
-  }                                                                            \
-  friend vec &operator OPASSIGN(vec & Lhs, const vec & Rhs) {                  \
-    Lhs = Lhs BINOP Rhs;                                                       \
-    return Lhs;                                                                \
-  }                                                                            \
-  template <int Num = NumElements>                                             \
-  friend typename std::enable_if_t<Num != 1, vec &> operator OPASSIGN(         \
-      vec & Lhs, const DataT & Rhs) {                                          \
-    Lhs = Lhs BINOP vec(Rhs);                                                  \
-    return Lhs;                                                                \
-  }
-
-#endif // __SYCL_USE_EXT_VECTOR_TYPE__
-
-  __SYCL_BINOP(+, +=, true)
-  __SYCL_BINOP(-, -=, true)
-  __SYCL_BINOP(*, *=, false)
-  __SYCL_BINOP(/, /=, false)
-
-  // TODO: The following OPs are available only when: DataT != cl_float &&
-  // DataT != cl_double && DataT != cl_half
-  __SYCL_BINOP(%, %=, false)
-  __SYCL_BINOP(|, |=, false)
-  __SYCL_BINOP(&, &=, false)
-  __SYCL_BINOP(^, ^=, false)
-  __SYCL_BINOP(>>, >>=, false)
-  __SYCL_BINOP(<<, <<=, true)
-#undef __SYCL_BINOP
-#undef __SYCL_BINOP_HELP
-
-  // Note: vec<>/SwizzleOp logical value is 0/-1 logic, as opposed to 0/1 logic.
-  // As far as CTS validation is concerned, 0/-1 logic also applies when
-  // NumElements is equal to one, which is somewhat inconsistent with being
-  // transparent with scalar data.
-  // TODO: Determine if vec<, NumElements=1> is needed at all, remove this
-  // inconsistency if not by disallowing one-element vectors (as in OpenCL)
-
-#ifdef __SYCL_RELLOGOP
-#error "Undefine __SYCL_RELLOGOP macro"
-#endif
-// Use __SYCL_DEVICE_ONLY__ macro because cast to OpenCL vector type is defined
-// by SYCL device compiler only.
-#ifdef __SYCL_DEVICE_ONLY__
-#define __SYCL_RELLOGOP(RELLOGOP)                                              \
-  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
-                                                   const vec & Rhs) {          \
-    vec<rel_t, NumElements> Ret{};                                             \
-    /* This special case is needed since there are no standard operator||   */ \
-    /* or operator&& functions for std::array.                              */ \
-    if constexpr (IsUsingArrayOnDevice &&                                      \
-                  (std::string_view(#RELLOGOP) == "||" ||                      \
-                   std::string_view(#RELLOGOP) == "&&")) {                     \
-      for (size_t I = 0; I < NumElements; ++I) {                               \
-        /* We cannot use SetValue here as the operator is not a friend of*/    \
-        /* Ret on Windows. */                                                  \
-        Ret[I] = static_cast<rel_t>(-(vec_data<DataT>::get(                    \
-            Lhs.getValue(I)) RELLOGOP vec_data<DataT>::get(Rhs.getValue(I)))); \
-      }                                                                        \
-    } else {                                                                   \
-      Ret = vec<rel_t, NumElements>(                                           \
-          (typename vec<rel_t, NumElements>::vector_t)(                        \
-              Lhs.m_Data RELLOGOP Rhs.m_Data));                                \
-      if (NumElements == 1) /*Scalar 0/1 logic was applied, invert*/           \
-        Ret *= -1;                                                             \
-    }                                                                          \
-    return Ret;                                                                \
-  }                                                                            \
-  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
-                                                   const DataT & Rhs) {        \
-    return Lhs RELLOGOP vec(Rhs);                                              \
-  }                                                                            \
-  friend vec<rel_t, NumElements> operator RELLOGOP(const DataT & Lhs,          \
-                                                   const vec & Rhs) {          \
-    return vec(Lhs) RELLOGOP Rhs;                                              \
-  }
-
-#else
-#define __SYCL_RELLOGOP(RELLOGOP)                                              \
-  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
-                                                   const vec & Rhs) {          \
-    vec<rel_t, NumElements> Ret{};                                             \
-    for (size_t I = 0; I < NumElements; ++I) {                                 \
-      /* We cannot use SetValue here as the operator is not a friend of*/      \
-      /* Ret on Windows. */                                                    \
-      Ret[I] = static_cast<rel_t>(-(vec_data<DataT>::get(                      \
-          Lhs.getValue(I)) RELLOGOP vec_data<DataT>::get(Rhs.getValue(I))));   \
-    }                                                                          \
-    return Ret;                                                                \
-  }                                                                            \
-  friend vec<rel_t, NumElements> operator RELLOGOP(const vec & Lhs,            \
-                                                   const DataT & Rhs) {        \
-    return Lhs RELLOGOP vec(Rhs);                                              \
-  }                                                                            \
-  friend vec<rel_t, NumElements> operator RELLOGOP(const DataT & Lhs,          \
-                                                   const vec & Rhs) {          \
-    return vec(Lhs) RELLOGOP Rhs;                                              \
-  }
-#endif
-
-  __SYCL_RELLOGOP(==)
-  __SYCL_RELLOGOP(!=)
-  __SYCL_RELLOGOP(>)
-  __SYCL_RELLOGOP(<)
-  __SYCL_RELLOGOP(>=)
-  __SYCL_RELLOGOP(<=)
-  // TODO: limit to integral types.
-  __SYCL_RELLOGOP(&&)
-  __SYCL_RELLOGOP(||)
-#undef __SYCL_RELLOGOP
-
-#ifdef __SYCL_UOP
-#error "Undefine __SYCL_UOP macro"
-#endif
-#define __SYCL_UOP(UOP, OPASSIGN)                                              \
-  friend vec &operator UOP(vec & Rhs) {                                        \
-    Rhs OPASSIGN vec_data<DataT>::get(1);                                      \
-    return Rhs;                                                                \
-  }                                                                            \
-  friend vec operator UOP(vec &Lhs, int) {                                     \
-    vec Ret(Lhs);                                                              \
-    Lhs OPASSIGN vec_data<DataT>::get(1);                                      \
-    return Ret;                                                                \
-  }
-
-  __SYCL_UOP(++, +=)
-  __SYCL_UOP(--, -=)
-#undef __SYCL_UOP
-
-  // operator~() available only when: dataT != float && dataT != double
-  // && dataT != half
-  friend vec operator~(const vec &Rhs) {
-    if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
-      vec Ret{};
-      for (size_t I = 0; I < NumElements; ++I) {
-        Ret.setValue(I, ~Rhs.getValue(I));
-      }
-      return Ret;
-    } else {
-      vec Ret{(typename vec::DataType) ~Rhs.m_Data};
-      if constexpr (std::is_same_v<Type, bool>) {
-        Ret.ConvertToDataT();
-      }
-      return Ret;
-    }
-  }
-
-  // operator!
-  friend vec<detail::rel_t<DataT>, NumElements> operator!(const vec &Rhs) {
-    if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
-      vec Ret{};
-      for (size_t I = 0; I < NumElements; ++I) {
-#if (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
-        // std::byte neither supports ! unary op or casting, so special handling
-        // is needed. And, worse, Windows has a conflict with 'byte'.
-        if constexpr (std::is_same_v<std::byte, DataT>) {
-          Ret.setValue(I, std::byte{!vec_data<DataT>::get(Rhs.getValue(I))});
-        } else
-#endif // (!defined(_HAS_STD_BYTE) || _HAS_STD_BYTE != 0)
-        {
-          Ret.setValue(I, !vec_data<DataT>::get(Rhs.getValue(I)));
-        }
-      }
-      return Ret.template as<vec<detail::rel_t<DataT>, NumElements>>();
-    } else {
-      return vec{(typename vec<DataT, NumElements>::DataType) !Rhs.m_Data}
-          .template as<vec<detail::rel_t<DataT>, NumElements>>();
-    }
-  }
-
-  // operator +
-  friend vec operator+(const vec &Lhs) {
-    if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
-      vec Ret{};
-      for (size_t I = 0; I < NumElements; ++I)
-        Ret.setValue(
-            I, vec_data<DataT>::get(+vec_data<DataT>::get(Lhs.getValue(I))));
-      return Ret;
-    } else {
-      return vec{+Lhs.m_Data};
-    }
-  }
-
-  // operator -
-  friend vec operator-(const vec &Lhs) {
-    namespace oneapi = sycl::ext::oneapi;
-    vec Ret{};
-    if constexpr (IsBfloat16 && NumElements == 1) {
-      oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data);
-      oneapi::bfloat16 w = -v;
-      Ret.m_Data = oneapi::detail::bfloat16ToBits(w);
-    } else if constexpr (IsBfloat16) {
-      for (size_t I = 0; I < NumElements; I++) {
-        oneapi::bfloat16 v = oneapi::detail::bitsToBfloat16(Lhs.m_Data[I]);
-        oneapi::bfloat16 w = -v;
-        Ret.m_Data[I] = oneapi::detail::bfloat16ToBits(w);
-      }
-    } else if constexpr (IsUsingArrayOnDevice || IsUsingArrayOnHost) {
-      for (size_t I = 0; I < NumElements; ++I)
-        Ret.setValue(
-            I, vec_data<DataT>::get(-vec_data<DataT>::get(Lhs.getValue(I))));
-      return Ret;
-    } else {
-      Ret = vec{-Lhs.m_Data};
-      if constexpr (std::is_same_v<Type, bool>) {
-        Ret.ConvertToDataT();
-      }
-      return Ret;
-    }
-  }
-
-  // OP is: &&, ||
-  // vec<RET, NumElements> operatorOP(const vec<DataT, NumElements> &Rhs) const;
-  // vec<RET, NumElements> operatorOP(const DataT &Rhs) const;
-
-  // OP is: ==, !=, <, >, <=, >=
-  // vec<RET, NumElements> operatorOP(const vec<DataT, NumElements> &Rhs) const;
-  // vec<RET, NumElements> operatorOP(const DataT &Rhs) const;
 private:
-  // Generic method that execute "Operation" on underlying values.
-
-#ifdef __SYCL_USE_EXT_VECTOR_TYPE__
-  template <template <typename> class Operation,
-            typename Ty = vec<DataT, NumElements>>
-  vec<DataT, NumElements>
-  operatorHelper(const EnableIfNotUsingArrayOnDevice<Ty> &Rhs) const {
-    vec<DataT, NumElements> Result;
-    Operation<DataType> Op;
-    Result.m_Data = Op(m_Data, Rhs.m_Data);
-    return Result;
-  }
-
-  template <template <typename> class Operation,
-            typename Ty = vec<DataT, NumElements>>
-  vec<DataT, NumElements>
-  operatorHelper(const EnableIfUsingArrayOnDevice<Ty> &Rhs) const {
-    vec<DataT, NumElements> Result;
-    Operation<DataT> Op;
-    for (size_t I = 0; I < NumElements; ++I) {
-      Result.setValue(I, Op(Rhs.getValue(I), getValue(I)));
-    }
-    return Result;
-  }
-#else  // __SYCL_USE_EXT_VECTOR_TYPE__
-  template <template <typename> class Operation>
-  vec<DataT, NumElements>
-  operatorHelper(const vec<DataT, NumElements> &Rhs) const {
-    vec<DataT, NumElements> Result;
-    Operation<DataT> Op;
-    for (size_t I = 0; I < NumElements; ++I) {
-      Result.setValue(I, Op(Rhs.getValue(I), getValue(I)));
-    }
-    return Result;
-  }
-#endif // __SYCL_USE_EXT_VECTOR_TYPE__
-
-  // setValue and getValue should be able to operate on different underlying
-  // types: enum cl_float#N , builtin vector float#N, builtin type float.
-  // These versions are for N > 1.
-#ifdef __SYCL_USE_EXT_VECTOR_TYPE__
-  template <int Num = NumElements, typename Ty = int,
-            typename = typename std::enable_if_t<1 != Num>>
-  constexpr void setValue(EnableIfNotHostHalf<Ty> Index, const DataT &Value,
-                          int) {
-    m_Data[Index] = vec_data<DataT>::set(Value);
-  }
-
-  template <int Num = NumElements, typename Ty = int,
-            typename = typename std::enable_if_t<1 != Num>>
-  constexpr DataT getValue(EnableIfNotHostHalf<Ty> Index, int) const {
-    return vec_data<DataT>::get(m_Data[Index]);
-  }
-
-  template <int Num = NumElements, typename Ty = int,
-            typename = typename std::enable_if_t<1 != Num>>
-  constexpr void setValue(EnableIfHostHalf<Ty> Index, const DataT &Value, int) {
-    m_Data.s[Index] = vec_data<DataT>::set(Value);
-  }
-
-  template <int Num = NumElements, typename Ty = int,
-            typename = typename std::enable_if_t<1 != Num>>
-  constexpr DataT getValue(EnableIfHostHalf<Ty> Index, int) const {
-    return vec_data<DataT>::get(m_Data.s[Index]);
-  }
-#else  // __SYCL_USE_EXT_VECTOR_TYPE__
-  template <int Num = NumElements,
-            typename = typename std::enable_if_t<1 != Num>>
-  constexpr void setValue(int Index, const DataT &Value, int) {
-    m_Data[Index] = vec_data<DataT>::set(Value);
-  }
-
-  template <int Num = NumElements,
-            typename = typename std::enable_if_t<1 != Num>>
-  constexpr DataT getValue(int Index, int) const {
-    return vec_data<DataT>::get(m_Data[Index]);
-  }
-#endif // __SYCL_USE_EXT_VECTOR_TYPE__
-
-  // N==1 versions, used by host and device. Shouldn't trailing type be int?
-  template <int Num = NumElements,
-            typename = typename std::enable_if_t<1 == Num>>
-  constexpr void setValue(int, const DataT &Value, float) {
-    m_Data = vec_data<DataT>::set(Value);
-  }
-
-  template <int Num = NumElements,
-            typename = typename std::enable_if_t<1 == Num>>
-  DataT getValue(int, float) const {
-    return vec_data<DataT>::get(m_Data);
-  }
-
-  // setValue and getValue.
-  // The "api" functions used by BINOP etc.  These versions just dispatch
-  // using additional int or float arg to disambiguate vec<1> vs. vec<N>
-  // Special proxies as specialization is not allowed in class scope.
-  constexpr void setValue(int Index, const DataT &Value) {
-    if (NumElements == 1)
-      setValue(Index, Value, 0);
-    else
-      setValue(Index, Value, 0.f);
-  }
-
-  DataT getValue(int Index) const {
-    return (NumElements == 1) ? getValue(Index, 0) : getValue(Index, 0.f);
-  }
-
   // fields
-
-  // Alignment is the same as size, to a maximum size of 64.
-  // detail::vector_alignment will return that value.
-  alignas(detail::vector_alignment<DataT, NumElements>::value) DataType m_Data;
+  // Alignment is the same as size, to a maximum size of 64. SPEC requires
+  // "The elements of an instance of the SYCL vec class template are stored
+  // in memory sequentially and contiguously and are aligned to the size of
+  // the element type in bytes multiplied by the number of elements."
+  static constexpr int alignment = (std::min)((size_t)64, sizeof(DataType));
+  alignas(alignment) DataType m_Data;
 
   // friends
   template <typename T1, typename T2, typename T3, template <typename> class T4,
             int... T5>
   friend class detail::SwizzleOp;
   template <typename T1, int T2> friend class vec;
+  // To allow arithmetic operators access private members of vec.
+  template <typename T1, int T2> friend class detail::vec_arith;
+  template <typename T1, int T2> friend class detail::vec_arith_common;
 };
 ///////////////////////// class sycl::vec /////////////////////////
 
@@ -1398,6 +633,8 @@ template <typename T> class GetScalarOp {
 private:
   DataT m_Data;
 };
+template <typename T>
+using rel_t = detail::select_cl_scalar_integral_signed_t<T>;
 
 template <typename T> struct EqualTo {
   constexpr rel_t<T> operator()(const T &Lhs, const T &Rhs) const {
@@ -1535,13 +772,13 @@ class SwizzleOp {
   template <typename T>
   using EnableIfScalarType = typename std::enable_if_t<
       std::is_convertible_v<DataT, T> &&
-      (std::is_fundamental_v<vec_data_t<T>> ||
+      (std::is_fundamental_v<T> ||
        detail::is_half_or_bf16_v<typename std::remove_const_t<T>>)>;
 
   template <typename T>
   using EnableIfNoScalarType = typename std::enable_if_t<
       !std::is_convertible_v<DataT, T> ||
-      !(std::is_fundamental_v<vec_data_t<T>> ||
+      !(std::is_fundamental_v<T> ||
         detail::is_half_or_bf16_v<typename std::remove_const_t<T>>)>;
 
   template <int... Indices>
@@ -1661,7 +898,7 @@ class SwizzleOp {
 
   template <typename T = DataT>
   friend typename std::enable_if_t<
-      std::is_same_v<T, DataT> && std::is_integral_v<vec_data_t<T>>, vec_t>
+      std::is_same_v<T, DataT> && !detail::is_vgenfloat_v<T>, vec_t>
   operator~(const SwizzleOp &Rhs) {
     vec_t Tmp = Rhs;
     return ~Tmp;
@@ -1688,34 +925,57 @@ class SwizzleOp {
 #ifdef __SYCL_BINOP
 #error "Undefine __SYCL_BINOP macro"
 #endif
-#define __SYCL_BINOP(BINOP)                                                    \
-  friend vec_t operator BINOP(const DataT &Lhs, const SwizzleOp &Rhs) {        \
+#define __SYCL_BINOP(BINOP, COND)                                              \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_t> operator BINOP(                       \
+      const DataT & Lhs, const SwizzleOp & Rhs) {                              \
     vec_t Tmp = Rhs;                                                           \
     return Lhs BINOP Tmp;                                                      \
   }                                                                            \
-  friend vec_t operator BINOP(const SwizzleOp &Lhs, const DataT &Rhs) {        \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_t> operator BINOP(const SwizzleOp & Lhs, \
+                                                        const DataT & Rhs) {   \
     vec_t Tmp = Lhs;                                                           \
     return Tmp BINOP Rhs;                                                      \
   }                                                                            \
-  friend vec_t operator BINOP(const vec_t &Lhs, const SwizzleOp &Rhs) {        \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_t> operator BINOP(                       \
+      const vec_t & Lhs, const SwizzleOp & Rhs) {                              \
     vec_t Tmp = Rhs;                                                           \
     return Lhs BINOP Tmp;                                                      \
   }                                                                            \
-  friend vec_t operator BINOP(const SwizzleOp &Lhs, const vec_t &Rhs) {        \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_t> operator BINOP(const SwizzleOp & Lhs, \
+                                                        const vec_t & Rhs) {   \
     vec_t Tmp = Lhs;                                                           \
     return Tmp BINOP Rhs;                                                      \
   }
 
-  __SYCL_BINOP(+)
-  __SYCL_BINOP(-)
-  __SYCL_BINOP(*)
-  __SYCL_BINOP(/)
-  __SYCL_BINOP(%)
-  __SYCL_BINOP(&)
-  __SYCL_BINOP(|)
-  __SYCL_BINOP(^)
-  __SYCL_BINOP(>>)
-  __SYCL_BINOP(<<)
+  __SYCL_BINOP(+, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(-, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(*, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(/, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(%, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(&, true)
+  __SYCL_BINOP(|, true)
+  __SYCL_BINOP(^, true)
+  // We have special <<, >> operators for std::byte.
+  __SYCL_BINOP(>>, (!detail::is_byte_v<T>))
+  __SYCL_BINOP(<<, (!detail::is_byte_v<T>))
+
+  template <typename T = DataT>
+  friend std::enable_if_t<detail::is_byte_v<T>, vec_t>
+  operator>>(const SwizzleOp &Lhs, const int shift) {
+    vec_t Tmp = Lhs;
+    return Tmp >> shift;
+  }
+
+  template <typename T = DataT>
+  friend std::enable_if_t<detail::is_byte_v<T>, vec_t>
+  operator<<(const SwizzleOp &Lhs, const int shift) {
+    vec_t Tmp = Lhs;
+    return Tmp << shift;
+  }
 #undef __SYCL_BINOP
 
 // scalar RELLOGOP vec<>
@@ -1724,33 +984,40 @@ class SwizzleOp {
 #ifdef __SYCL_RELLOGOP
 #error "Undefine __SYCL_RELLOGOP macro"
 #endif
-#define __SYCL_RELLOGOP(RELLOGOP)                                              \
-  friend vec_rel_t operator RELLOGOP(const DataT &Lhs, const SwizzleOp &Rhs) { \
+#define __SYCL_RELLOGOP(RELLOGOP, COND)                                        \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_rel_t> operator RELLOGOP(                \
+      const DataT & Lhs, const SwizzleOp & Rhs) {                              \
     vec_t Tmp = Rhs;                                                           \
     return Lhs RELLOGOP Tmp;                                                   \
   }                                                                            \
-  friend vec_rel_t operator RELLOGOP(const SwizzleOp &Lhs, const DataT &Rhs) { \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_rel_t> operator RELLOGOP(                \
+      const SwizzleOp & Lhs, const DataT & Rhs) {                              \
     vec_t Tmp = Lhs;                                                           \
     return Tmp RELLOGOP Rhs;                                                   \
   }                                                                            \
-  friend vec_rel_t operator RELLOGOP(const vec_t &Lhs, const SwizzleOp &Rhs) { \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_rel_t> operator RELLOGOP(                \
+      const vec_t & Lhs, const SwizzleOp & Rhs) {                              \
     vec_t Tmp = Rhs;                                                           \
     return Lhs RELLOGOP Tmp;                                                   \
   }                                                                            \
-  friend vec_rel_t operator RELLOGOP(const SwizzleOp &Lhs, const vec_t &Rhs) { \
+  template <typename T = DataT>                                                \
+  friend std::enable_if_t<(COND), vec_rel_t> operator RELLOGOP(                \
+      const SwizzleOp & Lhs, const vec_t & Rhs) {                              \
     vec_t Tmp = Lhs;                                                           \
     return Tmp RELLOGOP Rhs;                                                   \
   }
 
-  __SYCL_RELLOGOP(==)
-  __SYCL_RELLOGOP(!=)
-  __SYCL_RELLOGOP(>)
-  __SYCL_RELLOGOP(<)
-  __SYCL_RELLOGOP(>=)
-  __SYCL_RELLOGOP(<=)
-  // TODO: limit to integral types.
-  __SYCL_RELLOGOP(&&)
-  __SYCL_RELLOGOP(||)
+  __SYCL_RELLOGOP(==, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(!=, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(>, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(<, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(>=, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(<=, (!detail::is_byte_v<T>))
+  __SYCL_RELLOGOP(&&, (!detail::is_byte_v<T> && !detail::is_vgenfloat_v<T>))
+  __SYCL_RELLOGOP(||, (!detail::is_byte_v<T> && !detail::is_vgenfloat_v<T>))
 #undef __SYCL_RELLOGOP
 
   template <int IdxNum = getNumElements(),
@@ -1758,7 +1025,7 @@ class SwizzleOp {
   SwizzleOp &operator=(const vec<DataT, IdxNum> &Rhs) {
     std::array<int, IdxNum> Idxs{Indexes...};
     for (size_t I = 0; I < Idxs.size(); ++I) {
-      m_Vector->setValue(Idxs[I], Rhs.getValue(I));
+      (*m_Vector)[Idxs[I]] = Rhs.getValue(I);
     }
     return *this;
   }
@@ -1766,7 +1033,7 @@ class SwizzleOp {
   template <int IdxNum = getNumElements(), typename = EnableIfOneIndex<IdxNum>>
   SwizzleOp &operator=(const DataT &Rhs) {
     std::array<int, IdxNum> Idxs{Indexes...};
-    m_Vector->setValue(Idxs[0], Rhs);
+    (*m_Vector)[Idxs[0]] = Rhs;
     return *this;
   }
 
@@ -1775,7 +1042,7 @@ class SwizzleOp {
   SwizzleOp &operator=(const DataT &Rhs) {
     std::array<int, IdxNum> Idxs{Indexes...};
     for (auto Idx : Idxs) {
-      m_Vector->setValue(Idx, Rhs);
+      (*m_Vector)[Idx] = Rhs;
     }
     return *this;
   }
@@ -1783,7 +1050,7 @@ class SwizzleOp {
   template <int IdxNum = getNumElements(), typename = EnableIfOneIndex<IdxNum>>
   SwizzleOp &operator=(DataT &&Rhs) {
     std::array<int, IdxNum> Idxs{Indexes...};
-    m_Vector->setValue(Idxs[0], Rhs);
+    (*m_Vector)[Idxs[0]] = Rhs;
     return *this;
   }
 
@@ -1936,7 +1203,7 @@ class SwizzleOp {
   SwizzleOp &operator=(const SwizzleOp<T1, T2, T3, T4, T5...> &Rhs) {
     std::array<int, getNumElements()> Idxs{Indexes...};
     for (size_t I = 0; I < Idxs.size(); ++I) {
-      m_Vector->setValue(Idxs[I], Rhs.getValue(I));
+      (*m_Vector)[Idxs[I]] = Rhs.getValue(I);
     }
     return *this;
   }
@@ -1948,7 +1215,7 @@ class SwizzleOp {
   SwizzleOp &operator=(SwizzleOp<T1, T2, T3, T4, T5...> &&Rhs) {
     std::array<int, getNumElements()> Idxs{Indexes...};
     for (size_t I = 0; I < Idxs.size(); ++I) {
-      m_Vector->setValue(Idxs[I], Rhs.getValue(I));
+      (*m_Vector)[Idxs[I]] = Rhs.getValue(I);
     }
     return *this;
   }
@@ -2140,33 +1407,30 @@ class SwizzleOp {
       std::array<int, getNumElements()> Idxs{Indexes...};
       return m_Vector->getValue(Idxs[Index]);
     }
-    auto Op = OperationCurrentT<vec_data_t<CommonDataT>>();
-    return vec_data<CommonDataT>::get(
-        Op(vec_data<CommonDataT>::get(m_LeftOperation.getValue(Index)),
-           vec_data<CommonDataT>::get(m_RightOperation.getValue(Index))));
+    auto Op = OperationCurrentT<CommonDataT>();
+    return Op(m_LeftOperation.getValue(Index),
+              m_RightOperation.getValue(Index));
   }
 
   template <int IdxNum = getNumElements()>
   DataT getValue(EnableIfMultipleIndexes<IdxNum, size_t> Index) const {
     if (std::is_same_v<OperationCurrentT<DataT>, GetOp<DataT>>) {
       std::array<int, getNumElements()> Idxs{Indexes...};
-      return m_Vector->getValue(Idxs[Index]);
+      // Cast required for int8_t -> std::byte
+      return static_cast<DataT>(m_Vector->getValue(Idxs[Index]));
     }
-    auto Op = OperationCurrentT<vec_data_t<DataT>>();
-    return vec_data<DataT>::get(
-        Op(vec_data<DataT>::get(m_LeftOperation.getValue(Index)),
-           vec_data<DataT>::get(m_RightOperation.getValue(Index))));
+    auto Op = OperationCurrentT<DataT>();
+    return Op(m_LeftOperation.getValue(Index),
+              m_RightOperation.getValue(Index));
   }
 
   template <template <typename> class Operation, typename RhsOperation>
   void operatorHelper(const RhsOperation &Rhs) {
-    Operation<vec_data_t<DataT>> Op;
+    Operation<DataT> Op;
     std::array<int, getNumElements()> Idxs{Indexes...};
     for (size_t I = 0; I < Idxs.size(); ++I) {
-      DataT Res = vec_data<DataT>::get(
-          Op(vec_data<DataT>::get(m_Vector->getValue(Idxs[I])),
-             vec_data<DataT>::get(Rhs.getValue(I))));
-      m_Vector->setValue(Idxs[I], Res);
+      DataT Res = Op(m_Vector->getValue(Idxs[I]), Rhs.getValue(I));
+      (*m_Vector)[Idxs[I]] = Res;
     }
   }
 
diff --git a/sycl/include/syclcompat/device.hpp b/sycl/include/syclcompat/device.hpp
index 399efbd8b8933..ed16a9b32bfa4 100644
--- a/sycl/include/syclcompat/device.hpp
+++ b/sycl/include/syclcompat/device.hpp
@@ -195,7 +195,9 @@ class device_info {
   }
   int get_image1d_max() const { return _image1d_max; }
   auto get_image2d_max() const { return _image2d_max; }
+  auto get_image2d_max() { return _image2d_max; }
   auto get_image3d_max() const { return _image3d_max; }
+  auto get_image3d_max() { return _image3d_max; }
 
   // set interface
   void set_name(const char *name) {
@@ -664,8 +666,9 @@ Use 64 bits as memory_bus_width default value."
     std::lock_guard<std::mutex> lock(m_mutex);
     _events.push_back(event);
   }
-  friend sycl::event free_async(const std::vector<void *> &,
-                                const std::vector<sycl::event> &, sycl::queue);
+  friend sycl::event enqueue_free(const std::vector<void *> &,
+                                  const std::vector<sycl::event> &,
+                                  sycl::queue);
   queue_ptr _default_queue;
   queue_ptr _saved_queue;
   sycl::context _ctx;
@@ -726,14 +729,64 @@ class dev_mgr {
   unsigned int device_count() { return _devs.size(); }
 
   unsigned int get_device_id(const sycl::device &dev) {
+    if (!_devs.size()) {
+      throw std::runtime_error(
+          "[SYCLcompat] No SYCL devices found in the device list. Device list "
+          "may have been filtered by syclcompat::filter_device");
+    }
     unsigned int id = 0;
     for (auto dev_item : _devs) {
       if (*dev_item == dev) {
-        break;
+        return id;
       }
       id++;
     }
-    return id;
+    throw std::runtime_error("[SYCLcompat] The device[" +
+                             dev.get_info<sycl::info::device::name>() +
+                             "] is filtered out by syclcompat::filter_device "
+                             "in current device list!");
+  }
+
+  /// List all the devices with its id in dev_mgr.
+  void list_devices() const {
+    for (size_t i = 0; i < _devs.size(); ++i) {
+      std::cout << "Device " << i << ": "
+                << _devs[i]->get_info<sycl::info::device::name>() << std::endl;
+    }
+  }
+
+  /// Filter out devices; only keep the device whose name contains one of the
+  /// subname in \p dev_subnames.
+  /// May break device id mapping and change current device. It's better to be
+  /// called before other SYCLcompat/SYCL APIs.
+  void filter(const std::vector<std::string> &dev_subnames) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    auto iter = _devs.begin();
+    while (iter != _devs.end()) {
+      std::string dev_name = (*iter)->get_info<sycl::info::device::name>();
+      bool matched = false;
+      for (const auto &name : dev_subnames) {
+        if (dev_name.find(name) != std::string::npos) {
+          matched = true;
+          break;
+        }
+      }
+      if (matched)
+        ++iter;
+      else
+        iter = _devs.erase(iter);
+    }
+    _cpu_device = -1;
+    for (unsigned i = 0; i < _devs.size(); ++i) {
+      if (_devs[i]->is_cpu()) {
+        _cpu_device = i;
+        break;
+      }
+    }
+    _thread2dev_map.clear();
+#ifdef SYCLCOMPAT_VERBOSE
+    list_devices();
+#endif
   }
 
   /// Select device with a Device Selector
@@ -779,6 +832,9 @@ class dev_mgr {
         _cpu_device = _devs.size() - 1;
       }
     }
+#ifdef SYCLCOMPAT_VERBOSE
+    list_devices();
+#endif
   }
   void check_id(unsigned int id) const {
     if (id >= _devs.size()) {
@@ -853,6 +909,19 @@ static inline device_ext &cpu_device() {
   return detail::dev_mgr::instance().cpu_device();
 }
 
+/// Filter out devices; only keep the device whose name contains one of the
+/// subname in \p dev_subnames.
+/// May break device id mapping and change current device. It's better to be
+/// called before other SYCLcompat or SYCL APIs.
+static inline void filter_device(const std::vector<std::string> &dev_subnames) {
+  detail::dev_mgr::instance().filter(dev_subnames);
+}
+
+/// List all the devices with its id in dev_mgr.
+static inline void list_devices() {
+  detail::dev_mgr::instance().list_devices();
+}
+
 static inline unsigned int select_device(unsigned int id) {
   detail::dev_mgr::instance().select_device(id);
   return id;
@@ -869,4 +938,7 @@ static inline unsigned int get_device_id(const sycl::device &dev) {
   return detail::dev_mgr::instance().get_device_id(dev);
 }
 
+static inline unsigned int device_count() {
+  return detail::dev_mgr::instance().device_count();
+}
 } // namespace syclcompat
diff --git a/sycl/include/syclcompat/math.hpp b/sycl/include/syclcompat/math.hpp
index 5b742573b8db0..2f9ac8c0f1b4c 100644
--- a/sycl/include/syclcompat/math.hpp
+++ b/sycl/include/syclcompat/math.hpp
@@ -51,6 +51,10 @@ inline ValueT clamp(ValueT val, ValueT min_val, ValueT max_val) {
   return sycl::clamp(val, min_val, max_val);
 }
 
+template <typename T>
+constexpr bool is_int32_type = std::is_same_v<std::decay_t<T>, int32_t> ||
+                               std::is_same_v<std::decay_t<T>, uint32_t>;
+
 #ifdef SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS
 // TODO: Follow the process to add this to the extension. If added,
 // remove this functionality from the header.
@@ -79,20 +83,43 @@ class vectorized_binary {
 };
 
 /// Extend the 'val' to 'bit' size, zero extend for unsigned int and signed
-/// extend for signed int.
+/// extend for signed int. Returns a signed integer type.
 template <typename ValueT>
-inline int64_t zero_or_signed_extent(ValueT val, unsigned bit) {
-  if constexpr (std::is_signed_v<ValueT>) {
-    return int64_t(val) << (64 - bit) >> (64 - bit);
+inline auto zero_or_signed_extend(ValueT val, unsigned bit) {
+  static_assert(std::is_integral_v<ValueT>);
+  if constexpr (sizeof(ValueT) == 4) {
+    assert(bit < 64 &&
+           "When extending int32 value, bit must be smaller than 64.");
+    if constexpr (std::is_signed_v<ValueT>)
+      return int64_t(val) << (64 - bit) >> (64 - bit);
+    else
+      return int64_t(val);
+  } else if constexpr (sizeof(ValueT) == 2) {
+    assert(bit < 32 &&
+           "When extending int16 value, bit must be smaller than 32.");
+    if constexpr (std::is_signed_v<ValueT>)
+      return int32_t(val) << (32 - bit) >> (32 - bit);
+    else
+      return int32_t(val);
+  } else if constexpr (sizeof(ValueT) == 1) {
+    assert(bit < 16 &&
+           "When extending int8 value, bit must be smaller than 16.");
+    if constexpr (std::is_signed_v<ValueT>)
+      return int16_t(val) << (16 - bit) >> (16 - bit);
+    else
+      return int16_t(val);
+  } else {
+    static_assert(sizeof(ValueT) == 8);
+    assert(bit < 64 && "Cannot extend int64 value.");
+    return static_cast<int64_t>(val);
   }
-  return val;
 }
 
 template <typename RetT, bool needSat, typename AT, typename BT,
           typename BinaryOperation>
 inline constexpr RetT extend_binary(AT a, BT b, BinaryOperation binary_op) {
-  const int64_t extend_a = zero_or_signed_extent(a, 33);
-  const int64_t extend_b = zero_or_signed_extent(b, 33);
+  const int64_t extend_a = zero_or_signed_extend(a, 33);
+  const int64_t extend_b = zero_or_signed_extend(b, 33);
   const int64_t ret = binary_op(extend_a, extend_b);
   if constexpr (needSat)
     return detail::clamp<int64_t>(ret, std::numeric_limits<RetT>::min(),
@@ -105,18 +132,91 @@ template <typename RetT, bool needSat, typename AT, typename BT, typename CT,
 inline constexpr RetT extend_binary(AT a, BT b, CT c,
                                     BinaryOperation1 binary_op,
                                     BinaryOperation2 second_op) {
-  const int64_t extend_a = zero_or_signed_extent(a, 33);
-  const int64_t extend_b = zero_or_signed_extent(b, 33);
+  const int64_t extend_a = zero_or_signed_extend(a, 33);
+  const int64_t extend_b = zero_or_signed_extend(b, 33);
   int64_t extend_temp =
-      zero_or_signed_extent(binary_op(extend_a, extend_b), 34);
+      zero_or_signed_extend(binary_op(extend_a, extend_b), 34);
   if constexpr (needSat)
     extend_temp =
         detail::clamp<int64_t>(extend_temp, std::numeric_limits<RetT>::min(),
                                std::numeric_limits<RetT>::max());
-  const int64_t extend_c = zero_or_signed_extent(c, 33);
+  const int64_t extend_c = zero_or_signed_extend(c, 33);
   return second_op(extend_temp, extend_c);
 }
 
+template <typename T> sycl::vec<int32_t, 2> extract_and_extend2(T a) {
+  sycl::vec<int32_t, 2> ret;
+  sycl::vec<T, 1> va{a};
+  using IntT = std::conditional_t<std::is_signed_v<T>, int16_t, uint16_t>;
+  auto v = va.template as<sycl::vec<IntT, 2>>();
+  ret[0] = zero_or_signed_extend(v[0], 17);
+  ret[1] = zero_or_signed_extend(v[1], 17);
+  return ret;
+}
+
+template <typename T> sycl::vec<int16_t, 4> extract_and_extend4(T a) {
+  sycl::vec<int16_t, 4> ret;
+  sycl::vec<T, 1> va{a};
+  using IntT = std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>;
+  auto v = va.template as<sycl::vec<IntT, 4>>();
+  ret[0] = zero_or_signed_extend(v[0], 9);
+  ret[1] = zero_or_signed_extend(v[1], 9);
+  ret[2] = zero_or_signed_extend(v[2], 9);
+  ret[3] = zero_or_signed_extend(v[3], 9);
+  return ret;
+}
+
+template <typename RetT, bool NeedSat, bool NeedAdd, typename AT, typename BT,
+          typename BinaryOperation>
+inline constexpr RetT extend_vbinary2(AT a, BT b, RetT c,
+                                      BinaryOperation binary_op) {
+  static_assert(is_int32_type<AT> && is_int32_type<BT> && is_int32_type<RetT>);
+  sycl::vec<int32_t, 2> extend_a = extract_and_extend2(a);
+  sycl::vec<int32_t, 2> extend_b = extract_and_extend2(b);
+  sycl::vec<int32_t, 2> temp{binary_op(extend_a[0], extend_b[0]),
+                             binary_op(extend_a[1], extend_b[1])};
+  using IntT = std::conditional_t<std::is_signed_v<RetT>, int16_t, uint16_t>;
+
+  if constexpr (NeedSat) {
+    int32_t min_val = 0, max_val = 0;
+    min_val = std::numeric_limits<IntT>::min();
+    max_val = std::numeric_limits<IntT>::max();
+    temp = detail::clamp(temp, sycl::vec<int32_t, 2>(min_val),
+                         sycl::vec<int32_t, 2>(max_val));
+  }
+  if constexpr (NeedAdd) {
+    return temp[0] + temp[1] + c;
+  }
+  return sycl::vec<IntT, 2>{temp[0], temp[1]}.template as<sycl::vec<RetT, 1>>();
+}
+
+template <typename RetT, bool NeedSat, bool NeedAdd, typename AT, typename BT,
+          typename BinaryOperation>
+inline constexpr RetT extend_vbinary4(AT a, BT b, RetT c,
+                                      BinaryOperation binary_op) {
+  static_assert(is_int32_type<AT> && is_int32_type<BT> && is_int32_type<RetT>);
+  sycl::vec<int16_t, 4> extend_a = extract_and_extend4(a);
+  sycl::vec<int16_t, 4> extend_b = extract_and_extend4(b);
+  sycl::vec<int16_t, 4> temp{
+      binary_op(extend_a[0], extend_b[0]), binary_op(extend_a[1], extend_b[1]),
+      binary_op(extend_a[2], extend_b[2]), binary_op(extend_a[3], extend_b[3])};
+  using IntT = std::conditional_t<std::is_signed_v<RetT>, int8_t, uint8_t>;
+
+  if constexpr (NeedSat) {
+    int16_t min_val = 0, max_val = 0;
+    min_val = std::numeric_limits<IntT>::min();
+    max_val = std::numeric_limits<IntT>::max();
+    temp = detail::clamp(temp, sycl::vec<int16_t, 4>(min_val),
+                         sycl::vec<int16_t, 4>(max_val));
+  }
+  if constexpr (NeedAdd) {
+    return temp[0] + temp[1] + temp[2] + temp[3] + c;
+  }
+
+  return sycl::vec<IntT, 4>{temp[0], temp[1], temp[2], temp[3]}
+      .template as<sycl::vec<RetT, 1>>();
+}
+
 template <typename ValueT> inline bool isnan(const ValueT a) {
   return sycl::isnan(a);
 }
@@ -126,8 +226,160 @@ inline bool isnan(const sycl::ext::oneapi::bfloat16 a) {
 }
 #endif
 
+// FIXME(syclcompat-lib-reviewers): move bfe outside detail once perf is
+// improved & semantics understood
+/// Bitfield-extract.
+///
+/// \tparam T The type of \param source value, must be an integer.
+/// \param source The source value to extracting.
+/// \param bit_start The position to start extracting.
+/// \param num_bits The number of bits to extracting.
+template <typename T>
+inline T bfe(const T source, const uint32_t bit_start,
+             const uint32_t num_bits) {
+  static_assert(std::is_unsigned_v<T>);
+  // FIXME(syclcompat-lib-reviewers): This ternary was added to catch a case
+  // which may be undefined anyway. Consider that we are losing perf here.
+  const T mask =
+      num_bits >= CHAR_BIT * sizeof(T) ? T{-1} : ((T{1} << num_bits) - 1);
+  return (source >> bit_start) & mask;
+}
+
+} // namespace detail
+
+/// Bitfield-extract with boundary checking.
+///
+/// Extract bit field from \param source and return the zero or sign-extended
+/// result. Source \param bit_start gives the bit field starting bit position,
+/// and source \param num_bits gives the bit field length in bits.
+///
+/// The result is padded with the sign bit of the extracted field. If `num_bits`
+/// is zero, the  result is zero. If the start position is beyond the msb of the
+/// input, the result is filled with the replicated sign bit of the extracted
+/// field.
+///
+/// \tparam T The type of \param source value, must be an integer.
+/// \param source The source value to extracting.
+/// \param bit_start The position to start extracting.
+/// \param num_bits The number of bits to extracting.
+template <typename T>
+inline T bfe_safe(const T source, const uint32_t bit_start,
+                  const uint32_t num_bits) {
+  static_assert(std::is_integral_v<T>);
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  if constexpr (std::is_same_v<T, int8_t> || std::is_same_v<T, int16_t> ||
+                std::is_same_v<T, int32_t>) {
+    int32_t res{};
+    asm volatile("bfe.s32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"((int32_t)source), "r"(bit_start), "r"(num_bits));
+    return res;
+  } else if constexpr (std::is_same_v<T, uint8_t> ||
+                       std::is_same_v<T, uint16_t> ||
+                       std::is_same_v<T, uint32_t>) {
+    uint32_t res{};
+    asm volatile("bfe.u32 %0, %1, %2, %3;"
+                 : "=r"(res)
+                 : "r"((uint32_t)source), "r"(bit_start), "r"(num_bits));
+    return res;
+  } else if constexpr (std::is_same_v<T, int64_t>) {
+    T res{};
+    asm volatile("bfe.s64 %0, %1, %2, %3;"
+                 : "=l"(res)
+                 : "l"(source), "r"(bit_start), "r"(num_bits));
+    return res;
+  } else if constexpr (std::is_same_v<T, uint64_t>) {
+    T res{};
+    asm volatile("bfe.u64 %0, %1, %2, %3;"
+                 : "=l"(res)
+                 : "l"(source), "r"(bit_start), "r"(num_bits));
+    return res;
+  }
+#endif
+  const uint32_t bit_width = CHAR_BIT * sizeof(T);
+  const uint32_t pos = std::min(bit_start, bit_width);
+  const uint32_t len = std::min(pos + num_bits, bit_width) - pos;
+  if constexpr (std::is_signed_v<T>) {
+    // FIXME(syclcompat-lib-reviewers): As above, catching a case whose result
+    // is undefined and likely losing perf.
+    const T mask = len >= bit_width ? T{-1} : static_cast<T>((T{1} << len) - 1);
+
+    // Find the sign-bit, the result is padded with the sign bit of the
+    // extracted field.
+    // Note if requested num_bits==0, we return zero via sign_bit=0
+    const uint32_t sign_bit_pos = std::min(pos + len - 1, bit_width - 1);
+    const T sign_bit = num_bits != 0 && ((source >> sign_bit_pos) & 1);
+    const T sign_bit_padding = (-sign_bit & ~mask);
+    return ((source >> pos) & mask) | sign_bit_padding;
+  } else {
+    return syclcompat::detail::bfe(source, pos, len);
+  }
+}
+
+namespace detail {
+// FIXME(syclcompat-lib-reviewers): move bfi outside detail once perf is
+// improved & semantics understood
+/// Bitfield-insert.
+///
+/// \tparam T The type of \param x and \param y , must be an unsigned integer.
+/// \param x The source of the bitfield.
+/// \param y The source where bitfield is inserted.
+/// \param bit_start The position to start insertion.
+/// \param num_bits The number of bits to insertion.
+template <typename T>
+inline T bfi(const T x, const T y, const uint32_t bit_start,
+             const uint32_t num_bits) {
+  static_assert(std::is_unsigned_v<T>);
+  constexpr unsigned bit_width = CHAR_BIT * sizeof(T);
+
+  // if bit_start > bit_width || len == 0, should return y.
+  const T ignore_bfi = static_cast<T>(bit_start > bit_width || num_bits == 0);
+  T extract_bitfield_mask = (static_cast<T>(~T{0}) >> (bit_width - num_bits))
+                            << bit_start;
+  T clean_bitfield_mask = ~extract_bitfield_mask;
+  return (y & (-ignore_bfi | clean_bitfield_mask)) |
+         (~-ignore_bfi & ((x << bit_start) & extract_bitfield_mask));
+}
 } // namespace detail
 
+/// Bitfield-insert with boundary checking.
+///
+/// Align and insert a bit field from \param x into \param y . Source \param
+/// bit_start gives the starting bit position for the insertion, and source
+/// \param num_bits gives the bit field length in bits.
+///
+/// \tparam T The type of \param x and \param y , must be an unsigned integer.
+/// \param x The source of the bitfield.
+/// \param y The source where bitfield is inserted.
+/// \param bit_start The position to start insertion.
+/// \param num_bits The number of bits to insertion.
+template <typename T>
+inline T bfi_safe(const T x, const T y, const uint32_t bit_start,
+                  const uint32_t num_bits) {
+  static_assert(std::is_unsigned_v<T>);
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__)
+  if constexpr (std::is_same_v<T, uint8_t> || std::is_same_v<T, uint16_t> ||
+                std::is_same_v<T, uint32_t>) {
+    uint32_t res{};
+    asm volatile("bfi.b32 %0, %1, %2, %3, %4;"
+                 : "=r"(res)
+                 : "r"((uint32_t)x), "r"((uint32_t)y), "r"(bit_start),
+                   "r"(num_bits));
+    return res;
+  } else if constexpr (std::is_same_v<T, uint64_t>) {
+    uint64_t res{};
+    asm volatile("bfi.b64 %0, %1, %2, %3, %4;"
+                 : "=l"(res)
+                 : "l"(x), "l"(y), "r"(bit_start), "r"(num_bits));
+    return res;
+  }
+#endif
+  constexpr unsigned bit_width = CHAR_BIT * sizeof(T);
+  const uint32_t pos = std::min(bit_start, bit_width);
+  const uint32_t len = std::min(pos + num_bits, bit_width) - pos;
+  return syclcompat::detail::bfi(x, y, pos, len);
+}
+
 /// Emulated function for __funnelshift_l
 inline unsigned int funnelshift_l(unsigned int low, unsigned int high,
                                   unsigned int shift) {
@@ -712,6 +964,13 @@ struct shift_right {
     return x >> offset;
   }
 };
+
+struct average {
+  template <typename T> auto operator()(const T x, const T y) const {
+    return (x + y + (x + y >= 0)) >> 1;
+  }
+};
+
 } // namespace detail
 
 /// Compute vectorized binary operation value for two values, with each value
@@ -733,6 +992,116 @@ inline unsigned vectorized_binary(unsigned a, unsigned b,
   return v0;
 }
 
+template <typename T1, typename T2>
+using dot_product_acc_t =
+    std::conditional_t<std::is_unsigned_v<T1> && std::is_unsigned_v<T2>,
+                       uint32_t, int32_t>;
+
+namespace detail {
+
+template <typename T> sycl::vec<T, 4> extract_and_sign_or_zero_extend4(T val) {
+  return sycl::vec<T, 1>(val)
+      .template as<sycl::vec<
+          std::conditional_t<std::is_signed_v<T>, int8_t, uint8_t>, 4>>()
+      .template convert<T>();
+}
+
+template <typename T> sycl::vec<T, 2> extract_and_sign_or_zero_extend2(T val) {
+  return sycl::vec<T, 1>(val)
+      .template as<sycl::vec<
+          std::conditional_t<std::is_signed_v<T>, int16_t, uint16_t>, 2>>()
+      .template convert<T>();
+}
+
+} // namespace detail
+
+/// Two-way dot product-accumulate. Calculate and return integer_vector2(
+/// \param a) dot product integer_vector2(low16_bit( \param b)) + \param c
+///
+/// \tparam [in] T1 The type of first value.
+/// \tparam [in] T2 The type of second value.
+/// \param [in] a The first value.
+/// \param [in] b The second value.
+/// \param [in] c The third value. It has type uint32_t if both T1 and T1 are
+/// uint32_t else has type int32_t.
+/// \return Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit
+/// result.
+template <typename T1, typename T2>
+inline dot_product_acc_t<T1, T2> dp2a_lo(T1 a, T2 b,
+                                         dot_product_acc_t<T1, T2> c) {
+  static_assert(detail::is_int32_type<T1> && detail::is_int32_type<T2>,
+                "[SYCLcompat] dp2a_lo expects 32-bit integers as operands.");
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610
+  return __dp2a_lo(a, b, c);
+#else
+  dot_product_acc_t<T1, T2> res = c;
+  auto va = detail::extract_and_sign_or_zero_extend2(a);
+  auto vb = detail::extract_and_sign_or_zero_extend4(b);
+  res += va[0] * vb[0];
+  res += va[1] * vb[1];
+  return res;
+#endif
+}
+
+/// Two-way dot product-accumulate. Calculate and return integer_vector2(
+/// \param a) dot product integer_vector2(high_16bit( \param b)) + \param c
+///
+/// \tparam [in] T1 The type of first value.
+/// \tparam [in] T2 The type of second value.
+/// \param [in] a The first value.
+/// \param [in] b The second value.
+/// \param [in] c The third value. uint32_t if both T1 and T1 are
+/// uint32_t else has type int32_t.
+/// \return Two-way 16-bit to 8-bit dot product which is accumulated in 32-bit
+/// result.
+template <typename T1, typename T2>
+inline dot_product_acc_t<T1, T2> dp2a_hi(T1 a, T2 b,
+                                         dot_product_acc_t<T1, T2> c) {
+  static_assert(detail::is_int32_type<T1> && detail::is_int32_type<T2>,
+                "[SYCLcompat] dp2a_hi expects 32-bit integers as operands.");
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610
+  return __dp2a_hi(a, b, c);
+#else
+  dot_product_acc_t<T1, T2> res = c;
+  auto va = detail::extract_and_sign_or_zero_extend2(a);
+  auto vb = detail::extract_and_sign_or_zero_extend4(b);
+  res += va[0] * vb[2];
+  res += va[1] * vb[3];
+  return res;
+#endif
+}
+
+/// Four-way byte dot product-accumulate. Calculate and return integer_vector4(
+/// \param a) dot product integer_vector4( \param b)  + \param c
+///
+/// \tparam [in] T1 The type of first value.
+/// \tparam [in] T2 The type of second value.
+/// \param [in] a The first value.
+/// \param [in] b The second value.
+/// \param [in] c The third value. It has type uint32_t if both T1 and T1 are
+/// uint32_t else has type int32_t.
+/// \return Four-way byte dot product which is accumulated in 32-bit result.
+template <typename T1, typename T2>
+inline dot_product_acc_t<T1, T2> dp4a(T1 a, T2 b, dot_product_acc_t<T1, T2> c) {
+  static_assert(detail::is_int32_type<T1> && detail::is_int32_type<T2>,
+                "[SYCLcompat] dp4a expects 32-bit integers as operands.");
+#if defined(__SYCL_DEVICE_ONLY__) && defined(__NVPTX__) &&                     \
+    defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 610
+  return __dp4a(a, b, c);
+#else
+  dot_product_acc_t<T1, T2> res = c;
+  auto va = detail::extract_and_sign_or_zero_extend4(a);
+  auto vb = detail::extract_and_sign_or_zero_extend4(b);
+  res += va[0] * vb[0];
+  res += va[1] * vb[1];
+  res += va[2] * vb[2];
+  res += va[3] * vb[3];
+  return res;
+#endif
+}
+
 /// Extend \p a and \p b to 33 bit and add them.
 /// \tparam [in] RetT The type of the return value
 /// \tparam [in] AT The type of the first value
@@ -1222,4 +1591,600 @@ inline constexpr RetT extend_shr_sat_wrap(T a, uint32_t b, uint32_t c,
                                            detail::shift_right(), second_op);
 }
 
+/// Compute vectorized addition of \p a and \p b, with each value treated as a
+/// 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, std::plus());
+}
+
+/// Compute vectorized addition of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized addition of the two
+/// values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, std::plus());
+}
+
+/// Compute vectorized addition of \p a and \p b with saturation, with each
+/// value treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, std::plus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, std::minus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 2 elements vector type and extend each element to 17 bit. Then add each
+/// half of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized subtraction of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, std::minus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b with saturation, with each
+/// value treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, std::minus());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized abs_diff of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b with saturation, with each
+/// value treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, minimum());
+}
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized minimum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, minimum());
+}
+
+/// Compute vectorized minimum of \p a and \p b with saturation, with each value
+/// treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, minimum());
+}
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c, maximum());
+}
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized maximum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, maximum());
+}
+
+/// Compute vectorized maximum of \p a and \p b with saturation, with each value
+/// treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, maximum());
+}
+
+/// Compute vectorized average of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized average of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg2(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, false>(a, b, c,
+                                                     detail::average());
+}
+
+/// Compute vectorized average of \p a and \p b, with each value treated as a 2
+/// elements vector type and extend each element to 17 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend average maximum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg2_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, false, true>(a, b, c, detail::average());
+}
+
+/// Compute vectorized average of \p a and \p b with saturation, with each value
+/// treated as a 2 elements vector type and extend each element to 17 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized average of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg2_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary2<RetT, true, false>(a, b, c, detail::average());
+}
+
+/// Extend \p a and \p b to 33 bit and vectorized compare input values using
+/// specified comparison \p cmp .
+///
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the compare operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] cmp The comparsion operator
+/// \returns The comparison result of the two extended values.
+template <typename AT, typename BT, typename BinaryOperation>
+inline constexpr unsigned extend_vcompare2(AT a, BT b, BinaryOperation cmp) {
+  return detail::extend_vbinary2<unsigned, false, false>(a, b, 0, cmp);
+}
+
+/// Extend Inputs to 33 bit, and vectorized compare input values using specified
+/// comparison \p cmp , then add the result with \p c .
+///
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the compare operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] cmp The comparsion operator
+/// \returns The comparison result of the two extended values, and add the
+/// result with \p c .
+template <typename AT, typename BT, typename BinaryOperation>
+inline constexpr unsigned extend_vcompare2_add(AT a, BT b, unsigned c,
+                                               BinaryOperation cmp) {
+  return detail::extend_vbinary2<unsigned, false, true>(a, b, c, cmp);
+}
+
+/// Compute vectorized addition of \p a and \p b, with each value treated as a
+/// 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd4(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, false>(a, b, c, std::plus());
+}
+
+/// Compute vectorized addition of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized addition of the two
+/// values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd4_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, true>(a, b, c, std::plus());
+}
+
+/// Compute vectorized addition of \p a and \p b with saturation, with each
+/// value treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized addition of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vadd4_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, true, false>(a, b, c, std::plus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub4(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, false>(a, b, c, std::minus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b, with each value treated as
+/// a 4 elements vector type and extend each element to 9 bit. Then add each
+/// half of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized subtraction of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub4_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, true>(a, b, c, std::minus());
+}
+
+/// Compute vectorized subtraction of \p a and \p b with saturation, with each
+/// value treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized subtraction of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vsub4_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, true, false>(a, b, c, std::minus());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff4(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, false>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized abs_diff of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff4_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, true>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized abs_diff of \p a and \p b with saturation, with each
+/// value treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized abs_diff of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vabsdiff4_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, true, false>(a, b, c, abs_diff());
+}
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin4(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, false>(a, b, c, minimum());
+}
+
+/// Compute vectorized minimum of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized minimum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin4_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, true>(a, b, c, minimum());
+}
+
+/// Compute vectorized minimum of \p a and \p b with saturation, with each value
+/// treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized minimum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmin4_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, true, false>(a, b, c, minimum());
+}
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax4(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, false>(a, b, c, maximum());
+}
+
+/// Compute vectorized maximum of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized maximum of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax4_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, true>(a, b, c, maximum());
+}
+
+/// Compute vectorized maximum of \p a and \p b with saturation, with each value
+/// treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized maximum of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vmax4_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, true, false>(a, b, c, maximum());
+}
+
+/// Compute vectorized average of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized average of the two values
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg4(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, false>(a, b, c,
+                                                     detail::average());
+}
+
+/// Compute vectorized average of \p a and \p b, with each value treated as a 4
+/// elements vector type and extend each element to 9 bit. Then add each half
+/// of the result and add with \p c.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The addition of each half of extend vectorized average of the
+/// two values and the third value
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg4_add(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, false, true>(a, b, c, detail::average());
+}
+
+/// Compute vectorized average of \p a and \p b with saturation, with each value
+/// treated as a 4 elements vector type and extend each element to 9 bit.
+/// \tparam [in] RetT The type of the return value, can only be 32 bit integer
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \returns The extend vectorized average of the two values with saturation
+template <typename RetT, typename AT, typename BT>
+inline constexpr RetT extend_vavrg4_sat(AT a, BT b, RetT c) {
+  return detail::extend_vbinary4<RetT, true, false>(a, b, c, detail::average());
+}
+
+/// Extend \p a and \p b to 33 bit and vectorized compare input values using
+/// specified comparison \p cmp .
+///
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the compare operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] cmp The comparsion operator
+/// \returns The comparison result of the two extended values.
+template <typename AT, typename BT, typename BinaryOperation>
+inline constexpr unsigned extend_vcompare4(AT a, BT b, BinaryOperation cmp) {
+  return detail::extend_vbinary4<unsigned, false, false>(a, b, 0, cmp);
+}
+
+/// Extend Inputs to 33 bit, and vectorized compare input values using specified
+/// comparison \p cmp , then add the result with \p c .
+///
+/// \tparam [in] AT The type of the first value, can only be 32 bit integer
+/// \tparam [in] BT The type of the second value, can only be 32 bit integer
+/// \tparam [in] BinaryOperation The type of the compare operation
+/// \param [in] a The first value
+/// \param [in] b The second value
+/// \param [in] c The third value
+/// \param [in] cmp The comparsion operator
+/// \returns The comparison result of the two extended values, and add the
+/// result with \p c .
+template <typename AT, typename BT, typename BinaryOperation>
+inline constexpr unsigned extend_vcompare4_add(AT a, BT b, unsigned c,
+                                               BinaryOperation cmp) {
+  return detail::extend_vbinary4<unsigned, false, true>(a, b, c, cmp);
+}
+
 } // namespace syclcompat
diff --git a/sycl/include/syclcompat/memory.hpp b/sycl/include/syclcompat/memory.hpp
index 5b578825b02ba..ad33ce9a9bdf8 100644
--- a/sycl/include/syclcompat/memory.hpp
+++ b/sycl/include/syclcompat/memory.hpp
@@ -77,7 +77,7 @@ template <typename AllocT> auto *local_mem() {
   return As;
 }
 
-namespace detail {
+namespace experimental {
 enum memcpy_direction {
   host_to_host,
   host_to_device,
@@ -85,7 +85,7 @@ enum memcpy_direction {
   device_to_device,
   automatic
 };
-} // namespace detail
+}
 
 enum class memory_region {
   global = 0, // device global memory
@@ -122,6 +122,42 @@ class pitched_data {
   size_t _pitch, _x, _y;
 };
 
+namespace experimental {
+#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
+class image_mem_wrapper;
+namespace detail {
+static sycl::event memcpy(const image_mem_wrapper *src,
+                          const sycl::id<3> &src_id, pitched_data &dest,
+                          const sycl::id<3> &dest_id,
+                          const sycl::range<3> &copy_extend, sycl::queue q);
+static sycl::event memcpy(const pitched_data src, const sycl::id<3> &src_id,
+                          image_mem_wrapper *dest, const sycl::id<3> &dest_id,
+                          const sycl::range<3> &copy_extend, sycl::queue q);
+} // namespace detail
+#endif
+class image_matrix;
+namespace detail {
+static pitched_data to_pitched_data(image_matrix *image);
+}
+
+/// Memory copy parameters for 2D/3D memory data.
+struct memcpy_parameter {
+  struct data_wrapper {
+    pitched_data pitched{};
+    sycl::id<3> pos{};
+#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
+    experimental::image_mem_wrapper *image_bindless{nullptr};
+#endif
+    image_matrix *image{nullptr};
+  };
+  data_wrapper from{};
+  data_wrapper to{};
+  sycl::range<3> size{};
+  syclcompat::experimental::memcpy_direction direction{
+      syclcompat::experimental::memcpy_direction::automatic};
+};
+} // namespace experimental
+
 namespace detail {
 
 template <class T, memory_region Memory, size_t Dimension> class accessor;
@@ -263,21 +299,16 @@ static pointer_access_attribute get_pointer_attribute(sycl::queue q,
   }
 }
 
-static memcpy_direction deduce_memcpy_direction(sycl::queue q, void *to_ptr,
-                                                const void *from_ptr) {
+static experimental::memcpy_direction
+deduce_memcpy_direction(sycl::queue q, void *to_ptr, const void *from_ptr) {
   // table[to_attribute][from_attribute]
+  using namespace experimental; // for memcpy_direction
   static const memcpy_direction
       direction_table[static_cast<unsigned>(pointer_access_attribute::end)]
                      [static_cast<unsigned>(pointer_access_attribute::end)] = {
-                         {memcpy_direction::host_to_host,
-                          memcpy_direction::device_to_host,
-                          memcpy_direction::host_to_host},
-                         {memcpy_direction::host_to_device,
-                          memcpy_direction::device_to_device,
-                          memcpy_direction::device_to_device},
-                         {memcpy_direction::host_to_host,
-                          memcpy_direction::device_to_device,
-                          memcpy_direction::device_to_device}};
+                         {host_to_host, device_to_host, host_to_host},
+                         {host_to_device, device_to_device, device_to_device},
+                         {host_to_host, device_to_device, device_to_device}};
   return direction_table[static_cast<unsigned>(get_pointer_attribute(
       q, to_ptr))][static_cast<unsigned>(get_pointer_attribute(q, from_ptr))];
 }
@@ -300,6 +331,28 @@ static inline size_t get_offset(sycl::id<3> id, size_t slice, size_t pitch) {
   return slice * id.get(2) + pitch * id.get(1) + id.get(0);
 }
 
+// RAII for host pointer
+class host_buffer {
+  void *_buf;
+  size_t _size;
+  sycl::queue _q;
+  const std::vector<sycl::event> &_deps; // free operation depends
+
+public:
+  host_buffer(size_t size, sycl::queue q, const std::vector<sycl::event> &deps)
+      : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
+  void *get_ptr() const { return _buf; }
+  size_t get_size() const { return _size; }
+  ~host_buffer() {
+    if (_buf) {
+      _q.submit([&](sycl::handler &cgh) {
+        cgh.depends_on(_deps);
+        cgh.host_task([buf = _buf] { std::free(buf); });
+      });
+    }
+  }
+};
+
 /// copy 3D matrix specified by \p size from 3D matrix specified by \p from_ptr
 /// and \p from_range to another specified by \p to_ptr and \p to_range.
 static inline std::vector<sycl::event>
@@ -307,28 +360,7 @@ memcpy(sycl::queue q, void *to_ptr, const void *from_ptr,
        sycl::range<3> to_range, sycl::range<3> from_range, sycl::id<3> to_id,
        sycl::id<3> from_id, sycl::range<3> size,
        const std::vector<sycl::event> &dep_events = {}) {
-  // RAII for host pointer
-  class host_buffer {
-    void *_buf;
-    size_t _size;
-    sycl::queue _q;
-    const std::vector<sycl::event> &_deps; // free operation depends
-
-  public:
-    host_buffer(size_t size, sycl::queue q,
-                const std::vector<sycl::event> &deps)
-        : _buf(std::malloc(size)), _size(size), _q(q), _deps(deps) {}
-    void *get_ptr() const { return _buf; }
-    size_t get_size() const { return _size; }
-    ~host_buffer() {
-      if (_buf) {
-        _q.submit([&](sycl::handler &cgh) {
-          cgh.depends_on(_deps);
-          cgh.host_task([buf = _buf] { std::free(buf); });
-        });
-      }
-    }
-  };
+
   std::vector<sycl::event> event_list;
 
   size_t to_slice = to_range.get(1) * to_range.get(0);
@@ -343,6 +375,7 @@ memcpy(sycl::queue q, void *to_ptr, const void *from_ptr,
     return {memcpy(q, to_surface, from_surface, to_slice * size.get(2),
                    dep_events)};
   }
+  using namespace experimental; // for memcpy_direction
   memcpy_direction direction = deduce_memcpy_direction(q, to_ptr, from_ptr);
   size_t size_slice = size.get(1) * size.get(0);
   switch (direction) {
@@ -448,6 +481,56 @@ static sycl::event combine_events(std::vector<sycl::event> &events,
 
 } // namespace detail
 
+namespace experimental {
+namespace detail {
+static inline std::vector<sycl::event>
+memcpy(sycl::queue q, const experimental::memcpy_parameter &param) {
+  auto to = param.to.pitched;
+  auto from = param.from.pitched;
+#ifdef SYCL_EXT_ONEAPI_BINDLESS_IMAGES
+  if (param.to.image_bindless != nullptr &&
+      param.from.image_bindless != nullptr) {
+    throw std::runtime_error(
+        "[SYCLcompat] memcpy: Unsupported bindless_image API.");
+    // TODO: Need change logic when sycl support image_mem to image_mem copy.
+    std::vector<sycl::event> event_list;
+    syclcompat::detail::host_buffer buf(param.size.size(), q, event_list);
+    to.set_data_ptr(buf.get_ptr());
+    experimental::detail::memcpy(param.from.image_bindless, param.from.pos, to,
+                                 sycl::id<3>(0, 0, 0), param.size, q);
+    from.set_data_ptr(buf.get_ptr());
+    event_list.push_back(experimental::detail::memcpy(
+        from, sycl::id<3>(0, 0, 0), param.to.image_bindless, param.to.pos,
+        param.size, q));
+    return event_list;
+  } else if (param.to.image_bindless != nullptr) {
+    throw std::runtime_error(
+        "[SYCLcompat] memcpy: Unsupported bindless_image API.");
+    return {experimental::detail::memcpy(from, param.from.pos,
+                                         param.to.image_bindless, param.to.pos,
+                                         param.size, q)};
+  } else if (param.from.image_bindless != nullptr) {
+    throw std::runtime_error(
+        "[SYCLcompat] memcpy: Unsupported bindless_image API.");
+    return {experimental::detail::memcpy(param.from.image_bindless,
+                                         param.from.pos, to, param.to.pos,
+                                         param.size, q)};
+  }
+#endif
+  if (param.to.image != nullptr) {
+    throw std::runtime_error("[SYCLcompat] memcpy: Unsupported image API.");
+    to = experimental::detail::to_pitched_data(param.to.image);
+  }
+  if (param.from.image != nullptr) {
+    throw std::runtime_error("[SYCLcompat] memcpy: Unsupported image API.");
+    from = experimental::detail::to_pitched_data(param.from.image);
+  }
+  return syclcompat::detail::memcpy(q, to, param.to.pos, from, param.from.pos,
+                                    param.size);
+}
+} // namespace detail
+} // namespace experimental
+
 /// Allocate memory block on the device.
 /// \param num_bytes Number of bytes to allocate.
 /// \param q Queue to execute the allocate task.
@@ -531,26 +614,40 @@ static inline void *malloc(size_t &pitch, size_t x, size_t y,
   return detail::malloc(pitch, x, y, 1, q);
 }
 
-/// free
+/// Wait on the queue \p q and free the memory \p ptr.
 /// \param ptr Point to free.
 /// \param q Queue to execute the free task.
 /// \returns no return value.
+static inline void wait_and_free(void *ptr,
+                                 sycl::queue q = get_default_queue()) {
+  get_current_device().queues_wait_and_throw();
+  q.wait();
+  if (ptr) {
+    sycl::free(ptr, q);
+  }
+}
+
+/// Free the memory \p ptr on the default queue without synchronizing
+/// \param ptr Point to free.
+/// \returns no return value.
 static inline void free(void *ptr, sycl::queue q = get_default_queue()) {
   if (ptr) {
-    sycl::free(ptr, q.get_context());
+    sycl::free(ptr, q);
   }
 }
 
-/// Free the device memory pointed by a batch of pointers in \p pointers which
-/// are related to \p q after \p events completed.
+/// Enqueues the release of all pointers in /p pointers on the /p q.
+/// The command waits on all passed /p events and returns an event that
+/// track the commands execution on the queue.
 ///
 /// \param pointers The pointers point to the device memory requested to be
-/// freed. \param events The events to be waited. \param q The sycl::queue the
-/// memory relates to.
+/// freed.
+/// \param events The events to be waited on.
+/// \param q The sycl::queue the memory relates to.
 // Can't be static due to the friend declaration in the memory header.
-inline sycl::event free_async(const std::vector<void *> &pointers,
-                              const std::vector<sycl::event> &events,
-                              sycl::queue q = get_default_queue()) {
+inline sycl::event enqueue_free(const std::vector<void *> &pointers,
+                                const std::vector<sycl::event> &events,
+                                sycl::queue q = get_default_queue()) {
   auto event = q.submit(
       [&pointers, &events, ctxt = q.get_context()](sycl::handler &cgh) {
         cgh.depends_on(events);
@@ -743,6 +840,31 @@ static sycl::event inline fill_async(void *dev_ptr, const T &pattern,
   return detail::fill(q, dev_ptr, pattern, count);
 }
 
+namespace experimental {
+
+/// [UNSUPPORTED] Synchronously copies 2D/3D memory data specified by \p param .
+/// The function will return after the copy is completed.
+///
+/// \param param Memory copy parameters.
+/// \param q Queue to execute the copy task.
+/// \returns no return value.
+static inline void memcpy(const memcpy_parameter &param,
+                          sycl::queue q = get_default_queue()) {
+  sycl::event::wait(syclcompat::experimental::detail::memcpy(q, param));
+}
+
+/// [UNSUPPORTED] Asynchronously copies 2D/3D memory data specified by \p param
+/// . The return of the function does NOT guarantee the copy is completed.
+///
+/// \param param Memory copy parameters.
+/// \param q Queue to execute the copy task.
+/// \returns no return value.
+static inline void memcpy_async(const memcpy_parameter &param,
+                                sycl::queue q = get_default_queue()) {
+  syclcompat::experimental::detail::memcpy(q, param);
+}
+} // namespace experimental
+
 /// Synchronously sets \p value to the first \p size bytes starting from \p
 /// dev_ptr. The function will return after the memset operation is completed.
 ///
diff --git a/sycl/plugins/cuda/pi_cuda.cpp b/sycl/plugins/cuda/pi_cuda.cpp
index 1fa177b296328..0077b245905db 100644
--- a/sycl/plugins/cuda/pi_cuda.cpp
+++ b/sycl/plugins/cuda/pi_cuda.cpp
@@ -454,13 +454,21 @@ __SYCL_EXPORT pi_result piextMemImageGetInfo(pi_image_mem_handle MemHandle,
                                      ParamValueSizeRet);
 }
 
-__SYCL_EXPORT pi_result
-piextMemImportOpaqueFD(pi_context Context, pi_device Device, size_t Size,
-                       int FileDescriptor, pi_interop_mem_handle *RetHandle) {
+__SYCL_EXPORT_DEPRECATED("This function has been deprecated in favor of "
+                         "`piextImportExternalMemory`")
+pi_result piextMemImportOpaqueFD(pi_context Context, pi_device Device,
+                                 size_t Size, int FileDescriptor,
+                                 pi_interop_mem_handle *RetHandle) {
   return pi2ur::piextMemImportOpaqueFD(Context, Device, Size, FileDescriptor,
                                        RetHandle);
 }
 
+__SYCL_EXPORT pi_result piextImportExternalMemory(
+    pi_context Context, pi_device Device, pi_external_mem_descriptor *MemDesc,
+    pi_interop_mem_handle *RetHandle) {
+  return pi2ur::piextImportExternalMemory(Context, Device, MemDesc, RetHandle);
+}
+
 __SYCL_EXPORT pi_result piextMemMapExternalArray(
     pi_context Context, pi_device Device, pi_image_format *ImageFormat,
     pi_image_desc *ImageDesc, pi_interop_mem_handle MemHandle,
@@ -475,13 +483,24 @@ __SYCL_EXPORT pi_result piextMemReleaseInterop(pi_context Context,
   return pi2ur::piextMemReleaseInterop(Context, Device, ExtMem);
 }
 
-__SYCL_EXPORT pi_result piextImportExternalSemaphoreOpaqueFD(
-    pi_context Context, pi_device Device, int FileDescriptor,
-    pi_interop_semaphore_handle *RetHandle) {
+__SYCL_EXPORT_DEPRECATED("This function has been deprecated in favor of "
+                         "`piextImportExternalSemaphore`")
+pi_result
+piextImportExternalSemaphoreOpaqueFD(pi_context Context, pi_device Device,
+                                     int FileDescriptor,
+                                     pi_interop_semaphore_handle *RetHandle) {
   return pi2ur::piextImportExternalSemaphoreOpaqueFD(Context, Device,
                                                      FileDescriptor, RetHandle);
 }
 
+__SYCL_EXPORT pi_result
+piextImportExternalSemaphore(pi_context Context, pi_device Device,
+                             pi_external_semaphore_descriptor *SemDesc,
+                             pi_interop_semaphore_handle *RetHandle) {
+  return pi2ur::piextImportExternalSemaphore(Context, Device, SemDesc,
+                                             RetHandle);
+}
+
 __SYCL_EXPORT pi_result
 piextDestroyExternalSemaphore(pi_context Context, pi_device Device,
                               pi_interop_semaphore_handle SemHandle) {
@@ -489,19 +508,21 @@ piextDestroyExternalSemaphore(pi_context Context, pi_device Device,
 }
 
 __SYCL_EXPORT pi_result piextWaitExternalSemaphore(
-    pi_queue Queue, pi_interop_semaphore_handle SemHandle,
-    pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList,
-    pi_event *Event) {
-  return pi2ur::piextWaitExternalSemaphore(
-      Queue, SemHandle, NumEventsInWaitList, EventWaitList, Event);
+    pi_queue Queue, pi_interop_semaphore_handle SemHandle, bool HasWaitValue,
+    pi_uint64 WaitValue, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
+  return pi2ur::piextWaitExternalSemaphore(Queue, SemHandle, HasWaitValue,
+                                           WaitValue, NumEventsInWaitList,
+                                           EventWaitList, Event);
 }
 
 __SYCL_EXPORT pi_result piextSignalExternalSemaphore(
-    pi_queue Queue, pi_interop_semaphore_handle SemHandle,
-    pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList,
-    pi_event *Event) {
-  return pi2ur::piextSignalExternalSemaphore(
-      Queue, SemHandle, NumEventsInWaitList, EventWaitList, Event);
+    pi_queue Queue, pi_interop_semaphore_handle SemHandle, bool HasSignalValue,
+    pi_uint64 SignalValue, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
+  return pi2ur::piextSignalExternalSemaphore(Queue, SemHandle, HasSignalValue,
+                                             SignalValue, NumEventsInWaitList,
+                                             EventWaitList, Event);
 }
 
 pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device,
diff --git a/sycl/plugins/hip/pi_hip.cpp b/sycl/plugins/hip/pi_hip.cpp
index 5d9481ff6e8fb..33b7388f9c884 100644
--- a/sycl/plugins/hip/pi_hip.cpp
+++ b/sycl/plugins/hip/pi_hip.cpp
@@ -457,13 +457,21 @@ __SYCL_EXPORT pi_result piextMemImageGetInfo(pi_image_mem_handle MemHandle,
                                      ParamValueSizeRet);
 }
 
-__SYCL_EXPORT pi_result
-piextMemImportOpaqueFD(pi_context Context, pi_device Device, size_t Size,
-                       int FileDescriptor, pi_interop_mem_handle *RetHandle) {
+__SYCL_EXPORT_DEPRECATED("This function has been deprecated in favor of "
+                         "`piextImportExternalMemory`")
+pi_result piextMemImportOpaqueFD(pi_context Context, pi_device Device,
+                                 size_t Size, int FileDescriptor,
+                                 pi_interop_mem_handle *RetHandle) {
   return pi2ur::piextMemImportOpaqueFD(Context, Device, Size, FileDescriptor,
                                        RetHandle);
 }
 
+__SYCL_EXPORT pi_result piextImportExternalMemory(
+    pi_context Context, pi_device Device, pi_external_mem_descriptor *MemDesc,
+    pi_interop_mem_handle *RetHandle) {
+  return pi2ur::piextImportExternalMemory(Context, Device, MemDesc, RetHandle);
+}
+
 __SYCL_EXPORT pi_result piextMemMapExternalArray(
     pi_context Context, pi_device Device, pi_image_format *ImageFormat,
     pi_image_desc *ImageDesc, pi_interop_mem_handle MemHandle,
@@ -478,13 +486,24 @@ __SYCL_EXPORT pi_result piextMemReleaseInterop(pi_context Context,
   return pi2ur::piextMemReleaseInterop(Context, Device, ExtMem);
 }
 
-__SYCL_EXPORT pi_result piextImportExternalSemaphoreOpaqueFD(
-    pi_context Context, pi_device Device, int FileDescriptor,
-    pi_interop_semaphore_handle *RetHandle) {
+__SYCL_EXPORT_DEPRECATED("This function has been deprecated in favor of "
+                         "`piextImportExternalSemaphore`")
+pi_result
+piextImportExternalSemaphoreOpaqueFD(pi_context Context, pi_device Device,
+                                     int FileDescriptor,
+                                     pi_interop_semaphore_handle *RetHandle) {
   return pi2ur::piextImportExternalSemaphoreOpaqueFD(Context, Device,
                                                      FileDescriptor, RetHandle);
 }
 
+__SYCL_EXPORT pi_result
+piextImportExternalSemaphore(pi_context Context, pi_device Device,
+                             pi_external_semaphore_descriptor *SemDesc,
+                             pi_interop_semaphore_handle *RetHandle) {
+  return pi2ur::piextImportExternalSemaphore(Context, Device, SemDesc,
+                                             RetHandle);
+}
+
 __SYCL_EXPORT pi_result
 piextDestroyExternalSemaphore(pi_context Context, pi_device Device,
                               pi_interop_semaphore_handle SemHandle) {
@@ -492,19 +511,21 @@ piextDestroyExternalSemaphore(pi_context Context, pi_device Device,
 }
 
 __SYCL_EXPORT pi_result piextWaitExternalSemaphore(
-    pi_queue Queue, pi_interop_semaphore_handle SemHandle,
-    pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList,
-    pi_event *Event) {
-  return pi2ur::piextWaitExternalSemaphore(
-      Queue, SemHandle, NumEventsInWaitList, EventWaitList, Event);
+    pi_queue Queue, pi_interop_semaphore_handle SemHandle, bool HasWaitValue,
+    pi_uint64 WaitValue, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
+  return pi2ur::piextWaitExternalSemaphore(Queue, SemHandle, HasWaitValue,
+                                           WaitValue, NumEventsInWaitList,
+                                           EventWaitList, Event);
 }
 
 __SYCL_EXPORT pi_result piextSignalExternalSemaphore(
-    pi_queue Queue, pi_interop_semaphore_handle SemHandle,
-    pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList,
-    pi_event *Event) {
-  return pi2ur::piextSignalExternalSemaphore(
-      Queue, SemHandle, NumEventsInWaitList, EventWaitList, Event);
+    pi_queue Queue, pi_interop_semaphore_handle SemHandle, bool HasSignalValue,
+    pi_uint64 SignalValue, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
+  return pi2ur::piextSignalExternalSemaphore(Queue, SemHandle, HasSignalValue,
+                                             SignalValue, NumEventsInWaitList,
+                                             EventWaitList, Event);
 }
 
 pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device,
diff --git a/sycl/plugins/level_zero/CMakeLists.txt b/sycl/plugins/level_zero/CMakeLists.txt
index 0b522ee5afdf5..dc4c659f9e44c 100644
--- a/sycl/plugins/level_zero/CMakeLists.txt
+++ b/sycl/plugins/level_zero/CMakeLists.txt
@@ -1,49 +1,5 @@
 # PI Level Zero plugin library
 
-if (NOT DEFINED LEVEL_ZERO_LIBRARY OR NOT DEFINED LEVEL_ZERO_INCLUDE_DIR)
-  message(STATUS "Download Level Zero loader and headers from github.com")
-
-  set(LEVEL_ZERO_LOADER_REPO "https://github.com/oneapi-src/level-zero.git")
-  set(LEVEL_ZERO_LOADER_TAG v1.16.1)
-
-  # Disable due to a bug https://github.com/oneapi-src/level-zero/issues/104
-  set(CMAKE_INCLUDE_CURRENT_DIR OFF)
-
-  message(STATUS "Will fetch Level Zero Loader from ${LEVEL_ZERO_LOADER_REPO}")
-  include(FetchContent)
-  FetchContent_Declare(level-zero-loader
-    GIT_REPOSITORY    ${LEVEL_ZERO_LOADER_REPO}
-    GIT_TAG           ${LEVEL_ZERO_LOADER_TAG}
-  )
-
-  # Workaround warnings/errors for Level Zero build
-  set(CMAKE_CXX_FLAGS_BAK "${CMAKE_CXX_FLAGS}")
-  if (WIN32)
-    # FIXME: Level Zero build fails with /DUNICODE
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /UUNICODE")
-    # USE_Z7 forces use of /Z7 instead of /Zi which is broken with sccache
-    set(USE_Z7 ON)
-  else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-but-set-variable")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-pedantic")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-stringop-truncation")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unused-parameter")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-c++98-compat-extra-semi")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-unknown-warning-option")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-non-virtual-dtor")
-  endif()
-
-  FetchContent_MakeAvailable(level-zero-loader)
-  FetchContent_GetProperties(level-zero-loader)
-
-  # Restore original flags
-  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS_BAK}")
-
-  set(LEVEL_ZERO_LIBRARY ze_loader)
-  set(LEVEL_ZERO_INCLUDE_DIR
-    ${level-zero-loader_SOURCE_DIR}/include CACHE PATH "Path to Level Zero Headers")
-endif()
-
 if (SYCL_ENABLE_XPTI_TRACING)
   set(XPTI_PROXY_SRC "${CMAKE_SOURCE_DIR}/../xpti/src/xpti_proxy.cpp")
   set(XPTI_INCLUDE "${CMAKE_SOURCE_DIR}/../xpti/include")
@@ -52,6 +8,8 @@ endif()
 
 find_package(Python3 REQUIRED)
 
+get_target_property(LEVEL_ZERO_INCLUDE_DIR LevelZeroLoader-Headers INTERFACE_INCLUDE_DIRECTORIES)
+
 add_custom_target(ze-api DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/ze_api.def)
 add_custom_command(
   OUTPUT
diff --git a/sycl/plugins/level_zero/pi_level_zero.cpp b/sycl/plugins/level_zero/pi_level_zero.cpp
index fb8fb8f3df306..f88e8c1ed3cd3 100644
--- a/sycl/plugins/level_zero/pi_level_zero.cpp
+++ b/sycl/plugins/level_zero/pi_level_zero.cpp
@@ -466,13 +466,21 @@ __SYCL_EXPORT pi_result piextMemImageGetInfo(pi_image_mem_handle MemHandle,
                                      ParamValueSizeRet);
 }
 
-__SYCL_EXPORT pi_result
-piextMemImportOpaqueFD(pi_context Context, pi_device Device, size_t Size,
-                       int FileDescriptor, pi_interop_mem_handle *RetHandle) {
+__SYCL_EXPORT_DEPRECATED("This function has been deprecated in favor of "
+                         "`piextImportExternalMemory`")
+pi_result piextMemImportOpaqueFD(pi_context Context, pi_device Device,
+                                 size_t Size, int FileDescriptor,
+                                 pi_interop_mem_handle *RetHandle) {
   return pi2ur::piextMemImportOpaqueFD(Context, Device, Size, FileDescriptor,
                                        RetHandle);
 }
 
+__SYCL_EXPORT pi_result piextImportExternalMemory(
+    pi_context Context, pi_device Device, pi_external_mem_descriptor *MemDesc,
+    pi_interop_mem_handle *RetHandle) {
+  return pi2ur::piextImportExternalMemory(Context, Device, MemDesc, RetHandle);
+}
+
 __SYCL_EXPORT pi_result piextMemMapExternalArray(
     pi_context Context, pi_device Device, pi_image_format *ImageFormat,
     pi_image_desc *ImageDesc, pi_interop_mem_handle MemHandle,
@@ -487,13 +495,24 @@ __SYCL_EXPORT pi_result piextMemReleaseInterop(pi_context Context,
   return pi2ur::piextMemReleaseInterop(Context, Device, ExtMem);
 }
 
-__SYCL_EXPORT pi_result piextImportExternalSemaphoreOpaqueFD(
-    pi_context Context, pi_device Device, int FileDescriptor,
-    pi_interop_semaphore_handle *RetHandle) {
+__SYCL_EXPORT_DEPRECATED("This function has been deprecated in favor of "
+                         "`piextImportExternalSemaphore`")
+pi_result
+piextImportExternalSemaphoreOpaqueFD(pi_context Context, pi_device Device,
+                                     int FileDescriptor,
+                                     pi_interop_semaphore_handle *RetHandle) {
   return pi2ur::piextImportExternalSemaphoreOpaqueFD(Context, Device,
                                                      FileDescriptor, RetHandle);
 }
 
+__SYCL_EXPORT pi_result
+piextImportExternalSemaphore(pi_context Context, pi_device Device,
+                             pi_external_semaphore_descriptor *SemDesc,
+                             pi_interop_semaphore_handle *RetHandle) {
+  return pi2ur::piextImportExternalSemaphore(Context, Device, SemDesc,
+                                             RetHandle);
+}
+
 __SYCL_EXPORT pi_result
 piextDestroyExternalSemaphore(pi_context Context, pi_device Device,
                               pi_interop_semaphore_handle SemHandle) {
@@ -501,19 +520,21 @@ piextDestroyExternalSemaphore(pi_context Context, pi_device Device,
 }
 
 __SYCL_EXPORT pi_result piextWaitExternalSemaphore(
-    pi_queue Queue, pi_interop_semaphore_handle SemHandle,
-    pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList,
-    pi_event *Event) {
-  return pi2ur::piextWaitExternalSemaphore(
-      Queue, SemHandle, NumEventsInWaitList, EventWaitList, Event);
+    pi_queue Queue, pi_interop_semaphore_handle SemHandle, bool HasWaitValue,
+    pi_uint64 WaitValue, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
+  return pi2ur::piextWaitExternalSemaphore(Queue, SemHandle, HasWaitValue,
+                                           WaitValue, NumEventsInWaitList,
+                                           EventWaitList, Event);
 }
 
 __SYCL_EXPORT pi_result piextSignalExternalSemaphore(
-    pi_queue Queue, pi_interop_semaphore_handle SemHandle,
-    pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList,
-    pi_event *Event) {
-  return pi2ur::piextSignalExternalSemaphore(
-      Queue, SemHandle, NumEventsInWaitList, EventWaitList, Event);
+    pi_queue Queue, pi_interop_semaphore_handle SemHandle, bool HasSignalValue,
+    pi_uint64 SignalValue, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
+  return pi2ur::piextSignalExternalSemaphore(Queue, SemHandle, HasSignalValue,
+                                             SignalValue, NumEventsInWaitList,
+                                             EventWaitList, Event);
 }
 
 pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device,
diff --git a/sycl/plugins/native_cpu/pi_native_cpu.cpp b/sycl/plugins/native_cpu/pi_native_cpu.cpp
index df4ac7dae4ec3..d867caea5e23d 100644
--- a/sycl/plugins/native_cpu/pi_native_cpu.cpp
+++ b/sycl/plugins/native_cpu/pi_native_cpu.cpp
@@ -461,13 +461,21 @@ __SYCL_EXPORT pi_result piextMemImageGetInfo(pi_image_mem_handle MemHandle,
                                      ParamValueSizeRet);
 }
 
-__SYCL_EXPORT pi_result
-piextMemImportOpaqueFD(pi_context Context, pi_device Device, size_t Size,
-                       int FileDescriptor, pi_interop_mem_handle *RetHandle) {
+__SYCL_EXPORT_DEPRECATED("This function has been deprecated in favor of "
+                         "`piextImportExternalMemory`")
+pi_result piextMemImportOpaqueFD(pi_context Context, pi_device Device,
+                                 size_t Size, int FileDescriptor,
+                                 pi_interop_mem_handle *RetHandle) {
   return pi2ur::piextMemImportOpaqueFD(Context, Device, Size, FileDescriptor,
                                        RetHandle);
 }
 
+__SYCL_EXPORT pi_result piextImportExternalMemory(
+    pi_context Context, pi_device Device, pi_external_mem_descriptor *MemDesc,
+    pi_interop_mem_handle *RetHandle) {
+  return pi2ur::piextImportExternalMemory(Context, Device, MemDesc, RetHandle);
+}
+
 __SYCL_EXPORT pi_result piextMemMapExternalArray(
     pi_context Context, pi_device Device, pi_image_format *ImageFormat,
     pi_image_desc *ImageDesc, pi_interop_mem_handle MemHandle,
@@ -482,13 +490,24 @@ __SYCL_EXPORT pi_result piextMemReleaseInterop(pi_context Context,
   return pi2ur::piextMemReleaseInterop(Context, Device, ExtMem);
 }
 
-__SYCL_EXPORT pi_result piextImportExternalSemaphoreOpaqueFD(
-    pi_context Context, pi_device Device, int FileDescriptor,
-    pi_interop_semaphore_handle *RetHandle) {
+__SYCL_EXPORT_DEPRECATED("This function has been deprecated in favor of "
+                         "`piextImportExternalSemaphore`")
+pi_result
+piextImportExternalSemaphoreOpaqueFD(pi_context Context, pi_device Device,
+                                     int FileDescriptor,
+                                     pi_interop_semaphore_handle *RetHandle) {
   return pi2ur::piextImportExternalSemaphoreOpaqueFD(Context, Device,
                                                      FileDescriptor, RetHandle);
 }
 
+__SYCL_EXPORT pi_result
+piextImportExternalSemaphore(pi_context Context, pi_device Device,
+                             pi_external_semaphore_descriptor *SemDesc,
+                             pi_interop_semaphore_handle *RetHandle) {
+  return pi2ur::piextImportExternalSemaphore(Context, Device, SemDesc,
+                                             RetHandle);
+}
+
 __SYCL_EXPORT pi_result
 piextDestroyExternalSemaphore(pi_context Context, pi_device Device,
                               pi_interop_semaphore_handle SemHandle) {
@@ -496,19 +515,21 @@ piextDestroyExternalSemaphore(pi_context Context, pi_device Device,
 }
 
 __SYCL_EXPORT pi_result piextWaitExternalSemaphore(
-    pi_queue Queue, pi_interop_semaphore_handle SemHandle,
-    pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList,
-    pi_event *Event) {
-  return pi2ur::piextWaitExternalSemaphore(
-      Queue, SemHandle, NumEventsInWaitList, EventWaitList, Event);
+    pi_queue Queue, pi_interop_semaphore_handle SemHandle, bool HasWaitValue,
+    pi_uint64 WaitValue, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
+  return pi2ur::piextWaitExternalSemaphore(Queue, SemHandle, HasWaitValue,
+                                           WaitValue, NumEventsInWaitList,
+                                           EventWaitList, Event);
 }
 
 __SYCL_EXPORT pi_result piextSignalExternalSemaphore(
-    pi_queue Queue, pi_interop_semaphore_handle SemHandle,
-    pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList,
-    pi_event *Event) {
-  return pi2ur::piextSignalExternalSemaphore(
-      Queue, SemHandle, NumEventsInWaitList, EventWaitList, Event);
+    pi_queue Queue, pi_interop_semaphore_handle SemHandle, bool HasSignalValue,
+    pi_uint64 SignalValue, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
+  return pi2ur::piextSignalExternalSemaphore(Queue, SemHandle, HasSignalValue,
+                                             SignalValue, NumEventsInWaitList,
+                                             EventWaitList, Event);
 }
 
 pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device,
diff --git a/sycl/plugins/opencl/pi_opencl.cpp b/sycl/plugins/opencl/pi_opencl.cpp
index 570f069520fc4..1fef329d179af 100644
--- a/sycl/plugins/opencl/pi_opencl.cpp
+++ b/sycl/plugins/opencl/pi_opencl.cpp
@@ -437,6 +437,8 @@ pi_result piextMemImageGetInfo(pi_image_mem_handle MemHandle,
                                      ParamValueSizeRet);
 }
 
+[[deprecated("This function has been deprecated in favor of "
+             "`piextImportExternalMemory`")]]
 pi_result piextMemImportOpaqueFD(pi_context Context, pi_device Device,
                                  size_t Size, int FileDescriptor,
                                  pi_interop_mem_handle *RetHandle) {
@@ -444,6 +446,12 @@ pi_result piextMemImportOpaqueFD(pi_context Context, pi_device Device,
                                        RetHandle);
 }
 
+pi_result piextImportExternalMemory(pi_context Context, pi_device Device,
+                                    pi_external_mem_descriptor *MemDesc,
+                                    pi_interop_mem_handle *RetHandle) {
+  return pi2ur::piextImportExternalMemory(Context, Device, MemDesc, RetHandle);
+}
+
 pi_result piextMemMapExternalArray(pi_context Context, pi_device Device,
                                    pi_image_format *ImageFormat,
                                    pi_image_desc *ImageDesc,
@@ -458,6 +466,8 @@ pi_result piextMemReleaseInterop(pi_context Context, pi_device Device,
   return pi2ur::piextMemReleaseInterop(Context, Device, ExtMem);
 }
 
+[[deprecated("This function has been deprecated in favor of "
+             "`piextImportExternalSemaphore`")]]
 pi_result
 piextImportExternalSemaphoreOpaqueFD(pi_context Context, pi_device Device,
                                      int FileDescriptor,
@@ -466,27 +476,35 @@ piextImportExternalSemaphoreOpaqueFD(pi_context Context, pi_device Device,
                                                      FileDescriptor, RetHandle);
 }
 
+pi_result
+piextImportExternalSemaphore(pi_context Context, pi_device Device,
+                             pi_external_semaphore_descriptor *SemDesc,
+                             pi_interop_semaphore_handle *RetHandle) {
+  return pi2ur::piextImportExternalSemaphore(Context, Device, SemDesc,
+                                             RetHandle);
+}
+
 pi_result piextDestroyExternalSemaphore(pi_context Context, pi_device Device,
                                         pi_interop_semaphore_handle SemHandle) {
   return pi2ur::piextDestroyExternalSemaphore(Context, Device, SemHandle);
 }
 
-pi_result piextWaitExternalSemaphore(pi_queue Queue,
-                                     pi_interop_semaphore_handle SemHandle,
-                                     pi_uint32 NumEventsInWaitList,
-                                     const pi_event *EventWaitList,
-                                     pi_event *Event) {
-  return pi2ur::piextWaitExternalSemaphore(
-      Queue, SemHandle, NumEventsInWaitList, EventWaitList, Event);
+__SYCL_EXPORT pi_result piextWaitExternalSemaphore(
+    pi_queue Queue, pi_interop_semaphore_handle SemHandle, bool HasWaitValue,
+    pi_uint64 WaitValue, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
+  return pi2ur::piextWaitExternalSemaphore(Queue, SemHandle, HasWaitValue,
+                                           WaitValue, NumEventsInWaitList,
+                                           EventWaitList, Event);
 }
 
-pi_result piextSignalExternalSemaphore(pi_queue Queue,
-                                       pi_interop_semaphore_handle SemHandle,
-                                       pi_uint32 NumEventsInWaitList,
-                                       const pi_event *EventWaitList,
-                                       pi_event *Event) {
-  return pi2ur::piextSignalExternalSemaphore(
-      Queue, SemHandle, NumEventsInWaitList, EventWaitList, Event);
+__SYCL_EXPORT pi_result piextSignalExternalSemaphore(
+    pi_queue Queue, pi_interop_semaphore_handle SemHandle, bool HasSignalValue,
+    pi_uint64 SignalValue, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
+  return pi2ur::piextSignalExternalSemaphore(Queue, SemHandle, HasSignalValue,
+                                             SignalValue, NumEventsInWaitList,
+                                             EventWaitList, Event);
 }
 
 pi_result piKernelGetGroupInfo(pi_kernel Kernel, pi_device Device,
diff --git a/sycl/plugins/unified_runtime/CMakeLists.txt b/sycl/plugins/unified_runtime/CMakeLists.txt
index 80eb80062aa9a..b1ef3f71ae170 100644
--- a/sycl/plugins/unified_runtime/CMakeLists.txt
+++ b/sycl/plugins/unified_runtime/CMakeLists.txt
@@ -100,13 +100,13 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
   endfunction()
 
   set(UNIFIED_RUNTIME_REPO "https://github.com/oneapi-src/unified-runtime.git")
-  # commit 755a1e75e24ed55070a0f457d2d8a676521ad4f7
-  # Merge: 6469b890 5593d84c
+  # commit 1e9b1b493fe30e6236bf611ae6d82366c9376f6c
+  # Merge: a011f092 d8500a36
   # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
-  # Date:   Tue Jun 4 10:34:57 2024 +0100
-  #     Merge pull request #1385 from yingcong-wu/yc/new-api-suggestgroupsize
-  #     Implement urKernelGetSuggestedLocalWorkSize
-  set(UNIFIED_RUNTIME_TAG 755a1e75e24ed55070a0f457d2d8a676521ad4f7)
+  # Date:   Fri Jun 21 10:22:52 2024 +0100
+  #     Merge pull request #805 from aarongreig/aaron/kernelSetArgIndirectionFix
+  #     Correct level of indirection used in KernelSetArgPointer calls.
+  set(UNIFIED_RUNTIME_TAG 1e9b1b493fe30e6236bf611ae6d82366c9376f6c)
 
   fetch_adapter_source(level_zero
     ${UNIFIED_RUNTIME_REPO}
@@ -120,13 +120,7 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
 
   fetch_adapter_source(cuda
     ${UNIFIED_RUNTIME_REPO}
-    # commit 0e38fda02ca00aaab28018240eb526da7dd08f56
-    # Merge: f4968809 b7c8ac18
-    # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
-    # Date:   Tue Jun 4 14:18:15 2024 +0100
-    #     Merge pull request #1552 from mmoadeli/atomic-access-on-host-malloc
-    #     [CUDA] Remove the support of concurrent atomic access to host allocated pinned memory.
-    0e38fda02ca00aaab28018240eb526da7dd08f56
+    ${UNIFIED_RUNTIME_TAG}
   )
 
   fetch_adapter_source(hip
@@ -136,13 +130,7 @@ if(SYCL_PI_UR_USE_FETCH_CONTENT)
 
   fetch_adapter_source(native_cpu
     ${UNIFIED_RUNTIME_REPO}
-    # commit 31ee5d536130a6d2ce09661db1bef3bd1cd4d705
-    # Merge: ab151e98 53e43466
-    # Author: Kenneth Benzie (Benie) <k.benzie@codeplay.com>
-    # Date:   Tue Jun 4 14:10:39 2024 +0100
-    #     Merge pull request #1527 from PietroGhg/pietro/report_atomics
-    #     [NATIVECPU] Report correct memory order capabilities for Native CPU
-    31ee5d536130a6d2ce09661db1bef3bd1cd4d705
+    ${UNIFIED_RUNTIME_TAG}
   )
 
   if(SYCL_PI_UR_OVERRIDE_FETCH_CONTENT_REPO)
diff --git a/sycl/plugins/unified_runtime/pi2ur.hpp b/sycl/plugins/unified_runtime/pi2ur.hpp
index 5940e1173cec0..b71f77c062c0e 100644
--- a/sycl/plugins/unified_runtime/pi2ur.hpp
+++ b/sycl/plugins/unified_runtime/pi2ur.hpp
@@ -83,8 +83,6 @@ static pi_result ur2piResult(ur_result_t urResult) {
     return PI_ERROR_INVALID_IMAGE_SIZE;
   case UR_RESULT_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR:
     return PI_ERROR_INVALID_IMAGE_FORMAT_DESCRIPTOR;
-  case UR_RESULT_ERROR_IMAGE_FORMAT_NOT_SUPPORTED:
-    return PI_ERROR_IMAGE_FORMAT_NOT_SUPPORTED;
   case UR_RESULT_ERROR_MEM_OBJECT_ALLOCATION_FAILURE:
     return PI_ERROR_MEM_OBJECT_ALLOCATION_FAILURE;
   case UR_RESULT_ERROR_INVALID_PROGRAM_EXECUTABLE:
@@ -862,13 +860,13 @@ piextPlatformCreateWithNativeHandle(pi_native_handle NativeHandle,
   if (auto res = PiGetAdapter(adapter); res != PI_SUCCESS) {
     return res;
   }
-  (void)adapter;
 
   ur_platform_handle_t UrPlatform{};
   ur_native_handle_t UrNativeHandle =
       reinterpret_cast<ur_native_handle_t>(NativeHandle);
   ur_platform_native_properties_t UrProperties{};
-  urPlatformCreateWithNativeHandle(UrNativeHandle, &UrProperties, &UrPlatform);
+  urPlatformCreateWithNativeHandle(UrNativeHandle, adapter, &UrProperties,
+                                   &UrPlatform);
 
   *Platform = reinterpret_cast<pi_platform>(UrPlatform);
 
@@ -2290,7 +2288,10 @@ inline pi_result piKernelSetArgPointer(pi_kernel Kernel, pi_uint32 ArgIndex,
                                        size_t ArgSize, const void *ArgValue) {
   std::ignore = ArgSize;
   ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
-  HANDLE_ERRORS(urKernelSetArgPointer(UrKernel, ArgIndex, nullptr, ArgValue));
+  // The old PI interface was expecting a pointer to the pointer obtained via
+  // usm/virtual memory, UR now consumes that pointer directly.
+  const void *Arg = *static_cast<const void *const *>(ArgValue);
+  HANDLE_ERRORS(urKernelSetArgPointer(UrKernel, ArgIndex, nullptr, Arg));
 
   return PI_SUCCESS;
 }
@@ -2554,7 +2555,10 @@ inline pi_result piProgramRelease(pi_program Program) {
 inline pi_result piextKernelSetArgPointer(pi_kernel Kernel, pi_uint32 ArgIndex,
                                           size_t, const void *ArgValue) {
   ur_kernel_handle_t UrKernel = reinterpret_cast<ur_kernel_handle_t>(Kernel);
-  HANDLE_ERRORS(urKernelSetArgPointer(UrKernel, ArgIndex, nullptr, ArgValue));
+  // The old PI interface was expecting a pointer to the pointer obtained via
+  // usm/virtual memory, UR now consumes that pointer directly.
+  const void *Arg = *static_cast<const void *const *>(ArgValue);
+  HANDLE_ERRORS(urKernelSetArgPointer(UrKernel, ArgIndex, nullptr, Arg));
 
   return PI_SUCCESS;
 }
@@ -2968,7 +2972,6 @@ static void pi2urImageDesc(const pi_image_format *ImageFormat,
     UrDesc->type = TO;                                                         \
     break;                                                                     \
   }
-    PI_TO_UR_MAP_IMAGE_TYPE(PI_MEM_TYPE_BUFFER, UR_MEM_TYPE_BUFFER)
     PI_TO_UR_MAP_IMAGE_TYPE(PI_MEM_TYPE_IMAGE2D, UR_MEM_TYPE_IMAGE2D)
     PI_TO_UR_MAP_IMAGE_TYPE(PI_MEM_TYPE_IMAGE3D, UR_MEM_TYPE_IMAGE3D)
     PI_TO_UR_MAP_IMAGE_TYPE(PI_MEM_TYPE_IMAGE2D_ARRAY,
@@ -2976,8 +2979,6 @@ static void pi2urImageDesc(const pi_image_format *ImageFormat,
     PI_TO_UR_MAP_IMAGE_TYPE(PI_MEM_TYPE_IMAGE1D, UR_MEM_TYPE_IMAGE1D)
     PI_TO_UR_MAP_IMAGE_TYPE(PI_MEM_TYPE_IMAGE1D_ARRAY,
                             UR_MEM_TYPE_IMAGE1D_ARRAY)
-    PI_TO_UR_MAP_IMAGE_TYPE(PI_MEM_TYPE_IMAGE1D_BUFFER,
-                            UR_MEM_TYPE_IMAGE1D_BUFFER)
     PI_TO_UR_MAP_IMAGE_TYPE(PI_MEM_TYPE_IMAGE_CUBEMAP,
                             UR_MEM_TYPE_IMAGE_CUBEMAP_EXP)
 #undef PI_TO_UR_MAP_IMAGE_TYPE
@@ -5374,6 +5375,8 @@ inline pi_result piextMemImageGetInfo(pi_image_mem_handle MemHandle,
   return PI_SUCCESS;
 }
 
+[[deprecated("This function has been deprecated in favor of "
+             "`piextImportExternalMemory`")]]
 inline pi_result piextMemImportOpaqueFD(pi_context Context, pi_device Device,
                                         size_t Size, int FileDescriptor,
                                         pi_interop_mem_handle *RetHandle) {
@@ -5382,7 +5385,7 @@ inline pi_result piextMemImportOpaqueFD(pi_context Context, pi_device Device,
 
   auto UrContext = reinterpret_cast<ur_context_handle_t>(Context);
   auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
-  ur_exp_interop_mem_handle_t *UrRetHandle =
+  auto *UrRetHandle =
       reinterpret_cast<ur_exp_interop_mem_handle_t *>(RetHandle);
 
   ur_exp_file_descriptor_t PosixFD{};
@@ -5393,8 +5396,66 @@ inline pi_result piextMemImportOpaqueFD(pi_context Context, pi_device Device,
   InteropMemDesc.stype = UR_STRUCTURE_TYPE_EXP_INTEROP_MEM_DESC;
   InteropMemDesc.pNext = &PosixFD;
 
-  HANDLE_ERRORS(urBindlessImagesImportOpaqueFDExp(
-      UrContext, UrDevice, Size, &InteropMemDesc, UrRetHandle));
+  HANDLE_ERRORS(urBindlessImagesImportExternalMemoryExp(
+      UrContext, UrDevice, Size, UR_EXP_EXTERNAL_MEM_TYPE_OPAQUE_FD,
+      &InteropMemDesc, UrRetHandle));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result
+piextImportExternalMemory(pi_context Context, pi_device Device,
+                          pi_external_mem_descriptor *MemDescriptor,
+                          pi_interop_mem_handle *RetHandle) {
+  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
+  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
+
+  auto UrContext = reinterpret_cast<ur_context_handle_t>(Context);
+  auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
+  auto *UrRetHandle =
+      reinterpret_cast<ur_exp_interop_mem_handle_t *>(RetHandle);
+
+  ur_exp_interop_mem_desc_t InteropMemDesc{};
+  InteropMemDesc.stype = UR_STRUCTURE_TYPE_EXP_INTEROP_MEM_DESC;
+
+  ur_exp_external_mem_type_t UrExternalMemHandleType;
+  switch (MemDescriptor->handleType) {
+  case pi_external_mem_handle_type::opaque_fd:
+    UrExternalMemHandleType = UR_EXP_EXTERNAL_MEM_TYPE_OPAQUE_FD;
+    break;
+  case pi_external_mem_handle_type::win32_nt_handle:
+    UrExternalMemHandleType = UR_EXP_EXTERNAL_MEM_TYPE_WIN32_NT;
+    break;
+  case pi_external_mem_handle_type::win32_nt_dx12_resource:
+    UrExternalMemHandleType = UR_EXP_EXTERNAL_MEM_TYPE_WIN32_NT_DX12_RESOURCE;
+    break;
+  default:
+    return PI_ERROR_INVALID_VALUE;
+  }
+
+  switch (MemDescriptor->handleType) {
+  case pi_external_mem_handle_type::opaque_fd: {
+    ur_exp_file_descriptor_t OpaqueFD{};
+    OpaqueFD.stype = UR_STRUCTURE_TYPE_EXP_FILE_DESCRIPTOR;
+    OpaqueFD.fd = MemDescriptor->handle.file_descriptor;
+    InteropMemDesc.pNext = &OpaqueFD;
+    break;
+  }
+  case pi_external_mem_handle_type::win32_nt_handle:
+  case pi_external_mem_handle_type::win32_nt_dx12_resource: {
+    ur_exp_win32_handle_t Win32Handle{};
+    Win32Handle.stype = UR_STRUCTURE_TYPE_EXP_WIN32_HANDLE;
+    Win32Handle.handle = MemDescriptor->handle.win32_handle;
+    InteropMemDesc.pNext = &Win32Handle;
+    break;
+  }
+  default:
+    return PI_ERROR_INVALID_VALUE;
+  }
+
+  HANDLE_ERRORS(urBindlessImagesImportExternalMemoryExp(
+      UrContext, UrDevice, MemDescriptor->memorySizeBytes,
+      UrExternalMemHandleType, &InteropMemDesc, UrRetHandle));
 
   return PI_SUCCESS;
 }
@@ -5439,6 +5500,8 @@ inline pi_result piextMemReleaseInterop(pi_context Context, pi_device Device,
   return PI_SUCCESS;
 }
 
+[[deprecated("This function has been deprecated in favor of "
+             "`piextImportExternalSemaphore`")]]
 inline pi_result
 piextImportExternalSemaphoreOpaqueFD(pi_context Context, pi_device Device,
                                      int FileDescriptor,
@@ -5448,7 +5511,7 @@ piextImportExternalSemaphoreOpaqueFD(pi_context Context, pi_device Device,
 
   auto UrContext = reinterpret_cast<ur_context_handle_t>(Context);
   auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
-  ur_exp_interop_semaphore_handle_t *UrRetHandle =
+  auto *UrRetHandle =
       reinterpret_cast<ur_exp_interop_semaphore_handle_t *>(RetHandle);
 
   ur_exp_file_descriptor_t PosixFD{};
@@ -5459,8 +5522,67 @@ piextImportExternalSemaphoreOpaqueFD(pi_context Context, pi_device Device,
   InteropSemDesc.stype = UR_STRUCTURE_TYPE_EXP_INTEROP_SEMAPHORE_DESC;
   InteropSemDesc.pNext = &PosixFD;
 
-  HANDLE_ERRORS(urBindlessImagesImportExternalSemaphoreOpaqueFDExp(
-      UrContext, UrDevice, &InteropSemDesc, UrRetHandle));
+  HANDLE_ERRORS(urBindlessImagesImportExternalSemaphoreExp(
+      UrContext, UrDevice, UR_EXP_EXTERNAL_SEMAPHORE_TYPE_OPAQUE_FD,
+      &InteropSemDesc, UrRetHandle));
+
+  return PI_SUCCESS;
+}
+
+inline pi_result
+piextImportExternalSemaphore(pi_context Context, pi_device Device,
+                             pi_external_semaphore_descriptor *SemDescriptor,
+                             pi_interop_semaphore_handle *RetHandle) {
+  PI_ASSERT(Context, PI_ERROR_INVALID_CONTEXT);
+  PI_ASSERT(Device, PI_ERROR_INVALID_DEVICE);
+
+  auto UrContext = reinterpret_cast<ur_context_handle_t>(Context);
+  auto UrDevice = reinterpret_cast<ur_device_handle_t>(Device);
+  auto *UrRetHandle =
+      reinterpret_cast<ur_exp_interop_semaphore_handle_t *>(RetHandle);
+
+  ur_exp_interop_semaphore_desc_t InteropSemDesc{};
+  InteropSemDesc.stype = UR_STRUCTURE_TYPE_EXP_INTEROP_SEMAPHORE_DESC;
+
+  ur_exp_external_semaphore_type_t UrExternalSemHandleType;
+  switch (SemDescriptor->handleType) {
+  case pi_external_semaphore_handle_type::opaque_fd:
+    UrExternalSemHandleType = UR_EXP_EXTERNAL_SEMAPHORE_TYPE_OPAQUE_FD;
+    break;
+  case pi_external_semaphore_handle_type::win32_nt_handle:
+    UrExternalSemHandleType = UR_EXP_EXTERNAL_SEMAPHORE_TYPE_WIN32_NT;
+    break;
+  case pi_external_semaphore_handle_type::win32_nt_dx12_fence:
+    UrExternalSemHandleType =
+        UR_EXP_EXTERNAL_SEMAPHORE_TYPE_WIN32_NT_DX12_FENCE;
+    break;
+  default:
+    return PI_ERROR_INVALID_VALUE;
+  }
+
+  switch (SemDescriptor->handleType) {
+  case pi_external_semaphore_handle_type::opaque_fd: {
+    ur_exp_file_descriptor_t OpaqueFD{};
+    OpaqueFD.stype = UR_STRUCTURE_TYPE_EXP_FILE_DESCRIPTOR;
+    OpaqueFD.fd = SemDescriptor->handle.file_descriptor;
+    InteropSemDesc.pNext = &OpaqueFD;
+    break;
+  }
+  case pi_external_semaphore_handle_type::win32_nt_dx12_fence:
+  case pi_external_semaphore_handle_type::win32_nt_handle: {
+    ur_exp_win32_handle_t Win32Handle{};
+    Win32Handle.stype = UR_STRUCTURE_TYPE_EXP_WIN32_HANDLE;
+    Win32Handle.handle = SemDescriptor->handle.win32_handle;
+    InteropSemDesc.pNext = &Win32Handle;
+    break;
+  }
+  default:
+    return PI_ERROR_INVALID_VALUE;
+  }
+
+  HANDLE_ERRORS(urBindlessImagesImportExternalSemaphoreExp(
+      UrContext, UrDevice, UrExternalSemHandleType, &InteropSemDesc,
+      UrRetHandle));
 
   return PI_SUCCESS;
 }
@@ -5482,11 +5604,10 @@ piextDestroyExternalSemaphore(pi_context Context, pi_device Device,
   return PI_SUCCESS;
 }
 
-inline pi_result
-piextWaitExternalSemaphore(pi_queue Queue,
-                           pi_interop_semaphore_handle SemHandle,
-                           pi_uint32 NumEventsInWaitList,
-                           const pi_event *EventWaitList, pi_event *Event) {
+inline pi_result piextWaitExternalSemaphore(
+    pi_queue Queue, pi_interop_semaphore_handle SemHandle, bool HasWaitValue,
+    pi_uint64 WaitValue, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
   PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
 
   auto UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
@@ -5497,16 +5618,16 @@ piextWaitExternalSemaphore(pi_queue Queue,
   ur_event_handle_t *UREvent = reinterpret_cast<ur_event_handle_t *>(Event);
 
   HANDLE_ERRORS(urBindlessImagesWaitExternalSemaphoreExp(
-      UrQueue, UrSemHandle, NumEventsInWaitList, UrEventWaitList, UREvent));
+      UrQueue, UrSemHandle, HasWaitValue, WaitValue, NumEventsInWaitList,
+      UrEventWaitList, UREvent));
 
   return PI_SUCCESS;
 }
 
-inline pi_result
-piextSignalExternalSemaphore(pi_queue Queue,
-                             pi_interop_semaphore_handle SemHandle,
-                             pi_uint32 NumEventsInWaitList,
-                             const pi_event *EventWaitList, pi_event *Event) {
+inline pi_result piextSignalExternalSemaphore(
+    pi_queue Queue, pi_interop_semaphore_handle SemHandle, bool HasSignalValue,
+    pi_uint64 SignalValue, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
   PI_ASSERT(Queue, PI_ERROR_INVALID_QUEUE);
 
   auto UrQueue = reinterpret_cast<ur_queue_handle_t>(Queue);
@@ -5517,7 +5638,8 @@ piextSignalExternalSemaphore(pi_queue Queue,
   ur_event_handle_t *UREvent = reinterpret_cast<ur_event_handle_t *>(Event);
 
   HANDLE_ERRORS(urBindlessImagesSignalExternalSemaphoreExp(
-      UrQueue, UrSemHandle, NumEventsInWaitList, UrEventWaitList, UREvent));
+      UrQueue, UrSemHandle, HasSignalValue, SignalValue, NumEventsInWaitList,
+      UrEventWaitList, UREvent));
 
   return PI_SUCCESS;
 }
diff --git a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
index 8054a77e5d90c..30ba9a7afc8b1 100644
--- a/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
+++ b/sycl/plugins/unified_runtime/pi_unified_runtime.cpp
@@ -1310,13 +1310,21 @@ __SYCL_EXPORT pi_result piextMemImageGetInfo(pi_image_mem_handle MemHandle,
                                      ParamValueSizeRet);
 }
 
-__SYCL_EXPORT pi_result
-piextMemImportOpaqueFD(pi_context Context, pi_device Device, size_t Size,
-                       int FileDescriptor, pi_interop_mem_handle *RetHandle) {
+__SYCL_EXPORT_DEPRECATED("This function has been deprecated in favor of "
+                         "`piextImportExternalMemory`")
+pi_result piextMemImportOpaqueFD(pi_context Context, pi_device Device,
+                                 size_t Size, int FileDescriptor,
+                                 pi_interop_mem_handle *RetHandle) {
   return pi2ur::piextMemImportOpaqueFD(Context, Device, Size, FileDescriptor,
                                        RetHandle);
 }
 
+__SYCL_EXPORT pi_result piextImportExternalMemory(
+    pi_context Context, pi_device Device, pi_external_mem_descriptor *MemDesc,
+    pi_interop_mem_handle *RetHandle) {
+  return pi2ur::piextImportExternalMemory(Context, Device, MemDesc, RetHandle);
+}
+
 __SYCL_EXPORT pi_result piextMemMapExternalArray(
     pi_context Context, pi_device Device, pi_image_format *ImageFormat,
     pi_image_desc *ImageDesc, pi_interop_mem_handle MemHandle,
@@ -1331,13 +1339,24 @@ __SYCL_EXPORT pi_result piextMemReleaseInterop(pi_context Context,
   return pi2ur::piextMemReleaseInterop(Context, Device, ExtMem);
 }
 
-__SYCL_EXPORT pi_result piextImportExternalSemaphoreOpaqueFD(
-    pi_context Context, pi_device Device, int FileDescriptor,
-    pi_interop_semaphore_handle *RetHandle) {
+__SYCL_EXPORT_DEPRECATED("This function has been deprecated in favor of "
+                         "`piextImportExternalSemaphore`")
+pi_result
+piextImportExternalSemaphoreOpaqueFD(pi_context Context, pi_device Device,
+                                     int FileDescriptor,
+                                     pi_interop_semaphore_handle *RetHandle) {
   return pi2ur::piextImportExternalSemaphoreOpaqueFD(Context, Device,
                                                      FileDescriptor, RetHandle);
 }
 
+__SYCL_EXPORT pi_result
+piextImportExternalSemaphore(pi_context Context, pi_device Device,
+                             pi_external_semaphore_descriptor *SemDesc,
+                             pi_interop_semaphore_handle *RetHandle) {
+  return pi2ur::piextImportExternalSemaphore(Context, Device, SemDesc,
+                                             RetHandle);
+}
+
 __SYCL_EXPORT pi_result
 piextDestroyExternalSemaphore(pi_context Context, pi_device Device,
                               pi_interop_semaphore_handle SemHandle) {
@@ -1345,19 +1364,21 @@ piextDestroyExternalSemaphore(pi_context Context, pi_device Device,
 }
 
 __SYCL_EXPORT pi_result piextWaitExternalSemaphore(
-    pi_queue Queue, pi_interop_semaphore_handle SemHandle,
-    pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList,
-    pi_event *Event) {
-  return pi2ur::piextWaitExternalSemaphore(
-      Queue, SemHandle, NumEventsInWaitList, EventWaitList, Event);
+    pi_queue Queue, pi_interop_semaphore_handle SemHandle, bool HasWaitValue,
+    pi_uint64 WaitValue, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
+  return pi2ur::piextWaitExternalSemaphore(Queue, SemHandle, HasWaitValue,
+                                           WaitValue, NumEventsInWaitList,
+                                           EventWaitList, Event);
 }
 
 __SYCL_EXPORT pi_result piextSignalExternalSemaphore(
-    pi_queue Queue, pi_interop_semaphore_handle SemHandle,
-    pi_uint32 NumEventsInWaitList, const pi_event *EventWaitList,
-    pi_event *Event) {
-  return pi2ur::piextSignalExternalSemaphore(
-      Queue, SemHandle, NumEventsInWaitList, EventWaitList, Event);
+    pi_queue Queue, pi_interop_semaphore_handle SemHandle, bool HasSignalValue,
+    pi_uint64 SignalValue, pi_uint32 NumEventsInWaitList,
+    const pi_event *EventWaitList, pi_event *Event) {
+  return pi2ur::piextSignalExternalSemaphore(Queue, SemHandle, HasSignalValue,
+                                             SignalValue, NumEventsInWaitList,
+                                             EventWaitList, Event);
 }
 
 // This interface is not in Unified Runtime currently
diff --git a/sycl/source/detail/bindless_images.cpp b/sycl/source/detail/bindless_images.cpp
index a6b2ba9704f16..933b93bf0a025 100644
--- a/sycl/source/detail/bindless_images.cpp
+++ b/sycl/source/detail/bindless_images.cpp
@@ -478,10 +478,19 @@ __SYCL_EXPORT interop_mem_handle import_external_memory<resource_fd>(
   const sycl::detail::PluginPtr &Plugin = CtxImpl->getPlugin();
 
   pi_interop_mem_handle piInteropMem;
+  pi_external_mem_descriptor piExternalMemDescriptor;
+
+  piExternalMemDescriptor.memorySizeBytes = externalMem.size_in_bytes;
+  piExternalMemDescriptor.handle.file_descriptor =
+      externalMem.external_resource.file_descriptor;
+  // For `resource_fd` external memory type, the handle type is always
+  // `opaque_fd`. No need for a switch statement like we have for win32
+  // resources.
+  piExternalMemDescriptor.handleType = pi_external_mem_handle_type::opaque_fd;
+
   Plugin->call<sycl::errc::invalid,
-               sycl::detail::PiApiKind::piextMemImportOpaqueFD>(
-      C, Device, externalMem.size_in_bytes,
-      externalMem.external_resource.file_descriptor, &piInteropMem);
+               sycl::detail::PiApiKind::piextImportExternalMemory>(
+      C, Device, &piExternalMemDescriptor, &piInteropMem);
 
   return interop_mem_handle{piInteropMem};
 }
@@ -494,6 +503,47 @@ __SYCL_EXPORT interop_mem_handle import_external_memory<resource_fd>(
       externalMem, syclQueue.get_device(), syclQueue.get_context());
 }
 
+template <>
+__SYCL_EXPORT interop_mem_handle import_external_memory<resource_win32_handle>(
+    external_mem_descriptor<resource_win32_handle> externalMem,
+    const sycl::device &syclDevice, const sycl::context &syclContext) {
+  std::shared_ptr<sycl::detail::context_impl> CtxImpl =
+      sycl::detail::getSyclObjImpl(syclContext);
+  pi_context C = CtxImpl->getHandleRef();
+  std::shared_ptr<sycl::detail::device_impl> DevImpl =
+      sycl::detail::getSyclObjImpl(syclDevice);
+  pi_device Device = DevImpl->getHandleRef();
+  const sycl::detail::PluginPtr &Plugin = CtxImpl->getPlugin();
+
+  pi_interop_mem_handle piInteropMem;
+  pi_external_mem_descriptor piExternalMemDescriptor;
+
+  piExternalMemDescriptor.memorySizeBytes = externalMem.size_in_bytes;
+  piExternalMemDescriptor.handle.win32_handle =
+      externalMem.external_resource.handle;
+
+  // Select appropriate memory handle type.
+  switch (externalMem.handle_type) {
+  case external_mem_handle_type::win32_nt_handle:
+    piExternalMemDescriptor.handleType =
+        pi_external_mem_handle_type::win32_nt_handle;
+    break;
+  case external_mem_handle_type::win32_nt_dx12_resource:
+    piExternalMemDescriptor.handleType =
+        pi_external_mem_handle_type::win32_nt_dx12_resource;
+    break;
+  default:
+    throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                          "Invalid memory handle type");
+  }
+
+  Plugin->call<sycl::errc::invalid,
+               sycl::detail::PiApiKind::piextImportExternalMemory>(
+      C, Device, &piExternalMemDescriptor, &piInteropMem);
+
+  return interop_mem_handle{piInteropMem};
+}
+
 template <>
 __SYCL_EXPORT_DEPRECATED(
     "import_external_memory templated by external_mem_fd is deprecated."
@@ -520,6 +570,14 @@ interop_mem_handle import_external_memory<external_mem_fd>(
       externalMem, syclQueue.get_device(), syclQueue.get_context());
 }
 
+template <>
+__SYCL_EXPORT interop_mem_handle import_external_memory<resource_win32_handle>(
+    external_mem_descriptor<resource_win32_handle> externalMem,
+    const sycl::queue &syclQueue) {
+  return import_external_memory<resource_win32_handle>(
+      externalMem, syclQueue.get_device(), syclQueue.get_context());
+}
+
 __SYCL_EXPORT
 image_mem_handle map_external_image_memory(interop_mem_handle memHandle,
                                            const image_descriptor &desc,
@@ -610,13 +668,20 @@ __SYCL_EXPORT interop_semaphore_handle import_external_semaphore(
   pi_device Device = DevImpl->getHandleRef();
 
   pi_interop_semaphore_handle piInteropSemaphore;
+  pi_external_semaphore_descriptor piInteropSemDesc;
+
+  // For this specialization of `import_external_semaphore` the handleType is
+  // always `opaque_fd`.
+  piInteropSemDesc.handleType = pi_external_semaphore_handle_type::opaque_fd;
+  piInteropSemDesc.handle.file_descriptor =
+      externalSemaphoreDesc.external_resource.file_descriptor;
 
   Plugin->call<sycl::errc::invalid,
-               sycl::detail::PiApiKind::piextImportExternalSemaphoreOpaqueFD>(
-      C, Device, externalSemaphoreDesc.external_resource.file_descriptor,
-      &piInteropSemaphore);
+               sycl::detail::PiApiKind::piextImportExternalSemaphore>(
+      C, Device, &piInteropSemDesc, &piInteropSemaphore);
 
-  return interop_semaphore_handle{piInteropSemaphore};
+  return interop_semaphore_handle{piInteropSemaphore,
+                                  external_semaphore_handle_type::opaque_fd};
 }
 
 template <>
@@ -627,6 +692,55 @@ __SYCL_EXPORT interop_semaphore_handle import_external_semaphore(
       externalSemaphoreDesc, syclQueue.get_device(), syclQueue.get_context());
 }
 
+template <>
+__SYCL_EXPORT interop_semaphore_handle import_external_semaphore(
+    external_semaphore_descriptor<resource_win32_handle> externalSemaphoreDesc,
+    const sycl::device &syclDevice, const sycl::context &syclContext) {
+  std::shared_ptr<sycl::detail::context_impl> CtxImpl =
+      sycl::detail::getSyclObjImpl(syclContext);
+  const sycl::detail::PluginPtr &Plugin = CtxImpl->getPlugin();
+  pi_context C = CtxImpl->getHandleRef();
+  std::shared_ptr<sycl::detail::device_impl> DevImpl =
+      sycl::detail::getSyclObjImpl(syclDevice);
+  pi_device Device = DevImpl->getHandleRef();
+
+  pi_interop_semaphore_handle piInteropSemaphore;
+  pi_external_semaphore_descriptor piInteropSemDesc;
+
+  // Select appropriate semaphore handle type.
+  switch (externalSemaphoreDesc.handle_type) {
+  case external_semaphore_handle_type::win32_nt_handle:
+    piInteropSemDesc.handleType =
+        pi_external_semaphore_handle_type::win32_nt_handle;
+    break;
+  case external_semaphore_handle_type::win32_nt_dx12_fence:
+    piInteropSemDesc.handleType =
+        pi_external_semaphore_handle_type::win32_nt_dx12_fence;
+    break;
+  default:
+    throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
+                          "Invalid semaphore handle type");
+  }
+
+  piInteropSemDesc.handle.win32_handle =
+      externalSemaphoreDesc.external_resource.handle;
+
+  Plugin->call<sycl::errc::invalid,
+               sycl::detail::PiApiKind::piextImportExternalSemaphore>(
+      C, Device, &piInteropSemDesc, &piInteropSemaphore);
+
+  return interop_semaphore_handle{piInteropSemaphore,
+                                  externalSemaphoreDesc.handle_type};
+}
+
+template <>
+__SYCL_EXPORT interop_semaphore_handle import_external_semaphore(
+    external_semaphore_descriptor<resource_win32_handle> externalSemaphoreDesc,
+    const sycl::queue &syclQueue) {
+  return import_external_semaphore(
+      externalSemaphoreDesc, syclQueue.get_device(), syclQueue.get_context());
+}
+
 template <>
 __SYCL_EXPORT_DEPRECATED("import_external_semaphore templated by "
                          "external_semaphore_fd is deprecated."
@@ -634,7 +748,6 @@ __SYCL_EXPORT_DEPRECATED("import_external_semaphore templated by "
 interop_semaphore_handle import_external_semaphore(
     external_semaphore_descriptor<external_semaphore_fd> externalSemaphoreDesc,
     const sycl::device &syclDevice, const sycl::context &syclContext) {
-
   external_semaphore_descriptor<resource_fd> extSem;
   extSem.external_resource.file_descriptor =
       externalSemaphoreDesc.external_resource.file_descriptor;
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index 6d23432271c54..ba90e6eb829cb 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -501,10 +501,16 @@ struct get_device_info_impl<std::vector<size_t>,
         Dev->getHandleRef(), PiInfoCode<info::device::sub_group_sizes>::value,
         0, nullptr, &resultSize);
 
-    std::vector<size_t> result(resultSize / sizeof(size_t));
+    std::vector<uint32_t> result32(resultSize / sizeof(uint32_t));
     Dev->getPlugin()->call<PiApiKind::piDeviceGetInfo>(
         Dev->getHandleRef(), PiInfoCode<info::device::sub_group_sizes>::value,
-        resultSize, result.data(), nullptr);
+        resultSize, result32.data(), nullptr);
+
+    std::vector<size_t> result;
+    result.reserve(result32.size());
+    for (uint32_t value : result32) {
+      result.push_back(value);
+    }
     return result;
   }
 };
@@ -668,10 +674,7 @@ struct get_device_info_impl<
           if (Item.first == arch)
             return Item.second;
         }
-        throw sycl::exception(
-            make_error_code(errc::runtime),
-            "The current device architecture is not supported by "
-            "sycl_ext_oneapi_device_architecture.");
+        return ext::oneapi::experimental::architecture::unknown;
       };
       uint32_t DeviceIp;
       Dev->getPlugin()->call<PiApiKind::piDeviceGetInfo>(
@@ -687,10 +690,7 @@ struct get_device_info_impl<
           if (std::string_view(Item.first) == arch)
             return Item.second;
         }
-        throw sycl::exception(
-            make_error_code(errc::runtime),
-            "The current device architecture is not supported by "
-            "sycl_ext_oneapi_device_architecture.");
+        return ext::oneapi::experimental::architecture::unknown;
       };
       size_t ResultSize = 0;
       Dev->getPlugin()->call<PiApiKind::piDeviceGetInfo>(
@@ -721,21 +721,7 @@ struct get_device_info_impl<
       return MapArchIDToArchName(DeviceIp);
     } // else is not needed
     // TODO: add support of other architectures by extending with else if
-    // Generating a user-friendly error message
-    std::string DeviceStr;
-    if (Dev->is_gpu())
-      DeviceStr = "GPU";
-    else if (Dev->is_cpu())
-      DeviceStr = "CPU";
-    else if (Dev->is_accelerator())
-      DeviceStr = "accelerator";
-    // else if not needed
-    std::stringstream ErrorMessage;
-    ErrorMessage
-        << "sycl_ext_oneapi_device_architecture feature is not supported on "
-        << DeviceStr << " device with sycl::backend::" << CurrentBackend
-        << " backend.";
-    throw sycl::exception(make_error_code(errc::runtime), ErrorMessage.str());
+    return ext::oneapi::experimental::architecture::unknown;
   }
 };
 
diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index 19558851798ad..c7d245e5e91c0 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -167,15 +167,11 @@ event_impl::event_impl(sycl::detail::pi::PiEvent Event,
   }
 }
 
-event_impl::event_impl(const QueueImplPtr &Queue) {
+event_impl::event_impl(const QueueImplPtr &Queue)
+    : MQueue{Queue},
+      MIsProfilingEnabled{Queue->is_host() || Queue->MIsProfilingEnabled},
+      MFallbackProfiling{MIsProfilingEnabled && Queue->isProfilingFallback()} {
   this->setContextImpl(Queue->getContextImplPtr());
-  this->associateWithQueue(Queue);
-}
-
-void event_impl::associateWithQueue(const QueueImplPtr &Queue) {
-  MQueue = Queue;
-  MIsProfilingEnabled = Queue->is_host() || Queue->MIsProfilingEnabled;
-  MFallbackProfiling = MIsProfilingEnabled && Queue->isProfilingFallback();
   if (Queue->is_host()) {
     MState.store(HES_NotComplete);
     if (Queue->has_property<property::queue::enable_profiling>()) {
@@ -337,11 +333,6 @@ template <>
 uint64_t
 event_impl::get_profiling_info<info::event_profiling::command_start>() {
   checkProfilingPreconditions();
-
-  // For nop command start time is equal to submission time.
-  if (isNOP() && MSubmitTime)
-    return MSubmitTime;
-
   if (!MHostEvent) {
     if (MEvent) {
       auto StartTime =
@@ -369,11 +360,6 @@ event_impl::get_profiling_info<info::event_profiling::command_start>() {
 template <>
 uint64_t event_impl::get_profiling_info<info::event_profiling::command_end>() {
   checkProfilingPreconditions();
-
-  // For nop command end time is equal to submission time.
-  if (isNOP() && MSubmitTime)
-    return MSubmitTime;
-
   if (!MHostEvent) {
     if (MEvent) {
       auto EndTime =
diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp
index 56827e3373249..91bef738450d3 100644
--- a/sycl/source/detail/event_impl.hpp
+++ b/sycl/source/detail/event_impl.hpp
@@ -244,11 +244,6 @@ class event_impl {
     MSubmittedQueue = SubmittedQueue;
   };
 
-  /// Associate event with provided queue.
-  ///
-  /// @return
-  void associateWithQueue(const QueueImplPtr &Queue);
-
   /// Indicates if this event is not associated with any command and doesn't
   /// have native handle.
   ///
diff --git a/sycl/source/detail/global_handler.cpp b/sycl/source/detail/global_handler.cpp
index bfdee9e7f72c5..072a9628d6a6b 100644
--- a/sycl/source/detail/global_handler.cpp
+++ b/sycl/source/detail/global_handler.cpp
@@ -36,6 +36,11 @@ namespace detail {
 using LockGuard = std::lock_guard<SpinLock>;
 SpinLock GlobalHandler::MSyclGlobalHandlerProtector{};
 
+// forward decl
+void shutdown_win(); // TODO: win variant will go away soon
+void shutdown_early();
+void shutdown_late();
+
 // Utility class to track references on object.
 // Used for GlobalHandler now and created as thread_local object on the first
 // Scheduler usage. Origin idea is to track usage of Scheduler from main and
@@ -227,16 +232,25 @@ void GlobalHandler::releaseDefaultContexts() {
   MPlatformToDefaultContextCache.Inst.reset(nullptr);
 }
 
-struct DefaultContextReleaseHandler {
-  ~DefaultContextReleaseHandler() {
+struct EarlyShutdownHandler {
+  ~EarlyShutdownHandler() {
+#ifdef _WIN32
+    // on Windows we keep to the existing shutdown procedure
     GlobalHandler::instance().releaseDefaultContexts();
+#else
+    shutdown_early();
+#endif
   }
 };
 
-void GlobalHandler::registerDefaultContextReleaseHandler() {
-  static DefaultContextReleaseHandler handler{};
+void GlobalHandler::registerEarlyShutdownHandler() {
+  static EarlyShutdownHandler handler{};
 }
 
+bool GlobalHandler::isOkToDefer() const { return OkToDefer; }
+
+void GlobalHandler::endDeferredRelease() { OkToDefer = false; }
+
 // Note: Split from shutdown so it is available to the unittests for ensuring
 //       that the mock plugin is the lone plugin.
 void GlobalHandler::unloadPlugins() {
@@ -279,17 +293,20 @@ void GlobalHandler::drainThreadPool() {
 // itself is very aggressive about reclaiming memory. Thus,
 // we focus solely on unloading the plugins, so as to not
 // accidentally retain device handles. etc
-void shutdown() {
+void shutdown_win() {
   GlobalHandler *&Handler = GlobalHandler::getInstancePtr();
   Handler->unloadPlugins();
 }
 #else
-void shutdown() {
+void shutdown_early() {
   const LockGuard Lock{GlobalHandler::MSyclGlobalHandlerProtector};
   GlobalHandler *&Handler = GlobalHandler::getInstancePtr();
   if (!Handler)
     return;
 
+  // Now that we are shutting down, we will no longer defer MemObj releases.
+  Handler->endDeferredRelease();
+
   // Ensure neither host task is working so that no default context is accessed
   // upon its release
   Handler->prepareSchedulerToRelease(true);
@@ -297,12 +314,16 @@ void shutdown() {
   if (Handler->MHostTaskThreadPool.Inst)
     Handler->MHostTaskThreadPool.Inst->finishAndWait();
 
-  // If default contexts are requested after the first default contexts have
-  // been released there may be a new default context. These must be released
-  // prior to closing the plugins.
-  // Note: Releasing a default context here may cause failures in plugins with
-  // global state as the global state may have been released.
+  // This releases OUR reference to the default context, but
+  // other may yet have refs
   Handler->releaseDefaultContexts();
+}
+
+void shutdown_late() {
+  const LockGuard Lock{GlobalHandler::MSyclGlobalHandlerProtector};
+  GlobalHandler *&Handler = GlobalHandler::getInstancePtr();
+  if (!Handler)
+    return;
 
   // First, release resources, that may access plugins.
   Handler->MPlatformCache.Inst.reset(nullptr);
@@ -345,7 +366,7 @@ extern "C" __SYCL_EXPORT BOOL WINAPI DllMain(HINSTANCE hinstDLL,
                    // TODO: figure out what XPTI is doing that prevents release.
 #endif
 
-    shutdown();
+    shutdown_win();
     break;
   case DLL_PROCESS_ATTACH:
     if (PrintPiTrace)
@@ -363,7 +384,7 @@ extern "C" __SYCL_EXPORT BOOL WINAPI DllMain(HINSTANCE hinstDLL,
 // destructors. Priorities 0-100 are reserved by the compiler. The priority
 // value 110 allows SYCL users to run their destructors after runtime library
 // deinitialization.
-__attribute__((destructor(110))) static void syclUnload() { shutdown(); }
+__attribute__((destructor(110))) static void syclUnload() { shutdown_late(); }
 #endif
 } // namespace detail
 } // namespace _V1
diff --git a/sycl/source/detail/global_handler.hpp b/sycl/source/detail/global_handler.hpp
index 605be19fb7ab6..069fff3dbcdd5 100644
--- a/sycl/source/detail/global_handler.hpp
+++ b/sycl/source/detail/global_handler.hpp
@@ -73,8 +73,10 @@ class GlobalHandler {
   XPTIRegistry &getXPTIRegistry();
   ThreadPool &getHostTaskThreadPool();
 
-  static void registerDefaultContextReleaseHandler();
+  static void registerEarlyShutdownHandler();
 
+  bool isOkToDefer() const;
+  void endDeferredRelease();
   void unloadPlugins();
   void releaseDefaultContexts();
   void drainThreadPool();
@@ -91,7 +93,11 @@ class GlobalHandler {
   void *GSYCLCallEvent = nullptr;
 #endif
 
-  friend void shutdown();
+  bool OkToDefer = true;
+
+  friend void shutdown_win();
+  friend void shutdown_early();
+  friend void shutdown_late();
   friend class ObjectUsageCounter;
   static GlobalHandler *&getInstancePtr();
   static SpinLock MSyclGlobalHandlerProtector;
diff --git a/sycl/source/detail/graph_impl.cpp b/sycl/source/detail/graph_impl.cpp
index 5071c3d982066..329eab2aaf832 100644
--- a/sycl/source/detail/graph_impl.cpp
+++ b/sycl/source/detail/graph_impl.cpp
@@ -1303,6 +1303,7 @@ void exec_graph_impl::updateImpl(std::shared_ptr<node_impl> Node) {
   auto NDRDesc = ExecCG.MNDRDesc;
 
   pi_kernel PiKernel = nullptr;
+  pi_program PiProgram = nullptr;
   auto Kernel = ExecCG.MSyclKernel;
   auto KernelBundleImplPtr = ExecCG.MKernelBundle;
   std::shared_ptr<sycl::detail::kernel_impl> SyclKernelImpl = nullptr;
@@ -1326,7 +1327,7 @@ void exec_graph_impl::updateImpl(std::shared_ptr<node_impl> Node) {
     PiKernel = Kernel->getHandleRef();
     EliminatedArgMask = Kernel->getKernelArgMask();
   } else {
-    std::tie(PiKernel, std::ignore, EliminatedArgMask, std::ignore) =
+    std::tie(PiKernel, std::ignore, EliminatedArgMask, PiProgram) =
         sycl::detail::ProgramManager::getInstance().getOrCreateKernel(
             ContextImpl, DeviceImpl, ExecCG.MKernelName);
   }
@@ -1450,6 +1451,12 @@ void exec_graph_impl::updateImpl(std::shared_ptr<node_impl> Node) {
       sycl::detail::PiApiKind::piextCommandBufferUpdateKernelLaunch>(
       Command, &UpdateDesc);
 
+  if (PiProgram) {
+    // We retained these objects by calling getOrCreateKernel()
+    Plugin->call<sycl::detail::PiApiKind::piKernelRelease>(PiKernel);
+    Plugin->call<sycl::detail::PiApiKind::piProgramRelease>(PiProgram);
+  }
+
   if (Res != PI_SUCCESS) {
     throw sycl::exception(errc::invalid, "Error updating command_graph");
   }
diff --git a/sycl/source/detail/graph_impl.hpp b/sycl/source/detail/graph_impl.hpp
index 758d5903af311..80837181ec056 100644
--- a/sycl/source/detail/graph_impl.hpp
+++ b/sycl/source/detail/graph_impl.hpp
@@ -24,6 +24,7 @@
 #include <deque>
 #include <fstream>
 #include <functional>
+#include <iomanip>
 #include <list>
 #include <set>
 #include <shared_mutex>
@@ -618,6 +619,17 @@ class node_impl {
           } else if (Arg.MType ==
                      sycl::detail::kernel_param_kind_t::kind_pointer) {
             Type = "Pointer";
+            auto Fill = Stream.fill();
+            Stream << i << ") Type: " << Type << " Ptr: " << Arg.MPtr << "(0x"
+                   << std::hex << std::setfill('0');
+            for (int i = Arg.MSize - 1; i >= 0; --i) {
+              Stream << std::setw(2)
+                     << static_cast<int16_t>(
+                            (static_cast<unsigned char *>(Arg.MPtr))[i]);
+            }
+            Stream.fill(Fill);
+            Stream << std::dec << ")\\n";
+            continue;
           } else if (Arg.MType == sycl::detail::kernel_param_kind_t::
                                       kind_specialization_constants_buffer) {
             Type = "Specialization Constants Buffer";
diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp
index e268175781989..f50c5c94b78d4 100644
--- a/sycl/source/detail/handler_impl.hpp
+++ b/sycl/source/detail/handler_impl.hpp
@@ -123,6 +123,8 @@ class handler_impl {
 
   // Extra information for semaphore interoperability
   sycl::detail::pi::PiInteropSemaphoreHandle MInteropSemaphoreHandle;
+  std::optional<uint64_t> MWaitValue;
+  std::optional<uint64_t> MSignalValue;
 
   // The user facing node type, used for operations which are recorded to a
   // graph. Since some operations may actually be a different type than the user
diff --git a/sycl/source/detail/kernel_program_cache.hpp b/sycl/source/detail/kernel_program_cache.hpp
index 87a41d9fe1054..8a04e183a3122 100644
--- a/sycl/source/detail/kernel_program_cache.hpp
+++ b/sycl/source/detail/kernel_program_cache.hpp
@@ -290,7 +290,8 @@ class KernelProgramCache {
       } catch (const exception &Ex) {
         BuildResult->Error.Msg = Ex.what();
         BuildResult->Error.Code = Ex.get_cl_code();
-        if (BuildResult->Error.Code == PI_ERROR_OUT_OF_RESOURCES) {
+        if (BuildResult->Error.Code == PI_ERROR_OUT_OF_RESOURCES ||
+            BuildResult->Error.Code == PI_ERROR_OUT_OF_HOST_MEMORY) {
           reset();
           BuildResult->updateAndNotify(BuildState::BS_Initial);
           continue;
diff --git a/sycl/source/detail/platform_impl.cpp b/sycl/source/detail/platform_impl.cpp
index 2bdfab26676d9..00f66a28a5de8 100644
--- a/sycl/source/detail/platform_impl.cpp
+++ b/sycl/source/detail/platform_impl.cpp
@@ -193,15 +193,9 @@ std::vector<platform> platform_impl::get_platforms() {
     Platforms.push_back(Platform.first);
   }
 
-  // Register default context release handler after plugins have been loaded and
-  // after the first calls to each plugin. This initializes a function-local
-  // variable that should be destroyed before any global variables in the
-  // plugins are destroyed. This is done after the first call to the backends to
-  // ensure any lazy-loaded dependencies are loaded prior to the handler
-  // variable's initialization. Note: The default context release handler is not
-  // guaranteed to be destroyed before function-local static variables as they
-  // may be initialized after.
-  GlobalHandler::registerDefaultContextReleaseHandler();
+  // This initializes a function-local variable whose destructor is invoked as
+  // the SYCL shared library is first being unloaded.
+  GlobalHandler::registerEarlyShutdownHandler();
 
   return Platforms;
 }
diff --git a/sycl/source/detail/program_manager/program_manager.cpp b/sycl/source/detail/program_manager/program_manager.cpp
index c0b8ac875e67f..82246af25173d 100644
--- a/sycl/source/detail/program_manager/program_manager.cpp
+++ b/sycl/source/detail/program_manager/program_manager.cpp
@@ -1223,7 +1223,8 @@ ProgramManager::ProgramPtr ProgramManager::build(
         nullptr, &LinkedProg);
   };
   sycl::detail::pi::PiResult Error = doLink();
-  if (Error == PI_ERROR_OUT_OF_RESOURCES) {
+  if (Error == PI_ERROR_OUT_OF_RESOURCES ||
+      Error == PI_ERROR_OUT_OF_HOST_MEMORY) {
     Context->getKernelProgramCache().reset();
     Error = doLink();
   }
@@ -2118,7 +2119,8 @@ ProgramManager::link(const device_image_plain &DeviceImage,
         /*user_data=*/nullptr, &LinkedProg);
   };
   sycl::detail::pi::PiResult Error = doLink();
-  if (Error == PI_ERROR_OUT_OF_RESOURCES) {
+  if (Error == PI_ERROR_OUT_OF_RESOURCES ||
+      Error == PI_ERROR_OUT_OF_HOST_MEMORY) {
     ContextImpl->getKernelProgramCache().reset();
     Error = doLink();
   }
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 82334e6467dfd..52d01dc2923a6 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -732,7 +732,7 @@ class queue_impl {
       std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph) {
     std::lock_guard<std::mutex> Lock(MMutex);
     MGraph = Graph;
-    MExtGraphDeps.LastEventPtr = nullptr;
+    MExtGraphDeps.reset();
   }
 
   std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
@@ -938,6 +938,12 @@ class queue_impl {
     // ordering
     std::vector<EventImplPtr> UnenqueuedCmdEvents;
     EventImplPtr LastBarrier;
+
+    void reset() {
+      LastEventPtr = nullptr;
+      UnenqueuedCmdEvents.clear();
+      LastBarrier = nullptr;
+    }
   } MDefaultGraphDeps, MExtGraphDeps;
 
   const bool MIsInorder;
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index e8abd58fd1cfe..a164c455fed54 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -2303,7 +2303,18 @@ void SetArgBasedOnType(
         getMemAllocationFunc
             ? (sycl::detail::pi::PiMem)getMemAllocationFunc(Req)
             : nullptr;
-    if (Context.get_backend() == backend::opencl) {
+    // Only call piKernelSetArg for opencl plugin. Although for now opencl
+    // plugin is a thin wrapper for UR plugin, but they still produce different
+    // MemArg. For opencl plugin, the MemArg is a straight-forward cl_mem, so it
+    // will be fine using piKernelSetArg, which will call urKernelSetArgValue to
+    // pass the cl_mem object directly to clSetKernelArg. But when in
+    // SYCL_PREFER_UR=1, the MemArg is a cl_mem wrapped by ur_mem_object_t,
+    // which will need to unpack by calling piextKernelSetArgMemObj, which calls
+    // urKernelSetArgMemObj. If we call piKernelSetArg in such case, the
+    // clSetKernelArg will report CL_INVALID_MEM_OBJECT since the arg_value is
+    // not a valid cl_mem object but a ur_mem_object_t object.
+    if (Context.get_backend() == backend::opencl &&
+        !Plugin->hasBackend(backend::all)) {
       // clSetKernelArg (corresponding to piKernelSetArg) returns an error
       // when MemArg is null, which is the case when zero-sized buffers are
       // handled. Below assignment provides later call to clSetKernelArg with
@@ -3270,9 +3281,11 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     }
 
     const detail::PluginPtr &Plugin = MQueue->getPlugin();
+    auto OptWaitValue = SemWait->getWaitValue();
+    uint64_t WaitValue = OptWaitValue.has_value() ? OptWaitValue.value() : 0;
     Plugin->call<PiApiKind::piextWaitExternalSemaphore>(
-        MQueue->getHandleRef(), SemWait->getInteropSemaphoreHandle(), 0,
-        nullptr, nullptr);
+        MQueue->getHandleRef(), SemWait->getInteropSemaphoreHandle(),
+        OptWaitValue.has_value(), WaitValue, 0, nullptr, nullptr);
 
     return PI_SUCCESS;
   }
@@ -3284,9 +3297,12 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     }
 
     const detail::PluginPtr &Plugin = MQueue->getPlugin();
+    auto OptSignalValue = SemSignal->getSignalValue();
+    uint64_t SignalValue =
+        OptSignalValue.has_value() ? OptSignalValue.value() : 0;
     Plugin->call<PiApiKind::piextSignalExternalSemaphore>(
-        MQueue->getHandleRef(), SemSignal->getInteropSemaphoreHandle(), 0,
-        nullptr, nullptr);
+        MQueue->getHandleRef(), SemSignal->getInteropSemaphoreHandle(),
+        OptSignalValue.has_value(), SignalValue, 0, nullptr, nullptr);
 
     return PI_SUCCESS;
   }
@@ -3459,8 +3475,8 @@ UpdateCommandBufferCommand::UpdateCommandBufferCommand(
 pi_int32 UpdateCommandBufferCommand::enqueueImp() {
   waitForPreparedHostEvents();
   std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
-  auto RawEvents = getPiEvents(EventImpls);
-  flushCrossQueueDeps(EventImpls, getWorkerQueue());
+  sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
+  Command::waitForEvents(MQueue, EventImpls, Event);
 
   for (auto &Node : MNodes) {
     auto CG = static_cast<CGExecKernel *>(Node->MCommandGroup.get());
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index f0c5dc670aa05..b65a31d68660c 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -194,9 +194,9 @@ MemObjRecord *Scheduler::GraphBuilder::getMemObjRecord(SYCLMemObjI *MemObject) {
   return MemObject->MRecord.get();
 }
 
-MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord(
-    const QueueImplPtr &Queue, const Requirement *Req,
-    std::vector<Command *> &ToEnqueue) {
+MemObjRecord *
+Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue,
+                                                 const Requirement *Req) {
   SYCLMemObjI *MemObject = Req->MSYCLMemObj;
   MemObjRecord *Record = getMemObjRecord(MemObject);
 
@@ -242,8 +242,12 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord(
 
     MemObject->MRecord.reset(
         new MemObjRecord{InteropCtxPtr, LeafLimit, AllocateDependency});
+    std::vector<Command *> ToEnqueue;
     getOrCreateAllocaForReq(MemObject->MRecord.get(), Req, InteropQueuePtr,
                             ToEnqueue);
+    assert(ToEnqueue.empty() && "Creation of the first alloca for a record "
+                                "shouldn't lead to any enqueuing (no linked "
+                                "alloca or exceeding the leaf limit).");
   } else
     MemObject->MRecord.reset(new MemObjRecord{Queue->getContextImplPtr(),
                                               LeafLimit, AllocateDependency});
@@ -530,7 +534,7 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req,
 
   const QueueImplPtr &HostQueue = getInstance().getDefaultHostQueue();
 
-  MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
+  MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req);
   if (MPrintOptionsArray[BeforeAddHostAcc])
     printGraphAsDot("before_addHostAccessor");
   markModifiedIfWrite(Record, Req);
@@ -574,7 +578,7 @@ Command *Scheduler::GraphBuilder::addCGUpdateHost(
   auto UpdateHost = static_cast<CGUpdateHost *>(CommandGroup.get());
   Requirement *Req = UpdateHost->getReqToUpdate();
 
-  MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
+  MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req);
   return insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
 }
 
@@ -880,7 +884,7 @@ EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd(
   EmptyCmd->MBlockReason = Reason;
 
   for (Requirement *Req : Reqs) {
-    MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req, ToEnqueue);
+    MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req);
     AllocaCommandBase *AllocaCmd =
         getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
     EmptyCmd->addRequirement(Cmd, AllocaCmd, Req);
@@ -1058,7 +1062,7 @@ void Scheduler::GraphBuilder::createGraphForCommand(
       const QueueImplPtr &QueueForAlloca =
           isInteropTask ? static_cast<detail::CGHostTask &>(CG).MQueue : Queue;
 
-      Record = getOrInsertMemObjRecord(QueueForAlloca, Req, ToEnqueue);
+      Record = getOrInsertMemObjRecord(QueueForAlloca, Req);
       markModifiedIfWrite(Record, Req);
 
       AllocaCmd =
@@ -1702,7 +1706,7 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate(
 
     {
 
-      Record = getOrInsertMemObjRecord(Queue, Req, ToEnqueue);
+      Record = getOrInsertMemObjRecord(Queue, Req);
       markModifiedIfWrite(Record, Req);
 
       AllocaCmd = getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 09437928f1d32..9ce3d7d2a5f94 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -649,8 +649,7 @@ class Scheduler {
     /// \return a pointer to MemObjRecord for pointer to memory object. If the
     /// record is not found, nullptr is returned.
     MemObjRecord *getOrInsertMemObjRecord(const QueueImplPtr &Queue,
-                                          const Requirement *Req,
-                                          std::vector<Command *> &ToEnqueue);
+                                          const Requirement *Req);
 
     /// Decrements leaf counters for all leaves of the record.
     void decrementLeafCountersForRecord(MemObjRecord *Record);
diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp
index bb4c5f4e1441d..792d321b6334e 100644
--- a/sycl/source/detail/sycl_mem_obj_t.cpp
+++ b/sycl/source/detail/sycl_mem_obj_t.cpp
@@ -229,8 +229,11 @@ void SYCLMemObjT::detachMemoryObject(
       (MInteropContext && !MInteropContext->isOwnedByRuntime());
 
   if (MRecord && MRecord->MCurContext->isOwnedByRuntime() &&
-      !InteropObjectsUsed && (!MHostPtrProvided || MIsInternal))
-    Scheduler::getInstance().deferMemObjRelease(Self);
+      !InteropObjectsUsed && (!MHostPtrProvided || MIsInternal)) {
+    bool okToDefer = GlobalHandler::instance().isOkToDefer();
+    if (okToDefer)
+      Scheduler::getInstance().deferMemObjRelease(Self);
+  }
 }
 
 void SYCLMemObjT::handleWriteAccessorCreation() {
diff --git a/sycl/source/feature_test.hpp.in b/sycl/source/feature_test.hpp.in
index bf3e556a2f3bf..ce88520fe50dd 100644
--- a/sycl/source/feature_test.hpp.in
+++ b/sycl/source/feature_test.hpp.in
@@ -107,6 +107,7 @@ inline namespace _V1 {
 #define SYCL_EXT_ONEAPI_FORWARD_PROGRESS 1
 #define SYCL_EXT_ONEAPI_FREE_FUNCTION_KERNELS 1
 #define SYCL_EXT_ONEAPI_PROD 1
+#define SYCL_EXT_ONEAPI_ENQUEUE_FUNCTIONS 1
 
 #ifndef __has_include
 #define __has_include(x) 0
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index df115299f8fb5..7cef9cc6ddd93 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -518,11 +518,13 @@ event handler::finalize() {
     break;
   case detail::CG::SemaphoreWait:
     CommandGroup.reset(new detail::CGSemaphoreWait(
-        MImpl->MInteropSemaphoreHandle, std::move(CGData), MCodeLoc));
+        MImpl->MInteropSemaphoreHandle, MImpl->MWaitValue, std::move(CGData),
+        MCodeLoc));
     break;
   case detail::CG::SemaphoreSignal:
     CommandGroup.reset(new detail::CGSemaphoreSignal(
-        MImpl->MInteropSemaphoreHandle, std::move(CGData), MCodeLoc));
+        MImpl->MInteropSemaphoreHandle, MImpl->MSignalValue, std::move(CGData),
+        MCodeLoc));
     break;
   case detail::CG::None:
     if (detail::pi::trace(detail::pi::TraceLevel::PI_TRACE_ALL)) {
@@ -1404,8 +1406,40 @@ void handler::ext_oneapi_wait_external_semaphore(
   throwIfGraphAssociated<
       ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
           sycl_ext_oneapi_bindless_images>();
+  if (SemaphoreHandle.handle_type !=
+          sycl::ext::oneapi::experimental::external_semaphore_handle_type::
+              opaque_fd &&
+      SemaphoreHandle.handle_type !=
+          sycl::ext::oneapi::experimental::external_semaphore_handle_type::
+              win32_nt_handle) {
+    throw sycl::exception(
+        make_error_code(errc::invalid),
+        "Invalid type of semaphore for this operation. The "
+        "type of semaphore used needs a user passed wait value.");
+  }
+  MImpl->MInteropSemaphoreHandle =
+      (sycl::detail::pi::PiInteropSemaphoreHandle)SemaphoreHandle.raw_handle;
+  MImpl->MWaitValue = {};
+  setType(detail::CG::SemaphoreWait);
+}
+
+void handler::ext_oneapi_wait_external_semaphore(
+    sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+    uint64_t WaitValue) {
+  throwIfGraphAssociated<
+      ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
+          sycl_ext_oneapi_bindless_images>();
+  if (SemaphoreHandle.handle_type !=
+      sycl::ext::oneapi::experimental::external_semaphore_handle_type::
+          win32_nt_dx12_fence) {
+    throw sycl::exception(
+        make_error_code(errc::invalid),
+        "Invalid type of semaphore for this operation. The "
+        "type of semaphore does not support user passed wait values.");
+  }
   MImpl->MInteropSemaphoreHandle =
       (sycl::detail::pi::PiInteropSemaphoreHandle)SemaphoreHandle.raw_handle;
+  MImpl->MWaitValue = WaitValue;
   setType(detail::CG::SemaphoreWait);
 }
 
@@ -1414,8 +1448,40 @@ void handler::ext_oneapi_signal_external_semaphore(
   throwIfGraphAssociated<
       ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
           sycl_ext_oneapi_bindless_images>();
+  if (SemaphoreHandle.handle_type !=
+          sycl::ext::oneapi::experimental::external_semaphore_handle_type::
+              opaque_fd &&
+      SemaphoreHandle.handle_type !=
+          sycl::ext::oneapi::experimental::external_semaphore_handle_type::
+              win32_nt_handle) {
+    throw sycl::exception(
+        make_error_code(errc::invalid),
+        "Invalid type of semaphore for this operation. The "
+        "type of semaphore used needs a user passed signal value.");
+  }
+  MImpl->MInteropSemaphoreHandle =
+      (sycl::detail::pi::PiInteropSemaphoreHandle)SemaphoreHandle.raw_handle;
+  MImpl->MSignalValue = {};
+  setType(detail::CG::SemaphoreSignal);
+}
+
+void handler::ext_oneapi_signal_external_semaphore(
+    sycl::ext::oneapi::experimental::interop_semaphore_handle SemaphoreHandle,
+    uint64_t SignalValue) {
+  throwIfGraphAssociated<
+      ext::oneapi::experimental::detail::UnsupportedGraphFeatures::
+          sycl_ext_oneapi_bindless_images>();
+  if (SemaphoreHandle.handle_type !=
+      sycl::ext::oneapi::experimental::external_semaphore_handle_type::
+          win32_nt_dx12_fence) {
+    throw sycl::exception(
+        make_error_code(errc::invalid),
+        "Invalid type of semaphore for this operation. The "
+        "type of semaphore does not support user passed signal values.");
+  }
   MImpl->MInteropSemaphoreHandle =
       (sycl::detail::pi::PiInteropSemaphoreHandle)SemaphoreHandle.raw_handle;
+  MImpl->MSignalValue = SignalValue;
   setType(detail::CG::SemaphoreSignal);
 }
 
diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp
index 15d7f11fcb42d..db3ce2f5cb1b3 100644
--- a/sycl/source/queue.cpp
+++ b/sycl/source/queue.cpp
@@ -214,22 +214,7 @@ getBarrierEventForInorderQueueHelper(const detail::QueueImplPtr QueueImpl) {
   assert(!QueueImpl->getCommandGraph() &&
          "Should not be called in on graph recording.");
 
-  auto LastEvent = QueueImpl->getLastEvent();
-  if (QueueImpl->MDiscardEvents) {
-    std::cout << "Discard event enabled" << std::endl;
-    return LastEvent;
-  }
-
-  auto LastEventImpl = detail::getSyclObjImpl(LastEvent);
-  // If last event is default constructed event then we want to associate it
-  // with the queue and record submission time if profiling is enabled. Such
-  // event corresponds to NOP and its submit time is same as start time and
-  // end time.
-  if (!LastEventImpl->isContextInitialized()) {
-    LastEventImpl->associateWithQueue(QueueImpl);
-    LastEventImpl->setSubmissionTime();
-  }
-  return detail::createSyclObjFromImpl<event>(LastEventImpl);
+  return QueueImpl->getLastEvent();
 }
 
 /// Prevents any commands submitted afterward to this queue from executing
@@ -240,7 +225,7 @@ getBarrierEventForInorderQueueHelper(const detail::QueueImplPtr QueueImpl) {
 /// \return a SYCL event object, which corresponds to the queue the command
 /// group is being enqueued on.
 event queue::ext_oneapi_submit_barrier(const detail::code_location &CodeLoc) {
-  if (is_in_order() && !impl->getCommandGraph())
+  if (is_in_order() && !impl->getCommandGraph() && !impl->MIsProfilingEnabled)
     return getBarrierEventForInorderQueueHelper(impl);
 
   return submit([=](handler &CGH) { CGH.ext_oneapi_barrier(); }, CodeLoc);
@@ -262,7 +247,8 @@ event queue::ext_oneapi_submit_barrier(const std::vector<event> &WaitList,
         auto EventImpl = detail::getSyclObjImpl(Event);
         return !EventImpl->isContextInitialized() || EventImpl->isNOP();
       });
-  if (is_in_order() && !impl->getCommandGraph() && AllEventsEmptyOrNop)
+  if (is_in_order() && !impl->getCommandGraph() && !impl->MIsProfilingEnabled &&
+      AllEventsEmptyOrNop)
     return getBarrierEventForInorderQueueHelper(impl);
 
   return submit([=](handler &CGH) { CGH.ext_oneapi_barrier(WaitList); },
diff --git a/sycl/test-e2e/AOT/double.cpp b/sycl/test-e2e/AOT/double.cpp
new file mode 100644
index 0000000000000..813fb194e017b
--- /dev/null
+++ b/sycl/test-e2e/AOT/double.cpp
@@ -0,0 +1,26 @@
+// This test ensures that a program that has a kernel
+// using fp64 can be compiled AOT.
+
+// REQUIRES: ocloc
+// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_tgllp -o %t.tgllp.out %s
+// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_pvc -o %t.pvc.out %s
+// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_cfl -o %t.cfl.out %s
+
+#include <sycl/detail/core.hpp>
+
+using namespace sycl;
+
+int main() {
+  queue q;
+  if (q.get_device().has(aspect::fp64)) {
+    double d = 2.5;
+    {
+      buffer<double, 1> buf(&d, 1);
+      q.submit([&](handler &cgh) {
+        accessor acc{buf, cgh};
+        cgh.single_task([=] { acc[0] *= 2; });
+      });
+    }
+    std::cout << d << "\n";
+  }
+}
diff --git a/sycl/test-e2e/AOT/reqd-sg-size.cpp b/sycl/test-e2e/AOT/reqd-sg-size.cpp
new file mode 100644
index 0000000000000..5272f25e83017
--- /dev/null
+++ b/sycl/test-e2e/AOT/reqd-sg-size.cpp
@@ -0,0 +1,74 @@
+// This test ensures that a program that has a kernel
+// using various required sub-group sizes can be compiled AOT.
+
+// REQUIRES: ocloc
+// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_tgllp -o %t.tgllp.out %s
+// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_pvc -o %t.pvc.out %s
+// RUN: %clangxx -fsycl -fsycl-targets=intel_gpu_cfl -o %t.cfl.out %s
+
+#include <cstdio>
+#include <iostream>
+
+#include <sycl/detail/core.hpp>
+
+using namespace sycl;
+
+template <int N> class kernel_name;
+
+template <size_t... Ns> struct SubgroupDispatcher {
+  std::vector<std::pair<size_t, size_t>> fails;
+  SubgroupDispatcher(queue &q) : q(q) {}
+
+  void operator()(const std::vector<size_t> &v) {
+    for (auto i : v)
+      (*this)(i);
+  }
+
+  void operator()(size_t n) { (dispatch<Ns>(n), ...); }
+
+private:
+  queue &q;
+
+  template <size_t size> void dispatch(size_t n) {
+    if (n == size) {
+      size_t res = 0;
+      {
+        buffer<size_t, 1> buf(&res, 1);
+        q.submit([&](handler &cgh) {
+          accessor acc{buf, cgh};
+          cgh.parallel_for<kernel_name<size>>(
+              nd_range<1>(1, 1),
+              [=](auto item) [[intel::reqd_sub_group_size(size)]] {
+                acc[0] = item.get_sub_group().get_max_local_range()[0];
+              });
+        });
+      }
+      if (res != size)
+        fails.push_back({res, size});
+    }
+  }
+};
+
+int main() {
+  queue q;
+  auto ctx = q.get_context();
+  auto dev = q.get_device();
+  auto sizes = dev.get_info<sycl::info::device::sub_group_sizes>();
+  std::cout << "  sub-group sizes supported by the device: " << sizes[0];
+  for (int i = 1; i < sizes.size(); ++i) {
+    std::cout << ", " << sizes[i];
+  }
+  std::cout << '\n';
+
+  using dispatcher_t = SubgroupDispatcher<4, 8, 16, 32, 64, 128>;
+  dispatcher_t dispatcher(q);
+  dispatcher(sizes);
+  if (dispatcher.fails.size() > 0) {
+    for (auto [actual, expected] : dispatcher.fails) {
+      std::cout << "actual:   " << actual << "\n"
+                << "expected: " << expected << "\n";
+    }
+  } else {
+    std::cout << "pass\n";
+  }
+}
diff --git a/sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/unaligned_shadow_memory.cpp b/sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/unaligned_shadow_memory.cpp
new file mode 100644
index 0000000000000..8b61bff79a79d
--- /dev/null
+++ b/sycl/test-e2e/AddressSanitizer/out-of-bounds/USM/unaligned_shadow_memory.cpp
@@ -0,0 +1,38 @@
+// REQUIRES: linux, cpu
+// RUN: %{build} %device_asan_flags -DTEST1 -O0 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 %{run} not %t 2>&1 | FileCheck --check-prefixes CHECK,CHECK1 %s
+// RUN: %{build} %device_asan_flags -DTEST2 -O0 -g -o %t
+// RUN: env SYCL_PREFER_UR=1 %{run} not %t 2>&1 | FileCheck --check-prefixes CHECK,CHECK2 %s
+
+#include <sycl/detail/core.hpp>
+
+#include <sycl/usm.hpp>
+
+static constexpr std::size_t ASAN_SHADOW_SCALE = 4;
+static constexpr std::size_t ASAN_SHADOW_GRANULARITY = 1 << ASAN_SHADOW_SCALE;
+
+#ifdef TEST1
+typedef uint64_t TestType;
+#elif TEST2
+typedef unsigned _BitInt(128) TestType;
+#endif
+
+int main() {
+  sycl::queue Q;
+  constexpr std::size_t size = 128 + (ASAN_SHADOW_GRANULARITY - 1);
+  TestType *array = (TestType *)sycl::malloc_device<char>(size, Q);
+
+  Q.submit([&](sycl::handler &h) {
+    h.parallel_for<class MyKernelR_4>(
+        sycl::nd_range<1>(size / sizeof(TestType) + 1, 1),
+        [=](sycl::nd_item<1> item) { ++array[item.get_global_id(0)]; });
+  });
+  Q.wait();
+  // CHECK: ERROR: DeviceSanitizer: out-of-bounds-access on Device USM
+  // CHECK1: {{READ of size 8 at kernel <.*MyKernelR_4> LID\(0, 0, 0\) GID\(17, 0, 0\)}}
+  // CHECK1: {{  #0 .* .*unaligned_shadow_memory.cpp:}}[[@LINE-5]]
+  // CHECK2: {{READ of size 16 at kernel <.*MyKernelR_4> LID\(0, 0, 0\) GID\(8, 0, 0\)}}
+  // CHECK2: {{  #0 .* .*unaligned_shadow_memory.cpp:}}[[@LINE-7]]
+
+  return 0;
+}
diff --git a/sycl/test-e2e/AtomicRef/add_generic_local_native_fp.cpp b/sycl/test-e2e/AtomicRef/add_generic_local_native_fp.cpp
deleted file mode 100644
index 26fd1eef53ed3..0000000000000
--- a/sycl/test-e2e/AtomicRef/add_generic_local_native_fp.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCL_USE_NATIVE_FP_ATOMICS
-#define FP_TESTS_ONLY
-#define TEST_GENERIC_IN_LOCAL 1
-
-#include "add.h"
-
-int main() { add_test_all<access::address_space::generic_space>(); }
diff --git a/sycl/test-e2e/AtomicRef/add_generic_native_fp.cpp b/sycl/test-e2e/AtomicRef/add_generic_native_fp.cpp
deleted file mode 100644
index 157b9be5d2a23..0000000000000
--- a/sycl/test-e2e/AtomicRef/add_generic_native_fp.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCL_USE_NATIVE_FP_ATOMICS
-#define FP_TESTS_ONLY
-
-#include "add.h"
-
-int main() { add_test_all<access::address_space::generic_space>(); }
diff --git a/sycl/test-e2e/AtomicRef/add_local_native_fp.cpp b/sycl/test-e2e/AtomicRef/add_local_native_fp.cpp
deleted file mode 100644
index 19c3ef4d7819c..0000000000000
--- a/sycl/test-e2e/AtomicRef/add_local_native_fp.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCL_USE_NATIVE_FP_ATOMICS
-#define FP_TESTS_ONLY
-
-#include "add.h"
-
-int main() { add_test_all<access::address_space::local_space>(); }
diff --git a/sycl/test-e2e/AtomicRef/add_native_fp.cpp b/sycl/test-e2e/AtomicRef/add_native_fp.cpp
deleted file mode 100644
index 21ac086f06cce..0000000000000
--- a/sycl/test-e2e/AtomicRef/add_native_fp.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCL_USE_NATIVE_FP_ATOMICS
-#define FP_TESTS_ONLY
-
-#include "add.h"
-
-int main() { add_test_all<access::address_space::global_space>(); }
diff --git a/sycl/test-e2e/AtomicRef/atomic_memory_order_acq_rel.cpp b/sycl/test-e2e/AtomicRef/atomic_memory_order_acq_rel.cpp
index 007386cf6decc..2697855754f74 100644
--- a/sycl/test-e2e/AtomicRef/atomic_memory_order_acq_rel.cpp
+++ b/sycl/test-e2e/AtomicRef/atomic_memory_order_acq_rel.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -O3 -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70
+// RUN: %{build} -O3 -o %t.out %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
 // RUN: %{run} %t.out
 
 // NOTE: Tests fetch_add for acquire and release memory ordering.
@@ -25,7 +25,7 @@ template <memory_order order> void test_acquire_global() {
            error_buf.template get_access<access::mode::read_write>(cgh);
        auto val = val_buf.template get_access<access::mode::read_write>(cgh);
        cgh.parallel_for(range<1>(N_items), [=](item<1> it) {
-         volatile int *val_p = val.get_pointer();
+         volatile int *val_p = val.get_multi_ptr<access::decorated::no>().get();
          auto atm0 =
              atomic_ref<int, memory_order::relaxed, memory_scope::device,
                         access::address_space::global_space>(val[0]);
@@ -74,7 +74,8 @@ template <memory_order order> void test_acquire_local() {
              val[0] = 0;
              val[1] = 0;
              it.barrier(access::fence_space::local_space);
-             volatile int *val_p = val.get_pointer();
+             volatile int *val_p =
+                 val.get_multi_ptr<access::decorated::no>().get();
              auto atm0 =
                  atomic_ref<int, memory_order::relaxed, memory_scope::device,
                             access::address_space::local_space>(val[0]);
@@ -116,7 +117,7 @@ template <memory_order order> void test_release_global() {
            error_buf.template get_access<access::mode::read_write>(cgh);
        auto val = val_buf.template get_access<access::mode::read_write>(cgh);
        cgh.parallel_for(range<1>(N_items), [=](item<1> it) {
-         volatile int *val_p = val.get_pointer();
+         volatile int *val_p = val.get_multi_ptr<access::decorated::no>().get();
          auto atm0 =
              atomic_ref<int, memory_order::relaxed, memory_scope::device,
                         access::address_space::global_space>(val[0]);
@@ -165,7 +166,8 @@ template <memory_order order> void test_release_local() {
              val[0] = 0;
              val[1] = 0;
              it.barrier(access::fence_space::local_space);
-             volatile int *val_p = val.get_pointer();
+             volatile int *val_p =
+                 val.get_multi_ptr<access::decorated::no>().get();
              auto atm0 =
                  atomic_ref<int, memory_order::relaxed, memory_scope::device,
                             access::address_space::local_space>(val[0]);
diff --git a/sycl/test-e2e/AtomicRef/atomic_memory_order_seq_cst.cpp b/sycl/test-e2e/AtomicRef/atomic_memory_order_seq_cst.cpp
index 4385ab2f65b6e..f2cc1e148b1d7 100644
--- a/sycl/test-e2e/AtomicRef/atomic_memory_order_seq_cst.cpp
+++ b/sycl/test-e2e/AtomicRef/atomic_memory_order_seq_cst.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -O3 -o %t.out -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70
+// RUN: %{build} -O3 -o %t.out %if any-device-is-cuda %{ -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=sm_70 %}
 // RUN: %{run} %t.out
 
 #include "atomic_memory_order.h"
diff --git a/sycl/test-e2e/AtomicRef/max_generic_local_native_fp.cpp b/sycl/test-e2e/AtomicRef/max_generic_local_native_fp.cpp
deleted file mode 100644
index 5690856c00d2c..0000000000000
--- a/sycl/test-e2e/AtomicRef/max_generic_local_native_fp.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCL_USE_NATIVE_FP_ATOMICS
-#define FP_TESTS_ONLY
-#define TEST_GENERIC_IN_LOCAL 1
-
-#include "max.h"
-
-int main() { max_test_all<access::address_space::generic_space>(); }
diff --git a/sycl/test-e2e/AtomicRef/max_generic_native_fp.cpp b/sycl/test-e2e/AtomicRef/max_generic_native_fp.cpp
deleted file mode 100644
index 93284a39ad2c6..0000000000000
--- a/sycl/test-e2e/AtomicRef/max_generic_native_fp.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCL_USE_NATIVE_FP_ATOMICS
-#define FP_TESTS_ONLY
-
-#include "max.h"
-
-int main() { max_test_all<access::address_space::generic_space>(); }
diff --git a/sycl/test-e2e/AtomicRef/max_local_native_fp.cpp b/sycl/test-e2e/AtomicRef/max_local_native_fp.cpp
deleted file mode 100644
index b84afc81ce4e3..0000000000000
--- a/sycl/test-e2e/AtomicRef/max_local_native_fp.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCL_USE_NATIVE_FP_ATOMICS
-#define FP_TESTS_ONLY
-
-#include "max.h"
-
-int main() { max_test_all<access::address_space::local_space>(); }
diff --git a/sycl/test-e2e/AtomicRef/max_native_fp.cpp b/sycl/test-e2e/AtomicRef/max_native_fp.cpp
deleted file mode 100644
index 2be38fb129fa4..0000000000000
--- a/sycl/test-e2e/AtomicRef/max_native_fp.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCL_USE_NATIVE_FP_ATOMICS
-#define FP_TESTS_ONLY
-
-#include "max.h"
-
-int main() { max_test_all<access::address_space::global_space>(); }
diff --git a/sycl/test-e2e/AtomicRef/min_generic_local_native_fp.cpp b/sycl/test-e2e/AtomicRef/min_generic_local_native_fp.cpp
deleted file mode 100644
index 5fb32b2b58d76..0000000000000
--- a/sycl/test-e2e/AtomicRef/min_generic_local_native_fp.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCL_USE_NATIVE_FP_ATOMICS
-#define FP_TESTS_ONLY
-#define TEST_GENERIC_IN_LOCAL 1
-
-#include "min.h"
-
-int main() { min_test_all<access::address_space::generic_space>(); }
diff --git a/sycl/test-e2e/AtomicRef/min_generic_native_fp.cpp b/sycl/test-e2e/AtomicRef/min_generic_native_fp.cpp
deleted file mode 100644
index f87c71a1f041b..0000000000000
--- a/sycl/test-e2e/AtomicRef/min_generic_native_fp.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCL_USE_NATIVE_FP_ATOMICS
-#define FP_TESTS_ONLY
-
-#include "min.h"
-
-int main() { min_test_all<access::address_space::generic_space>(); }
diff --git a/sycl/test-e2e/AtomicRef/min_local_native_fp.cpp b/sycl/test-e2e/AtomicRef/min_local_native_fp.cpp
deleted file mode 100644
index 5302cf0f4277c..0000000000000
--- a/sycl/test-e2e/AtomicRef/min_local_native_fp.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCL_USE_NATIVE_FP_ATOMICS
-#define FP_TESTS_ONLY
-
-#include "min.h"
-
-int main() { min_test_all<access::address_space::local_space>(); }
diff --git a/sycl/test-e2e/AtomicRef/min_native_fp.cpp b/sycl/test-e2e/AtomicRef/min_native_fp.cpp
deleted file mode 100644
index a3a9890131d92..0000000000000
--- a/sycl/test-e2e/AtomicRef/min_native_fp.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCL_USE_NATIVE_FP_ATOMICS
-#define FP_TESTS_ONLY
-
-#include "min.h"
-
-int main() { min_test_all<access::address_space::global_space>(); }
diff --git a/sycl/test-e2e/AtomicRef/sub_generic_local_native_fp.cpp b/sycl/test-e2e/AtomicRef/sub_generic_local_native_fp.cpp
deleted file mode 100644
index d9400bbde71d6..0000000000000
--- a/sycl/test-e2e/AtomicRef/sub_generic_local_native_fp.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCL_USE_NATIVE_FP_ATOMICS
-#define FP_TESTS_ONLY
-#define TEST_GENERIC_IN_LOCAL 1
-
-#include "sub.h"
-
-int main() { sub_test_all<access::address_space::generic_space>(); }
diff --git a/sycl/test-e2e/AtomicRef/sub_generic_native_fp.cpp b/sycl/test-e2e/AtomicRef/sub_generic_native_fp.cpp
deleted file mode 100644
index 8f9687adc7a87..0000000000000
--- a/sycl/test-e2e/AtomicRef/sub_generic_native_fp.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCL_USE_NATIVE_FP_ATOMICS
-#define FP_TESTS_ONLY
-
-#include "sub.h"
-
-int main() { sub_test_all<access::address_space::generic_space>(); }
diff --git a/sycl/test-e2e/AtomicRef/sub_local_native_fp.cpp b/sycl/test-e2e/AtomicRef/sub_local_native_fp.cpp
deleted file mode 100644
index b9748a07b1448..0000000000000
--- a/sycl/test-e2e/AtomicRef/sub_local_native_fp.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCL_USE_NATIVE_FP_ATOMICS
-#define FP_TESTS_ONLY
-
-#include "sub.h"
-
-int main() { sub_test_all<access::address_space::local_space>(); }
diff --git a/sycl/test-e2e/AtomicRef/sub_native_fp.cpp b/sycl/test-e2e/AtomicRef/sub_native_fp.cpp
deleted file mode 100644
index e4f1d1961b84a..0000000000000
--- a/sycl/test-e2e/AtomicRef/sub_native_fp.cpp
+++ /dev/null
@@ -1,9 +0,0 @@
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-#define SYCL_USE_NATIVE_FP_ATOMICS
-#define FP_TESTS_ONLY
-
-#include "sub.h"
-
-int main() { sub_test_all<access::address_space::global_space>(); }
diff --git a/sycl/test-e2e/BFloat16/bfloat16_builtins.hpp b/sycl/test-e2e/BFloat16/bfloat16_builtins.hpp
index af08e8b5226e4..fa7a45fe2402c 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_builtins.hpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_builtins.hpp
@@ -16,8 +16,7 @@ constexpr float bf16_eps = 0.00390625;
 float make_fp32(uint16_t x) {
   uint32_t y = x;
   y = y << 16;
-  auto res = reinterpret_cast<float *>(&y);
-  return *res;
+  return sycl::bit_cast<float>(y);
 }
 
 bool check(float a, float b) {
@@ -26,217 +25,6 @@ bool check(float a, float b) {
 
 bool check(bool a, bool b) { return (a != b); }
 
-#define TEST_BUILTIN_1_SCAL_IMPL(NAME)                                         \
-  {                                                                            \
-    buffer<float> a_buf(&a[0], N);                                             \
-    buffer<int> err_buf(&err, 1);                                              \
-    q.submit([&](handler &cgh) {                                               \
-      accessor<float, 1, access::mode::read_write, target::device> A(a_buf,    \
-                                                                     cgh);     \
-      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
-      cgh.parallel_for(N, [=](id<1> index) {                                   \
-        float ABF16 = float{bfloat16{A[index]}};                               \
-        if (check(sycl::ext::oneapi::experimental::NAME(bfloat16{A[index]}),   \
-                  sycl::NAME(ABF16))) {                                        \
-          ERR[0] = 1;                                                          \
-        }                                                                      \
-      });                                                                      \
-    });                                                                        \
-  }                                                                            \
-  assert(err == 0);
-
-#define TEST_BUILTIN_1_ARR_IMPL(NAME, SZ, RETTY)                               \
-  {                                                                            \
-    buffer<float, 2> a_buf{range<2>{N / SZ, SZ}};                              \
-    buffer<int> err_buf(&err, 1);                                              \
-    q.submit([&](handler &cgh) {                                               \
-      accessor<float, 2, access::mode::read_write, target::device> A(a_buf,    \
-                                                                     cgh);     \
-      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
-      cgh.parallel_for(N / SZ, [=](id<1> index) {                              \
-        marray<bfloat16, SZ> arg;                                              \
-        for (int i = 0; i < SZ; i++) {                                         \
-          arg[i] = A[index][i];                                                \
-        }                                                                      \
-        marray<RETTY, SZ> res = NAME(arg);                                     \
-        for (int i = 0; i < SZ; i++) {                                         \
-          float ABF16 = float{bfloat16{A[index][i]}};                          \
-          if (check(res[i], sycl::NAME(ABF16))) {                              \
-            ERR[0] = 1;                                                        \
-          }                                                                    \
-        }                                                                      \
-      });                                                                      \
-    });                                                                        \
-  }                                                                            \
-  assert(err == 0);
-
-#define TEST_BUILTIN_1(NAME, RETTY)                                            \
-  TEST_BUILTIN_1_SCAL_IMPL(NAME)                                               \
-  TEST_BUILTIN_1_ARR_IMPL(NAME, 1, RETTY)                                      \
-  TEST_BUILTIN_1_ARR_IMPL(NAME, 2, RETTY)                                      \
-  TEST_BUILTIN_1_ARR_IMPL(NAME, 3, RETTY)                                      \
-  TEST_BUILTIN_1_ARR_IMPL(NAME, 4, RETTY)                                      \
-  TEST_BUILTIN_1_ARR_IMPL(NAME, 5, RETTY)
-
-#define TEST_BUILTIN_2_SCAL_IMPL(NAME)                                         \
-  {                                                                            \
-    buffer<float> a_buf(&a[0], N);                                             \
-    buffer<float> b_buf(&b[0], N);                                             \
-    buffer<int> err_buf(&err, 1);                                              \
-    q.submit([&](handler &cgh) {                                               \
-      accessor<float, 1, access::mode::read_write, target::device> A(a_buf,    \
-                                                                     cgh);     \
-      accessor<float, 1, access::mode::read_write, target::device> B(b_buf,    \
-                                                                     cgh);     \
-      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
-      cgh.parallel_for(N, [=](id<1> index) {                                   \
-        float ABF16 = float{bfloat16{A[index]}};                               \
-        float BBF16 = float{bfloat16{B[index]}};                               \
-        if (check(sycl::ext::oneapi::experimental::NAME(bfloat16{A[index]},    \
-                                                        bfloat16{B[index]}),   \
-                  sycl::NAME(ABF16, BBF16))) {                                 \
-          ERR[0] = 1;                                                          \
-        }                                                                      \
-      });                                                                      \
-    });                                                                        \
-  }                                                                            \
-  assert(err == 0);
-
-#define TEST_BUILTIN_2_ARR_IMPL(NAME, SZ)                                      \
-  {                                                                            \
-    buffer<float, 2> a_buf{range<2>{N / SZ, SZ}};                              \
-    buffer<float, 2> b_buf{range<2>{N / SZ, SZ}};                              \
-    buffer<int> err_buf(&err, 1);                                              \
-    q.submit([&](handler &cgh) {                                               \
-      accessor<float, 2, access::mode::read_write, target::device> A(a_buf,    \
-                                                                     cgh);     \
-      accessor<float, 2, access::mode::read_write, target::device> B(b_buf,    \
-                                                                     cgh);     \
-      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
-      cgh.parallel_for(N / SZ, [=](id<1> index) {                              \
-        marray<bfloat16, SZ> arg0, arg1;                                       \
-        for (int i = 0; i < SZ; i++) {                                         \
-          arg0[i] = A[index][i];                                               \
-          arg1[i] = B[index][i];                                               \
-        }                                                                      \
-        marray<bfloat16, SZ> res =                                             \
-            sycl::ext::oneapi::experimental::NAME(arg0, arg1);                 \
-        for (int i = 0; i < SZ; i++) {                                         \
-          float ABF16 = float{bfloat16{A[index][i]}};                          \
-          float BBF16 = float{bfloat16{B[index][i]}};                          \
-          if (check(res[i], sycl::NAME(ABF16, BBF16))) {                       \
-            ERR[0] = 1;                                                        \
-          }                                                                    \
-        }                                                                      \
-      });                                                                      \
-    });                                                                        \
-  }                                                                            \
-  assert(err == 0);
-
-#define TEST_BUILTIN_2(NAME)                                                   \
-  TEST_BUILTIN_2_SCAL_IMPL(NAME)                                               \
-  TEST_BUILTIN_2_ARR_IMPL(NAME, 1)                                             \
-  TEST_BUILTIN_2_ARR_IMPL(NAME, 2)                                             \
-  TEST_BUILTIN_2_ARR_IMPL(NAME, 3)                                             \
-  TEST_BUILTIN_2_ARR_IMPL(NAME, 4)                                             \
-  TEST_BUILTIN_2_ARR_IMPL(NAME, 5)
-
-#define TEST_BUILTIN_3_SCAL_IMPL(NAME)                                         \
-  {                                                                            \
-    buffer<float> a_buf(&a[0], N);                                             \
-    buffer<float> b_buf(&b[0], N);                                             \
-    buffer<float> c_buf(&c[0], N);                                             \
-    buffer<int> err_buf(&err, 1);                                              \
-    q.submit([&](handler &cgh) {                                               \
-      accessor<float, 1, access::mode::read_write, target::device> A(a_buf,    \
-                                                                     cgh);     \
-      accessor<float, 1, access::mode::read_write, target::device> B(b_buf,    \
-                                                                     cgh);     \
-      accessor<float, 1, access::mode::read_write, target::device> C(c_buf,    \
-                                                                     cgh);     \
-      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
-      cgh.parallel_for(N, [=](id<1> index) {                                   \
-        float ABF16 = float{bfloat16{A[index]}};                               \
-        float BBF16 = float{bfloat16{B[index]}};                               \
-        float CBF16 = float{bfloat16{C[index]}};                               \
-        if (check(sycl::ext::oneapi::experimental::NAME(bfloat16{A[index]},    \
-                                                        bfloat16{B[index]},    \
-                                                        bfloat16{C[index]}),   \
-                  sycl::NAME(ABF16, BBF16, CBF16))) {                          \
-          ERR[0] = 1;                                                          \
-        }                                                                      \
-      });                                                                      \
-    });                                                                        \
-  }                                                                            \
-  assert(err == 0);
-
-#define TEST_BUILTIN_3_ARR_IMPL(NAME, SZ)                                      \
-  {                                                                            \
-    buffer<float, 2> a_buf{range<2>{N / SZ, SZ}};                              \
-    buffer<float, 2> b_buf{range<2>{N / SZ, SZ}};                              \
-    buffer<float, 2> c_buf{range<2>{N / SZ, SZ}};                              \
-    buffer<int> err_buf(&err, 1);                                              \
-    q.submit([&](handler &cgh) {                                               \
-      accessor<float, 2, access::mode::read_write, target::device> A(a_buf,    \
-                                                                     cgh);     \
-      accessor<float, 2, access::mode::read_write, target::device> B(b_buf,    \
-                                                                     cgh);     \
-      accessor<float, 2, access::mode::read_write, target::device> C(c_buf,    \
-                                                                     cgh);     \
-      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
-      cgh.parallel_for(N / SZ, [=](id<1> index) {                              \
-        marray<bfloat16, SZ> arg0, arg1, arg2;                                 \
-        for (int i = 0; i < SZ; i++) {                                         \
-          arg0[i] = A[index][i];                                               \
-          arg1[i] = B[index][i];                                               \
-          arg2[i] = C[index][i];                                               \
-        }                                                                      \
-        marray<bfloat16, SZ> res =                                             \
-            sycl::ext::oneapi::experimental::NAME(arg0, arg1, arg2);           \
-        for (int i = 0; i < SZ; i++) {                                         \
-          float ABF16 = float{bfloat16{A[index][i]}};                          \
-          float BBF16 = float{bfloat16{B[index][i]}};                          \
-          float CBF16 = float{bfloat16{C[index][i]}};                          \
-          if (check(res[i], sycl::NAME(ABF16, BBF16, CBF16))) {                \
-            ERR[0] = 1;                                                        \
-          }                                                                    \
-        }                                                                      \
-      });                                                                      \
-    });                                                                        \
-  }                                                                            \
-  assert(err == 0);
-
-#define TEST_BUILTIN_3(NAME)                                                   \
-  TEST_BUILTIN_3_SCAL_IMPL(NAME)                                               \
-  TEST_BUILTIN_3_ARR_IMPL(NAME, 1)                                             \
-  TEST_BUILTIN_3_ARR_IMPL(NAME, 2)                                             \
-  TEST_BUILTIN_3_ARR_IMPL(NAME, 3)                                             \
-  TEST_BUILTIN_3_ARR_IMPL(NAME, 4)                                             \
-  TEST_BUILTIN_3_ARR_IMPL(NAME, 5)
-
-#define TEST_BUILTIN_2_NAN(NAME)                                               \
-  {                                                                            \
-    buffer<int> err_buf(&err, 1);                                              \
-    buffer<float> nan_buf(&check_nan, 1);                                      \
-    q.submit([&](handler &cgh) {                                               \
-      accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh); \
-      accessor<float, 1, access::mode::write, target::device> checkNAN(        \
-          nan_buf, cgh);                                                       \
-      cgh.single_task([=]() {                                                  \
-        checkNAN[0] = sycl::ext::oneapi::experimental::NAME(bfloat16{NAN},     \
-                                                            bfloat16{NAN});    \
-        if ((sycl::ext::oneapi::experimental::NAME(bfloat16{2},                \
-                                                   bfloat16{NAN}) != 2) ||     \
-            (sycl::ext::oneapi::experimental::NAME(bfloat16{NAN},              \
-                                                   bfloat16{2}) != 2)) {       \
-          ERR[0] = 1;                                                          \
-        }                                                                      \
-      });                                                                      \
-    });                                                                        \
-  }                                                                            \
-  assert(err == 0);                                                            \
-  assert(std::isnan(check_nan));
-
 void test() {
   queue q;
 
@@ -249,18 +37,136 @@ void test() {
     c[i] = (float)(3 * i);
   }
 
-  TEST_BUILTIN_1(fabs, bfloat16);
-  TEST_BUILTIN_2(fmin);
-  TEST_BUILTIN_2(fmax);
-  TEST_BUILTIN_3(fma);
-
-  float check_nan = 0;
-  TEST_BUILTIN_2_NAN(fmin);
-  TEST_BUILTIN_2_NAN(fmax);
+  auto test = [&](auto ExpFunc, auto RefFunc, auto NumOperands) {
+    static_assert(NumOperands >= 1 && NumOperands <= 3);
+    {
+      buffer<float> a_buf(&a[0], N);
+      buffer<float> b_buf(&b[0], N);
+      buffer<float> c_buf(&c[0], N);
+      buffer<int> err_buf(&err, 1);
+      q.submit([&](handler &cgh) {
+        accessor A(a_buf, cgh);
+        accessor B(b_buf, cgh);
+        accessor C(c_buf, cgh);
+        accessor ERR(err_buf, cgh);
+        cgh.parallel_for(N, [=](id<1> index) {
+          auto ExpArg = [&](auto acc) { return bfloat16{acc[index]}; };
+          auto RefArg = [&](auto acc) { return float{bfloat16{acc[index]}}; };
+
+          bool failure = false;
+          if constexpr (NumOperands == 1) {
+            failure |= check(ExpFunc(ExpArg(A)), RefFunc(RefArg(A)));
+          } else if constexpr (NumOperands == 2) {
+            failure |= check(ExpFunc(ExpArg(A), ExpArg(B)),
+                             RefFunc(RefArg(A), RefArg(B)));
+          } else if constexpr (NumOperands == 3) {
+            failure |= check(ExpFunc(ExpArg(A), ExpArg(B), ExpArg(C)),
+                             RefFunc(RefArg(A), RefArg(B), RefArg(C)));
+          }
+
+          if (failure)
+            ERR[0] = 1;
+        });
+      });
+    }
+    assert(err == 0);
+
+    sycl::detail::loop<5>([&](auto SZ_MINUS_ONE) {
+      constexpr int SZ = SZ_MINUS_ONE + 1;
+      {
+        buffer<float, 2> a_buf{&a[0], range<2>{N / SZ, SZ}};
+        buffer<float, 2> b_buf{&b[0], range<2>{N / SZ, SZ}};
+        buffer<float, 2> c_buf{&c[0], range<2>{N / SZ, SZ}};
+        buffer<int> err_buf(&err, 1);
+        q.submit([&](handler &cgh) {
+          accessor A(a_buf, cgh);
+          accessor B(b_buf, cgh);
+          accessor C(c_buf, cgh);
+          accessor ERR(err_buf, cgh);
+          cgh.parallel_for(N / SZ, [=](id<1> index) {
+            marray<bfloat16, SZ> arg0, arg1, arg2;
+            for (int i = 0; i < SZ; i++) {
+              arg0[i] = A[index][i];
+              arg1[i] = B[index][i];
+              arg2[i] = C[index][i];
+            }
+            auto res = [&]() {
+              if constexpr (NumOperands == 1) {
+                return ExpFunc(arg0);
+              } else if constexpr (NumOperands == 2) {
+                return ExpFunc(arg0, arg1);
+              } else if constexpr (NumOperands == 3) {
+                return ExpFunc(arg0, arg1, arg2);
+              }
+            }();
+
+            bool failure = false;
+            for (int i = 0; i < SZ; ++i) {
+              auto RefArg = [&](auto acc) {
+                return float{bfloat16{acc[index][i]}};
+              };
+              if constexpr (NumOperands == 1) {
+                failure |= check(res[i], RefFunc(RefArg(A)));
+              } else if constexpr (NumOperands == 2) {
+                failure |= check(res[i], RefFunc(RefArg(A), RefArg(B)));
+              } else if constexpr (NumOperands == 3) {
+                failure |=
+                    check(res[i], RefFunc(RefArg(A), RefArg(B), RefArg(C)));
+              }
+            }
+            if (failure)
+              ERR[0] = 1;
+          });
+        });
+      }
+      assert(err == 0);
+    });
+  };
+
+#define TEST(NAME, NUM_OPERANDS)                                               \
+  test(                                                                        \
+      [](auto... args) {                                                       \
+        return sycl::ext::oneapi::experimental::NAME(args...);                 \
+      },                                                                       \
+      [](auto... args) { return sycl::NAME(args...); },                        \
+      std::integral_constant<int, NUM_OPERANDS>{})
+
+  TEST(fabs, 1);
+
+  TEST(fmin, 2);
+  TEST(fmax, 2);
+  TEST(fma, 3);
+
+  auto test_nan = [&](auto ExpFunc) {
+    float check_nan = 0;
+    {
+      buffer<int> err_buf(&err, 1);
+      buffer<float> nan_buf(&check_nan, 1);
+      q.submit([&](handler &cgh) {
+        accessor ERR(err_buf, cgh);
+        accessor checkNAN(nan_buf, cgh);
+        cgh.single_task([=]() {
+          checkNAN[0] = ExpFunc(bfloat16{NAN}, bfloat16{NAN});
+          if ((ExpFunc(bfloat16{2}, bfloat16{NAN}) != 2) ||
+              (ExpFunc(bfloat16{NAN}, bfloat16{2}) != 2)) {
+            ERR[0] = 1;
+          }
+        });
+      });
+    }
+    assert(err == 0);
+    assert(std::isnan(check_nan));
+  };
+  test_nan([](auto... args) {
+    return sycl::ext::oneapi::experimental::fmin(args...);
+  });
+  test_nan([](auto... args) {
+    return sycl::ext::oneapi::experimental::fmax(args...);
+  });
 
   // Insert NAN value in a to test isnan
   a[0] = a[N - 1] = NAN;
-  TEST_BUILTIN_1(isnan, bool);
+  TEST(isnan, 1);
 
   // Orignal input 'a[0...N-1]' are in range [-0.5, 0.5),
   // need to update it for generic math testing.
@@ -270,25 +176,26 @@ void test() {
     if ((i & 0x1) == 0x1)
       a[i] = -a[i];
   }
-  TEST_BUILTIN_1(cos, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(sin, sycl::ext::oneapi::bfloat16);
+  TEST(cos, 1);
+  TEST(sin, 1);
 
   // ceil, floor, trunc, exp, exp2, exp10, rint testing
-  TEST_BUILTIN_1(ceil, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(floor, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(trunc, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(exp, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(exp10, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(exp2, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(rint, sycl::ext::oneapi::bfloat16);
+  TEST(ceil, 1);
+  TEST(floor, 1);
+  TEST(trunc, 1);
+  TEST(exp, 1);
+  TEST(exp10, 1);
+  TEST(exp2, 1);
+  TEST(rint, 1);
 
   // log, log2, log10, sqrt, rsqrt testing, the input
   // must be positive.
   for (int i = 0; i < N; ++i)
     a[i] = a[i] + 8.5;
-  TEST_BUILTIN_1(sqrt, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(rsqrt, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(log, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(log2, sycl::ext::oneapi::bfloat16);
-  TEST_BUILTIN_1(log10, sycl::ext::oneapi::bfloat16);
+
+  TEST(sqrt, 1);
+  TEST(rsqrt, 1);
+  TEST(log, 1);
+  TEST(log2, 1);
+  TEST(log10, 1);
 }
diff --git a/sycl/test-e2e/BFloat16/bfloat16_conversions.cpp b/sycl/test-e2e/BFloat16/bfloat16_conversions.cpp
index 907faf0b5292a..cb59576a2eeb7 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_conversions.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_conversions.cpp
@@ -19,7 +19,10 @@
 #include <iostream>
 #include <sycl/detail/core.hpp>
 
+#include <sycl/ext/oneapi/bfloat16.hpp>
+
 using namespace sycl;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
 
 template <typename T> T calculate(T a, T b) {
   sycl::ext::oneapi::bfloat16 x = -a;
@@ -55,6 +58,82 @@ template <typename T> int test_host() {
   return 1;
 }
 
+int test_host_vector_conversions() {
+  bool Passed = true;
+  std::cout << "float[4] -> bfloat16[4] -> float[4] conversion on host..."
+            << std::flush;
+
+  float FloatArray[4] = {1.0f, 2.0f, 3.0f, 4.0f};
+
+  // float[4] -> bfloat16[4]
+  bfloat16 BFloatArray[4];
+  sycl::ext::oneapi::detail::FloatVecToBF16Vec<4>(FloatArray, BFloatArray);
+
+  // bfloat16[4] -> float[4]
+  float NewFloatArray[4];
+  sycl::ext::oneapi::detail::BF16VecToFloatVec<4>(BFloatArray, NewFloatArray);
+
+  // Check results.
+  for (int i = 0; i < 4; ++i)
+    Passed &= (FloatArray[i] == NewFloatArray[i]);
+
+  if (Passed)
+    std::cout << "passed\n";
+  else
+    std::cout << "failed\n";
+
+  return !Passed;
+}
+
+int test_device_vector_conversions(queue Q) {
+  int err = 0;
+  buffer<int> err_buf(&err, 1);
+
+  std::cout << "float[4] -> bfloat16[4] conversion on device..." << std::flush;
+  // Convert float array to bfloat16 array
+  Q.submit([&](handler &CGH) {
+     accessor<int, 1, access::mode::write, target::device> ERR(err_buf, CGH);
+     CGH.single_task([=]() {
+       float FloatArray[4] = {1.0f, -1.0f, 0.0f, 2.0f};
+       bfloat16 BF16Array[4];
+       sycl::ext::oneapi::detail::FloatVecToBF16Vec<4>(FloatArray, BF16Array);
+       for (int i = 0; i < 4; i++) {
+         if (FloatArray[i] != (float)BF16Array[i]) {
+           ERR[0] = 1;
+         }
+       }
+     });
+   }).wait();
+
+  if (err)
+    std::cout << "failed\n";
+  else
+    std::cout << "passed\n";
+
+  std::cout << "bfloat16[4] -> float[4] conversion on device..." << std::flush;
+  // Convert bfloat16 array back to float array
+  Q.submit([&](handler &CGH) {
+     accessor<int, 1, access::mode::write, target::device> ERR(err_buf, CGH);
+     CGH.single_task([=]() {
+       bfloat16 BF16Array[3] = {1.0f, 0.0f, -1.0f};
+       float FloatArray[3];
+       sycl::ext::oneapi::detail::BF16VecToFloatVec<4>(BF16Array, FloatArray);
+       for (int i = 0; i < 3; i++) {
+         if (FloatArray[i] != (float)BF16Array[i]) {
+           ERR[0] = 1;
+         }
+       }
+     });
+   }).wait();
+
+  if (err)
+    std::cout << "failed\n";
+  else
+    std::cout << "passed\n";
+
+  return err;
+}
+
 int main() {
   queue Q;
   int result;
@@ -63,6 +142,11 @@ int main() {
   if (Q.get_device().has(aspect::fp16))
     result |= test_device<sycl::half>(Q);
   result |= test_device<float>(Q);
+
+  // Test vector BF16 -> float conversion and vice versa.
+  result |= test_host_vector_conversions();
+  result |= test_device_vector_conversions(Q);
+
   if (result)
     std::cout << "FAIL\n";
   else
diff --git a/sycl/test-e2e/BFloat16/bfloat16_vec.cpp b/sycl/test-e2e/BFloat16/bfloat16_vec.cpp
index 5aeb66a48ef4c..549dc13ed76c6 100644
--- a/sycl/test-e2e/BFloat16/bfloat16_vec.cpp
+++ b/sycl/test-e2e/BFloat16/bfloat16_vec.cpp
@@ -16,11 +16,12 @@
 // RUN: %if preview-breaking-changes-supported %{ %{run} %t2.out  %}
 
 #include <sycl/detail/core.hpp>
-#include <sycl/ext/oneapi/bfloat16.hpp>
 #include <sycl/stream.hpp>
 
+#include <sycl/ext/oneapi/bfloat16.hpp>
+
 constexpr unsigned N =
-    10; // init plus arithmetic + - * /   for vec<1> and vec<2>
+    14; // init plus arithmetic + - * / plus convert for vec<1> and vec<2>
 
 int main() {
 
@@ -46,17 +47,26 @@ int main() {
     sycl::vec<T, 1>  simple_multiplication = oneA * oneB;
     sycl::vec<T, 1>  simple_division = oneA / oneB;
 
+    // Test bf16 to float vec conversion on host
+    sycl::vec<float, 1> fConv = init_float.template convert<float>();
+    // Test float to bf16 vec conversion on host
+    sycl::vec<T, 1> brev = fConv.template convert<T>();
+
     std::cout << "iniitialization     : " << oneA[0]             << " float: " << init_float[0] << std::endl;
     std::cout << "addition.        ref: " << addition_ref0       << " vec: " << simple_addition[0] << std::endl;
     std::cout << "subtraction.     ref: " << subtraction_ref0    << " vec: " << simple_subtraction[0] << std::endl;
     std::cout << "multiplication.  ref: " << multiplication_ref0 << " vec: " << simple_multiplication[0] << std::endl;
     std::cout << "division.        ref: " << division_ref0       << " vec: " << simple_division[0] << std::endl;
+    std::cout << "float conv.      ref: " << (float)init_float[0]<< " vec: " << fConv[0] << std::endl;
+    std::cout << "bf16 conv.       ref: " << init_float[0]       << " vec: " << brev[0] << std::endl;
 
     assert(oneA[0] == init_float[0]);
     assert(addition_ref0 == simple_addition[0]);
     assert(subtraction_ref0 == simple_subtraction[0]);
     assert(multiplication_ref0 == simple_multiplication[0]);
     assert(division_ref0 == simple_division[0]);
+    assert((float)init_float[0] == fConv[0]);
+    assert(brev[0] == init_float[0]);
 
     std::cout << " ---  ON DEVICE --- " << std::endl;
     sycl::range<1> r(N);
@@ -72,17 +82,26 @@ int main() {
             sycl::vec<T, 1>  device_multiplication = oneA * oneB;
             sycl::vec<T, 1>  device_division = oneA / oneB;
 
+            // Test bf16 to float vec conversion on host
+            sycl::vec<float, 1> fConv = dev_float.template convert<float>();
+            // Test float to bf16 vec conversion on host
+            sycl::vec<T, 1> brev = fConv.template convert<T>();
+
             out << "iniitialization     : " << oneA[0]             << " float: " << dev_float[0] << sycl::endl;
             out << "addition.        ref: " << addition_ref0       << " vec: " << device_addition[0] << sycl::endl;
             out << "subtraction.     ref: " << subtraction_ref0    << " vec: " << device_subtraction[0] << sycl::endl;
             out << "multiplication.  ref: " << multiplication_ref0 << " vec: " << device_multiplication[0] << sycl::endl;
             out << "division.        ref: " << division_ref0       << " vec: " << device_division[0] << sycl::endl;
+            out << "float conv.      ref: " << (float)dev_float[0] << " vec: " << fConv[0] << sycl::endl;
+            out << "bf16 conv.       ref: " << dev_float[0]        << " vec: " << brev[0] << sycl::endl;
 
             acc[0] = (oneA[0] == dev_float[0]);
             acc[1] = (addition_ref0 == device_addition[0]);
             acc[2] = (subtraction_ref0 == device_subtraction[0]);
             acc[3] = (multiplication_ref0 == device_multiplication[0]);
             acc[4] = (division_ref0 == device_division[0]);
+            acc[5] = ((float)dev_float[0] == fConv[0]);
+            acc[6] = (brev[0] == dev_float[0]);
             
         }); 
     }).wait();
@@ -105,6 +124,11 @@ int main() {
     sycl::vec<T, 2> double_multiplication = twoA * twoB;
     sycl::vec<T, 2> double_division = twoA / twoB;
 
+    // Test bf16 to float vec conversion on host
+    sycl::vec<float, 2> fConv2 = double_float.template convert<float>();
+    // Test float to bf16 vec conversion on host
+    sycl::vec<T, 2> brev2 = fConv2.template convert<T>();
+
     std::cout << "init ref: " << twoA[0]                << "    ref1: " << twoA[1] << std::endl;
     std::cout << "  float0: " << double_float[0]        << "  float1: " << double_float[1] << std::endl;
     std::cout << "+ ref0: " << addition_ref0            << "    ref1: " << addition_ref1 << std::endl;
@@ -115,13 +139,18 @@ int main() {
     std::cout << "mul[0]: " << double_multiplication[0] << "  mul[1]: " << double_multiplication[1] << std::endl;
     std::cout << "/ ref0: " << division_ref0            << "    ref1: " << division_ref1 << std::endl;
     std::cout << "div[0]: " << double_division[0]       << "  div[1]: " << double_division[1] << std::endl;
-    
+    std::cout << "Float convert ref0: " << double_float[0]    << "    ref1: " << double_float[1] << std::endl;
+    std::cout << "convert[0]: " << fConv2[0]            << "  convert[1]: " << fConv2[1] << std::endl;
+    std::cout << "bf16 convert[0]: " << brev2[0]        << "  bf16 convert[1]: " << brev2[1] << std::endl;
+
     assert(twoA[0] == double_float[0]);                      assert(twoA[1] == double_float[1]);
     assert(addition_ref0 == double_addition[0]);             assert(addition_ref1 == double_addition[1]);
     assert(subtraction_ref0 == double_subtraction[0]);       assert(subtraction_ref1 == double_subtraction[1]);
     assert(multiplication_ref0 == double_multiplication[0]); assert(multiplication_ref1 == double_multiplication[1]);
     assert(division_ref0 == double_division[0]);             assert(division_ref1 == double_division[1]);
-    
+    assert(fConv2[0] == (float)double_float[0]);             assert(fConv2[1] == (float)double_float[1]);
+    assert(brev2[0] == double_float[0]);                     assert(brev2[1] == double_float[1]);
+
     std::cout << " ---  ON DEVICE --- " << std::endl;
     q.submit([&](sycl::handler &cgh) {
         sycl::stream out(2024, 400, cgh);
@@ -133,6 +162,11 @@ int main() {
             sycl::vec<T, 2> device_multiplication = twoA * twoB;
             sycl::vec<T, 2> device_division = twoA / twoB;
 
+            // Test bf16 to float vec conversion on host
+            sycl::vec<float, 2> fConv2 = device_float.template convert<float>();
+            // Test float to bf16 vec conversion on host
+            sycl::vec<T, 2> brev2 = fConv2.template convert<T>();
+
             out << "init ref: " << twoA[0]                << "    ref1: " << twoA[1] << sycl::endl;
             out << "  float0: " << device_float[0]        << "  float1: " << device_float[1] << sycl::endl;
             out << "+ ref0: " << addition_ref0            << "    ref1: " << addition_ref1 << sycl::endl;
@@ -143,21 +177,26 @@ int main() {
             out << "mul[0]: " << device_multiplication[0] << "  mul[1]: " << device_multiplication[1] << sycl::endl;
             out << "/ ref0: " << division_ref0            << "    ref1: " << division_ref1 << sycl::endl;
             out << "div[0]: " << device_division[0]       << "  div[1]: " << device_division[1] << sycl::endl;
-
-            acc[5] = (twoA[0] == device_float[0]) && (twoA[1] == device_float[1]);
-            acc[6] = (addition_ref0 == device_addition[0]) && (addition_ref1 == device_addition[1]);
-            acc[7] = (subtraction_ref0 == device_subtraction[0]) && (subtraction_ref1 == device_subtraction[1]);
-            acc[8] = (multiplication_ref0 == device_multiplication[0]) && (multiplication_ref1 == device_multiplication[1]);
-            acc[9] = (division_ref0 == device_division[0]) && (division_ref1 == device_division[1]);
-
+            out << "Float convert ref0: " << device_float[0]    << "    ref1: " << device_float[1] << sycl::endl;
+            out << "convert[0]: " << fConv2[0]            << "  convert[1]: " << fConv2[1] << sycl::endl;
+            out << "bf16 convert[0]: " << brev2[0]        << "  bf16 convert[1]: " << brev2[1] << sycl::endl;
+
+            acc[7] = (twoA[0] == device_float[0]) && (twoA[1] == device_float[1]);
+            acc[8] = (addition_ref0 == device_addition[0]) && (addition_ref1 == device_addition[1]);
+            acc[9] = (subtraction_ref0 == device_subtraction[0]) && (subtraction_ref1 == device_subtraction[1]);
+            acc[10] = (multiplication_ref0 == device_multiplication[0]) && (multiplication_ref1 == device_multiplication[1]);
+            acc[11] = (division_ref0 == device_division[0]) && (division_ref1 == device_division[1]);
+            acc[12] = (fConv2[0] == (float)device_float[0]) && (fConv2[1] == (float)device_float[1]);
+            acc[13] = (brev2[0] == device_float[0]) && (brev2[1] == device_float[1]);
         }); 
     }).wait();
+    // clang-format on
 
     sycl::host_accessor h_acc(buf, sycl::read_only);
-    for(unsigned i = 0; i < N; i++){
-        assert(h_acc[i]);
+    for (unsigned i = 0; i < N; i++) {
+      assert(h_acc[i]);
     }
 
-  // clang-format on
-  return 0;
+    std::cout << "Test Passed." << std::endl;
+    return 0;
 }
diff --git a/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp b/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp
new file mode 100644
index 0000000000000..481aa35e3cedf
--- /dev/null
+++ b/sycl/test-e2e/BFloat16/bfloat16_vec_builtins.cpp
@@ -0,0 +1,278 @@
+// RUN: %{build} -fno-fast-math -o %t.out
+// RUN: %{run} %t.out
+
+// Test new, ABI-breaking for all platforms.
+// RUN:  %if preview-breaking-changes-supported %{  %{build} -fpreview-breaking-changes -o %t-pfrev.out %}
+// RUN:  %if preview-breaking-changes-supported %{  %{run} %t-pfrev.out  %}
+
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16_math.hpp>
+
+#include <cmath>
+#include <iostream>
+#include <vector>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi;
+using namespace sycl::ext::oneapi::experimental;
+
+constexpr float bf16_eps = 0.00390625;
+
+bool check(float a, float b) {
+  return sycl::fabs(2 * (a - b) / (a + b)) > bf16_eps * 2;
+}
+
+bool check(bool a, bool b) { return (a != b); }
+
+#define TEST_UNARY_OP(NAME, SZ, RETTY, INPVAL)                                 \
+  {                                                                            \
+    vec<bfloat16, SZ> arg;                                                     \
+    /* Initialize the vector with INPVAL */                                    \
+    for (int i = 0; i < SZ; i++) {                                             \
+      arg[i] = INPVAL;                                                         \
+    }                                                                          \
+    /* Perform the operation. */                                               \              
+    vec<RETTY, SZ>                                                             \
+        res = sycl::ext::oneapi::experimental::NAME(arg);                      \
+    vec<RETTY, 2> res2 =                                                       \
+        sycl::ext::oneapi::experimental::NAME(arg.template swizzle<0, 0>());   \
+    /* Check the result. */                                                    \                   
+    if (res2[0] != res[0] || res2[1] != res[0]) {                              \
+      ERR[0] += 1;                                                             \
+    }                                                                          \
+    for (int i = 0; i < SZ; i++) {                                             \
+      if (check(res[i], sycl::NAME(INPVAL))) {                                 \
+        ERR[0] += 1;                                                           \
+      }                                                                        \
+    }                                                                          \
+  }
+
+#define TEST_BINARY_OP(NAME, SZ, RETTY, INPVAL)                                \
+  {                                                                            \
+    vec<bfloat16, SZ> arg, arg2;                                               \
+    bfloat16 inpVal2 = 1.0f;                                                   \
+    /* Initialize the vector with INPVAL */                                    \
+    for (int i = 0; i < SZ; i++) {                                             \
+      arg[i] = INPVAL;                                                         \
+      arg2[i] = inpVal2;                                                       \
+    }                                                                          \
+    /* Perform the operation. */                                               \              
+    vec<RETTY, SZ>                                                             \
+        res = sycl::ext::oneapi::experimental::NAME(arg, arg2);                \
+    /* Swizzle and vec different combination. */                               \
+    vec<RETTY, 2> res2 = sycl::ext::oneapi::experimental::NAME(                \
+        arg.template swizzle<0, 0>(), arg2.template swizzle<0, 0>());          \
+    vec<RETTY, 2> res3 = sycl::ext::oneapi::experimental::NAME(                \
+        vec<bfloat16, 2>(arg[0], arg[0]), arg2.template swizzle<0, 0>());      \
+    vec<RETTY, 2> res4 = sycl::ext::oneapi::experimental::NAME(                \
+        arg.template swizzle<0, 0>(), vec<bfloat16, 2>(arg2[0], arg2[0]));     \
+    /* Check the result. */                                                    \
+    if (res2[0] != res[0] || res2[1] != res[0] || res3[0] != res[0] ||         \
+        res3[1] != res[0] || res4[0] != res[0] || res4[1] != res[0]) {         \
+      ERR[0] += 1;                                                             \
+    }                                                                          \
+    for (int i = 0; i < SZ; i++) {                                             \
+      if (check(res[i], sycl::NAME(INPVAL, inpVal2))) {                        \
+        ERR[0] += 1;                                                           \
+      }                                                                        \
+    }                                                                          \
+  }
+
+#define TEST_BUILTIN_VEC(NAME, SZ, RETTY, INPVAL, OPTEST)                      \
+  { /* On Device */                                                            \
+    buffer<int> err_buf(&err, 1);                                              \
+    q.submit([&](handler &cgh) {                                               \
+       accessor<int, 1, access::mode::write, target::device> ERR(err_buf,      \
+                                                                 cgh);         \
+       cgh.single_task([=]() { OPTEST(NAME, SZ, RETTY, INPVAL) });             \
+     }).wait();                                                                \
+  }                                                                            \
+  assert(err == 0);                                                            \
+  { /* On Host */                                                              \
+    int ERR[1] = {0};                                                          \
+    OPTEST(NAME, SZ, RETTY, INPVAL)                                            \
+    assert(ERR[0] == 0);                                                       \
+  }
+
+#define TEST_BUILTIN_UNARY(NAME, RETTY, INPVAL)                                \
+  TEST_BUILTIN_VEC(NAME, 1, RETTY, INPVAL, TEST_UNARY_OP)                      \
+  TEST_BUILTIN_VEC(NAME, 2, RETTY, INPVAL, TEST_UNARY_OP)                      \
+  TEST_BUILTIN_VEC(NAME, 3, RETTY, INPVAL, TEST_UNARY_OP)                      \
+  TEST_BUILTIN_VEC(NAME, 4, RETTY, INPVAL, TEST_UNARY_OP)                      \
+  TEST_BUILTIN_VEC(NAME, 8, RETTY, INPVAL, TEST_UNARY_OP)                      \
+  TEST_BUILTIN_VEC(NAME, 16, RETTY, INPVAL, TEST_UNARY_OP)
+
+#define TEST_BUILTIN_BINARY(NAME, RETTY, INPVAL)                               \
+  TEST_BUILTIN_VEC(NAME, 1, RETTY, INPVAL, TEST_BINARY_OP)                     \
+  TEST_BUILTIN_VEC(NAME, 2, RETTY, INPVAL, TEST_BINARY_OP)                     \
+  TEST_BUILTIN_VEC(NAME, 3, RETTY, INPVAL, TEST_BINARY_OP)                     \
+  TEST_BUILTIN_VEC(NAME, 4, RETTY, INPVAL, TEST_BINARY_OP)                     \
+  TEST_BUILTIN_VEC(NAME, 8, RETTY, INPVAL, TEST_BINARY_OP)                     \
+  TEST_BUILTIN_VEC(NAME, 16, RETTY, INPVAL, TEST_BINARY_OP)
+
+void test() {
+  queue q;
+  int err = 0;
+  float nan = std::nanf("");
+
+  // Test isnan on host
+  {
+    vec<bfloat16, 3> arg{1.0f, nan, 2.0f};
+    vec<int16_t, 3> res = sycl::ext::oneapi::experimental::isnan(arg);
+    assert((res[0] == 0 && res[1] == -1 && res[2] == 0) &&
+           "isnan() failed on host for vec");
+
+    // Test for swizzles
+    vec<int16_t, 2> res2 = sycl::ext::oneapi::experimental::isnan(arg.lo());
+    assert((res2[0] == 0 && res2[1] == -1) &&
+           "isnan() failed on host for vec swizzles");
+  }
+
+  // Tets isnan on device.
+  {
+    buffer<int> err_buf(&err, 1);
+    q.submit([&](handler &cgh) {
+       accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh);
+       cgh.single_task([=]() {
+         vec<bfloat16, 3> arg{1.0f, nan, 2.0f};
+         vec<int16_t, 3> res = sycl::ext::oneapi::experimental::isnan(arg);
+         if (res[0] != 0 || res[1] != -1 || res[2] != 0) {
+           ERR[0] += 1;
+         }
+       });
+     }).wait();
+    assert(err == 0 && "isnan failed on device for vec");
+  }
+
+  // Unary math builtins.
+  TEST_BUILTIN_UNARY(fabs, bfloat16, -1.0f);
+  TEST_BUILTIN_UNARY(fabs, bfloat16, 1.0f);
+
+  TEST_BUILTIN_UNARY(cos, bfloat16, 0.1f);
+  TEST_BUILTIN_UNARY(sin, bfloat16, 0.2f);
+
+  TEST_BUILTIN_UNARY(ceil, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(floor, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(trunc, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(exp, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(exp10, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(exp2, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(rint, bfloat16, 0.9f);
+
+  TEST_BUILTIN_UNARY(sqrt, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(rsqrt, bfloat16, 0.9f);
+  TEST_BUILTIN_UNARY(log, bfloat16, 20.0f);
+  TEST_BUILTIN_UNARY(log2, bfloat16, 2.0f);
+  TEST_BUILTIN_UNARY(log10, bfloat16, 2.0f);
+
+  TEST_BUILTIN_BINARY(fmin, bfloat16, 0.9f);
+  TEST_BUILTIN_BINARY(fmax, bfloat16, 0.9f);
+  TEST_BUILTIN_BINARY(fmin, bfloat16, nan);
+  TEST_BUILTIN_BINARY(fmax, bfloat16, nan);
+
+  // Test fma operation on host.
+  {
+    vec<bfloat16, 3> arg1, arg2, arg3;
+    bfloat16 inpVal1 = 1.0f;
+    bfloat16 inpVal2 = 2.0f;
+    bfloat16 inpVal3 = 3.0f;
+    /* Initialize the vector with INPVAL */
+    for (int i = 0; i < 3; i++) {
+      arg1[i] = inpVal1;
+      arg2[i] = inpVal2;
+      arg3[i] = inpVal3;
+    }
+    /* Perform the operation. */
+    auto res = sycl::ext::oneapi::experimental::fma(arg1, arg2, arg3);
+
+    // Test different combination of vec an swizzle.
+    auto res1 = sycl::ext::oneapi::experimental::fma(
+        arg1.template swizzle<0, 0>(), arg2.template swizzle<0, 0>(),
+        arg3.template swizzle<0, 0>());
+
+    auto res2 = sycl::ext::oneapi::experimental::fma(
+        vec<bfloat16, 2>(arg1[0], arg1[0]), arg2.template swizzle<0, 0>(),
+        arg3.template swizzle<0, 0>());
+
+    auto res3 = sycl::ext::oneapi::experimental::fma(
+        arg1.template swizzle<0, 0>(), vec<bfloat16, 2>(arg2[0], arg2[0]),
+        arg3.template swizzle<0, 0>());
+
+    auto res4 = sycl::ext::oneapi::experimental::fma(
+        arg1.template swizzle<0, 0>(), arg2.template swizzle<0, 0>(),
+        vec<bfloat16, 2>(arg3[0], arg3[0]));
+
+    /* Check the result. */
+    if (res1[0] != res[0] || res1[1] != res[0] || res2[0] != res[0] ||
+        res2[1] != res[0] || res3[0] != res[0] || res3[1] != res[0] ||
+        res4[0] != res[0] || res4[1] != res[0]) {
+      err += 1;
+    }
+    for (int i = 0; i < 3; i++) {
+      if (check(res[i], sycl::ext::oneapi::experimental::fma(inpVal1, inpVal2,
+                                                             inpVal3))) {
+        err += 1;
+      }
+    }
+    assert(err == 0);
+  }
+
+  // Test fma on device.
+  {
+    buffer<int> err_buf(&err, 1);
+    q.submit([&](handler &cgh) {
+       accessor<int, 1, access::mode::write, target::device> ERR(err_buf, cgh);
+       cgh.single_task([=]() {
+         vec<bfloat16, 3> arg1, arg2, arg3;
+         bfloat16 inpVal1 = 1.0f;
+         bfloat16 inpVal2 = 2.0f;
+         bfloat16 inpVal3 = 3.0f;
+         /* Initialize the vector with INPVAL */
+         for (int i = 0; i < 3; i++) {
+           arg1[i] = inpVal1;
+           arg2[i] = inpVal2;
+           arg3[i] = inpVal3;
+         }
+         /* Perform the operation. */
+         auto res = sycl::ext::oneapi::experimental::fma(arg1, arg2, arg3);
+
+         // Test different combination of vec an swizzle.
+         auto res1 = sycl::ext::oneapi::experimental::fma(
+             arg1.template swizzle<0, 0>(), arg2.template swizzle<0, 0>(),
+             arg3.template swizzle<0, 0>());
+
+         auto res2 = sycl::ext::oneapi::experimental::fma(
+             vec<bfloat16, 2>(arg1[0], arg1[0]), arg2.template swizzle<0, 0>(),
+             arg3.template swizzle<0, 0>());
+
+         auto res3 = sycl::ext::oneapi::experimental::fma(
+             arg1.template swizzle<0, 0>(), vec<bfloat16, 2>(arg2[0], arg2[0]),
+             arg3.template swizzle<0, 0>());
+
+         auto res4 = sycl::ext::oneapi::experimental::fma(
+             arg1.template swizzle<0, 0>(), arg2.template swizzle<0, 0>(),
+             vec<bfloat16, 2>(arg3[0], arg3[0]));
+
+         /* Check the result. */
+         if (res1[0] != res[0] || res1[1] != res[0] || res2[0] != res[0] ||
+             res2[1] != res[0] || res3[0] != res[0] || res3[1] != res[0] ||
+             res4[0] != res[0] || res4[1] != res[0]) {
+           ERR[0] += 1;
+         }
+         for (int i = 0; i < 3; i++) {
+           if (check(res[i], sycl::ext::oneapi::experimental::fma(
+                                 inpVal1, inpVal2, inpVal3))) {
+             ERR[0] += 1;
+           }
+         }
+       });
+     }).wait();
+    assert(err == 0);
+  }
+}
+
+int main() {
+
+  test();
+  return 0;
+}
diff --git a/sycl/test-e2e/Basic/barrier_order.cpp b/sycl/test-e2e/Basic/barrier_order.cpp
index 16af6eee3837b..be5c78b6c410c 100644
--- a/sycl/test-e2e/Basic/barrier_order.cpp
+++ b/sycl/test-e2e/Basic/barrier_order.cpp
@@ -2,10 +2,6 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-// Hangs on that platform (at least on gpu-intel-pvc),
-// https://github.com/intel/llvm/issues/7330.
-// UNSUPPORTED: opencl && gpu
-
 #include <iostream>
 #include <stdlib.h>
 
diff --git a/sycl/test-e2e/Basic/buffer/buffer_dev_to_dev.cpp b/sycl/test-e2e/Basic/buffer/buffer_dev_to_dev.cpp
index e44f2a628dbc0..ad155e2348416 100644
--- a/sycl/test-e2e/Basic/buffer/buffer_dev_to_dev.cpp
+++ b/sycl/test-e2e/Basic/buffer/buffer_dev_to_dev.cpp
@@ -1,5 +1,3 @@
-// FIXME flaky fail on HIP
-// UNSUPPORTED: hip
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -12,15 +10,16 @@
 //===----------------------------------------------------------------------===//
 
 #include <cassert>
-#include <memory>
 #include <sycl/detail/core.hpp>
 
 using namespace sycl;
 
+constexpr int size = 10;
+
 int main() {
-  int Data[10] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
+  int Data[size] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1};
   {
-    buffer<int, 1> Buffer(Data, range<1>(10),
+    buffer<int, 1> Buffer(Data, range<1>(size),
                           {property::buffer::use_host_ptr()});
 
     device Device(default_selector_v);
@@ -33,17 +32,17 @@ int main() {
 
     assert(FirstQueue.get_context() != SecondQueue.get_context());
     FirstQueue.submit([&](handler &Cgh) {
-      auto Accessor = Buffer.get_access<access::mode::read_write>(Cgh);
-      Cgh.parallel_for<class init_b>(range<1>{10},
+      accessor Accessor{Buffer, Cgh};
+      Cgh.parallel_for<class init_b>(range<1>{size},
                                      [=](id<1> Index) { Accessor[Index] = 0; });
     });
     SecondQueue.submit([&](handler &Cgh) {
-      auto Accessor = Buffer.get_access<access::mode::read_write>(Cgh);
+      accessor Accessor{Buffer, Cgh};
       Cgh.parallel_for<class increment_b>(
-          range<1>{10}, [=](id<1> Index) { Accessor[Index] += 1; });
+          range<1>{size}, [=](id<1> Index) { Accessor[Index] += 1; });
     });
   } // Data is copied back
-  for (int I = 0; I < 10; I++) {
+  for (int I = 0; I < size; I++) {
     assert(Data[I] == 1);
   }
 
diff --git a/sycl/test-e2e/Basic/built-ins/host_math.cpp b/sycl/test-e2e/Basic/built-ins/host_math.cpp
index 6057c9a5f2734..739bf79240e0d 100644
--- a/sycl/test-e2e/Basic/built-ins/host_math.cpp
+++ b/sycl/test-e2e/Basic/built-ins/host_math.cpp
@@ -44,7 +44,8 @@ void testRemquo() {
     int quo = 0;
     float rem = sycl::remquo(
         86.0f, 10.0f,
-        sycl::multi_ptr<int, sycl::access::address_space::global_space>{&quo});
+        sycl::address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::no, int>(&quo));
     assert(quo == 9);
     assert(rem == -4);
   }
@@ -53,7 +54,8 @@ void testRemquo() {
     int quo = 0;
     float rem = sycl::remquo(
         -10.0, 3.0,
-        sycl::multi_ptr<int, sycl::access::address_space::global_space>{&quo});
+        sycl::address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::no, int>(&quo));
     assert(quo == -3);
     assert(rem == -1);
   }
@@ -62,7 +64,8 @@ void testRemquo() {
     int quo = 0;
     float rem = sycl::remquo(
         0.552879f, 0.219282f,
-        sycl::multi_ptr<int, sycl::access::address_space::global_space>{&quo});
+        sycl::address_space_cast<sycl::access::address_space::global_space,
+                                 sycl::access::decorated::no, int>(&quo));
     assert(quo == 3);
     assert(rem == -0.10496702790260315f);
   }
diff --git a/sycl/test-e2e/Basic/device_event.cpp b/sycl/test-e2e/Basic/device_event.cpp
index b9c5354a74cdf..25631fac20843 100644
--- a/sycl/test-e2e/Basic/device_event.cpp
+++ b/sycl/test-e2e/Basic/device_event.cpp
@@ -57,8 +57,7 @@ int test_strideN(size_t stride) {
     nElemsToCopy++;
 
   try {
-    default_selector selector;
-    queue myQueue(selector, [](exception_list l) {
+    queue myQueue(default_selector_v, [](exception_list l) {
       for (auto ep : l) {
         try {
           std::rethrow_exception(ep);
@@ -88,7 +87,7 @@ int test_strideN(size_t stride) {
             local_acc.get_multi_ptr<access::decorated::yes>();
         decorated_global_ptr<int> gptr =
             out_ptr.get_multi_ptr<access::decorated::yes>() +
-            grp.get_id()[0] * 16;
+            grp.get_group_id()[0] * 16;
 
         // Write the values 700, 701, ..., 763 to global memory.
         // Why? Well, a) to ensure that something is written into that memory
diff --git a/sycl/test-e2e/Basic/event.cpp b/sycl/test-e2e/Basic/event.cpp
index aec4dbbedd99d..d5cba1063f074 100644
--- a/sycl/test-e2e/Basic/event.cpp
+++ b/sycl/test-e2e/Basic/event.cpp
@@ -52,15 +52,13 @@ int main() {
   }
 
   {
-    struct exception : public sycl::exception {};
-
     std::cout << "wait_and_throw() check" << std::endl;
     bool failed = true;
     auto handler = [&](sycl::exception_list l) { failed = false; };
 
     sycl::queue queue(handler);
     sycl::event e = queue.submit([&](sycl::handler &cgh) {
-      cgh.host_task([=]() { throw exception{}; });
+      cgh.host_task([=]() { throw sycl::exception{sycl::errc::runtime}; });
     });
     e.wait_and_throw();
     assert(failed == false);
diff --git a/sycl/test-e2e/Basic/host-task-dependency.cpp b/sycl/test-e2e/Basic/host-task-dependency.cpp
index 367f8def7fe8a..7f4f31320f1e7 100644
--- a/sycl/test-e2e/Basic/host-task-dependency.cpp
+++ b/sycl/test-e2e/Basic/host-task-dependency.cpp
@@ -32,10 +32,10 @@ struct Context {
 
 S::event HostTask_CopyBuf1ToBuf2(Context *Ctx) {
   S::event Event = Ctx->Queue.submit([&](S::handler &CGH) {
-    S::accessor<int, 1, S::access::mode::read, S::access::target::host_buffer>
-        CopierSrcAcc(Ctx->Buf1, CGH);
-    S::accessor<int, 1, S::access::mode::write, S::access::target::host_buffer>
-        CopierDstAcc(Ctx->Buf2, CGH);
+    S::host_accessor<int, 1, S::access::mode::read> CopierSrcAcc(Ctx->Buf1,
+                                                                 CGH);
+    S::host_accessor<int, 1, S::access::mode::write> CopierDstAcc(Ctx->Buf2,
+                                                                  CGH);
 
     auto CopierHostTask = [=] {
       for (size_t Idx = 0; Idx < CopierDstAcc.size(); ++Idx)
@@ -59,24 +59,21 @@ S::event HostTask_CopyBuf1ToBuf2(Context *Ctx) {
 void Thread1Fn(Context *Ctx) {
   // 0. initialize resulting buffer with apriori wrong result
   {
-    S::accessor<int, 1, S::access::mode::write, S::access::target::host_buffer>
-        Acc(Ctx->Buf1);
+    S::host_accessor<int, 1, S::access::mode::write> Acc(Ctx->Buf1);
 
     for (size_t Idx = 0; Idx < Acc.size(); ++Idx)
       Acc[Idx] = -1;
   }
 
   {
-    S::accessor<int, 1, S::access::mode::write, S::access::target::host_buffer>
-        Acc(Ctx->Buf2);
+    S::host_accessor<int, 1, S::access::mode::write> Acc(Ctx->Buf2);
 
     for (size_t Idx = 0; Idx < Acc.size(); ++Idx)
       Acc[Idx] = -2;
   }
 
   {
-    S::accessor<int, 1, S::access::mode::write, S::access::target::host_buffer>
-        Acc(Ctx->Buf3);
+    S::host_accessor<int, 1, S::access::mode::write> Acc(Ctx->Buf3);
 
     for (size_t Idx = 0; Idx < Acc.size(); ++Idx)
       Acc[Idx] = -3;
@@ -117,8 +114,7 @@ void Thread1Fn(Context *Ctx) {
 
   // 4. check data in buffer #3
   {
-    S::accessor<int, 1, S::access::mode::read, S::access::target::host_buffer>
-        Acc(Ctx->Buf3);
+    S::host_accessor<int, 1, S::access::mode::read> Acc(Ctx->Buf3);
 
     bool Failure = false;
 
@@ -163,8 +159,7 @@ void test() {
 
   // 3. check via host accessor that buf 2 contains valid data
   {
-    S::accessor<int, 1, S::access::mode::read, S::access::target::host_buffer>
-        ResultAcc(Ctx.Buf2);
+    S::host_accessor<int, 1, S::access::mode::read> ResultAcc(Ctx.Buf2);
 
     bool Failure = false;
     for (size_t Idx = 0; Idx < ResultAcc.size(); ++Idx) {
diff --git a/sycl/test-e2e/Basic/image/image.cpp b/sycl/test-e2e/Basic/image/image.cpp
index 9d7165150194a..3fdc767420ca7 100644
--- a/sycl/test-e2e/Basic/image/image.cpp
+++ b/sycl/test-e2e/Basic/image/image.cpp
@@ -91,7 +91,7 @@ int main() {
 
     constexpr int dims = 1;
 
-    using data_img = sycl::cl_float4;
+    using data_img = sycl::float4;
     constexpr auto mode_img = sycl::access::mode::read;
     constexpr auto target_img = sycl::target::image;
     const auto range_img = sycl::range<dims>(3);
diff --git a/sycl/test-e2e/Basic/image/image_accessor_readsampler.cpp b/sycl/test-e2e/Basic/image/image_accessor_readsampler.cpp
index 78cdc0bd1e41f..20c7ecb4fec8a 100644
--- a/sycl/test-e2e/Basic/image/image_accessor_readsampler.cpp
+++ b/sycl/test-e2e/Basic/image/image_accessor_readsampler.cpp
@@ -26,14 +26,13 @@ namespace s = sycl;
 
 template <int unique_number> class kernel_class;
 
-void validateReadData(s::cl_float4 ReadData, s::cl_float4 ExpectedColor,
-                      s::cl_int precision = 1) {
+void validateReadData(s::float4 ReadData, s::float4 ExpectedColor,
+                      int precision = 1) {
   // Maximum difference of 1.5 ULP is allowed when precision = 1.
-  s::cl_int4 PixelDataInt = ReadData.template as<s::cl_int4>();
-  s::cl_int4 ExpectedDataInt = ExpectedColor.template as<s::cl_int4>();
-  s::cl_int4 Diff = ExpectedDataInt - PixelDataInt;
-  s::cl_int DataIsCorrect =
-      s::all((Diff <= precision) && (Diff >= (-precision)));
+  s::int4 PixelDataInt = ReadData.template as<s::int4>();
+  s::int4 ExpectedDataInt = ExpectedColor.template as<s::int4>();
+  s::int4 Diff = ExpectedDataInt - PixelDataInt;
+  int DataIsCorrect = s::all((Diff <= precision) && (Diff >= (-precision)));
 #if DEBUG_OUTPUT
   {
     if (DataIsCorrect) {
@@ -49,28 +48,30 @@ void validateReadData(s::cl_float4 ReadData, s::cl_float4 ExpectedColor,
     Diff.dump();
   }
 #else
-  { assert(DataIsCorrect); }
+  {
+    assert(DataIsCorrect);
+  }
 #endif
 }
 
 template <int i>
-void checkReadSampler(char *host_ptr, s::sampler Sampler, s::cl_float4 Coord,
-                      s::cl_float4 ExpectedColor, s::cl_int precision = 1) {
+void checkReadSampler(char *host_ptr, s::sampler Sampler, s::float4 Coord,
+                      s::float4 ExpectedColor, int precision = 1) {
 
-  s::cl_float4 ReadData;
+  s::float4 ReadData;
   {
     // image with dim = 3
     s::image<3> Img(host_ptr, s::image_channel_order::rgba,
                     s::image_channel_type::snorm_int8, s::range<3>{2, 3, 4});
     s::queue myQueue;
-    s::buffer<s::cl_float4, 1> ReadDataBuf(&ReadData, s::range<1>(1));
+    s::buffer<s::float4, 1> ReadDataBuf(&ReadData, s::range<1>(1));
     myQueue.submit([&](s::handler &cgh) {
-      auto ReadAcc = Img.get_access<s::cl_float4, s::access::mode::read>(cgh);
-      s::accessor<s::cl_float4, 1, s::access::mode::write> ReadDataBufAcc(
+      auto ReadAcc = Img.get_access<s::float4, s::access::mode::read>(cgh);
+      s::accessor<s::float4, 1, s::access::mode::write> ReadDataBufAcc(
           ReadDataBuf, cgh);
 
       cgh.single_task<class kernel_class<i>>([=]() {
-        s::cl_float4 RetColor = ReadAcc.read(Coord, Sampler);
+        s::float4 RetColor = ReadAcc.read(Coord, Sampler);
         ReadDataBufAcc[0] = RetColor;
       });
     });
@@ -90,9 +91,8 @@ void checkSamplerNearest() {
   // addressing_mode::mirrored_repeat
   {
     // Out-of-range mirrored_repeat mode
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
     auto Sampler = s::sampler(s::coordinate_normalization_mode::normalized,
                               s::addressing_mode::mirrored_repeat,
                               s::filtering_mode::nearest);
@@ -102,9 +102,8 @@ void checkSamplerNearest() {
   // addressing_mode::repeat
   {
     // Out-of-range repeat mode
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::normalized,
                    s::addressing_mode::repeat, s::filtering_mode::nearest);
@@ -114,9 +113,8 @@ void checkSamplerNearest() {
   // addressing_mode::clamp_to_edge
   {
     // Out-of-range Edge Color
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(88.0f, 89.0f, 90.0f, 91.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(88.0f, 89.0f, 90.0f, 91.0f) / 127.0f;
     auto Sampler = s::sampler(s::coordinate_normalization_mode::normalized,
                               s::addressing_mode::clamp_to_edge,
                               s::filtering_mode::nearest);
@@ -126,8 +124,8 @@ void checkSamplerNearest() {
   // addressing_mode::clamp
   {
     // Out-of-range Border Color
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue = s::cl_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(0.0f, 0.0f, 0.0f, 0.0f);
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::normalized,
                    s::addressing_mode::clamp, s::filtering_mode::nearest);
@@ -137,9 +135,8 @@ void checkSamplerNearest() {
   // addressing_mode::none
   {
     // In-range for consistent return value.
-    s::cl_float4 Coord(0.0f, 0.5f, 0.75f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(80.0f, 81.0f, 82.0f, 83.0f) / 127.0f;
+    s::float4 Coord(0.0f, 0.5f, 0.75f, 0.0f);
+    s::float4 ExpectedValue = s::float4(80.0f, 81.0f, 82.0f, 83.0f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::normalized,
                    s::addressing_mode::none, s::filtering_mode::nearest);
@@ -149,9 +146,8 @@ void checkSamplerNearest() {
   // B. coordinate_normalization_mode::unnormalized
   // addressing_mode::clamp_to_edge
   {
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
     auto Sampler = s::sampler(s::coordinate_normalization_mode::unnormalized,
                               s::addressing_mode::clamp_to_edge,
                               s::filtering_mode::nearest);
@@ -160,9 +156,8 @@ void checkSamplerNearest() {
 
   // addressing_mode::clamp
   {
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::unnormalized,
                    s::addressing_mode::clamp, s::filtering_mode::nearest);
@@ -172,9 +167,8 @@ void checkSamplerNearest() {
   // addressing_mode::none
   {
     // In-range for consistent return value.
-    s::cl_float4 Coord(0.0f, 1.0f, 2.0f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.0f, 2.0f, 0.0f);
+    s::float4 ExpectedValue = s::float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::unnormalized,
                    s::addressing_mode::none, s::filtering_mode::nearest);
@@ -190,7 +184,7 @@ void checkSamplerNearest() {
 // value of 15000 ULP is used.
 void checkSamplerLinear() {
 
-  const s::cl_int PrecisionInULP = 15000;
+  const int PrecisionInULP = 15000;
   // create image:
   char host_ptr[100];
   for (int i = 0; i < 100; i++)
@@ -201,9 +195,8 @@ void checkSamplerLinear() {
   // addressing_mode::mirrored_repeat
   {
     // Out-of-range mirrored_repeat mode
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(44.0f, 45.0f, 46.0f, 47.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(44.0f, 45.0f, 46.0f, 47.0f) / 127.0f;
     auto Sampler = s::sampler(s::coordinate_normalization_mode::normalized,
                               s::addressing_mode::mirrored_repeat,
                               s::filtering_mode::linear);
@@ -212,9 +205,8 @@ void checkSamplerLinear() {
   }
   {
     // In-range mirrored_repeat mode
-    s::cl_float4 Coord(0.0f, 0.25f, 0.55f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(42.8f, 43.8f, 44.8f, 45.8f) / 127.0f;
+    s::float4 Coord(0.0f, 0.25f, 0.55f, 0.0f);
+    s::float4 ExpectedValue = s::float4(42.8f, 43.8f, 44.8f, 45.8f) / 127.0f;
     auto Sampler = s::sampler(s::coordinate_normalization_mode::normalized,
                               s::addressing_mode::mirrored_repeat,
                               s::filtering_mode::linear);
@@ -225,9 +217,8 @@ void checkSamplerLinear() {
   // addressing_mode::repeat
   {
     // Out-of-range repeat mode
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(46.0f, 47.0f, 48.0f, 49.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(46.0f, 47.0f, 48.0f, 49.0f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::normalized,
                    s::addressing_mode::repeat, s::filtering_mode::linear);
@@ -236,9 +227,8 @@ void checkSamplerLinear() {
   }
   {
     // In-range repeat mode
-    s::cl_float4 Coord(0.0f, 0.25f, 0.55f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(44.8f, 45.8f, 46.8f, 47.8f) / 127.0f;
+    s::float4 Coord(0.0f, 0.25f, 0.55f, 0.0f);
+    s::float4 ExpectedValue = s::float4(44.8f, 45.8f, 46.8f, 47.8f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::normalized,
                    s::addressing_mode::repeat, s::filtering_mode::linear);
@@ -249,9 +239,8 @@ void checkSamplerLinear() {
   // addressing_mode::clamp_to_edge
   {
     // Out-of-range Edge Color
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(88.0f, 89.0f, 90.0f, 91.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(88.0f, 89.0f, 90.0f, 91.0f) / 127.0f;
     auto Sampler = s::sampler(s::coordinate_normalization_mode::normalized,
                               s::addressing_mode::clamp_to_edge,
                               s::filtering_mode::linear);
@@ -259,9 +248,8 @@ void checkSamplerLinear() {
                         PrecisionInULP);
   }
   {
-    s::cl_float4 Coord(0.0f, 0.2f, 0.5f, 0.0f); // In-range
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(36.8f, 37.8f, 38.8f, 39.8f) / 127.0f;
+    s::float4 Coord(0.0f, 0.2f, 0.5f, 0.0f); // In-range
+    s::float4 ExpectedValue = s::float4(36.8f, 37.8f, 38.8f, 39.8f) / 127.0f;
     auto Sampler = s::sampler(s::coordinate_normalization_mode::normalized,
                               s::addressing_mode::clamp_to_edge,
                               s::filtering_mode::linear);
@@ -272,8 +260,8 @@ void checkSamplerLinear() {
   // addressing_mode::clamp
   {
     // Out-of-range
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue = s::cl_float4(0.0f, 0.0f, 0.0f, 0.0f);
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(0.0f, 0.0f, 0.0f, 0.0f);
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::normalized,
                    s::addressing_mode::clamp, s::filtering_mode::linear);
@@ -282,9 +270,8 @@ void checkSamplerLinear() {
   }
   {
     // In-range
-    s::cl_float4 Coord(0.0f, 0.2f, 0.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(18.4f, 18.9f, 19.4f, 19.9f) / 127.0f;
+    s::float4 Coord(0.0f, 0.2f, 0.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(18.4f, 18.9f, 19.4f, 19.9f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::normalized,
                    s::addressing_mode::clamp, s::filtering_mode::linear);
@@ -295,9 +282,8 @@ void checkSamplerLinear() {
   // addressing_mode::none
   {
     // In-range for consistent return value.
-    s::cl_float4 Coord(0.5f, 0.5f, 0.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(46.0f, 47.0f, 48.0f, 49.0f) / 127.0f;
+    s::float4 Coord(0.5f, 0.5f, 0.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(46.0f, 47.0f, 48.0f, 49.0f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::normalized,
                    s::addressing_mode::none, s::filtering_mode::linear);
@@ -309,9 +295,8 @@ void checkSamplerLinear() {
   // addressing_mode::clamp_to_edge
   {
     // Out-of-range
-    s::cl_float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 2.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(56.0f, 57.0f, 58.0f, 59.0f) / 127.0f;
     auto Sampler = s::sampler(s::coordinate_normalization_mode::unnormalized,
                               s::addressing_mode::clamp_to_edge,
                               s::filtering_mode::linear);
@@ -320,8 +305,8 @@ void checkSamplerLinear() {
   }
   {
     // In-range
-    s::cl_float4 Coord(0.0f, 0.2f, 0.5f, 0.0f);
-    s::cl_float4 ExpectedValue = s::cl_float4(0.0f, 1.0f, 2.0f, 3.0f) / 127.0f;
+    s::float4 Coord(0.0f, 0.2f, 0.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(0.0f, 1.0f, 2.0f, 3.0f) / 127.0f;
     auto Sampler = s::sampler(s::coordinate_normalization_mode::unnormalized,
                               s::addressing_mode::clamp_to_edge,
                               s::filtering_mode::linear);
@@ -332,9 +317,8 @@ void checkSamplerLinear() {
   // addressing_mode::clamp
   {
     // Out-of-range
-    s::cl_float4 Coord(0.0f, 1.5f, 1.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(16.0f, 16.5f, 17.0f, 17.5f) / 127.0f;
+    s::float4 Coord(0.0f, 1.5f, 1.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(16.0f, 16.5f, 17.0f, 17.5f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::unnormalized,
                    s::addressing_mode::clamp, s::filtering_mode::linear);
@@ -343,9 +327,8 @@ void checkSamplerLinear() {
   }
   {
     // In-range
-    s::cl_float4 Coord(0.0f, 0.2f, 0.5f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(0.0f, 0.35f, 0.7f, 1.05f) / 127.0f;
+    s::float4 Coord(0.0f, 0.2f, 0.5f, 0.0f);
+    s::float4 ExpectedValue = s::float4(0.0f, 0.35f, 0.7f, 1.05f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::unnormalized,
                    s::addressing_mode::clamp, s::filtering_mode::linear);
@@ -356,9 +339,8 @@ void checkSamplerLinear() {
   // addressing_mode::none
   {
     // In-range for consistent return value.
-    s::cl_float4 Coord(1.0f, 2.0f, 3.0f, 0.0f);
-    s::cl_float4 ExpectedValue =
-        s::cl_float4(74.0f, 75.0f, 76.0f, 77.0f) / 127.0f;
+    s::float4 Coord(1.0f, 2.0f, 3.0f, 0.0f);
+    s::float4 ExpectedValue = s::float4(74.0f, 75.0f, 76.0f, 77.0f) / 127.0f;
     auto Sampler =
         s::sampler(s::coordinate_normalization_mode::unnormalized,
                    s::addressing_mode::none, s::filtering_mode::linear);
@@ -369,8 +351,8 @@ void checkSamplerLinear() {
 
 int main() {
 
-  // Note: Currently these functions only check for cl_float4 return datatype,
-  // the test case can be extended to test all return datatypes.
+  // Note: Currently these functions only check for vec<float, 4> return
+  // datatype, the test case can be extended to test all return datatypes.
   checkSamplerNearest();
   checkSamplerLinear();
 }
diff --git a/sycl/test-e2e/Basic/image/image_accessor_readwrite.cpp b/sycl/test-e2e/Basic/image/image_accessor_readwrite.cpp
index 0508e2f2e5f2b..1313bc8e96223 100644
--- a/sycl/test-e2e/Basic/image/image_accessor_readwrite.cpp
+++ b/sycl/test-e2e/Basic/image/image_accessor_readwrite.cpp
@@ -28,8 +28,8 @@ template <typename WriteDataT, int ImgType, int read_write> class kernel_class;
 
 template <typename ReadDataT,
           typename = typename std::enable_if<
-              (!(std::is_same_v<ReadDataT, s::cl_float4>) &&
-               !(std::is_same_v<ReadDataT, s::cl_half4>))>::type>
+              (!(std::is_same_v<ReadDataT, s::float4>) &&
+               !(std::is_same_v<ReadDataT, s::half4>))>::type>
 void check_read_data(ReadDataT ReadData, ReadDataT ExpectedColor) {
   using ReadDataType = typename ReadDataT::element_type;
   bool CorrectData = false;
@@ -59,11 +59,11 @@ void check_read_data(ReadDataT ReadData, ReadDataT ExpectedColor) {
 #endif
 }
 
-void check_read_data(s::cl_float4 ReadData, s::cl_float4 ExpectedColor) {
+void check_read_data(s::float4 ReadData, s::float4 ExpectedColor) {
   // Maximum difference of 1.5 ULP is allowed.
-  s::cl_int4 PixelDataInt = ReadData.template as<s::cl_int4>();
-  s::cl_int4 ExpectedDataInt = ExpectedColor.template as<s::cl_int4>();
-  s::cl_int4 Diff = ExpectedDataInt - PixelDataInt;
+  s::int4 PixelDataInt = ReadData.template as<s::int4>();
+  s::int4 ExpectedDataInt = ExpectedColor.template as<s::int4>();
+  s::int4 Diff = ExpectedDataInt - PixelDataInt;
   bool CorrectData = false;
   if ((Diff.x() <= 1 && Diff.x() >= -1) && (Diff.y() <= 1 && Diff.y() >= -1) &&
       (Diff.z() <= 1 && Diff.z() >= -1) && (Diff.w() <= 1 && Diff.w() >= -1))
@@ -89,10 +89,10 @@ void check_read_data(s::cl_float4 ReadData, s::cl_float4 ExpectedColor) {
 #endif
 }
 
-void check_read_data(s::cl_half4 ReadData, s::cl_half4 ExpectedColor) {
+void check_read_data(s::half4 ReadData, s::half4 ExpectedColor) {
   // Maximum difference of 1.5 ULP is allowed.
-  s::cl_float4 ReadDatafloat = ReadData.template convert<float>();
-  s::cl_float4 ExpectedColorfloat = ExpectedColor.template convert<float>();
+  s::float4 ReadDatafloat = ReadData.template convert<float>();
+  s::float4 ExpectedColorfloat = ExpectedColor.template convert<float>();
   check_read_data(ReadDatafloat, ExpectedColorfloat);
 }
 
@@ -142,102 +142,102 @@ void check_read_type_order(char *HostPtr, const s::image_channel_order ImgOrder,
 
 template <typename T> void check(char *);
 
-template <> void check<s::cl_int4>(char *HostPtr) {
+template <> void check<s::int4>(char *HostPtr) {
   // valid channel types:
   // s::image_channel_type::signed_int8,
-  write_type_order<s::cl_int4, s::image_channel_type::signed_int8>(
+  write_type_order<s::int4, s::image_channel_type::signed_int8>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_int4(std::numeric_limits<s::cl_int>::max(),
-                 std::numeric_limits<s::cl_int>::min(), 123, 0));
-  check_read_type_order<s::cl_int4, s::image_channel_type::signed_int8>(
+      s::int4(std::numeric_limits<int>::max(), std::numeric_limits<int>::min(),
+              123, 0));
+  check_read_type_order<s::int4, s::image_channel_type::signed_int8>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_int4(std::numeric_limits<s::cl_char>::max(),
-                 std::numeric_limits<s::cl_char>::min(), 123, 0));
+      s::int4(std::numeric_limits<char>::max(),
+              std::numeric_limits<char>::min(), 123, 0));
 
   // s::image_channel_type::signed_int16,
-  write_type_order<s::cl_int4, s::image_channel_type::signed_int16>(
+  write_type_order<s::int4, s::image_channel_type::signed_int16>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_int4(std::numeric_limits<s::cl_int>::max(),
-                 std::numeric_limits<s::cl_int>::min(), 123, 0));
-  check_read_type_order<s::cl_int4, s::image_channel_type::signed_int16>(
+      s::int4(std::numeric_limits<int>::max(), std::numeric_limits<int>::min(),
+              123, 0));
+  check_read_type_order<s::int4, s::image_channel_type::signed_int16>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_int4(std::numeric_limits<s::cl_short>::max(),
-                 std::numeric_limits<s::cl_short>::min(), 123, 0));
+      s::int4(std::numeric_limits<short>::max(),
+              std::numeric_limits<short>::min(), 123, 0));
 
   // s::image_channel_type::signed_int32.
-  write_type_order<s::cl_int4, s::image_channel_type::signed_int32>(
+  write_type_order<s::int4, s::image_channel_type::signed_int32>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_int4(std::numeric_limits<s::cl_int>::max(),
-                 std::numeric_limits<s::cl_int>::min(), 123, 0));
-  check_read_type_order<s::cl_int4, s::image_channel_type::signed_int32>(
+      s::int4(std::numeric_limits<int>::max(), std::numeric_limits<int>::min(),
+              123, 0));
+  check_read_type_order<s::int4, s::image_channel_type::signed_int32>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_int4(std::numeric_limits<s::cl_int>::max(),
-                 std::numeric_limits<s::cl_int>::min(), 123, 0));
+      s::int4(std::numeric_limits<int>::max(), std::numeric_limits<int>::min(),
+              123, 0));
 };
 
-template <> void check<s::cl_uint4>(char *HostPtr) {
-  // Calling only valid channel types with s::cl_uint4.
+template <> void check<s::uint4>(char *HostPtr) {
+  // Calling only valid channel types with s::uint4.
   // s::image_channel_type::signed_int8
-  write_type_order<s::cl_uint4, s::image_channel_type::unsigned_int8>(
+  write_type_order<s::uint4, s::image_channel_type::unsigned_int8>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_uint4(std::numeric_limits<s::cl_uint>::max(),
-                  std::numeric_limits<s::cl_uint>::min(), 123, 0));
-  check_read_type_order<s::cl_uint4, s::image_channel_type::unsigned_int8>(
+      s::uint4(std::numeric_limits<unsigned int>::max(),
+               std::numeric_limits<unsigned int>::min(), 123, 0));
+  check_read_type_order<s::uint4, s::image_channel_type::unsigned_int8>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_uint4(std::numeric_limits<s::cl_uchar>::max(),
-                  std::numeric_limits<s::cl_uchar>::min(), 123, 0));
+      s::uint4(std::numeric_limits<unsigned char>::max(),
+               std::numeric_limits<unsigned char>::min(), 123, 0));
 
   // s::image_channel_type::signed_int16
-  write_type_order<s::cl_uint4, s::image_channel_type::unsigned_int16>(
+  write_type_order<s::uint4, s::image_channel_type::unsigned_int16>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_uint4(std::numeric_limits<s::cl_uint>::max(),
-                  std::numeric_limits<s::cl_uint>::min(), 123, 0));
-  check_read_type_order<s::cl_uint4, s::image_channel_type::unsigned_int16>(
+      s::uint4(std::numeric_limits<unsigned int>::max(),
+               std::numeric_limits<unsigned int>::min(), 123, 0));
+  check_read_type_order<s::uint4, s::image_channel_type::unsigned_int16>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_uint4(std::numeric_limits<s::cl_ushort>::max(),
-                  std::numeric_limits<s::cl_ushort>::min(), 123, 0));
+      s::uint4(std::numeric_limits<unsigned short>::max(),
+               std::numeric_limits<unsigned short>::min(), 123, 0));
 
   // s::image_channel_type::signed_int32
-  write_type_order<s::cl_uint4, s::image_channel_type::unsigned_int32>(
+  write_type_order<s::uint4, s::image_channel_type::unsigned_int32>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_uint4(std::numeric_limits<s::cl_uint>::max(),
-                  std::numeric_limits<s::cl_uint>::min(), 123, 0));
-  check_read_type_order<s::cl_uint4, s::image_channel_type::unsigned_int32>(
+      s::uint4(std::numeric_limits<unsigned int>::max(),
+               std::numeric_limits<unsigned int>::min(), 123, 0));
+  check_read_type_order<s::uint4, s::image_channel_type::unsigned_int32>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_uint4(std::numeric_limits<s::cl_uint>::max(),
-                  std::numeric_limits<s::cl_uint>::min(), 123, 0));
+      s::uint4(std::numeric_limits<unsigned int>::max(),
+               std::numeric_limits<unsigned int>::min(), 123, 0));
 };
 
-template <> void check<s::cl_float4>(char *HostPtr) {
-  // Calling only valid channel types with s::cl_float4.
+template <> void check<s::float4>(char *HostPtr) {
+  // Calling only valid channel types with s::float4.
   // TODO: Correct the values below.
   // s::image_channel_type::snorm_int8,
-  write_type_order<s::cl_float4, s::image_channel_type::snorm_int8>(
-      HostPtr, s::image_channel_order::rgba, s::cl_float4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_float4, s::image_channel_type::snorm_int8>(
+  write_type_order<s::float4, s::image_channel_type::snorm_int8>(
+      HostPtr, s::image_channel_order::rgba, s::float4(2, -2, 0.375f, 0));
+  check_read_type_order<s::float4, s::image_channel_type::snorm_int8>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_float4(1, -1, ((float)48 / 127) /*0.3779527544975280762f*/, 0));
+      s::float4(1, -1, ((float)48 / 127) /*0.3779527544975280762f*/, 0));
 
   // s::image_channel_type::snorm_int16,
-  write_type_order<s::cl_float4, s::image_channel_type::snorm_int16>(
-      HostPtr, s::image_channel_order::rgba, s::cl_float4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_float4, s::image_channel_type::snorm_int16>(
+  write_type_order<s::float4, s::image_channel_type::snorm_int16>(
+      HostPtr, s::image_channel_order::rgba, s::float4(2, -2, 0.375f, 0));
+  check_read_type_order<s::float4, s::image_channel_type::snorm_int16>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_float4(1, -1, ((float)12288 / 32767) /*0.375011444091796875f*/, 0));
+      s::float4(1, -1, ((float)12288 / 32767) /*0.375011444091796875f*/, 0));
 
   // s::image_channel_type::unorm_int8,
-  write_type_order<s::cl_float4, s::image_channel_type::unorm_int8>(
-      HostPtr, s::image_channel_order::rgba, s::cl_float4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_float4, s::image_channel_type::unorm_int8>(
+  write_type_order<s::float4, s::image_channel_type::unorm_int8>(
+      HostPtr, s::image_channel_order::rgba, s::float4(2, -2, 0.375f, 0));
+  check_read_type_order<s::float4, s::image_channel_type::unorm_int8>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_float4(1, 0, ((float)96 / 255) /*0.3764705955982208252f*/, 0));
+      s::float4(1, 0, ((float)96 / 255) /*0.3764705955982208252f*/, 0));
 
   // s::image_channel_type::unorm_int16
-  write_type_order<s::cl_float4, s::image_channel_type::unorm_int16>(
-      HostPtr, s::image_channel_order::rgba, s::cl_float4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_float4, s::image_channel_type::unorm_int16>(
+  write_type_order<s::float4, s::image_channel_type::unorm_int16>(
+      HostPtr, s::image_channel_order::rgba, s::float4(2, -2, 0.375f, 0));
+  check_read_type_order<s::float4, s::image_channel_type::unorm_int16>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_float4(1, 0, ((float)24576 / 65535) /*0.3750057220458984375f*/, 0));
+      s::float4(1, 0, ((float)24576 / 65535) /*0.3750057220458984375f*/, 0));
 
   // s::image_channel_type::unorm_short_565, order::rgbx
   // Currently unsupported since OpenCL has no information on this.
@@ -247,37 +247,39 @@ template <> void check<s::cl_float4>(char *HostPtr) {
   // (CL_IMAGE_FORMAT_NOT_SUPPORTED) s::image_channel_type::unorm_short_555,
   // order::rgbx
   /*
-  write_type_order<s::cl_float4, s::image_channel_type::unorm_short_555>(
-      HostPtr, s::image_channel_order::rgbx, s::cl_float4(2, -2, 0.375f, 0));
+  write_type_order<s::float4, s::image_channel_type::unorm_short_555>(
+      HostPtr, s::image_channel_order::rgbx, s::float4(2, -2, 0.375f,
+  0));
 
   // s::image_channel_type::unorm_int_101010, order::rgbx
-  write_type_order<s::cl_float4, s::image_channel_type::unorm_int_101010>(
-      HostPtr, s::image_channel_order::rgbx, s::cl_float4(2, -2, 0.375f, 0));
+  write_type_order<s::float4, s::image_channel_type::unorm_int_101010>(
+      HostPtr, s::image_channel_order::rgbx, s::float4(2, -2, 0.375f,
+  0));
   */
 
   // s::image_channel_type::fp16
-  write_type_order<s::cl_float4, s::image_channel_type::fp16>(
-      HostPtr, s::image_channel_order::rgba, s::cl_float4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_float4, s::image_channel_type::fp16>(
-      HostPtr, s::image_channel_order::rgba, s::cl_float4(2, -2, 0.375f, 0));
+  write_type_order<s::float4, s::image_channel_type::fp16>(
+      HostPtr, s::image_channel_order::rgba, s::float4(2, -2, 0.375f, 0));
+  check_read_type_order<s::float4, s::image_channel_type::fp16>(
+      HostPtr, s::image_channel_order::rgba, s::float4(2, -2, 0.375f, 0));
 
   // s::image_channel_type::fp32
-  write_type_order<s::cl_float4, s::image_channel_type::fp32>(
-      HostPtr, s::image_channel_order::rgba, s::cl_float4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_float4, s::image_channel_type::fp32>(
-      HostPtr, s::image_channel_order::rgba, s::cl_float4(2, -2, 0.375f, 0));
+  write_type_order<s::float4, s::image_channel_type::fp32>(
+      HostPtr, s::image_channel_order::rgba, s::float4(2, -2, 0.375f, 0));
+  check_read_type_order<s::float4, s::image_channel_type::fp32>(
+      HostPtr, s::image_channel_order::rgba, s::float4(2, -2, 0.375f, 0));
 };
 
 int main() {
   // Checking only for dimension=1.
-  // 4 datatypes possible: s::cl_uint4, s::cl_int4, s::cl_float4, s::cl_half4.
-  // half4 datatype is checked in a different test case.
-  // create image:
+  // 4 datatypes possible: s::uint4, s::int4, s::float4,
+  // s::half4. s::half4 datatype is checked in a different test case. create
+  // image:
   char HostPtr[100];
   for (int i = 0; i < 100; i++)
     HostPtr[i] = i;
 
-  check<s::cl_int4>(HostPtr);
-  check<s::cl_uint4>(HostPtr);
-  check<s::cl_float4>(HostPtr);
+  check<s::int4>(HostPtr);
+  check<s::uint4>(HostPtr);
+  check<s::float4>(HostPtr);
 }
diff --git a/sycl/test-e2e/Basic/image/image_accessor_readwrite_half.cpp b/sycl/test-e2e/Basic/image/image_accessor_readwrite_half.cpp
index 7fcd17a87302f..88cc8825f2b92 100644
--- a/sycl/test-e2e/Basic/image/image_accessor_readwrite_half.cpp
+++ b/sycl/test-e2e/Basic/image/image_accessor_readwrite_half.cpp
@@ -26,16 +26,14 @@ namespace s = sycl;
 
 template <typename WriteDataT, int ImgType, int read_write> class kernel_class;
 
-void check_read_data(s::cl_float4 ReadData, s::cl_float4 ExpectedColor) {
+void check_read_data(s::float4 ReadData, s::float4 ExpectedColor) {
   // Maximum difference of 1.5 ULP is allowed.
-  s::cl_int4 PixelDataInt = ReadData.template as<s::cl_int4>();
-  s::cl_int4 ExpectedDataInt = ExpectedColor.template as<s::cl_int4>();
-  s::cl_int4 Diff = ExpectedDataInt - PixelDataInt;
+  s::int4 PixelDataInt = ReadData.template as<s::int4>();
+  s::int4 ExpectedDataInt = ExpectedColor.template as<s::int4>();
+  s::int4 Diff = ExpectedDataInt - PixelDataInt;
   bool CorrectData = false;
-  if (((s::cl_int)Diff.x() <= 1 && (s::cl_int)Diff.x() >= -1) &&
-      ((s::cl_int)Diff.y() <= 1 && (s::cl_int)Diff.y() >= -1) &&
-      ((s::cl_int)Diff.z() <= 1 && (s::cl_int)Diff.z() >= -1) &&
-      ((s::cl_int)Diff.w() <= 1 && (s::cl_int)Diff.w() >= -1))
+  if ((Diff.x() <= 1 && Diff.x() >= -1) && (Diff.y() <= 1 && Diff.y() >= -1) &&
+      (Diff.z() <= 1 && Diff.z() >= -1) && (Diff.w() <= 1 && Diff.w() >= -1))
     CorrectData = true;
 
 #if DEBUG_OUTPUT
@@ -59,9 +57,9 @@ void check_read_data(s::cl_float4 ReadData, s::cl_float4 ExpectedColor) {
 #endif
 }
 
-void check_read_data(s::cl_half4 ReadData, s::cl_half4 ExpectedColor) {
-  s::cl_float4 ReadDatafloat = ReadData.convert<float>();
-  s::cl_float4 ExpectedColorfloat = ExpectedColor.convert<float>();
+void check_read_data(s::half4 ReadData, s::half4 ExpectedColor) {
+  s::float4 ReadDatafloat = ReadData.convert<float>();
+  s::float4 ExpectedColorfloat = ExpectedColor.convert<float>();
   check_read_data(ReadDatafloat, ExpectedColorfloat);
 }
 
@@ -111,40 +109,40 @@ void check_read_type_order(char *HostPtr, const s::image_channel_order ImgOrder,
 
 void check_half4(char *HostPtr) {
 
-  // Calling only valid channel types with s::cl_half4.
+  // Calling only valid channel types with s::half4.
   // s::image_channel_type::snorm_int8,
-  write_type_order<s::cl_half4, s::image_channel_type::snorm_int8>(
-      HostPtr, s::image_channel_order::rgba, s::cl_half4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_half4, s::image_channel_type::snorm_int8>(
+  write_type_order<s::half4, s::image_channel_type::snorm_int8>(
+      HostPtr, s::image_channel_order::rgba, s::half4(2, -2, 0.375f, 0));
+  check_read_type_order<s::half4, s::image_channel_type::snorm_int8>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_half4(1, -1, ((float)48 / 127) /*0.3779527544975280762f*/, 0));
+      s::half4(1, -1, ((float)48 / 127) /*0.3779527544975280762f*/, 0));
 
   // s::image_channel_type::snorm_int16,
-  write_type_order<s::cl_half4, s::image_channel_type::snorm_int16>(
-      HostPtr, s::image_channel_order::rgba, s::cl_half4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_half4, s::image_channel_type::snorm_int16>(
+  write_type_order<s::half4, s::image_channel_type::snorm_int16>(
+      HostPtr, s::image_channel_order::rgba, s::half4(2, -2, 0.375f, 0));
+  check_read_type_order<s::half4, s::image_channel_type::snorm_int16>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_half4(1, -1, ((float)12288 / 32767) /*0.375011444091796875f*/, 0));
+      s::half4(1, -1, ((float)12288 / 32767) /*0.375011444091796875f*/, 0));
 
   // s::image_channel_type::unorm_int8,
-  write_type_order<s::cl_half4, s::image_channel_type::unorm_int8>(
-      HostPtr, s::image_channel_order::rgba, s::cl_half4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_half4, s::image_channel_type::unorm_int8>(
+  write_type_order<s::half4, s::image_channel_type::unorm_int8>(
+      HostPtr, s::image_channel_order::rgba, s::half4(2, -2, 0.375f, 0));
+  check_read_type_order<s::half4, s::image_channel_type::unorm_int8>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_half4(1, 0, ((float)96 / 255) /*0.3764705955982208252f*/, 0));
+      s::half4(1, 0, ((float)96 / 255) /*0.3764705955982208252f*/, 0));
 
   // s::image_channel_type::unorm_int16
-  write_type_order<s::cl_half4, s::image_channel_type::unorm_int16>(
-      HostPtr, s::image_channel_order::rgba, s::cl_half4(1, -1, 0.375f, 0));
-  check_read_type_order<s::cl_half4, s::image_channel_type::unorm_int16>(
+  write_type_order<s::half4, s::image_channel_type::unorm_int16>(
+      HostPtr, s::image_channel_order::rgba, s::half4(1, -1, 0.375f, 0));
+  check_read_type_order<s::half4, s::image_channel_type::unorm_int16>(
       HostPtr, s::image_channel_order::rgba,
-      s::cl_half4(1, 0, ((float)24576 / 65535) /*0.3750057220458984375f*/, 0));
+      s::half4(1, 0, ((float)24576 / 65535) /*0.3750057220458984375f*/, 0));
 
   // s::image_channel_type::fp16
-  write_type_order<s::cl_half4, s::image_channel_type::fp16>(
-      HostPtr, s::image_channel_order::rgba, s::cl_half4(2, -2, 0.375f, 0));
-  check_read_type_order<s::cl_half4, s::image_channel_type::fp16>(
-      HostPtr, s::image_channel_order::rgba, s::cl_half4(2, -2, 0.375f, 0));
+  write_type_order<s::half4, s::image_channel_type::fp16>(
+      HostPtr, s::image_channel_order::rgba, s::half4(2, -2, 0.375f, 0));
+  check_read_type_order<s::half4, s::image_channel_type::fp16>(
+      HostPtr, s::image_channel_order::rgba, s::half4(2, -2, 0.375f, 0));
 };
 
 int main() {
diff --git a/sycl/test-e2e/Basic/kernel_max_wg_size.cpp b/sycl/test-e2e/Basic/kernel_max_wg_size.cpp
index 7dfc2520920b2..3e83a657af9d0 100644
--- a/sycl/test-e2e/Basic/kernel_max_wg_size.cpp
+++ b/sycl/test-e2e/Basic/kernel_max_wg_size.cpp
@@ -51,5 +51,6 @@ int main() {
   });
 
   myQueue.wait();
+  free(result, myQueue);
   return 0;
 }
diff --git a/sycl/test-e2e/Basic/sycl_2020_images/common.hpp b/sycl/test-e2e/Basic/sycl_2020_images/common.hpp
index e03a1664e1f92..b5b97a66d2f2f 100644
--- a/sycl/test-e2e/Basic/sycl_2020_images/common.hpp
+++ b/sycl/test-e2e/Basic/sycl_2020_images/common.hpp
@@ -269,7 +269,7 @@ template <typename T, int Dims> bool AllTrue(const vec<T, Dims> &Vec) {
 
 template <typename T, int Dims>
 bool ApproxEq(const vec<T, Dims> &LHS, const vec<T, Dims> &RHS,
-              T Precision = 0.1) {
+              T Precision = (T)0.1) {
   if constexpr (std::is_integral_v<T>)
     return AllTrue(sycl::abs(LHS - RHS) <= Precision);
   else
diff --git a/sycl/test-e2e/Basic/vector/byte.cpp b/sycl/test-e2e/Basic/vector/byte.cpp
index f724841332aa1..3d1c372f79837 100644
--- a/sycl/test-e2e/Basic/vector/byte.cpp
+++ b/sycl/test-e2e/Basic/vector/byte.cpp
@@ -182,6 +182,7 @@ int main() {
     // std::byte is not an arithmetic type or a character type, so std::byte
     // and vec<std::byte> should not support artithmetic operations. In the
     // new implementation of vec<> class, the following will be removed.
+#ifndef __INTEL_PREVIEW_BREAKING_CHANGES
     {
       // binary op for 2 vec
       auto vop = VecByte3A + VecByte3B;
@@ -352,6 +353,35 @@ int main() {
 
       auto bitv2 = !VecByte4A;
     }
+#else
+    {
+      // std::byte is not an arithmetic type and it only supports the following
+      // overloads of >> and << operators.
+      //
+      // 1 template <class IntegerType>
+      //   constexpr std::byte operator<<( std::byte b, IntegerType shift )
+      //   noexcept;
+      // 2 template <class IntegerType>
+      //   constexpr std::byte operator>>( std::byte b, IntegerType shift )
+      //   noexcept;
+      auto VecByte3Shift = VecByte3A << 3;
+      assert(VecByte3Shift[0] == VecByte3A[0] << 3 &&
+             VecByte3Shift[1] == VecByte3A[1] << 3 &&
+             VecByte3Shift[2] == VecByte3A[2] << 3);
+
+      VecByte3Shift = VecByte3A >> 1;
+      assert(VecByte3Shift[0] == VecByte3A[0] >> 1 &&
+             VecByte3Shift[1] == VecByte3A[1] >> 1 &&
+             VecByte3Shift[2] == VecByte3A[2] >> 1);
+
+      auto SwizByte2Shift = VecByte4A.lo();
+      using VecType = sycl::vec<std::byte, 2>;
+      auto SwizShiftRight = (VecType)(SwizByte2Shift >> 3);
+      auto SwizShiftLeft = (VecType)(SwizByte2Shift << 3);
+      assert(SwizShiftRight[0] == SwizByte2Shift[0] >> 3 &&
+             SwizShiftLeft[1] == SwizByte2Shift[1] << 3);
+    }
+#endif // __INTEL_PREVIEW_BREAKING_CHANGES
   }
 
   return 0;
diff --git a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_relational.cpp b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_relational.cpp
index 2ba7ff130bf72..e07e2548a840c 100644
--- a/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_relational.cpp
+++ b/sycl/test-e2e/Basic/vector/vec_binary_scalar_order_relational.cpp
@@ -2,9 +2,6 @@
 // RUN: %{build} -fpreview-breaking-changes -o %t.out
 // RUN: %{run} %t.out
 
-// This test currently fails on AMD HIP due to an unresolved memcmp function.
-// XFAIL: hip_amd
-
 // Checks scalar/vec relational operator ordering.
 
 #include "vec_binary_scalar_order.hpp"
diff --git a/sycl/test-e2e/CMakeLists.txt b/sycl/test-e2e/CMakeLists.txt
index b838d89e0786d..c8b096315a8d6 100644
--- a/sycl/test-e2e/CMakeLists.txt
+++ b/sycl/test-e2e/CMakeLists.txt
@@ -2,6 +2,9 @@ cmake_minimum_required(VERSION 3.20.0)
 
 message("Configuring SYCL End-to-End Tests")
 
+option(SYCL_E2E_LIT_ALLOW_UNKNOWN_ARCH
+  "Allow unknown architectures when configuring e2e tests" Off)
+
 if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
   project(sycl-e2e-test-suite CXX)
   set(SYCL_TEST_E2E_STANDALONE TRUE)
diff --git a/sycl/test-e2e/Config/config.cpp b/sycl/test-e2e/Config/config.cpp
index 701fd8c2961f0..1499a6f9908d5 100644
--- a/sycl/test-e2e/Config/config.cpp
+++ b/sycl/test-e2e/Config/config.cpp
@@ -6,15 +6,15 @@
 //
 //===----------------------------------------------------------------------===//
 // RUN: %{build} %debug_option -O0 -o %t.out
-// RUN: echo "SYCL_PRINT_EXECUTION_GRAPH=always" > %t.cfg
+// RUN: echo SYCL_PRINT_EXECUTION_GRAPH=always > %t.cfg
 // RUN: env SYCL_CONFIG_FILE_NAME=%t.cfg %t.out
-// RUN: ls | grep dot
+// RUN: cat *.dot > /dev/null
 // RUN: rm *.dot
 // RUN: env SYCL_PRINT_EXECUTION_GRAPH=always %t.out
-// RUN: ls | grep dot
+// RUN: cat *.dot > /dev/null
 // RUN: rm *.dot
 // RUN: %t.out
-// RUN: ls | not grep dot
+// RUN: not cat *.dot > /dev/null
 
 #include <sycl/detail/core.hpp>
 
diff --git a/sycl/test-e2e/DeviceArchitecture/device_architecture_unknown_on_host.cpp b/sycl/test-e2e/DeviceArchitecture/device_architecture_unknown_on_host.cpp
new file mode 100644
index 0000000000000..71cc158690ddf
--- /dev/null
+++ b/sycl/test-e2e/DeviceArchitecture/device_architecture_unknown_on_host.cpp
@@ -0,0 +1,28 @@
+// REQUIRES: accelerator
+
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// Test checks that device_architecture extension implementation correctly
+// handles unsupported HW. The unsupported HW in this test is any FPGA device,
+// as FPGA currently is not supported at all by the device_architecture
+// extension.
+
+#include <sycl/detail/core.hpp>
+
+int main() {
+  sycl::queue q;
+  sycl::device dev = q.get_device();
+
+  sycl::ext::oneapi::experimental::architecture arch = dev.get_info<
+      sycl::ext::oneapi::experimental::info::device::architecture>();
+
+  assert(arch == sycl::ext::oneapi::experimental::architecture::unknown);
+  // device::ext_oneapi_architecture_is(syclex::architecture::unknown) should
+  // return true if the device does not have a known architecture.
+  assert(dev.ext_oneapi_architecture_is(arch));
+
+  // No exceptions are expected in this test.
+
+  return 0;
+}
diff --git a/sycl/test-e2e/DeviceLib/cmath_fp64_test.cpp b/sycl/test-e2e/DeviceLib/cmath_fp64_test.cpp
index a89cac0c9822c..c4ce47cb79991 100644
--- a/sycl/test-e2e/DeviceLib/cmath_fp64_test.cpp
+++ b/sycl/test-e2e/DeviceLib/cmath_fp64_test.cpp
@@ -20,13 +20,13 @@ namespace s = sycl;
 constexpr s::access::mode sycl_read = s::access::mode::read;
 constexpr s::access::mode sycl_write = s::access::mode::write;
 
-#define TEST_NUM 69
+#define TEST_NUM 73
 
-double ref[TEST_NUM] = {0,   -2,  1,   2,   1, 1, 1, 0, 1, 1, 0, 0, 0, 0,
-                        0,   1,   1,   0.5, 0, 2, 0, 0, 1, 0, 2, 0, 0, 0,
-                        0,   0,   1,   0,   1, 2, 0, 1, 2, 5, 0, 0, 0, 0,
-                        0.5, 0.5, NAN, NAN, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-                        0,   0,   0,   0,   0, 0, 0, 0, 0, 0, 0, 0, 0};
+double ref[TEST_NUM] = {
+    100, 0.5, 1.0, 0, 0, -2, 1, 2, 1,   1,   1,   0,   1, 1, 0, 0, 0, 0, 0,
+    1,   1,   0.5, 0, 2, 0,  0, 1, 0,   2,   0,   0,   0, 0, 0, 1, 0, 1, 2,
+    0,   1,   2,   5, 0, 0,  0, 0, 0.5, 0.5, NAN, NAN, 2, 0, 0, 0, 0, 0, 0,
+    0,   0,   0,   0, 0, 0,  0, 0, 0,   0,   0,   0,   0, 0, 0, 0};
 
 double refIptr = 1;
 
@@ -61,6 +61,10 @@ template <class T> void device_cmath_test(s::queue &deviceQueue) {
         T minus_infinity = -INFINITY;
         double subnormal;
         *((uint64_t *)&subnormal) = 0xFFFFFFFFFFFFFULL;
+        res_access[i++] = sycl::exp10(2.0);
+        res_access[i++] = sycl::rsqrt(4.0);
+        res_access[i++] = std::trunc(1.3);
+        res_access[i++] = sycl::sinpi(0.0);
         res_access[i++] = sycl::cospi(0.5);
         res_access[i++] = std::copysign(2, -1);
         res_access[i++] = std::fmin(2, 1);
diff --git a/sycl/test-e2e/DeviceLib/cmath_test.cpp b/sycl/test-e2e/DeviceLib/cmath_test.cpp
index 90c68f4e30249..6d3bc69e8690b 100644
--- a/sycl/test-e2e/DeviceLib/cmath_test.cpp
+++ b/sycl/test-e2e/DeviceLib/cmath_test.cpp
@@ -22,13 +22,13 @@ namespace s = sycl;
 constexpr s::access::mode sycl_read = s::access::mode::read;
 constexpr s::access::mode sycl_write = s::access::mode::write;
 
-#define TEST_NUM 66
+#define TEST_NUM 70
 
-float ref[TEST_NUM] = {0,   -2, 1,   2, 1, 1, 0, 1, 1, 0, 0, 0,   0,   0,
-                       1,   1,  0.5, 0, 0, 1, 0, 2, 0, 0, 0, 0,   0,   1,
-                       0,   1,  2,   0, 1, 2, 5, 0, 0, 0, 0, 0.5, 0.5, NAN,
-                       NAN, 2,  0,   0, 0, 0, 0, 0, 0, 0, 0, 0,   0,   0,
-                       0,   0,  0,   0, 0, 0, 0, 0, 0, 0};
+float ref[TEST_NUM] = {100, 0.5, 1.0, 0,   0,   -2, 1,   2, 1, 1, 0, 1, 1, 0,
+                       0,   0,   0,   0,   1,   1,  0.5, 0, 0, 1, 0, 2, 0, 0,
+                       0,   0,   0,   1,   0,   1,  2,   0, 1, 2, 5, 0, 0, 0,
+                       0,   0.5, 0.5, NAN, NAN, 2,  0,   0, 0, 0, 0, 0, 0, 0,
+                       0,   0,   0,   0,   0,   0,  0,   0, 0, 0, 0, 0, 0, 0};
 
 float refIptr = 1;
 
@@ -58,6 +58,10 @@ template <class T> void device_cmath_test_1(s::queue &deviceQueue) {
         float subnormal;
         *((uint32_t *)&subnormal) = 0x7FFFFF;
 
+        res_access[i++] = sycl::exp10(2.0f);
+        res_access[i++] = sycl::rsqrt(4.0f);
+        res_access[i++] = std::trunc(1.2f);
+        res_access[i++] = sycl::sinpi(0.0f);
         res_access[i++] = sycl::cospi(0.5f);
         res_access[i++] = std::copysign(2.0f, -10.0f);
         res_access[i++] = sycl::min(2.0f, 1.0f);
diff --git a/sycl/test-e2e/DiscardEvents/discard_events_l0_inorder.cpp b/sycl/test-e2e/DiscardEvents/discard_events_l0_inorder.cpp
index c23bb4a6175fc..4a329208f72fa 100644
--- a/sycl/test-e2e/DiscardEvents/discard_events_l0_inorder.cpp
+++ b/sycl/test-e2e/DiscardEvents/discard_events_l0_inorder.cpp
@@ -1,5 +1,8 @@
 // REQUIRES: level_zero
 //
+// https://github.com/intel/llvm/issues/14121
+// UNSUPPORTED: gpu-intel-dg2
+//
 // RUN: %{build} -o %t.out
 //
 // RUN: env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 SYCL_PI_LEVEL_ZERO_BATCH_SIZE=0 ONEAPI_DEVICE_SELECTOR="level_zero:*" %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/PerformanceTests/BitonicSortK.cpp b/sycl/test-e2e/ESIMD/PerformanceTests/BitonicSortK.cpp
index 3f0dd433f6ca5..ccfaade989370 100644
--- a/sycl/test-e2e/ESIMD/PerformanceTests/BitonicSortK.cpp
+++ b/sycl/test-e2e/ESIMD/PerformanceTests/BitonicSortK.cpp
@@ -5,12 +5,12 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// UNSUPPORTED: gpu
 // REQUIRES: gpu-intel-dg2 && level_zero
+// UNSUPPORTED: windows
 
 // RUN: mkdir -p %t.dir && %{build} -o %t.dir/exec.out
 // RUN: env IGC_DumpToCustomDir=%t.dir IGC_ShaderDumpEnable=1 %{run} %t.dir/exec.out
-// RUN: python3 %S/instruction_count.py %t.dir 3452 VC_asmfc04983569d0d4c9__ZTSZZN11BitonicSort5SolveEPjS0_jENKUlRN4sycl3_V17handlerEE0_clES4_E5Merge.asm
+// RUN: python3 %S/instruction_count.py %t.dir 3452 ZTSZZN11BitonicSort5SolveEPjS0_jENKUlRN4sycl3_V17handlerEE0_clES4_E5Merge.asm
 // RUN: echo "Baseline from driver version 1.3.29138"
 
 #include "../BitonicSortK.cpp"
\ No newline at end of file
diff --git a/sycl/test-e2e/ESIMD/PerformanceTests/BitonicSortKv2.cpp b/sycl/test-e2e/ESIMD/PerformanceTests/BitonicSortKv2.cpp
new file mode 100644
index 0000000000000..a122881412099
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/PerformanceTests/BitonicSortKv2.cpp
@@ -0,0 +1,16 @@
+//==---------------- BitonicSortKv2.cpp  - DPC++ ESIMD on-device test ------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-dg2 && level_zero
+// UNSUPPORTED: windows
+
+// RUN: mkdir -p %t.dir && %{build} -o %t.dir/exec.out
+// RUN: env IGC_DumpToCustomDir=%t.dir IGC_ShaderDumpEnable=1 %{run} %t.dir/exec.out
+// RUN: python3 %S/instruction_count.py %t.dir 3456 ZTSZZN11BitonicSort5SolveEPjS0_jENKUlRN4sycl3_V17handlerEE0_clES4_E5Merge.asm
+// RUN: echo "Baseline from driver version 1.3.29138"
+
+#include "../BitonicSortKv2.cpp"
diff --git a/sycl/test-e2e/ESIMD/PerformanceTests/Stencil.cpp b/sycl/test-e2e/ESIMD/PerformanceTests/Stencil.cpp
new file mode 100644
index 0000000000000..18198c14cc71c
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/PerformanceTests/Stencil.cpp
@@ -0,0 +1,16 @@
+//==---------------- Stencil.cpp  - DPC++ ESIMD on-device test ------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-dg2 && level_zero
+// UNSUPPORTED: windows
+
+// RUN: mkdir -p %t.dir && %{build} -o %t.dir/exec.out
+// RUN: env IGC_DumpToCustomDir=%t.dir IGC_ShaderDumpEnable=1 %{run} %t.dir/exec.out
+// RUN: python3 %S/instruction_count.py %t.dir 1699 ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E14Stencil_kernel.asm
+// RUN: echo "Baseline from driver version 1.3.29138"
+
+#include "../Stencil.cpp"
\ No newline at end of file
diff --git a/sycl/test-e2e/ESIMD/PerformanceTests/instruction_count.py b/sycl/test-e2e/ESIMD/PerformanceTests/instruction_count.py
index 5cf25ef38a198..44c7a69a5eea1 100644
--- a/sycl/test-e2e/ESIMD/PerformanceTests/instruction_count.py
+++ b/sycl/test-e2e/ESIMD/PerformanceTests/instruction_count.py
@@ -7,11 +7,16 @@ def main(directory, max_count, target_file):
     total_count = 0
     pattern = re.compile(r"//\.instCount (\d+)")
 
+    if not os.path.isdir(directory):
+        print(f"Directory {directory} does not exist.")
+        sys.exit(1)
+
     try:
         target_found = False
         for root, dirs, files in os.walk(directory):
             for file in files:
-                if file.endswith(".asm") and file == target_file:
+                print("File: ", file)
+                if file.endswith(".asm") and re.search(target_file + "$", file):
                     target_found = True
                     print("Checking file: ", file)
                     try:
diff --git a/sycl/test-e2e/ESIMD/PerformanceTests/matrix_transpose.cpp b/sycl/test-e2e/ESIMD/PerformanceTests/matrix_transpose.cpp
new file mode 100644
index 0000000000000..b7c30b7126621
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/PerformanceTests/matrix_transpose.cpp
@@ -0,0 +1,16 @@
+//==---------------- matrix_transpose.cpp  - DPC++ ESIMD on-device test ----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-dg2 && level_zero
+// UNSUPPORTED: windows
+
+// RUN: mkdir -p %t.dir && %{build} -o %t.dir/exec.out
+// RUN: env IGC_DumpToCustomDir=%t.dir IGC_ShaderDumpEnable=1 %{run} %t.dir/exec.out
+// RUN: python3 %S/instruction_count.py %t.dir 1280 ZTSZZ7runTestjjjRdS_ENKUlRN4sycl3_V17handlerEE_clES3_E3K16.asm
+// RUN: echo "Baseline from driver version 1.3.29138"
+
+#include "../matrix_transpose.cpp"
\ No newline at end of file
diff --git a/sycl/test-e2e/ESIMD/PerformanceTests/stencil2.cpp b/sycl/test-e2e/ESIMD/PerformanceTests/stencil2.cpp
new file mode 100644
index 0000000000000..8857d286eee42
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/PerformanceTests/stencil2.cpp
@@ -0,0 +1,16 @@
+//==---------------- stencil2.cpp  - DPC++ ESIMD on-device test ------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-dg2 && level_zero
+// UNSUPPORTED: windows
+
+// RUN: mkdir -p %t.dir && %{build} -o %t.dir/exec.out
+// RUN: env IGC_DumpToCustomDir=%t.dir IGC_ShaderDumpEnable=1 %{run} %t.dir/exec.out
+// RUN: python3 %S/instruction_count.py %t.dir 1699 ZTSZZ4mainENKUlRN4sycl3_V17handlerEE_clES2_E14Stencil_kernel.asm
+// RUN: echo "Baseline from driver version 1.3.29138"
+
+#include "../stencil2.cpp"
\ No newline at end of file
diff --git a/sycl/test-e2e/ESIMD/clz_ctz.cpp b/sycl/test-e2e/ESIMD/clz_ctz.cpp
index 87bcac8f27528..c03e63a572319 100644
--- a/sycl/test-e2e/ESIMD/clz_ctz.cpp
+++ b/sycl/test-e2e/ESIMD/clz_ctz.cpp
@@ -20,7 +20,7 @@ template <typename T, bool CLZ> bool test(queue &q) {
   std::cout << "Running " << (CLZ ? "CLZ " : "CTZ ")
             << esimd_test::type_name<T>() << std::endl;
   constexpr unsigned VL = 16;
-  constexpr unsigned Size = 1024 * 128;
+  constexpr unsigned Size = 256;
 
   T *A = new T[Size];
   T *B = new T[Size];
@@ -72,7 +72,13 @@ template <typename T, bool CLZ> bool test(queue &q) {
 
   for (unsigned i = 0; i < Size; ++i) {
     int Expected =
-        CLZ ? (i == 0 ? sizeof(T) * 8 : __builtin_clz(i)) : __builtin_ctz(i);
+        i == 0 ? sizeof(T) * 8 : (CLZ ? __builtin_clz(i) : __builtin_ctz(i));
+    if (CLZ && i != 0 && sizeof(T) < sizeof(unsigned)) {
+      // The builtin function for CLZ seems to assume 32-bit, so fixup the
+      // result for smaller types.
+      unsigned int Diff = (sizeof(unsigned) * 8) - (sizeof(T) * 8);
+      Expected -= Diff;
+    }
     int Computed = B[i];
     if (Expected != Computed && ++err_cnt < 10)
       std::cout << "Failure at " << std::to_string(i)
diff --git a/sycl/test-e2e/ESIMD/esimd_test_utils.hpp b/sycl/test-e2e/ESIMD/esimd_test_utils.hpp
index 2c2e0b01249e4..afa5d9d920ca3 100644
--- a/sycl/test-e2e/ESIMD/esimd_test_utils.hpp
+++ b/sycl/test-e2e/ESIMD/esimd_test_utils.hpp
@@ -626,86 +626,6 @@ template <typename... ArgT> void printTestLabel(queue Q, ArgT &&...Args) {
   std::cout << std::endl;
 }
 
-enum GPUDriverOS { Linux = 1, Windows = 2, LinuxAndWindows = 3 };
-
-/// This function returns true if it can detect the level-zero or opencl
-/// GPU driver and can determine that the current driver is same or newer
-/// than the one passed in \p RequiredVersion or \p WinOpenCLRequiredVersion.
-///
-/// Below are how driver versions look like:
-///   Linux/L0:       [1.3.26370]
-///   Linux/opencl:   [23.22.26370.18]
-///   Windows/L0:     [1.3.26370]
-///   Windows/opencl: [31.0.101.4502]
-///
-/// This function uses only the part of the driver identification:
-///   - the second half of the driver id on win/opencl, e.g. 101.4502";
-///   - the 5-digit id for 3 other platforms, e.g. 26370.
-///
-/// Note: For the previous & new driver version and their release dates
-/// for win/opencl see the link:
-/// https://www.intel.com/content/www/us/en/download/726609/intel-arc-iris-xe-graphics-whql-windows.html
-bool isGPUDriverGE(queue Q, GPUDriverOS OSCheck, std::string RequiredVersion,
-                   std::string WinOpenCLRequiredVersion = "",
-                   bool VerifyFormat = true) {
-  auto Dev = Q.get_device();
-  if (!Dev.is_gpu())
-    return false;
-
-  bool IsLinux = false;
-#if defined(__SYCL_RT_OS_LINUX)
-  IsLinux = true;
-#elif !defined(__SYCL_RT_OS_WINDOWS)
-  return false;
-#endif
-
-  // A and B must have digits at the same positions.
-  // Otherwise, A and B symbols must be equal, e.g. both be equal to '.'.
-  auto isExpectedDriverVersionFormat = [](const std::string &A,
-                                          const std::string &B) {
-    if (A.size() != B.size())
-      return false;
-    for (int I = 0; I < A.size(); I++) {
-      if ((A[I] >= '0' && A[I] <= '9' && !(B[I] >= '0' && B[I] <= '9')) &&
-          A[I] != B[I])
-        return false;
-    }
-    return true;
-  };
-
-  auto BE = Q.get_backend();
-  int Length = 5;              // extract 5 digits for 3 or 4 platforms
-  int Start = 4;               // start of the driver id for 2 of 4 platforms
-  if (BE == backend::opencl) { // opencl has less-standard versioning
-    if (IsLinux) {
-      Start = 6;
-    } else {
-      Start = 5;
-      Length = 8;
-      RequiredVersion = WinOpenCLRequiredVersion;
-    }
-  }
-
-  bool IsGE = true;
-  if (IsLinux && (OSCheck & GPUDriverOS::Linux) ||
-      !IsLinux && (OSCheck & GPUDriverOS::Windows)) {
-    auto CurrentVersion = Dev.get_info<sycl::info::device::driver_version>();
-    CurrentVersion = CurrentVersion.substr(Start, Length);
-    if (isExpectedDriverVersionFormat(CurrentVersion, RequiredVersion)) {
-      IsGE = CurrentVersion >= RequiredVersion;
-    } else if (VerifyFormat) {
-      std::string Msg =
-          std::string("Inconsistent expected & actual driver versions: ") +
-          CurrentVersion + " vs " + RequiredVersion;
-      throw std::runtime_error(
-          "Inconsistent expected & actual driver versions");
-    } else {
-      IsGE = false;
-    }
-  }
-  return IsGE;
-}
-
 template <typename T> T getRandomValue() {
   using Tuint = std::conditional_t<
       sizeof(T) == 1, uint8_t,
diff --git a/sycl/test-e2e/ESIMD/fma.cpp b/sycl/test-e2e/ESIMD/fma.cpp
index a33c334f2e66d..ef9e4add7b950 100644
--- a/sycl/test-e2e/ESIMD/fma.cpp
+++ b/sycl/test-e2e/ESIMD/fma.cpp
@@ -94,9 +94,7 @@ int main() {
   passed &= test<sycl::ext::oneapi::bfloat16>(q);
   passed &= test<ext::intel::experimental::esimd::tfloat32>(q);
 #endif
-  if (q.get_device().has(sycl::aspect::fp16) &&
-      esimd_test::isGPUDriverGE(q, esimd_test::GPUDriverOS::Windows, "28454",
-                                "101.5310"))
+  if (q.get_device().has(sycl::aspect::fp16))
     passed &= test<sycl::half>(q);
   if (q.get_device().has(sycl::aspect::fp64))
     passed &= test<double>(q);
diff --git a/sycl/test-e2e/ESIMD/local_accessor_block_load_store.cpp b/sycl/test-e2e/ESIMD/local_accessor_block_load_store.cpp
index e2748eb2da3cf..9d7a79c8fe2ad 100644
--- a/sycl/test-e2e/ESIMD/local_accessor_block_load_store.cpp
+++ b/sycl/test-e2e/ESIMD/local_accessor_block_load_store.cpp
@@ -102,9 +102,7 @@ int main() {
   Pass &= test<int, 16, Align16>(Q);
   Pass &= test<float, 16, Align16>(Q);
 
-  if (Dev.has(aspect::fp16) &&
-      esimd_test::isGPUDriverGE(Q, esimd_test::GPUDriverOS::LinuxAndWindows,
-                                "26032", "101.4502"))
+  if (Dev.has(aspect::fp16))
     Pass &= test<sycl::half, 16, Align16>(Q);
 
   // Check SLM load/store with vector size that is not power of 2
diff --git a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_block_load_prefetch.hpp b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_block_load_prefetch.hpp
index 4466f395d6127..63f711f5dbde4 100644
--- a/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_block_load_prefetch.hpp
+++ b/sycl/test-e2e/ESIMD/lsc/Inputs/lsc_usm_block_load_prefetch.hpp
@@ -145,7 +145,14 @@ template <typename T> bool test_lsc_block_load() {
             << Q.get_device().get_info<sycl::info::device::name>() << std::endl;
 
   bool Passed = true;
-  Passed &= test<T, 64, DS, L1H, L2H, NoPrefetch, NoCheckMerge>(Q, 1, 4);
+  if constexpr (sizeof(T) * 64 < 256)
+    Passed &= test<T, 64, DS, L1H, L2H, NoPrefetch, NoCheckMerge>(Q, 1, 4);
+  else {
+#ifdef USE_PVC
+    Passed &= test<T, 64, DS, L1H, L2H, NoPrefetch, NoCheckMerge>(Q, 1, 4);
+#endif
+  }
+
   Passed &= test<T, 32, DS, L1H, L2H, NoPrefetch, NoCheckMerge>(Q, 1, 4);
   Passed &= test<T, 16, DS, L1H, L2H, NoPrefetch, NoCheckMerge>(Q, 2, 2);
   Passed &= test<T, 8, DS, L1H, L2H, NoPrefetch, NoCheckMerge>(Q, 2, 8);
@@ -154,6 +161,7 @@ template <typename T> bool test_lsc_block_load() {
     Passed &= test<T, 2, DS, L1H, L2H, NoPrefetch, NoCheckMerge>(Q, 5, 5);
   if constexpr (sizeof(T) >= sizeof(int))
     Passed &= test<T, 1, DS, L1H, L2H, NoPrefetch, CheckMerge>(Q, 3, 5);
+#ifdef USE_PVC
   if constexpr (sizeof(T) <= 4) {
     Passed &= test<T, 128, DS, L1H, L2H, NoPrefetch, CheckMerge,
                    __ESIMD_NS::overaligned_tag<8>>(Q, 1, 4);
@@ -172,8 +180,15 @@ template <typename T> bool test_lsc_block_load() {
                      __ESIMD_NS::overaligned_tag<8>>(Q, 1, 4);
     }
   }
-
-  Passed &= test<T, 64, DS, L1H, L2H, NoPrefetch, CheckMerge>(Q, 1, 4);
+#endif
+
+  if constexpr (sizeof(T) * 64 < 256)
+    Passed &= test<T, 64, DS, L1H, L2H, NoPrefetch, CheckMerge>(Q, 1, 4);
+  else {
+#ifdef USE_PVC
+    Passed &= test<T, 64, DS, L1H, L2H, NoPrefetch, CheckMerge>(Q, 1, 4);
+#endif
+  }
   Passed &= test<T, 32, DS, L1H, L2H, NoPrefetch, CheckMerge>(Q, 2, 2);
   Passed &= test<T, 16, DS, L1H, L2H, NoPrefetch, CheckMerge>(Q, 4, 4);
   Passed &= test<T, 8, DS, L1H, L2H, NoPrefetch, CheckMerge>(Q, 2, 8);
@@ -182,6 +197,7 @@ template <typename T> bool test_lsc_block_load() {
     Passed &= test<T, 2, DS, L1H, L2H, NoPrefetch, CheckMerge>(Q, 5, 5);
   if constexpr (sizeof(T) >= sizeof(int))
     Passed &= test<T, 1, DS, L1H, L2H, NoPrefetch, CheckMerge>(Q, 3, 5);
+#ifdef USE_PVC
   // Only 512-bits maximum can be loaded at once (i.e. 4*128 bytes).
   if constexpr (sizeof(T) <= 4)
     Passed &= test<T, 128, DS, L1H, L2H, NoPrefetch, CheckMerge,
@@ -192,7 +208,7 @@ template <typename T> bool test_lsc_block_load() {
   if constexpr (sizeof(T) == 1)
     Passed &= test<T, 512, DS, L1H, L2H, NoPrefetch, CheckMerge,
                    __ESIMD_NS::overaligned_tag<8>>(Q, 1, 4);
-
+#endif
   return Passed;
 }
 
@@ -209,7 +225,13 @@ std::enable_if_t<!IsGatherLikePrefetch, bool> test_lsc_prefetch() {
             << Q.get_device().get_info<sycl::info::device::name>() << std::endl;
 
   bool Passed = true;
-  Passed &= test<T, 64, DS, L1H, L2H, DoPrefetch>(Q, 1, 4);
+  if constexpr (sizeof(T) * 64 < 256)
+    Passed &= test<T, 64, DS, L1H, L2H, DoPrefetch>(Q, 1, 4);
+  else {
+#ifdef USE_PVC
+    Passed &= test<T, 64, DS, L1H, L2H, DoPrefetch>(Q, 1, 4);
+#endif
+  }
   Passed &= test<T, 32, DS, L1H, L2H, DoPrefetch>(Q, 1, 4);
   Passed &= test<T, 16, DS, L1H, L2H, DoPrefetch>(Q, 2, 2);
   Passed &= test<T, 8, DS, L1H, L2H, DoPrefetch>(Q, 2, 8);
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32.cpp
index 2b7b9186cb288..c327177486cd1 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-dg2
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -19,8 +19,10 @@ int main(void) {
 
   Passed &= test_lsc_block_load<uint32_t>();
   Passed &= test_lsc_block_load<float>();
+#ifdef USE_PVC
   Passed &=
       test_lsc_block_load<sycl::ext::intel::experimental::esimd::tfloat32>();
+#endif
 
   std::cout << (Passed ? "Passed\n" : "FAILED\n");
   return Passed ? 0 : 1;
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_64.cpp
index 582279fa822ef..0746b03503495 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-dg2
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_64_pvc.cpp
similarity index 60%
rename from sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp
rename to sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_64_pvc.cpp
index 7d74bf8055d6b..ec299a1ffdf59 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_colA_rowB_colC.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_64_pvc.cpp
@@ -1,19 +1,17 @@
-//==---------- joint_matrix_colA_rowB_colC.cpp - DPC++ joint_matrix---------==//
+//==--- lsc_usm_block_load_u32_64_pvc.cpp - DPC++ ESIMD on-device test----==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-// XFAIL:gpu
-
-#include "../common.hpp"
+// PVC variant of the test
 
-constexpr size_t TN = 8;
+#define USE_64_BIT_OFFSET
+#define USE_PVC
 
-#include "../joint_matrix_colA_rowB_colC_impl.hpp"
+#include "lsc_usm_block_load_u32.cpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_pvc.cpp
similarity index 61%
rename from sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp
rename to sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_pvc.cpp
index 09c1a4ae32a92..13379ad04e183 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_array.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_pvc.cpp
@@ -1,17 +1,17 @@
-//==-------- joint_matrix_bfloat16_array.cpp  - DPC++ joint_matrix----------==//
+//==--- lsc_usm_block_load_u32_pvc.cpp - DPC++ ESIMD on-device test----==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-#include "../common.hpp"
+// PVC variant of the test
 
-static constexpr int TN = 8;
+#define USE_64_BIT_OFFSET
+#define USE_PVC
 
-#include "../joint_matrix_bfloat16_array_impl.hpp"
+#include "lsc_usm_block_load_u32.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_scalar_off.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_scalar_off.cpp
index b9163aceae6d8..458a199a9c321 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_scalar_off.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_scalar_off.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-dg2
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_scalar_off_pvc.cpp
similarity index 58%
rename from sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp
rename to sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_scalar_off_pvc.cpp
index a0a98e3f16d0c..1b8ad1b26d54e 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_transposeC.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u32_scalar_off_pvc.cpp
@@ -1,18 +1,17 @@
-//==----------- joint_matrix_transposeC.cpp  - DPC++ joint_matrix-----------==//
+//==-lsc_usm_block_load_u32_scalar_off_pvc.cpp - DPC++ ESIMD on-device test-==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-// REQUIRES-INTEL-DRIVER: lin: 28267
-
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-#include "../common.hpp"
+// PVC variant of the test
 
-constexpr size_t TN = 8;
+#define USE_SCALAR_OFFSET
+#define USE_PVC
 
-#include "../joint_matrix_transposeC_impl.hpp"
+#include "lsc_usm_block_load_u32.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64.cpp
index bf5a9ba9cd543..9e3cda113aba7 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-dg2
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -15,10 +15,11 @@ constexpr uint32_t Seed = 187;
 
 int main(void) {
   srand(Seed);
-
   bool Passed = true;
   Passed &= test_lsc_block_load<uint64_t>();
+#ifdef USE_PVC
   Passed &= test_lsc_block_load<double>();
+#endif
 
   std::cout << (Passed ? "Passed\n" : "FAILED\n");
   return Passed ? 0 : 1;
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_64.cpp
index 6174dff54baef..4051eabb2f34b 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-dg2
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_64_pvc.cpp
similarity index 57%
rename from sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp
rename to sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_64_pvc.cpp
index f42f37378514d..825e136b91768 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_unaligned_k.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_64_pvc.cpp
@@ -1,20 +1,17 @@
-//==-------- joint_matrix_unaligned_k.cpp - DPC++ joint_matrix--------------==//
+//==------- lsc_usm_block_load_u64_64_pvc.cpp - DPC++ ESIMD on-device test -==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
+// REQUIRES: gpu-intel-pvc
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
-// XFAIL:*
-
-#include "../common.hpp"
+// PVC variant of the test
 
-constexpr size_t TN = 8;
-constexpr size_t MATRIX_K = 1024 + 14;
+#define USE_64_BIT_OFFSET
+#define USE_PVC
 
-#include "../joint_matrix_out_bounds_impl.hpp"
+#include "lsc_usm_block_load_u64.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_pvc.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_pvc.cpp
new file mode 100644
index 0000000000000..3435223ee1b31
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_pvc.cpp
@@ -0,0 +1,16 @@
+//==------- lsc_usm_block_load_u64_pvc.cpp - DPC++ ESIMD on-device test ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// PVC variant of the test
+
+#define USE_PVC
+
+#include "lsc_usm_block_load_u64.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_scalar_off.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_scalar_off.cpp
index e8632bd521895..c76d7de37602b 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_scalar_off.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_scalar_off.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES:  gpu-intel-dg2
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_scalar_off_pvc.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_scalar_off_pvc.cpp
new file mode 100644
index 0000000000000..21c4d11706a61
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u64_scalar_off_pvc.cpp
@@ -0,0 +1,17 @@
+//==---- lsc_usm_load_u64_scalar_off_pvc.cpp - DPC++ ESIMD on-device test-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// PVC variant of the test
+
+#define USE_SCALAR_OFFSET
+#define USE_PVC
+
+#include "lsc_usm_block_load_u64.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16.cpp
index 89c33c4282623..e502a3da785c6 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-dg2
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16_64.cpp
index ee1ec12323b8d..baae1dee0a6b4 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-dg2
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16_64_pvc.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16_64_pvc.cpp
new file mode 100644
index 0000000000000..5eee3d2ee2495
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16_64_pvc.cpp
@@ -0,0 +1,17 @@
+//==-- lsc_usm_block_load_u8_u16_64_pvc.cpp - DPC++ ESIMD on-device test ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// PVC variant of the test
+
+#define USE_64_BIT_OFFSET
+#define USE_PVC
+
+#include "lsc_usm_block_load_u8_u16.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16_pvc.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16_pvc.cpp
new file mode 100644
index 0000000000000..75ebfd57b1cec
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_block_load_u8_u16_pvc.cpp
@@ -0,0 +1,17 @@
+//==----- lsc_usm_block_load_u8_u16_pvc.cpp - DPC++ ESIMD on-device test--==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// PVC variant of the test
+
+#define USE_64_BIT_OFFSET
+#define USE_PVC
+
+#include "lsc_usm_block_load_u8_u16.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32.cpp
index 0f1a1f2af3a49..47a8c9ad9cf4d 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-dg2
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -30,7 +30,9 @@ int main(void) {
 
   Passed &= tests<uint32_t>();
   Passed &= tests<float>();
+#ifdef USE_PVC
   Passed &= tests<sycl::ext::intel::experimental::esimd::tfloat32>();
+#endif
 
   std::cout << (Passed ? "Passed\n" : "FAILED\n");
   return Passed ? 0 : 1;
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_64.cpp
index 322f974814829..d3cc0c8118dc0 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-dg2
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_64_pvc.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_64_pvc.cpp
new file mode 100644
index 0000000000000..f9ed09d3226e7
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_64_pvc.cpp
@@ -0,0 +1,17 @@
+//==----- lsc_usm_prefetch_u32_64_pvc.cpp - DPC++ ESIMD on-device test -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// PVC variant of the test
+
+#define USE_64_BIT_OFFSET
+#define USE_PVC
+
+#include "lsc_usm_prefetch_u32.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_pvc.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_pvc.cpp
new file mode 100644
index 0000000000000..a1bb140636773
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_pvc.cpp
@@ -0,0 +1,16 @@
+//==------- lsc_usm_prefetch_u32_pvc.cpp - DPC++ ESIMD on-device test -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// PVC variant of the test
+
+#define USE_PVC
+
+#include "lsc_usm_prefetch_u32.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_scalar_off.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_scalar_off.cpp
index d04b1d3a3c4b0..3ab226b849c13 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_scalar_off.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_scalar_off.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-dg2
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_scalar_off_pvc.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_scalar_off_pvc.cpp
new file mode 100644
index 0000000000000..6aea74106f5ca
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u32_scalar_off_pvc.cpp
@@ -0,0 +1,17 @@
+//==- lsc_usm_prefetch_u32_scalar_off_pvc.cpp - DPC++ ESIMD on-device test -==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// PVC variant of the test
+
+#define USE_SCALAR_OFFSET
+#define USE_PVC
+
+#include "lsc_usm_prefetch_u32.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64.cpp
index c6c8c3348fee5..11cd13b2e6cfe 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-dg2
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -29,7 +29,9 @@ int main(void) {
   bool Passed = true;
 
   Passed &= tests<uint64_t>();
+#ifdef USE_PVC
   Passed &= tests<double>();
+#endif
 
   std::cout << (Passed ? "Passed\n" : "FAILED\n");
   return Passed ? 0 : 1;
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_64.cpp
index b82f3022fd279..e286e73342ab3 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-dg2
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_64_pvc.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_64_pvc.cpp
new file mode 100644
index 0000000000000..ac7537ceb5767
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_64_pvc.cpp
@@ -0,0 +1,17 @@
+//==---- lsc_usm_prefetch_u64_64_pvc.cpp - DPC++ ESIMD on-device test -----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// PVC variant of the test
+
+#define USE_64_BIT_OFFSET
+#define USE_PVC
+
+#include "lsc_usm_prefetch_u64.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_pvc.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_pvc.cpp
new file mode 100644
index 0000000000000..22378bc0e0309
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_pvc.cpp
@@ -0,0 +1,16 @@
+//==--------- lsc_usm_prefetch_u64_pvc.cpp - DPC++ ESIMD on-device test ---==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// PVC variant of the test
+
+#define USE_PVC
+
+#include "lsc_usm_prefetch_u64.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_scalar_off.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_scalar_off.cpp
index 178e30cc2ca83..5353d2860e682 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_scalar_off.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_scalar_off.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-dg2
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_scalar_off_pvc.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_scalar_off_pvc.cpp
new file mode 100644
index 0000000000000..8987f941e1a17
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_prefetch_u64_scalar_off_pvc.cpp
@@ -0,0 +1,17 @@
+//==- lsc_usm_prefetch_u64_scalar_off_pvc.cpp - DPC++ ESIMD on-device test-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// PVC variant of the test
+
+#define USE_SCALAR_OFFSET
+#define USE_PVC
+
+#include "lsc_usm_prefetch_u64.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32.cpp
index 0c5d807d92349..56f2ffc544007 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-dg2
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -31,10 +31,11 @@ template <int TestCastNum, typename T> bool tests() {
   passed &= test<TestCastNum + 10, T, 4, 4, 1, 4, true>();
 
   // large number of elements
+#ifdef USE_PVC
   passed &= test<TestCastNum + 11, T, 4, 4, 1, 128, true,
                  lsc_data_size::default_size, cache_hint::none,
                  cache_hint::none, __ESIMD_NS::overaligned_tag<8>>();
-
+#endif
   return passed;
 }
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32_64.cpp
index 23153d76cbf77..625b91da8ffd7 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-pvc || gpu-intel-dg2
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32_64_pvc.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32_64_pvc.cpp
new file mode 100644
index 0000000000000..e7643590a860a
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32_64_pvc.cpp
@@ -0,0 +1,17 @@
+//==------- lsc_usm_store_u32_64.cpp - DPC++ ESIMD on-device test ----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// PVC variant of the test
+
+#define USE_64_BIT_OFFSET
+#define USE_PVC
+
+#include "lsc_usm_store_u32.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32_pvc.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32_pvc.cpp
new file mode 100644
index 0000000000000..2aa05e627b7a0
--- /dev/null
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u32_pvc.cpp
@@ -0,0 +1,17 @@
+//==------- lsc_usm_store_u32_pvc.cpp - DPC++ ESIMD on-device test-------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu-intel-pvc
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+// PVC variant of the test
+
+#define USE_64_BIT_OFFSET
+#define USE_PVC
+
+#include "lsc_usm_store_u32.cpp"
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u64.cpp
index 55b4d083c12e8..9f007cf8da712 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-pvc || gpu-intel-dg2
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
@@ -36,9 +36,12 @@ template <int TestCastNum, typename T> bool tests() {
 int main(void) {
   srand(seed);
   bool passed = true;
+  auto Q = queue{gpu_selector_v};
 
   passed &= tests<0, uint64_t>();
-  passed &= tests<11, double>();
+  if (Q.get_device().has(sycl::aspect::fp64)) {
+    passed &= tests<11, double>();
+  }
 
   std::cout << (passed ? "Passed\n" : "FAILED\n");
   return passed ? 0 : 1;
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u64_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u64_64.cpp
index 4dff03aac15d0..49eaf99086476 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u64_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u64_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-pvc || gpu-intel-dg2
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8_u16.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8_u16.cpp
index 9a7ba6ffb4055..9066483d30108 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8_u16.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8_u16.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-pvc || gpu-intel-dg2
 // TODO: GPU Driver fails with "add3 src operand only supports integer D/W type"
 // error. Enable the test when it is fixed.
 // UNSUPPORTED: gpu
diff --git a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8_u16_64.cpp b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8_u16_64.cpp
index a20b09faaced4..d42c40751f431 100644
--- a/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8_u16_64.cpp
+++ b/sycl/test-e2e/ESIMD/lsc/lsc_usm_store_u8_u16_64.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: gpu-intel-pvc
+// REQUIRES: gpu-intel-pvc || gpu-intel-dg2
 // TODO: GPU Driver fails with "add3 src operand only supports integer D/W type"
 // error. Enable the test when it is fixed.
 // UNSUPPORTED: gpu
diff --git a/sycl/test-e2e/ESIMD/regression/bit_shift_vector_compilation_test.cpp b/sycl/test-e2e/ESIMD/regression/bit_shift_vector_compilation_test.cpp
index 8230f34400c6f..634ef40e40ebb 100644
--- a/sycl/test-e2e/ESIMD/regression/bit_shift_vector_compilation_test.cpp
+++ b/sycl/test-e2e/ESIMD/regression/bit_shift_vector_compilation_test.cpp
@@ -6,13 +6,12 @@
 //
 //===---------------------------------------===//
 
-// RUN: %{build} -fsycl-device-code-split=per_kernel -std=c++20 -o %t.out
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
 // RUN: %{run} %t.out
 
 // This is a basic test to validate the vector bit shifting functions.
 
 #include "../esimd_test_utils.hpp"
-#include <bit>
 
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
diff --git a/sycl/test-e2e/ESIMD/rotate.cpp b/sycl/test-e2e/ESIMD/rotate.cpp
index 34be5b62c1ce6..70313a90b9818 100644
--- a/sycl/test-e2e/ESIMD/rotate.cpp
+++ b/sycl/test-e2e/ESIMD/rotate.cpp
@@ -21,6 +21,19 @@
 #define NS sycl::ext::intel::esimd
 #endif
 
+// https://stackoverflow.com/questions/776508
+template <typename T> T rotl(T n, unsigned int c) {
+  const unsigned int mask = (CHAR_BIT * sizeof(n) - 1);
+  c &= mask;
+  return (n << c) | (n >> ((-c) & mask));
+}
+
+template <typename T> T rotr(T n, unsigned int c) {
+  const unsigned int mask = (CHAR_BIT * sizeof(n) - 1);
+  c &= mask;
+  return (n >> c) | (n << ((-c) & mask));
+}
+
 using namespace sycl;
 using namespace sycl::ext::intel::esimd;
 
@@ -85,14 +98,14 @@ template <typename T> bool test_rotate(sycl::queue &Queue) {
 
   for (int I = 0; I < VL; I++) {
     using OpT = std::make_unsigned_t<T>;
-    ExpectedRorScalar[I] = std::rotr<OpT>(
-        sycl::bit_cast<OpT>(ExpectedRorScalar[I]), ScalarRotateFactor);
-    ExpectedRolScalar[I] = std::rotl<OpT>(
-        sycl::bit_cast<OpT>(ExpectedRolScalar[I]), ScalarRotateFactor);
-    ExpectedRorVector[I] = std::rotr<OpT>(
-        sycl::bit_cast<OpT>(ExpectedRorVector[I]), VectorRotateFactor[I]);
-    ExpectedRolVector[I] = std::rotl<OpT>(
-        sycl::bit_cast<OpT>(ExpectedRolVector[I]), VectorRotateFactor[I]);
+    ExpectedRorScalar[I] = rotr<OpT>(sycl::bit_cast<OpT>(ExpectedRorScalar[I]),
+                                     ScalarRotateFactor);
+    ExpectedRolScalar[I] = rotl<OpT>(sycl::bit_cast<OpT>(ExpectedRolScalar[I]),
+                                     ScalarRotateFactor);
+    ExpectedRorVector[I] = rotr<OpT>(sycl::bit_cast<OpT>(ExpectedRorVector[I]),
+                                     VectorRotateFactor[I]);
+    ExpectedRolVector[I] = rotl<OpT>(sycl::bit_cast<OpT>(ExpectedRolVector[I]),
+                                     VectorRotateFactor[I]);
   }
   for (int I = 0; I < VL; I++) {
     if (ExpectedRorScalar[I] != OutputRorScalar[I]) {
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp
index 4831c31ecf16e..28c5fb0b2783e 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/atomic_update_slm.hpp
@@ -620,9 +620,7 @@ bool test_int_types_and_sizes(queue q) {
   passed &= test_int_types<2, Op, UseMask, Features, UseAcc, SignMask>(q);
   passed &= test_int_types<4, Op, UseMask, Features, UseAcc, SignMask>(q);
   passed &= test_int_types<8, Op, UseMask, Features, UseAcc, SignMask>(q);
-  if (UseMask && Features == TestFeatures::Generic &&
-      esimd_test::isGPUDriverGE(q, esimd_test::GPUDriverOS::LinuxAndWindows,
-                                "26918", "101.4953", false)) {
+  if (UseMask && Features == TestFeatures::Generic) {
     passed &= test_int_types<16, Op, UseMask, Features, UseAcc, SignMask>(q);
     passed &= test_int_types<32, Op, UseMask, Features, UseAcc, SignMask>(q);
   }
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp
index 8cd554c475fea..af8168b94671b 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_load.hpp
@@ -166,17 +166,10 @@ template <typename T, TestFeatures Features> bool testUSM(queue Q) {
 
   // Intentionally check non-power-of-2 simd size - it must work.
   // Just pass element-size alignment.
-  // These test cases compute wrong values for for the few last elements
-  // if the driver is not new enough.
-  // TODO: windows version with the fix is not known. Enable it eventually.
-  if (sizeof(T) > 2 ||
-      esimd_test::isGPUDriverGE(Q, esimd_test::GPUDriverOS::LinuxAndWindows,
-                                "27556", "win.just.skip.test", false)) {
-    Passed &= testUSM<T, 33, !CheckMask, !CheckMerge, CheckProperties>(
-        Q, 2, 4, AlignElemProps);
-    Passed &= testUSM<T, 67, !CheckMask, !CheckMerge, CheckProperties>(
-        Q, 2, 4, AlignElemProps);
-  }
+  Passed &= testUSM<T, 33, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 2, 4, AlignElemProps);
+  Passed &= testUSM<T, 67, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 2, 4, AlignElemProps);
 
   // Intentionally check big simd size - it must work.
   Passed &= testUSM<T, 512, !CheckMask, !CheckMerge, CheckProperties>(
@@ -579,64 +572,55 @@ template <typename T, TestFeatures Features> bool testSLMAcc(queue Q) {
   Passed &= testSLMAcc<T, 3, !CheckMask, !CheckMerge, CheckProperties>(
       Q, 2, 4, AlignElemProps);
 
-  // These test case compute wrong values for for the few last elements
-  // if the driver is not new enough.
-  // TODO: windows version with the fix is not known. Enable it eventually.
-  if (sizeof(T) > 2 ||
-      esimd_test::isGPUDriverGE(Q, esimd_test::GPUDriverOS::LinuxAndWindows,
-                                "27556", "win.just.skip.test", false)) {
     Passed &= testSLMAcc<T, 17, !CheckMask, !CheckMerge, CheckProperties>(
         Q, 2, 4, AlignElemProps);
 
     Passed &= testSLMAcc<T, 113, !CheckMask, !CheckMerge, CheckProperties>(
         Q, 2, 4, AlignElemProps);
-  }
 
-  if constexpr (Features == TestFeatures::PVC ||
-                Features == TestFeatures::DG2) {
+    if constexpr (Features == TestFeatures::PVC ||
+                  Features == TestFeatures::DG2) {
 
-    // Using the mask adds the requirement to run tests on DG2/PVC.
-    // Also, DG2/PVC variant currently requires power-or-two elements and
-    // the number of bytes loaded per call must not exceed 512.
+      // Using the mask adds the requirement to run tests on DG2/PVC.
+      // Also, DG2/PVC variant currently requires power-or-two elements and
+      // the number of bytes loaded per call must not exceed 512.
 
-    constexpr int I32Factor =
-        std::max(static_cast<int>(sizeof(int) / sizeof(T)), 1);
-    constexpr size_t ReqiredAlignment = sizeof(T) <= 4 ? 4 : 8;
-    properties DG2OrPVCProps{alignment<ReqiredAlignment>};
+      constexpr int I32Factor =
+          std::max(static_cast<int>(sizeof(int) / sizeof(T)), 1);
+      constexpr size_t ReqiredAlignment = sizeof(T) <= 4 ? 4 : 8;
+      properties DG2OrPVCProps{alignment<ReqiredAlignment>};
 
-    // Test block_load() that is available on DG2/PVC:
-    // 1, 2, 3, 4, 8, ... N elements (up to 512-bytes).
-    Passed &=
-        testSLMAcc<T, 1 * I32Factor, CheckMask, !CheckMerge, CheckProperties>(
-            Q, 2, 4, DG2OrPVCProps);
-    Passed &=
-        testSLMAcc<T, 2 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
-            Q, 1, 4, DG2OrPVCProps);
-    Passed &=
-        testSLMAcc<T, 3 * I32Factor, CheckMask, !CheckMerge, CheckProperties>(
-            Q, 2, 8, DG2OrPVCProps);
-    Passed &=
-        testSLMAcc<T, 4 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
-            Q, 2, 4, DG2OrPVCProps);
-    Passed &=
-        testSLMAcc<T, 8 * I32Factor, CheckMask, !CheckMerge, CheckProperties>(
-            Q, 2, 4, DG2OrPVCProps);
-    Passed &=
-        testSLMAcc<T, 16 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
-            Q, 2, 4, DG2OrPVCProps);
-    Passed &=
-        testSLMAcc<T, 32 * I32Factor, CheckMask, !CheckMerge, CheckProperties>(
-            Q, 2, 4, DG2OrPVCProps);
-    if constexpr (Features == TestFeatures::PVC) {
+      // Test block_load() that is available on DG2/PVC:
+      // 1, 2, 3, 4, 8, ... N elements (up to 512-bytes).
       Passed &=
-          testSLMAcc<T, 64 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+          testSLMAcc<T, 1 * I32Factor, CheckMask, !CheckMerge, CheckProperties>(
               Q, 2, 4, DG2OrPVCProps);
-
-      if constexpr (sizeof(T) <= 4)
-        Passed &= testSLMAcc<T, 128 * I32Factor, CheckMask, CheckMerge,
-                             CheckProperties>(Q, 2, 4, Align16Props);
-    }
-  } // TestPVCFeatures
+      Passed &=
+          testSLMAcc<T, 2 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+              Q, 1, 4, DG2OrPVCProps);
+      Passed &=
+          testSLMAcc<T, 3 * I32Factor, CheckMask, !CheckMerge, CheckProperties>(
+              Q, 2, 8, DG2OrPVCProps);
+      Passed &=
+          testSLMAcc<T, 4 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+              Q, 2, 4, DG2OrPVCProps);
+      Passed &=
+          testSLMAcc<T, 8 * I32Factor, CheckMask, !CheckMerge, CheckProperties>(
+              Q, 2, 4, DG2OrPVCProps);
+      Passed &=
+          testSLMAcc<T, 16 * I32Factor, CheckMask, CheckMerge, CheckProperties>(
+              Q, 2, 4, DG2OrPVCProps);
+      Passed &= testSLMAcc<T, 32 * I32Factor, CheckMask, !CheckMerge,
+                           CheckProperties>(Q, 2, 4, DG2OrPVCProps);
+      if constexpr (Features == TestFeatures::PVC) {
+        Passed &= testSLMAcc<T, 64 * I32Factor, CheckMask, CheckMerge,
+                             CheckProperties>(Q, 2, 4, DG2OrPVCProps);
+
+        if constexpr (sizeof(T) <= 4)
+          Passed &= testSLMAcc<T, 128 * I32Factor, CheckMask, CheckMerge,
+                               CheckProperties>(Q, 2, 4, Align16Props);
+      }
+    } // TestPVCFeatures
 
   return Passed;
 }
@@ -776,19 +760,15 @@ template <typename T, TestFeatures Features> bool testSLM(queue Q) {
   // Alignment that is smaller than 16-bytes is not assumed/expected by default
   // and requires explicit passing of the esimd::alignment property.
   //
-  // These test case may compute wrong values for some of elements
-  // if the driver is not new enough.
-  if (esimd_test::isGPUDriverGE(Q, esimd_test::GPUDriverOS::LinuxAndWindows,
-                                "27556", "win.just.skip.test", false)) {
-    Passed &= testSLM<T, 3, !CheckMask, !CheckMerge, CheckProperties>(
-        Q, 2, AlignElemProps);
-
-    Passed &= testSLM<T, 17, !CheckMask, !CheckMerge, CheckProperties>(
-        Q, 2, AlignElemProps);
-
-    Passed &= testSLM<T, 113, !CheckMask, !CheckMerge, CheckProperties>(
-        Q, 2, AlignElemProps);
-  }
+
+  Passed &= testSLM<T, 3, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 2, AlignElemProps);
+
+  Passed &= testSLM<T, 17, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 2, AlignElemProps);
+
+  Passed &= testSLM<T, 113, !CheckMask, !CheckMerge, CheckProperties>(
+      Q, 2, AlignElemProps);
 
   if constexpr (Features == TestFeatures::PVC ||
                 Features == TestFeatures::DG2) {
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_store.hpp b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_store.hpp
index 607840cdb7db0..8a61b54a2a2a3 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_store.hpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/Inputs/block_store.hpp
@@ -462,14 +462,8 @@ bool test_block_store_usm(queue Q) {
   // Intentionally check non-power-of-2 simd size - it must work.
   Passed &=
       testUSM<T, 33, !CheckMask, CheckProperties>(Q, 2, 4, AlignElemProps);
-  // This test case computes wrong values for for the few last elements
-  // if the driver is not new enough.
-  // TODO: windows version with the fix is not known. Enable it eventually.
-  if (sizeof(T) > 2 ||
-      esimd_test::isGPUDriverGE(Q, esimd_test::GPUDriverOS::LinuxAndWindows,
-                                "27556", "win.just.skip.test", false))
-    Passed &=
-        testUSM<T, 67, !CheckMask, CheckProperties>(Q, 1, 4, AlignElemProps);
+  Passed &=
+      testUSM<T, 67, !CheckMask, CheckProperties>(Q, 1, 4, AlignElemProps);
   // Intentionally check big simd size - it must work.
   Passed &=
       testUSM<T, 128, !CheckMask, CheckProperties>(Q, 2, 4, AlignElemProps);
@@ -631,23 +625,11 @@ bool test_block_store_slm(queue Q) {
   // alignment - it works even for byte- and word-vectors if mask is not used.
   // Alignment that is smaller than 16-bytes is not assumed/expected by default
   // and requires explicit passing of the esimd::alignment property.
-  //
-  // These test case may compute wrong values for some of elements
-  // if the driver is not new enough.
-#if 0
-  // TODO: Enable these cases when GPU driver is fixed. It seems the issue with
-  // non-power-of-2 N values was resolved for slm_block_load(), but not for
-  // slm_block_store().
-  if (esimd_test::isGPUDriverGE(Q, esimd_test::GPUDriverOS::LinuxAndWindows,
-                                "27556", "win.just.skip.test", false)) {
-    Passed &= testSLM<T, 3, !CheckMask, CheckProperties>(Q, 2, AlignElemProps);
+  Passed &= testSLM<T, 3, !CheckMask, CheckProperties>(Q, 2, AlignElemProps);
 
-    Passed &= testSLM<T, 17, !CheckMask, CheckProperties>(Q, 2, AlignElemProps);
+  Passed &= testSLM<T, 17, !CheckMask, CheckProperties>(Q, 2, AlignElemProps);
 
-    Passed &=
-        testSLM<T, 113, !CheckMask, CheckProperties>(Q, 2, AlignElemProps);
-  }
-#endif
+  Passed &= testSLM<T, 113, !CheckMask, CheckProperties>(Q, 2, AlignElemProps);
 
   if constexpr (Features == TestFeatures::PVC ||
                 Features == TestFeatures::DG2) {
@@ -694,11 +676,6 @@ bool test_block_store_local_acc_slm(queue Q) {
 
   bool Passed = true;
 
-  // Many cases currently fail before this driver version.
-  if (!esimd_test::isGPUDriverGE(Q, esimd_test::GPUDriverOS::LinuxAndWindows,
-                                 "26957", "101.4824", false))
-    return Passed;
-
   // Test block_store() from SLM that doesn't use the mask is implemented
   // for any N > 1.
   // Ensure that for every call of block_store(local_accessor, offset, ...)
@@ -729,14 +706,10 @@ bool test_block_store_local_acc_slm(queue Q) {
   // Alignment that is smaller than 16-bytes is not assumed/expected by default
   // and requires explicit passing of the esimd::alignment property.
   //
-  // These test case may compute wrong values for some of elements
-  // if the driver is not new enough.
 #if 0
   // TODO: Enable these cases when GPU driver is fixed. It seems the issue with
   // non-power-of-2 N values was resolved for slm_block_load(), but not for
   // slm_block_store().
-  if (esimd_test::isGPUDriverGE(Q, esimd_test::GPUDriverOS::LinuxAndWindows,
-                                "27556", "win.just.skip.test", false)) {
     Passed &= testLocalAccSLM<T, 3, !CheckMask, CheckProperties>(
         Q, 2, AlignElemProps);
 
@@ -745,9 +718,7 @@ bool test_block_store_local_acc_slm(queue Q) {
 
     Passed &= testLocalAccSLM<T, 113, !CheckMask, CheckProperties>(
         Q, 2, AlignElemProps);
-  }
 #endif
-
   if constexpr (Features == TestFeatures::PVC ||
                 Features == TestFeatures::DG2) {
     // Using the mask adds the requirement to run tests on DG2/PVC.
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_acc_dg2_pvc_cmpxchg.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_acc_dg2_pvc_cmpxchg.cpp
index e5321c213cd6a..e9c7d06e75306 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_acc_dg2_pvc_cmpxchg.cpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/atomic_update_acc_dg2_pvc_cmpxchg.cpp
@@ -7,6 +7,7 @@
 //===---------------------------------------------------------------------===//
 
 // REQUIRES: gpu-intel-pvc || gpu-intel-dg2
+// REQUIRES-INTEL-DRIVER: win: 101.5660
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm.cpp
index 49e28ecc2846d..45cde2e97f270 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm.cpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/block_load_slm.cpp
@@ -5,14 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES-INTEL-DRIVER: win: 101.4887
-// Somehow the driver version check above does not work, i.e. Windows CI runs
-// the test with 31.0.101.4502 (if opencl:gpu) and 1.3.26370 (if level-zero:gpu)
-// It seems the driver check infrastructure may need some fix/tuning.
-// TODO: Enable the test when Windows CI driver reaches 101.4887 version, or
-// driver version check is fixed/tuned.
-// UNSUPPORTED: windows
-
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/block_store_slm.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/block_store_slm.cpp
index 9460917d57f53..e11fbf7653db6 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/block_store_slm.cpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/block_store_slm.cpp
@@ -5,14 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //==------------------------------------------------------------------------==//
-// REQUIRES-INTEL-DRIVER: win: 101.4887
-// Somehow the driver version check above does not work, i.e. Windows CI runs
-// the test with 31.0.101.4502 (if opencl:gpu) and 1.3.26370 (if level-zero:gpu)
-// It seems the driver check infrastructure may need some fix/tuning.
-// TODO: Enable the test when Windows CI driver reaches 101.4887 version, or
-// driver version check is fixed/tuned.
-// UNSUPPORTED: windows
-
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp
index 301392a247381..3781f17d883d0 100644
--- a/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp
+++ b/sycl/test-e2e/ESIMD/unified_memory_api/scatter_lacc.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES-INTEL-DRIVER: lin: 26816, win: 101.51086
+// REQUIRES-INTEL-DRIVER: lin: 26816, win: 101.5108
 // Use per-kernel compilation to have more information about failing cases.
 // RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/FilterSelector/select_device_acc.cpp b/sycl/test-e2e/FilterSelector/select_device_acc.cpp
index 0b2250a2cf3bb..4b3db255b0853 100644
--- a/sycl/test-e2e/FilterSelector/select_device_acc.cpp
+++ b/sycl/test-e2e/FilterSelector/select_device_acc.cpp
@@ -22,16 +22,14 @@ int main() {
     forcedPIs = envVal;
   }
   {
-    default_selector ds;
-    device d = ds.select_device();
+    device d(default_selector_v);
     string name = d.get_platform().get_info<info::platform::name>();
     assert(name.find("OpenCL") != string::npos &&
            "default selector failed to find acc device");
   }
   {
-    gpu_selector gs;
     try {
-      device d = gs.select_device();
+      device d(gpu_selector_v);
       std::cerr << "GPU Device is found in error: " << std::boolalpha
                 << d.is_gpu() << std::endl;
       return -1;
@@ -39,9 +37,8 @@ int main() {
     }
   }
   {
-    cpu_selector cs;
     try {
-      device d = cs.select_device();
+      device d(cpu_selector_v);
       std::cerr << "CPU Device is found in error: " << std::boolalpha
                 << d.is_cpu() << std::endl;
       return -1;
@@ -49,8 +46,7 @@ int main() {
     }
   }
   {
-    accelerator_selector as;
-    device d = as.select_device();
+    device d(accelerator_selector_v);
     string name = d.get_platform().get_info<info::platform::name>();
     assert(name.find("OpenCL") != string::npos &&
            "accelerator_selector failed to find acc device");
diff --git a/sycl/test-e2e/FilterSelector/select_device_cpu.cpp b/sycl/test-e2e/FilterSelector/select_device_cpu.cpp
index d20002edb3090..5d0294c55d547 100644
--- a/sycl/test-e2e/FilterSelector/select_device_cpu.cpp
+++ b/sycl/test-e2e/FilterSelector/select_device_cpu.cpp
@@ -22,30 +22,24 @@ int main() {
     forcedPIs = envVal;
   }
   {
-    default_selector ds;
-    device d = ds.select_device();
+    device d(default_selector_v);
     string name = d.get_platform().get_info<info::platform::name>();
     assert(name.find("OpenCL") != string::npos &&
            "default_selector failed to find cpu device");
   }
   {
-    gpu_selector gs;
     try {
-      device d = gs.select_device();
+      device d(gpu_selector_v);
       std::cerr << "GPU Device is found: " << std::boolalpha << d.is_gpu()
                 << std::endl;
       return -1;
     } catch (...) {
     }
   }
+  { device d(cpu_selector_v); }
   {
-    cpu_selector cs;
-    device d = cs.select_device();
-  }
-  {
-    accelerator_selector as;
     try {
-      device d = as.select_device();
+      device d(accelerator_selector_v);
       std::cerr << "ACC device is found in error: " << d.is_accelerator()
                 << std::endl;
       return -1;
diff --git a/sycl/test-e2e/FilterSelector/select_device_cuda.cpp b/sycl/test-e2e/FilterSelector/select_device_cuda.cpp
index 8a5662c198096..372c1771bcba8 100644
--- a/sycl/test-e2e/FilterSelector/select_device_cuda.cpp
+++ b/sycl/test-e2e/FilterSelector/select_device_cuda.cpp
@@ -23,30 +23,26 @@ int main() {
   }
 
   {
-    default_selector ds;
-    device d = ds.select_device();
+    device d(default_selector_v);
     string name = d.get_platform().get_info<info::platform::name>();
     assert(name.find("CUDA") != string::npos);
   }
   {
-    gpu_selector gs;
-    device d = gs.select_device();
+    device d(gpu_selector_v);
     string name = d.get_platform().get_info<info::platform::name>();
     assert(name.find("CUDA") != string::npos);
   }
   {
-    cpu_selector cs;
     try {
-      device d = cs.select_device();
+      device d(cpu_selector_v);
       cerr << "CPU device is found in error: " << d.is_cpu() << std::endl;
       return -1;
     } catch (...) {
     }
   }
   {
-    accelerator_selector as;
     try {
-      device d = as.select_device();
+      device d(accelerator_selector_v);
       cerr << "ACC device is found in error: " << d.is_accelerator()
            << std::endl;
     } catch (...) {
diff --git a/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp
index 9df03725c3999..0d9be32264592 100644
--- a/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp
+++ b/sycl/test-e2e/Graph/Explicit/add_nodes_after_finalize.cpp
@@ -4,7 +4,9 @@
 // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
 // Extra run to check for immediate-command-list in Level Zero
 // RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
-//
+
+// Test is flaky on Windows, disable until it can be fixed
+// UNSUPPORTED: windows
 
 #define GRAPH_E2E_EXPLICIT
 
diff --git a/sycl/test-e2e/Graph/Inputs/work_group_size_prop.cpp b/sycl/test-e2e/Graph/Inputs/work_group_size_prop.cpp
index 550d6af01e95b..e052ab5acb3bf 100644
--- a/sycl/test-e2e/Graph/Inputs/work_group_size_prop.cpp
+++ b/sycl/test-e2e/Graph/Inputs/work_group_size_prop.cpp
@@ -68,19 +68,12 @@ int test(queue &Queue, PropertiesT Props, KernelType KernelFunc) {
 
     Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraph); });
     Queue.wait_and_throw();
-  } catch (nd_range_error &E) {
-    std::cerr << "Test case failed: unexpected "
-                 "nd_range_error exception: "
-              << E.what() << std::endl;
-    return 1;
-  } catch (runtime_error &E) {
-    std::cerr << "Test case failed: unexpected "
-                 "runtime_error exception: "
-              << E.what() << std::endl;
+  } catch (exception &E) {
+    std::cerr << "Test case failed: unexpected exception: " << E.what()
+              << std::endl;
     return 1;
   } catch (...) {
-    std::cerr << "Test case failed: something unexpected "
-                 "has been caught"
+    std::cerr << "Test case failed: something unexpected has been caught"
               << std::endl;
     return 1;
   }
@@ -101,26 +94,21 @@ int test(queue &Queue, PropertiesT Props, KernelType KernelFunc) {
     std::cerr << "Test case ReqdWGSizeNegativeA failed: no exception has been "
                  "thrown\n";
     return 1; // We shouldn't be here, exception is expected
-  } catch (nd_range_error &E) {
-    if (std::string(E.what()).find(
+  } catch (exception &E) {
+    if (E.code() != errc::nd_range ||
+        std::string(E.what()).find(
             "The specified local size " + rangeToString(repeatRange<Dims>(8)) +
             " doesn't match the required " +
             "work-group size specified in the program source " +
             rangeToString(range<Dims>(Is...))) == std::string::npos) {
       std::cerr
-          << "Test case ReqdWGSizeNegativeA failed: unexpected nd_range_error "
-             "exception: "
+          << "Test case ReqdWGSizeNegativeA failed: unexpected exception: "
           << E.what() << std::endl;
       return 1;
     }
-  } catch (runtime_error &E) {
-    std::cerr << "Test case ReqdWGSizeNegativeA failed: unexpected "
-                 "nd_range_error exception: "
-              << E.what() << std::endl;
-    return 1;
   } catch (...) {
-    std::cerr << "Test case ReqdWGSizeNegativeA failed: something unexpected "
-                 "has been caught"
+    std::cerr << "Test case ReqdWGSizeNegativeA failed: something "
+                 "unexpected has been caught"
               << std::endl;
     return 1;
   }
@@ -144,23 +132,18 @@ int test(queue &Queue, PropertiesT Props, KernelType KernelFunc) {
                  "has been "
                  "thrown\n";
     return 1; // We shouldn't be here, exception is expected
-  } catch (nd_range_error &E) {
-    if (std::string(E.what()).find(
+  } catch (exception &E) {
+    if (E.code() != errc::nd_range ||
+        std::string(E.what()).find(
             "The specified local size " + rangeToString(repeatRange<Dims>(8)) +
             " doesn't match the required " +
             "work-group size specified in the program source " +
             rangeToString(range<Dims>(Is...))) == std::string::npos) {
       std::cerr << "Test case ReqdWGSizeNegativeA shortcut failed: unexpected "
-                   "nd_range_error "
                    "exception: "
                 << E.what() << std::endl;
       return 1;
     }
-  } catch (runtime_error &E) {
-    std::cerr << "Test case ReqdWGSizeNegativeA shortcut failed: unexpected "
-                 "nd_range_error exception: "
-              << E.what() << std::endl;
-    return 1;
   } catch (...) {
     std::cerr << "Test case ReqdWGSizeNegativeA shortcut failed: something "
                  "unexpected has been caught"
diff --git a/sycl/test-e2e/Graph/RecordReplay/barrier_multi_graph.cpp b/sycl/test-e2e/Graph/RecordReplay/barrier_multi_graph.cpp
new file mode 100644
index 0000000000000..deee2abc9f390
--- /dev/null
+++ b/sycl/test-e2e/Graph/RecordReplay/barrier_multi_graph.cpp
@@ -0,0 +1,58 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+//
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue{};
+
+  int *PtrA = malloc_device<int>(Size, Queue);
+  int *PtrB = malloc_device<int>(Size, Queue);
+
+  exp_ext::command_graph GraphA{Queue};
+  exp_ext::command_graph GraphB{Queue};
+
+  GraphA.begin_recording(Queue);
+  auto EventA = Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{Size}, [=](id<1> it) { PtrA[it] = it; });
+  });
+  Queue.ext_oneapi_submit_barrier({EventA});
+  Queue.copy(PtrA, PtrB, Size);
+  GraphA.end_recording();
+
+  GraphB.begin_recording(Queue);
+  auto EventB = Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>{Size}, [=](id<1> it) { PtrA[it] = it * 2; });
+  });
+  Queue.ext_oneapi_submit_barrier();
+  Queue.copy(PtrA, PtrB, Size);
+  GraphB.end_recording();
+
+  auto ExecGraphA = GraphA.finalize();
+  auto ExecGraphB = GraphB.finalize();
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraphA); }).wait();
+
+  std::array<int, Size> Output;
+  Queue.memcpy(Output.data(), PtrB, sizeof(int) * Size).wait();
+
+  for (int i = 0; i < Size; i++) {
+    assert(Output[i] == i);
+  }
+
+  Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecGraphB); }).wait();
+  Queue.memcpy(Output.data(), PtrB, sizeof(int) * Size).wait();
+
+  for (int i = 0; i < Size; i++) {
+    assert(Output[i] == 2 * i);
+  }
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp b/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp
index b3d40f4d0f89e..4c179b8b4c50a 100644
--- a/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp
+++ b/sycl/test-e2e/Graph/UnsupportedDevice/device_query.cpp
@@ -17,10 +17,10 @@ int main() {
   bool SupportsLimitedGraphs = Device.has(aspect::ext_oneapi_limited_graph);
   auto Backend = Device.get_backend();
 
-  if ((Backend == backend::ext_oneapi_level_zero)) {
-    assert(!SupportsGraphs);
+  if (Backend == backend::ext_oneapi_level_zero) {
+    // Full graph support is dependent on the Level Zero device & driver,
+    // and cannot be asserted without diving into these details.
     assert(SupportsLimitedGraphs);
-
   } else if ((Backend == backend::ext_oneapi_cuda) ||
              (Backend == backend::ext_oneapi_hip)) {
     assert(SupportsGraphs);
diff --git a/sycl/test-e2e/Graph/Update/Explicit/whole_update_double_buffer.cpp b/sycl/test-e2e/Graph/Update/Explicit/whole_update_double_buffer.cpp
index c787d5ea4b4e6..ee39ca1f06908 100644
--- a/sycl/test-e2e/Graph/Update/Explicit/whole_update_double_buffer.cpp
+++ b/sycl/test-e2e/Graph/Update/Explicit/whole_update_double_buffer.cpp
@@ -7,6 +7,4 @@
 
 #define GRAPH_E2E_EXPLICIT
 
-// UNSUPPORTED: cuda
-
 #include "../../Inputs/whole_update_double_buffer.cpp"
diff --git a/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_double_buffer.cpp b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_double_buffer.cpp
index 3bc52c2af25d9..9baa1e53d813e 100644
--- a/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_double_buffer.cpp
+++ b/sycl/test-e2e/Graph/Update/RecordReplay/whole_update_double_buffer.cpp
@@ -7,6 +7,4 @@
 
 #define GRAPH_E2E_RECORD_REPLAY
 
-// UNSUPPORTED: cuda
-
 #include "../../Inputs/whole_update_double_buffer.cpp"
diff --git a/sycl/test-e2e/Graph/Update/update_with_indices_ptr_3D.cpp b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_3D.cpp
new file mode 100644
index 0000000000000..5459eb42de8d4
--- /dev/null
+++ b/sycl/test-e2e/Graph/Update/update_with_indices_ptr_3D.cpp
@@ -0,0 +1,83 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+//
+
+// Tests updating a 3D ND-Range graph kernel node using index-based explicit
+// update
+
+#include "../graph_common.hpp"
+
+int main() {
+  queue Queue{};
+
+  const range<3> GlobalWorkSize(1, 2, 2);
+  const range<3> LocalWorkSize(1, 2, 2);
+  const size_t N = GlobalWorkSize[0] * GlobalWorkSize[1] * GlobalWorkSize[2];
+
+  exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()};
+
+  int *PtrA = malloc_device<int>(N, Queue);
+  int *PtrB = malloc_device<int>(N, Queue);
+
+  std::vector<int> HostDataA(N);
+  std::vector<int> HostDataB(N);
+
+  Queue.memset(PtrA, 0, N * sizeof(int)).wait();
+  Queue.memset(PtrB, 0, N * sizeof(int)).wait();
+
+  exp_ext::dynamic_parameter DynParam(Graph, PtrA);
+
+  nd_range<3> NDRange{GlobalWorkSize, LocalWorkSize};
+  auto NodeA = Graph.add([&](handler &cgh) {
+    cgh.set_arg(0, DynParam);
+    // TODO: Use the free function kernel extension instead of regular kernels
+    // when available.
+    cgh.parallel_for(NDRange, [=](nd_item<3> Item) {
+      size_t GlobalID = Item.get_global_linear_id();
+      PtrA[GlobalID] = GlobalID;
+    });
+  });
+
+  range<3> Range{GlobalWorkSize};
+  auto NodeB = Graph.add(
+      [&](handler &cgh) {
+        cgh.set_arg(0, DynParam);
+        // TODO: Use the free function kernel extension instead of regular
+        // kernels when available.
+        cgh.parallel_for(Range, [=](item<3> Item) {
+          size_t GlobalID = Item.get_linear_id();
+          PtrA[GlobalID] *= 2;
+        });
+      },
+      exp_ext::property::node::depends_on{NodeA});
+
+  auto ExecGraph = Graph.finalize(exp_ext::property::graph::updatable{});
+
+  // PtrA should be filled with values
+  Queue.ext_oneapi_graph(ExecGraph).wait();
+
+  Queue.copy(PtrA, HostDataA.data(), N).wait();
+  Queue.copy(PtrB, HostDataB.data(), N).wait();
+  for (size_t i = 0; i < N; i++) {
+    assert(HostDataA[i] == (i * 2));
+    assert(HostDataB[i] == 0);
+  }
+
+  // Swap PtrB to be the input/output
+  DynParam.update(PtrB);
+  ExecGraph.update({NodeA, NodeB});
+  Queue.ext_oneapi_graph(ExecGraph).wait();
+
+  Queue.copy(PtrA, HostDataA.data(), N).wait();
+  Queue.copy(PtrB, HostDataB.data(), N).wait();
+  for (size_t i = 0; i < N; i++) {
+    const size_t Ref = i * 2;
+    assert(HostDataA[i] == Ref);
+    assert(HostDataB[i] == Ref);
+  }
+  return 0;
+}
diff --git a/sycl/test-e2e/Graph/Update/whole_update_dynamic_param.cpp b/sycl/test-e2e/Graph/Update/whole_update_dynamic_param.cpp
index f71fc5eef6ef0..f907006c88701 100644
--- a/sycl/test-e2e/Graph/Update/whole_update_dynamic_param.cpp
+++ b/sycl/test-e2e/Graph/Update/whole_update_dynamic_param.cpp
@@ -31,6 +31,7 @@ int main() {
   Queue.copy(InputDataHost1.data(), InputDataDevice1, Size);
   Queue.copy(InputDataHost2.data(), InputDataDevice2, Size);
   Queue.copy(OutputDataHost1.data(), OutputDataDevice1, Size);
+  Queue.wait();
 
   exp_ext::command_graph GraphA{Queue.get_context(), Queue.get_device()};
 
diff --git a/sycl/test-e2e/Graph/ValidUsage/linear_graph_copy.cpp b/sycl/test-e2e/Graph/ValidUsage/linear_graph_copy.cpp
new file mode 100644
index 0000000000000..fee6ff18d94bf
--- /dev/null
+++ b/sycl/test-e2e/Graph/ValidUsage/linear_graph_copy.cpp
@@ -0,0 +1,102 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=0 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+// Extra run to check for immediate-command-list in Level Zero
+// RUN: %if level_zero %{env SYCL_PI_LEVEL_ZERO_USE_IMMEDIATE_COMMANDLISTS=1 %{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %}
+//
+
+// Tests that the optimization to use the L0 Copy Engine for memory commands
+// does not interfere with the linear graph optimization
+
+#include "../graph_common.hpp"
+
+#include <sycl/properties/queue_properties.hpp>
+
+int main() {
+  queue Queue{{sycl::property::queue::in_order{}}};
+
+  using T = int;
+
+  const T ModValue = 7;
+  std::vector<T> DataA(Size), DataB(Size), DataC(Size);
+
+  std::iota(DataA.begin(), DataA.end(), 1);
+  std::iota(DataB.begin(), DataB.end(), 10);
+  std::iota(DataC.begin(), DataC.end(), 1000);
+
+  // Create reference data for output
+  std::vector<T> ReferenceA(DataA), ReferenceB(DataB), ReferenceC(DataC);
+  for (size_t i = 0; i < Iterations; i++) {
+    for (size_t j = 0; j < Size; j++) {
+      ReferenceA[j] += ModValue;
+      ReferenceB[j] = ReferenceA[j];
+      ReferenceB[j] -= ModValue;
+      ReferenceC[j] = ReferenceB[j];
+      ReferenceC[j] += ModValue;
+    }
+  }
+
+  ext::oneapi::experimental::command_graph Graph{Queue.get_context(),
+                                                 Queue.get_device()};
+
+  T *PtrA = malloc_device<T>(Size, Queue);
+  T *PtrB = malloc_device<T>(Size, Queue);
+  T *PtrC = malloc_device<T>(Size, Queue);
+
+  Queue.copy(DataA.data(), PtrA, Size);
+  Queue.copy(DataB.data(), PtrB, Size);
+  Queue.copy(DataC.data(), PtrC, Size);
+  Queue.wait_and_throw();
+
+  Graph.begin_recording(Queue);
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>(Size), [=](item<1> id) {
+      auto LinID = id.get_linear_id();
+      PtrA[LinID] += ModValue;
+    });
+  });
+
+  Queue.submit([&](handler &CGH) { CGH.memcpy(PtrB, PtrA, Size * sizeof(T)); });
+
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>(Size), [=](item<1> id) {
+      auto LinID = id.get_linear_id();
+      PtrB[LinID] -= ModValue;
+    });
+  });
+
+  Queue.submit([&](handler &CGH) { CGH.memcpy(PtrC, PtrB, Size * sizeof(T)); });
+
+  Queue.submit([&](handler &CGH) {
+    CGH.parallel_for(range<1>(Size), [=](item<1> id) {
+      auto LinID = id.get_linear_id();
+      PtrC[LinID] += ModValue;
+    });
+  });
+
+  Graph.end_recording();
+
+  auto GraphExec = Graph.finalize();
+
+  event Event;
+  for (unsigned n = 0; n < Iterations; n++) {
+    Event =
+        Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(GraphExec); });
+  }
+
+  Queue.copy(PtrA, DataA.data(), Size, Event);
+  Queue.copy(PtrB, DataB.data(), Size, Event);
+  Queue.copy(PtrC, DataC.data(), Size, Event);
+  Queue.wait_and_throw();
+
+  free(PtrA, Queue);
+  free(PtrB, Queue);
+  free(PtrC, Queue);
+
+  for (size_t i = 0; i < Size; i++) {
+    assert(check_value(i, ReferenceA[i], DataA[i], "DataA"));
+    assert(check_value(i, ReferenceB[i], DataB[i], "DataB"));
+    assert(check_value(i, ReferenceC[i], DataC[i], "DataC"));
+  }
+}
diff --git a/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/array_input_sort.cpp b/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/array_input_sort.cpp
new file mode 100644
index 0000000000000..b7ced46f10edb
--- /dev/null
+++ b/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/array_input_sort.cpp
@@ -0,0 +1,250 @@
+// REQUIRES: sg-8
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies sorting APIs for fixed-size array input from group_sort
+// extension.
+#include "common.hpp"
+#include <sycl/ext/oneapi/experimental/group_sort.hpp>
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <random>
+#include <vector>
+
+template <UseGroupT UseGroup, int Dims, size_t ElementsPerWorkItem,
+          class Property = sycl::ext::oneapi::experimental::empty_properties_t,
+          class T, class Compare>
+void RunSortOverGroupArray(sycl::queue &Q, const std::vector<T> &DataToSort,
+                           const Compare &Comp, Property Prop) {
+
+  const size_t WorkSize = DataToSort.size() / ElementsPerWorkItem;
+  const size_t NumSubGroups = WorkSize / ReqSubGroupSize + 1;
+
+  const sycl::nd_range<Dims> NDRange = [&]() {
+    if constexpr (Dims == 1)
+      return sycl::nd_range<1>{{WorkSize}, {WorkSize}};
+    else
+      return sycl::nd_range<2>{{1, WorkSize}, {1, WorkSize}};
+    static_assert(Dims < 3,
+                  "Only one and two dimensional kernels are supported");
+  }();
+
+  using DefaultSorterT =
+      oneapi_exp::default_sorters::group_sorter<T, Compare,
+                                                ElementsPerWorkItem>;
+
+  using RadixSorterT = oneapi_exp::radix_sorters::group_sorter<
+      typename ConvertToSimpleType<T>::Type,
+      ConvertToSortingOrder<Compare>::Type, ElementsPerWorkItem>;
+
+  constexpr bool IsSG = (UseGroup == UseGroupT::SubGroup);
+  auto Scope =
+      IsSG ? sycl::memory_scope::sub_group : sycl::memory_scope::work_group;
+  auto WGSize = NDRange.get_local_range().size();
+  auto GroupSize = IsSG ? ReqSubGroupSize : WGSize;
+  std::size_t LocalMemorySizeDefault =
+      DefaultSorterT::memory_required(Scope, GroupSize);
+  std::size_t LocalMemorySizeRadix =
+      RadixSorterT::memory_required(Scope, GroupSize);
+  std::array<std::vector<T>, 4> DataToSortCase = {DataToSort, DataToSort,
+                                                  DataToSort, DataToSort};
+
+  // Sort data using 3 different versions of sort_over_group API
+  {
+    std::array<std::shared_ptr<sycl::buffer<T>>, 4> BufToSort;
+    for (int i = 0; i < 4; i++)
+      BufToSort[i].reset(new sycl::buffer<T>(DataToSortCase[i].data(),
+                                             DataToSortCase[i].size()));
+
+    Q.submit([&](sycl::handler &CGH) {
+       auto AccToSort0 = sycl::accessor(*BufToSort[0], CGH);
+       auto AccToSort1 = sycl::accessor(*BufToSort[1], CGH);
+       auto AccToSort2 = sycl::accessor(*BufToSort[2], CGH);
+       auto AccToSort3 = sycl::accessor(*BufToSort[3], CGH);
+
+       // Allocate local memory for all sub-groups in a work-group
+       const size_t TotalLocalMemSizeDefault =
+           IsSG ? LocalMemorySizeDefault * NumSubGroups
+                : LocalMemorySizeDefault;
+       sycl::local_accessor<std::byte, 1> ScratchDefault(
+           {TotalLocalMemSizeDefault}, CGH);
+
+       const size_t TotalLocalMemSizeRadix =
+           IsSG ? LocalMemorySizeRadix * NumSubGroups : LocalMemorySizeRadix;
+
+       sycl::local_accessor<std::byte, 1> ScratchRadix({TotalLocalMemSizeRadix},
+                                                       CGH);
+
+       CGH.parallel_for(
+           NDRange, [=](sycl::nd_item<Dims> id) [[intel::reqd_sub_group_size(
+                        ReqSubGroupSize)]] {
+             const size_t GlobalLinearID = id.get_global_linear_id();
+             using RadixSorterT = oneapi_exp::radix_sorters::group_sorter<
+                 typename ConvertToSimpleType<T>::Type,
+                 ConvertToSortingOrder<Compare>::Type, ElementsPerWorkItem>;
+
+             auto Group = [&]() {
+               if constexpr (IsSG)
+                 return id.get_sub_group();
+               else
+                 return id.get_group();
+             }();
+
+             // Each sub-group should use its own part of the scratch pad
+             const size_t ScratchShiftDefault =
+                 IsSG ? id.get_sub_group().get_group_linear_id() *
+                            LocalMemorySizeDefault
+                      : 0;
+             std::byte *ScratchPtrDefault =
+                 &ScratchDefault[0] + ScratchShiftDefault;
+
+             T ValsPrivate[ElementsPerWorkItem];
+
+             auto ReadToPrivate = [&](auto Acc) {
+               for (std::size_t I = 0; I < ElementsPerWorkItem; ++I)
+                 ValsPrivate[I] = Acc[GlobalLinearID * ElementsPerWorkItem + I];
+             };
+             auto WriteToGlobal = [&](auto Acc) {
+               for (std::size_t I = 0; I < ElementsPerWorkItem; ++I)
+                 Acc[GlobalLinearID * ElementsPerWorkItem + I] = ValsPrivate[I];
+             };
+
+             auto Scratch =
+                 sycl::span{ScratchPtrDefault, LocalMemorySizeDefault};
+             auto PrivateArr = sycl::span<T, ElementsPerWorkItem>{
+                 ValsPrivate, ValsPrivate + ElementsPerWorkItem};
+             if constexpr (std::is_same_v<Compare, std::less<T>>) {
+               ReadToPrivate(AccToSort0);
+               oneapi_exp::sort_over_group(
+                   oneapi_exp::group_with_scratchpad(Group, Scratch),
+                   PrivateArr, Prop); // (4)
+               WriteToGlobal(AccToSort0);
+             }
+
+             ReadToPrivate(AccToSort1);
+             oneapi_exp::sort_over_group(
+                 oneapi_exp::group_with_scratchpad(Group, Scratch), PrivateArr,
+                 Comp, Prop); // (5)
+             WriteToGlobal(AccToSort1);
+
+             ReadToPrivate(AccToSort2);
+             oneapi_exp::sort_over_group(Group, PrivateArr,
+                                         DefaultSorterT(Scratch), Prop); // (6)
+             WriteToGlobal(AccToSort2);
+
+             // Each sub-group should use its own part of the scratch pad
+             const size_t ScratchShiftRadix =
+                 IsSG ? id.get_sub_group().get_group_linear_id() *
+                            LocalMemorySizeRadix
+                      : 0;
+             std::byte *ScratchPtrRadix = &ScratchRadix[0] + ScratchShiftRadix;
+
+             // Radix doesn't support custom types
+             if constexpr (!std::is_same_v<CustomType, T>) {
+               ReadToPrivate(AccToSort3);
+               oneapi_exp::sort_over_group(
+                   Group, PrivateArr,
+                   RadixSorterT(
+                       sycl::span{ScratchPtrRadix, LocalMemorySizeRadix}),
+                   Prop); // (6)
+               WriteToGlobal(AccToSort3);
+             }
+           });
+     }).wait_and_throw();
+  }
+
+  // Verification
+  {
+    // Emulate independent sorting of each work-group/sub-group
+    const size_t ChunkSize = GroupSize * ElementsPerWorkItem;
+    std::vector<T> TempSorted = DataToSort;
+    auto It = TempSorted.begin();
+    for (; (It + ChunkSize) < TempSorted.end(); It += ChunkSize)
+      std::sort(It, It + ChunkSize, Comp);
+
+    // Sort reminder
+    std::sort(It, TempSorted.end(), Comp);
+    std::vector<T> DataSorted;
+    DataSorted.resize(TempSorted.size());
+    writeBlockedOrStriped<T>(/*In */ TempSorted, /* Out */ DataSorted,
+                             GroupSize, ElementsPerWorkItem, Prop);
+
+    if constexpr (std::is_same_v<Compare, std::less<T>>)
+      assert(DataToSortCase[0] == DataSorted);
+
+    assert(DataToSortCase[1] == DataSorted);
+    assert(DataToSortCase[2] == DataSorted);
+    // Radix doesn't support custom types
+    if constexpr (!std::is_same_v<CustomType, T>)
+      assert(DataToSortCase[3] == DataSorted);
+  }
+}
+
+template <
+    UseGroupT UseGroup, int Dim, size_t ElementsPerWorkItem, class T,
+    typename Compare,
+    typename Properties = sycl::ext::oneapi::experimental::empty_properties_t>
+void RunOnData(sycl::queue &Q, const std::vector<T> &Data,
+               const Compare &Comparator, Properties Prop = {}) {
+  if constexpr (UseGroup == UseGroupT::SubGroup)
+    if (Q.get_backend() == sycl::backend::ext_oneapi_cuda ||
+        Q.get_backend() == sycl::backend::ext_oneapi_hip) {
+      std::cout << "Note! Skipping sub group testing on CUDA BE" << std::endl;
+      return;
+    }
+
+  RunSortOverGroupArray<UseGroup, Dim, ElementsPerWorkItem>(Q, Data, Comparator,
+                                                            Prop);
+};
+
+template <class T> void RunOverType(sycl::queue &Q, size_t DataSize) {
+  constexpr size_t PerWI = 4;
+  std::vector<T> ArrayDataRandom(DataSize * PerWI);
+
+  // Fill using random numbers
+  std::default_random_engine generator;
+  std::normal_distribution<float> distribution((10.0), (2.0));
+  for (T &Elem : ArrayDataRandom)
+    Elem = T(distribution(generator));
+
+  auto blocked = oneapi_exp::properties{oneapi_exp::input_data_placement<
+      oneapi_exp::group_algorithm_data_placement::blocked>};
+  auto striped = oneapi_exp::properties{oneapi_exp::input_data_placement<
+      oneapi_exp::group_algorithm_data_placement::striped>};
+  RunOnData<UseGroupT::WorkGroup, 1, PerWI>(Q, ArrayDataRandom, std::less<T>{},
+                                            blocked);
+  RunOnData<UseGroupT::WorkGroup, 1, PerWI>(Q, ArrayDataRandom,
+                                            std::greater<T>{}, striped);
+  RunOnData<UseGroupT::WorkGroup, 2, PerWI>(Q, ArrayDataRandom, std::less<T>{},
+                                            blocked);
+  RunOnData<UseGroupT::WorkGroup, 2, PerWI>(Q, ArrayDataRandom,
+                                            std::greater<T>{}, striped);
+  RunOnData<UseGroupT::SubGroup, 1, PerWI>(Q, ArrayDataRandom,
+                                           std::greater<T>{}, blocked);
+  RunOnData<UseGroupT::SubGroup, 1, PerWI>(Q, ArrayDataRandom, std::less<T>{},
+                                           striped);
+  RunOnData<UseGroupT::SubGroup, 2, PerWI>(Q, ArrayDataRandom,
+                                           std::greater<T>{}, blocked);
+  RunOnData<UseGroupT::SubGroup, 2, PerWI>(Q, ArrayDataRandom, std::less<T>{},
+                                           striped);
+}
+
+int main() {
+  try {
+    sycl::queue Q;
+
+    static constexpr size_t Size = 18;
+
+    RunOverType<std::int32_t>(Q, Size);
+    RunOverType<CustomType>(Q, Size);
+
+    std::cout << "Test passed." << std::endl;
+    return 0;
+  } catch (std::exception &E) {
+    std::cout << "Test failed" << std::endl;
+    std::cout << E.what() << std::endl;
+    return 1;
+  }
+}
diff --git a/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/common.hpp b/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/common.hpp
new file mode 100644
index 0000000000000..8f3317addb6e0
--- /dev/null
+++ b/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/common.hpp
@@ -0,0 +1,92 @@
+
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/experimental/group_sort.hpp>
+
+#pragma once
+
+namespace oneapi_exp = sycl::ext::oneapi::experimental;
+
+enum class UseGroupT { SubGroup = true, WorkGroup = false };
+
+// these classes are needed to pass non-type template parameters to KernelName
+template <int> class IntWrapper;
+template <UseGroupT> class UseGroupWrapper;
+
+class CustomType {
+public:
+  CustomType(size_t Val) : MVal(Val) {}
+  CustomType() : MVal(0) {}
+
+  bool operator<(const CustomType &RHS) const { return MVal < RHS.MVal; }
+  bool operator>(const CustomType &RHS) const { return MVal > RHS.MVal; }
+  bool operator==(const CustomType &RHS) const { return MVal == RHS.MVal; }
+
+private:
+  size_t MVal = 0;
+};
+
+template <class T> struct ConvertToSimpleType {
+  using Type = T;
+};
+
+// Dummy overloads for CustomType which is not supported by radix sorter
+template <> struct ConvertToSimpleType<CustomType> {
+  using Type = int;
+};
+
+template <class SorterT> struct ConvertToSortingOrder;
+
+template <class T> struct ConvertToSortingOrder<std::greater<T>> {
+  static const auto Type = oneapi_exp::sorting_order::descending;
+};
+
+template <class T> struct ConvertToSortingOrder<std::less<T>> {
+  static const auto Type = oneapi_exp::sorting_order::ascending;
+};
+
+constexpr size_t ReqSubGroupSize = 8;
+
+template <typename...> class KernelNameOverGroup;
+template <typename...> class KernelNameJoint;
+
+template <typename Properties>
+constexpr bool IsOutputBlocked(Properties properties) {
+  if constexpr (properties.template has_property<
+                    oneapi_exp::output_data_placement_key>())
+    return properties.template get_property<
+               oneapi_exp::output_data_placement_key>() ==
+           oneapi_exp::output_data_placement<
+               oneapi_exp::group_algorithm_data_placement::blocked>;
+  else
+    return true;
+}
+
+// Write data from In to Out in blocked/striped way.
+template <typename T, typename Properties>
+void writeBlockedOrStriped(const std::vector<T> &In, std::vector<T> &Out,
+                           size_t MaxGroupSize, size_t ElementsPerWorkItem,
+                           Properties Prop) {
+  assert(In.size() == Out.size());
+  size_t index = {};
+  size_t shift = {};
+  auto ChunkSize = MaxGroupSize * ElementsPerWorkItem;
+  std::uint32_t ChunkStart = 0;
+  for (std::uint32_t ChunkStart = 0; ChunkStart < In.size();
+       ChunkStart += ChunkSize) {
+    auto GroupSize = (In.size() - ChunkStart) >= ChunkSize
+                         ? MaxGroupSize
+                         : (In.size() - ChunkStart) / ElementsPerWorkItem;
+    for (std::uint32_t j = 0; j < GroupSize; ++j) {
+      for (std::uint32_t k = 0; k < ElementsPerWorkItem; ++k) {
+        index = ChunkStart + j * ElementsPerWorkItem + k;
+        if constexpr (IsOutputBlocked(Prop)) {
+          shift = index;
+        } else {
+          shift = ChunkStart + k * GroupSize + j;
+        }
+        if (index < Out.size() && shift < In.size())
+          Out[index] = In[shift];
+      }
+    }
+  }
+}
diff --git a/sycl/test-e2e/GroupAlgorithm/SYCL2020/sort.cpp b/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/group_and_joint_sort.cpp
similarity index 88%
rename from sycl/test-e2e/GroupAlgorithm/SYCL2020/sort.cpp
rename to sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/group_and_joint_sort.cpp
index 4dc00232b5d30..4abea0c6a7268 100644
--- a/sycl/test-e2e/GroupAlgorithm/SYCL2020/sort.cpp
+++ b/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/group_and_joint_sort.cpp
@@ -1,6 +1,8 @@
 // REQUIRES: sg-8
-// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
-// RUN: %{run} %t.out
+// RUN: %{build} -fsycl-device-code-split=per_kernel -DVERSION=1 -o %t1.out
+// RUN: %{run} %t1.out
+// RUN: %{build} -fsycl-device-code-split=per_kernel -DVERSION=2 -o %t2.out
+// RUN: %{run} %t2.out
 // UNSUPPORTED: accelerator
 
 // The test verifies sort API extension.
@@ -27,6 +29,7 @@
 
 #include <sycl/detail/core.hpp>
 
+#include "common.hpp"
 #include <sycl/builtins.hpp>
 #include <sycl/ext/oneapi/experimental/group_sort.hpp>
 #include <sycl/group_algorithm.hpp>
@@ -37,30 +40,7 @@
 #include <random>
 #include <vector>
 
-namespace oneapi_exp = sycl::ext::oneapi::experimental;
-
-template <typename...> class KernelNameOverGroup;
-template <typename...> class KernelNameJoint;
-
-enum class UseGroupT { SubGroup = true, WorkGroup = false };
-
-// these classes are needed to pass non-type template parameters to KernelName
-template <int> class IntWrapper;
-template <UseGroupT> class UseGroupWrapper;
-
-class CustomType {
-public:
-  CustomType(size_t Val) : MVal(Val) {}
-  CustomType() : MVal(0) {}
-
-  bool operator<(const CustomType &RHS) const { return MVal < RHS.MVal; }
-  bool operator>(const CustomType &RHS) const { return MVal > RHS.MVal; }
-  bool operator==(const CustomType &RHS) const { return MVal == RHS.MVal; }
-
-private:
-  size_t MVal = 0;
-};
-
+#if VERSION == 1
 template <class CompT, class T> struct RadixSorterType;
 
 template <class T> struct RadixSorterType<std::greater<T>, T> {
@@ -83,8 +63,7 @@ template <> struct RadixSorterType<std::greater<CustomType>, CustomType> {
   using Type =
       oneapi_exp::radix_sorter<int, oneapi_exp::sorting_order::descending>;
 };
-
-constexpr size_t ReqSubGroupSize = 8;
+#endif
 
 template <UseGroupT UseGroup, int Dims, class T, class Compare>
 void RunJointSort(sycl::queue &Q, const std::vector<T> &DataToSort,
@@ -97,22 +76,40 @@ void RunJointSort(sycl::queue &Q, const std::vector<T> &DataToSort,
 
   constexpr size_t NumSubGroups = WGSize / ReqSubGroupSize;
 
+#if VERSION == 1
   using RadixSorterT = typename RadixSorterType<Compare, T>::Type;
+#else
+  using RadixSorterT = oneapi_exp::radix_sorters::joint_sorter<
+      typename ConvertToSimpleType<T>::Type,
+      ConvertToSortingOrder<Compare>::Type>;
+#endif
 
   std::size_t LocalMemorySizeDefault = 0;
   std::size_t LocalMemorySizeRadix = 0;
   if (UseGroup == UseGroupT::SubGroup) {
     // Each sub-group needs a piece of memory for sorting
+#if VERSION == 1
     LocalMemorySizeDefault =
         oneapi_exp::default_sorter<Compare>::template memory_required<T>(
             sycl::memory_scope::sub_group, ReqSubGroupSize * ElemsPerWI);
+#else
+    LocalMemorySizeDefault = oneapi_exp::default_sorters::joint_sorter<
+        Compare>::template memory_required<T>(sycl::memory_scope::sub_group,
+                                              ReqSubGroupSize * ElemsPerWI);
+#endif
     LocalMemorySizeRadix = RadixSorterT::memory_required(
         sycl::memory_scope::sub_group, ReqSubGroupSize * ElemsPerWI);
   } else {
     // A single chunk of memory for each work-group
+#if VERSION == 1
     LocalMemorySizeDefault =
         oneapi_exp::default_sorter<Compare>::template memory_required<T>(
             sycl::memory_scope::work_group, WGSize * ElemsPerWI);
+#else
+    LocalMemorySizeDefault = oneapi_exp::default_sorters::joint_sorter<
+        Compare>::template memory_required<T>(sycl::memory_scope::work_group,
+                                              WGSize * ElemsPerWI);
+#endif
     LocalMemorySizeRadix = RadixSorterT::memory_required(
         sycl::memory_scope::sub_group, WGSize * ElemsPerWI);
   }
@@ -206,8 +203,13 @@ void RunJointSort(sycl::queue &Q, const std::vector<T> &DataToSort,
 
              oneapi_exp::joint_sort(
                  Group, &AccToSort2[StartIdx], &AccToSort2[EndIdx],
+#if VERSION == 1
                  oneapi_exp::default_sorter<Compare>(sycl::span{
                      &ScratchDefault[LocalPartID], LocalMemorySizeDefault}));
+#else
+                 oneapi_exp::default_sorters::joint_sorter<Compare>(sycl::span{
+                     &ScratchDefault[LocalPartID], LocalMemorySizeDefault}));
+#endif
 
              const size_t LocalPartIDRadix =
                  UseGroup == UseGroupT::SubGroup
@@ -266,26 +268,54 @@ void RunSortOVerGroup(sycl::queue &Q, const std::vector<T> &DataToSort,
                   "Only one and two dimensional kernels are supported");
   }();
 
+#if VERSION == 1
   using RadixSorterT = typename RadixSorterType<Compare, T>::Type;
+#else
+  using RadixSorterT = oneapi_exp::radix_sorters::group_sorter<
+      typename ConvertToSimpleType<T>::Type,
+      ConvertToSortingOrder<Compare>::Type>;
+#endif
 
   std::size_t LocalMemorySizeDefault = 0;
   std::size_t LocalMemorySizeRadix = 0;
   if (UseGroup == UseGroupT::SubGroup) {
     // Each sub-group needs a piece of memory for sorting
+#if VERSION == 1
     LocalMemorySizeDefault =
         oneapi_exp::default_sorter<Compare>::template memory_required<T>(
             sycl::memory_scope::sub_group, sycl::range<1>{ReqSubGroupSize});
+#else
+    LocalMemorySizeDefault = oneapi_exp::default_sorters::group_sorter<
+        T, Compare, 1>::memory_required(sycl::memory_scope::sub_group,
+                                        ReqSubGroupSize);
+#endif
 
+#if VERSION == 1
     LocalMemorySizeRadix = RadixSorterT::template memory_required(
         sycl::memory_scope::sub_group, sycl::range<1>{ReqSubGroupSize});
+#else
+    LocalMemorySizeRadix = RadixSorterT::memory_required(
+        sycl::memory_scope::sub_group, ReqSubGroupSize);
+#endif
   } else {
     // A single chunk of memory for each work-group
+#if VERSION == 1
     LocalMemorySizeDefault =
         oneapi_exp::default_sorter<Compare>::template memory_required<T>(
             sycl::memory_scope::work_group, sycl::range<1>{NumOfElements});
+#else
+    LocalMemorySizeDefault = oneapi_exp::default_sorters::group_sorter<
+        T, Compare, 1>::memory_required(sycl::memory_scope::work_group,
+                                        NumOfElements);
+#endif
 
+#if VERSION == 1
     LocalMemorySizeRadix = RadixSorterT::template memory_required(
         sycl::memory_scope::work_group, sycl::range<1>{NumOfElements});
+#else
+    LocalMemorySizeRadix = RadixSorterT::memory_required(
+        sycl::memory_scope::work_group, NumOfElements);
+#endif
   }
 
   std::vector<T> DataToSortCase0 = DataToSort;
@@ -358,8 +388,13 @@ void RunSortOVerGroup(sycl::queue &Q, const std::vector<T> &DataToSort,
 
              AccToSort2[GlobalLinearID] = oneapi_exp::sort_over_group(
                  Group, AccToSort2[GlobalLinearID],
+#if VERSION == 1
                  oneapi_exp::default_sorter<Compare>(
                      sycl::span{ScratchPtrDefault, LocalMemorySizeDefault}));
+#else
+                 oneapi_exp::default_sorters::group_sorter<T, Compare, 1>(
+                     sycl::span{ScratchPtrDefault, LocalMemorySizeDefault}));
+#endif
 
              // Each sub-group should use it's own part of the scratch pad
              const size_t ScratchShiftRadix =
diff --git a/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/key_value_sort.cpp b/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/key_value_sort.cpp
new file mode 100644
index 0000000000000..d1eca3aa087b8
--- /dev/null
+++ b/sycl/test-e2e/GroupAlgorithm/SYCL2020/group_sort/key_value_sort.cpp
@@ -0,0 +1,314 @@
+// REQUIRES: sg-8
+// RUN: %{build} -fsycl-device-code-split=per_kernel -o %t.out
+// RUN: %{run} %t.out
+
+// The test verifies key/value sorting from group_sort extension.
+#include "common.hpp"
+#include <sycl/ext/oneapi/experimental/group_sort.hpp>
+
+#include <algorithm>
+#include <iostream>
+#include <numeric>
+#include <random>
+#include <vector>
+
+template <typename...> class KernelNameOverGroup;
+
+template <UseGroupT UseGroup, int Dims, class KeyTy, class ValueTy,
+          class Compare>
+void RunSortKeyValueOverGroup(sycl::queue &Q,
+                              const std::vector<KeyTy> &KeysToSort,
+                              const std::vector<ValueTy> &DataToSort,
+                              const Compare &Comp) {
+
+  const size_t NumOfElements = DataToSort.size();
+  const size_t NumSubGroups = NumOfElements / ReqSubGroupSize + 1;
+
+  const sycl::nd_range<Dims> NDRange = [&]() {
+    if constexpr (Dims == 1)
+      return sycl::nd_range<1>{{NumOfElements}, {NumOfElements}};
+    else
+      return sycl::nd_range<2>{{1, NumOfElements}, {1, NumOfElements}};
+    static_assert(Dims < 3,
+                  "Only one and two dimensional kernels are supported");
+  }();
+
+  using RadixSorterT = oneapi_exp::radix_sorters::group_key_value_sorter<
+      typename ConvertToSimpleType<KeyTy>::Type, ValueTy,
+      ConvertToSortingOrder<Compare>::Type>;
+
+  std::size_t LocalMemorySizeDefault = 0;
+  std::size_t LocalMemorySizeRadix = 0;
+  if (UseGroup == UseGroupT::SubGroup) {
+    // Each sub-group needs a piece of memory for sorting
+    LocalMemorySizeDefault = oneapi_exp::default_sorters::
+        group_key_value_sorter<KeyTy, ValueTy, Compare>::memory_required(
+            sycl::memory_scope::sub_group, ReqSubGroupSize);
+
+    LocalMemorySizeRadix = RadixSorterT::memory_required(
+        sycl::memory_scope::sub_group, ReqSubGroupSize);
+  } else {
+    // A single chunk of memory for each work-group
+    LocalMemorySizeDefault = oneapi_exp::default_sorters::
+        group_key_value_sorter<KeyTy, ValueTy, Compare>::memory_required(
+            sycl::memory_scope::work_group, NumOfElements);
+
+    LocalMemorySizeRadix = RadixSorterT::memory_required(
+        sycl::memory_scope::work_group, NumOfElements);
+  }
+
+  std::vector<KeyTy> KeysToSortCase0 = KeysToSort;
+  std::vector<ValueTy> DataToSortCase0 = DataToSort;
+
+  std::vector<KeyTy> KeysToSortCase1 = KeysToSort;
+  std::vector<ValueTy> DataToSortCase1 = DataToSort;
+
+  std::vector<KeyTy> KeysToSortCase2 = KeysToSort;
+  std::vector<ValueTy> DataToSortCase2 = DataToSort;
+
+  std::vector<KeyTy> KeysToSortCase3 = KeysToSort;
+  std::vector<ValueTy> DataToSortCase3 = DataToSort;
+
+  // Sort data using 3 different versions of sort_over_group API
+  {
+    sycl::buffer<KeyTy> BufKeysToSort0(KeysToSortCase0.data(),
+                                       KeysToSortCase0.size());
+    sycl::buffer<ValueTy> BufDataToSort0(DataToSortCase0.data(),
+                                         DataToSortCase0.size());
+
+    sycl::buffer<KeyTy> BufKeysToSort1(KeysToSortCase1.data(),
+                                       KeysToSortCase1.size());
+    sycl::buffer<ValueTy> BufDataToSort1(DataToSortCase1.data(),
+                                         DataToSortCase1.size());
+
+    sycl::buffer<KeyTy> BufKeysToSort2(KeysToSortCase2.data(),
+                                       KeysToSortCase2.size());
+    sycl::buffer<ValueTy> BufDataToSort2(DataToSortCase2.data(),
+                                         DataToSortCase2.size());
+
+    sycl::buffer<KeyTy> BufKeysToSort3(KeysToSortCase3.data(),
+                                       KeysToSortCase3.size());
+    sycl::buffer<ValueTy> BufDataToSort3(DataToSortCase3.data(),
+                                         DataToSortCase3.size());
+
+    Q.submit([&](sycl::handler &CGH) {
+       auto AccKeysToSort0 = sycl::accessor(BufKeysToSort0, CGH);
+       auto AccDataToSort0 = sycl::accessor(BufDataToSort0, CGH);
+
+       auto AccKeysToSort1 = sycl::accessor(BufKeysToSort1, CGH);
+       auto AccDataToSort1 = sycl::accessor(BufDataToSort1, CGH);
+
+       auto AccKeysToSort2 = sycl::accessor(BufKeysToSort2, CGH);
+       auto AccDataToSort2 = sycl::accessor(BufDataToSort2, CGH);
+
+       auto AccKeysToSort3 = sycl::accessor(BufKeysToSort3, CGH);
+       auto AccDataToSort3 = sycl::accessor(BufDataToSort3, CGH);
+
+       // Allocate local memory for all sub-groups in a work-group
+       const size_t TotalLocalMemSizeDefault =
+           UseGroup == UseGroupT::SubGroup
+               ? LocalMemorySizeDefault * NumSubGroups
+               : LocalMemorySizeDefault;
+       sycl::local_accessor<std::byte, 1> ScratchDefault(
+           {TotalLocalMemSizeDefault}, CGH);
+
+       const size_t TotalLocalMemSizeRadix =
+           UseGroup == UseGroupT::SubGroup ? LocalMemorySizeRadix * NumSubGroups
+                                           : LocalMemorySizeRadix;
+
+       sycl::local_accessor<std::byte, 1> ScratchRadix({TotalLocalMemSizeRadix},
+                                                       CGH);
+
+       auto KeyValueSortKernel =
+           [=](sycl::nd_item<Dims> id) [[intel::reqd_sub_group_size(
+               ReqSubGroupSize)]] {
+             const size_t GlobalLinearID = id.get_global_linear_id();
+
+             auto Group = [&]() {
+               if constexpr (UseGroup == UseGroupT::SubGroup)
+                 return id.get_sub_group();
+               else
+                 return id.get_group();
+             }();
+
+             // Each sub-group should use its own part of the scratch pad
+             const size_t ScratchShiftDefault =
+                 UseGroup == UseGroupT::SubGroup
+                     ? id.get_sub_group().get_group_linear_id() *
+                           LocalMemorySizeDefault
+                     : 0;
+             std::byte *ScratchPtrDefault =
+                 &ScratchDefault[0] + ScratchShiftDefault;
+
+             if constexpr (std::is_same_v<Compare, std::less<KeyTy>>)
+               std::tie(AccKeysToSort0[GlobalLinearID],
+                        AccDataToSort0[GlobalLinearID]) =
+                   oneapi_exp::sort_key_value_over_group(
+                       oneapi_exp::group_with_scratchpad(
+                           Group, sycl::span{ScratchPtrDefault,
+                                             LocalMemorySizeDefault}),
+                       AccKeysToSort0[GlobalLinearID],
+                       AccDataToSort0[GlobalLinearID]); // (4)
+
+             std::tie(AccKeysToSort1[GlobalLinearID],
+                      AccDataToSort1[GlobalLinearID]) =
+                 oneapi_exp::sort_key_value_over_group(
+                     oneapi_exp::group_with_scratchpad(
+                         Group,
+                         sycl::span{ScratchPtrDefault, LocalMemorySizeDefault}),
+                     AccKeysToSort1[GlobalLinearID],
+                     AccDataToSort1[GlobalLinearID], Comp); // (5)
+
+             std::tie(AccKeysToSort2[GlobalLinearID],
+                      AccDataToSort2[GlobalLinearID]) =
+                 oneapi_exp::sort_key_value_over_group(
+                     Group, AccKeysToSort2[GlobalLinearID],
+                     AccDataToSort2[GlobalLinearID],
+                     oneapi_exp::default_sorters::group_key_value_sorter<
+                         KeyTy, ValueTy, Compare, /*ElementsPerWorkItem*/ 1>(
+                         sycl::span{ScratchPtrDefault,
+                                    LocalMemorySizeDefault})); // (6)
+
+             // Each sub-group should use its own part of the scratch pad
+             const size_t ScratchShiftRadix =
+                 UseGroup == UseGroupT::SubGroup
+                     ? id.get_sub_group().get_group_linear_id() *
+                           LocalMemorySizeRadix
+                     : 0;
+             std::byte *ScratchPtrRadix = &ScratchRadix[0] + ScratchShiftRadix;
+
+             // Radix doesn't support custom types
+             if constexpr (!std::is_same_v<CustomType, KeyTy>)
+               std::tie(AccKeysToSort3[GlobalLinearID],
+                        AccDataToSort3[GlobalLinearID]) =
+                   oneapi_exp::sort_key_value_over_group(
+                       Group, AccKeysToSort3[GlobalLinearID],
+                       AccDataToSort3[GlobalLinearID],
+                       RadixSorterT(
+                           sycl::span{ScratchPtrRadix,
+                                      LocalMemorySizeRadix})); // (6) radix
+           };
+
+       CGH.parallel_for<
+           KernelNameOverGroup<IntWrapper<Dims>, UseGroupWrapper<UseGroup>,
+                               KeyTy, ValueTy, Compare>>(NDRange,
+                                                         KeyValueSortKernel);
+     }).wait_and_throw();
+  }
+
+  // Verification
+  {
+    std::vector<std::pair<KeyTy, ValueTy>> KeyDataToSort;
+    KeyDataToSort.reserve(KeysToSort.size());
+    std::transform(
+        KeysToSort.begin(), KeysToSort.end(), DataToSort.begin(),
+        std::back_inserter(KeyDataToSort),
+        [](KeyTy Key, ValueTy Value) { return std::make_pair(Key, Value); });
+    // Emulate independent sorting of each work-group/sub-group
+    const size_t ChunkSize = UseGroup == UseGroupT::SubGroup
+                                 ? ReqSubGroupSize
+                                 : NDRange.get_local_range().size();
+    auto It = KeyDataToSort.begin();
+    auto KeyValueComp = [&](const std::pair<KeyTy, ValueTy> &A,
+                            const std::pair<KeyTy, ValueTy> &B) -> bool {
+      return Comp(A.first, B.first);
+    };
+    for (; (It + ChunkSize) < KeyDataToSort.end(); It += ChunkSize)
+      std::stable_sort(It, It + ChunkSize, KeyValueComp);
+
+    // Sort remainder
+    std::stable_sort(It, KeyDataToSort.end(), KeyValueComp);
+
+    std::vector<KeyTy> KeysSorted;
+    std::vector<ValueTy> DataSorted;
+    KeysSorted.reserve(KeyDataToSort.size());
+    DataSorted.reserve(KeyDataToSort.size());
+    std::transform(KeyDataToSort.begin(), KeyDataToSort.end(),
+                   std::back_inserter(KeysSorted),
+                   [](const std::pair<KeyTy, ValueTy> &KeyValue) {
+                     return KeyValue.first;
+                   });
+    std::transform(KeyDataToSort.begin(), KeyDataToSort.end(),
+                   std::back_inserter(DataSorted),
+                   [](const std::pair<KeyTy, ValueTy> &KeyValue) {
+                     return KeyValue.second;
+                   });
+
+    if constexpr (std::is_same_v<Compare, std::less<KeyTy>>) {
+      assert(KeysToSortCase0 == KeysSorted);
+      assert(DataToSortCase0 == DataSorted);
+    }
+
+    assert(KeysToSortCase1 == KeysSorted);
+    assert(DataToSortCase1 == DataSorted);
+    assert(KeysToSortCase2 == KeysSorted);
+    assert(DataToSortCase2 == DataSorted);
+    if constexpr (!std::is_same_v<CustomType, KeyTy>) {
+      assert(KeysToSortCase3 == KeysSorted);
+      assert(DataToSortCase3 == DataSorted);
+    }
+  }
+}
+
+template <class KeyTy, class ValueTy>
+void RunOverType(sycl::queue &Q, size_t DataSize) {
+  std::vector<KeyTy> KeysRandom(DataSize);
+  std::vector<ValueTy> DataRandom(DataSize);
+
+  // Fill using random numbers
+  {
+    std::default_random_engine generator;
+    std::normal_distribution<float> distribution((10.0), (2.0));
+    for (KeyTy &Elem : KeysRandom)
+      Elem = KeyTy(distribution(generator));
+
+    for (ValueTy &Elem : DataRandom)
+      Elem = ValueTy(distribution(generator));
+  }
+
+  auto RunOnDataAndComp = [&](const std::vector<KeyTy> &Keys,
+                              const std::vector<ValueTy> &Data,
+                              const auto &Comparator) {
+    RunSortKeyValueOverGroup<UseGroupT::WorkGroup, 1>(Q, Data, Keys,
+                                                      Comparator);
+    RunSortKeyValueOverGroup<UseGroupT::WorkGroup, 2>(Q, Data, Keys,
+                                                      Comparator);
+
+    if (Q.get_backend() == sycl::backend::ext_oneapi_cuda ||
+        Q.get_backend() == sycl::backend::ext_oneapi_hip) {
+      std::cout << "Note! Skipping sub group testing on CUDA BE" << std::endl;
+      return;
+    }
+
+    RunSortKeyValueOverGroup<UseGroupT::SubGroup, 1>(Q, Data, Keys, Comparator);
+    RunSortKeyValueOverGroup<UseGroupT::SubGroup, 2>(Q, Data, Keys, Comparator);
+  };
+
+  RunOnDataAndComp(KeysRandom, DataRandom, std::less<KeyTy>{});
+  RunOnDataAndComp(KeysRandom, DataRandom, std::greater<KeyTy>{});
+}
+
+int main() {
+  try {
+    sycl::queue Q;
+
+    std::vector<size_t> Sizes{18, 64};
+
+    for (size_t Size : Sizes) {
+      RunOverType<std::int32_t, char>(Q, Size);
+      RunOverType<char, std::int32_t>(Q, Size);
+      if (Q.get_device().has(sycl::aspect::fp16))
+        RunOverType<sycl::half, std::int32_t>(Q, Size);
+      if (Q.get_device().has(sycl::aspect::fp64))
+        RunOverType<double, char>(Q, Size);
+      RunOverType<CustomType, std::int32_t>(Q, Size);
+    }
+
+    std::cout << "Test passed." << std::endl;
+    return 0;
+  } catch (std::exception &E) {
+    std::cout << "Test failed" << std::endl;
+    std::cout << E.what() << std::endl;
+    return 1;
+  }
+}
diff --git a/sycl/test-e2e/GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp b/sycl/test-e2e/GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp
index cf9bb8cf5c850..ed705979ac4fe 100644
--- a/sycl/test-e2e/GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp
+++ b/sycl/test-e2e/GroupAlgorithm/SYCL2020/reduce_over_group_size.cpp
@@ -1,8 +1,6 @@
-// Test hangs on AMD with https://github.com/intel/llvm/pull/8412
-// UNSUPPORTED: hip_amd
-
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
+// REQUIRES: aspect-usm_shared_allocations
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/GroupAlgorithm/root_group.cpp b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
index e33e2cc2dfd31..983f8e7ca003a 100644
--- a/sycl/test-e2e/GroupAlgorithm/root_group.cpp
+++ b/sycl/test-e2e/GroupAlgorithm/root_group.cpp
@@ -1,5 +1,5 @@
 // Fails with opencl non-cpu, enable when fixed.
-// XFAIL: (opencl && !cpu)
+// XFAIL: (opencl && !cpu && !accelerator)
 // RUN: %{build} -I . -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp b/sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp
index d9677e1b93f91..6f19d9f3cdf2a 100644
--- a/sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp
+++ b/sycl/test-e2e/HierPar/hier_par_wgscope_O0.cpp
@@ -5,10 +5,6 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-
-// Test hangs on AMD with https://github.com/intel/llvm/pull/8412
-// UNSUPPORTED: hip_amd
-
 // RUN: %{build} -O0 -o %t.out
 
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/HostInteropTask/host-task-dependency2.cpp b/sycl/test-e2e/HostInteropTask/host-task-dependency2.cpp
index 996a74ceabc0a..c6752de3923df 100644
--- a/sycl/test-e2e/HostInteropTask/host-task-dependency2.cpp
+++ b/sycl/test-e2e/HostInteropTask/host-task-dependency2.cpp
@@ -36,9 +36,9 @@ void test(queue &Q, size_t Count) {
     event E1 = Q.submit([&](handler &CGH) {
       std::cout << "Submit 1" << std::endl;
 
-      auto Acc0 = B0.get_access<mode::read_write, target::host_buffer>(CGH);
-      auto Acc1 = B1.get_access<mode::read_write, target::host_buffer>(CGH);
-      auto Acc2 = B2.get_access<mode::read_write, target::host_buffer>(CGH);
+      auto Acc0 = B0.get_host_access(CGH);
+      auto Acc1 = B1.get_host_access(CGH);
+      auto Acc2 = B2.get_host_access(CGH);
 
       auto Func = [=] {
         Acc0[0] = 1 * Idx;
@@ -53,8 +53,8 @@ void test(queue &Q, size_t Count) {
     event E2 = Q.submit([&](handler &CGH) {
       std::cout << "Submit 2" << std::endl;
 
-      auto Acc2 = B2.get_access<mode::read_write, target::host_buffer>(CGH);
-      auto Acc3 = B3.get_access<mode::read_write, target::host_buffer>(CGH);
+      auto Acc2 = B2.get_host_access(CGH);
+      auto Acc3 = B3.get_host_access(CGH);
 
       auto Func = [=] {
         Acc2[1] = 1 * Idx;
@@ -71,8 +71,8 @@ void test(queue &Q, size_t Count) {
 
       std::cout << "Submit 3" << std::endl;
 
-      auto Acc4 = B4.get_access<mode::read_write, target::host_buffer>(CGH);
-      auto Acc5 = B5.get_access<mode::read_write, target::host_buffer>(CGH);
+      auto Acc4 = B4.get_host_access(CGH);
+      auto Acc5 = B5.get_host_access(CGH);
 
       auto Func = [=] {
         Acc4[2] = 1 * Idx;
diff --git a/sycl/test-e2e/HostInteropTask/host-task-dependency3.cpp b/sycl/test-e2e/HostInteropTask/host-task-dependency3.cpp
index c49651c741ee8..3b4cdf1b53c36 100644
--- a/sycl/test-e2e/HostInteropTask/host-task-dependency3.cpp
+++ b/sycl/test-e2e/HostInteropTask/host-task-dependency3.cpp
@@ -44,7 +44,7 @@ void test(size_t Count) {
     Q.submit([&](handler &CGH) {
       std::cout << "Submit HT-1" << std::endl;
 
-      auto Acc0 = B0.get_access<mode::read_write, target::host_buffer>(CGH);
+      auto Acc0 = B0.get_host_access();
 
       CGH.host_task([=] {
         std::this_thread::sleep_for(SleepFor);
@@ -71,7 +71,7 @@ void test(size_t Count) {
     Q.submit([&](handler &CGH) {
       std::cout << "Submit HT-2" << std::endl;
 
-      auto Acc2 = B2.get_access<mode::read_write, target::host_buffer>(CGH);
+      auto Acc2 = B2.get_host_access();
 
       CGH.host_task([=] {
         std::this_thread::sleep_for(SleepFor);
@@ -84,9 +84,9 @@ void test(size_t Count) {
     event EHT3 = Q.submit([&](handler &CGH) {
       std::cout << "Submit HT-3" << std::endl;
 
-      auto Acc0 = B0.get_access<mode::read_write, target::host_buffer>(CGH);
-      auto Acc1 = B1.get_access<mode::read_write, target::host_buffer>(CGH);
-      auto Acc2 = B2.get_access<mode::read_write, target::host_buffer>(CGH);
+      auto Acc0 = B0.get_host_access();
+      auto Acc1 = B1.get_host_access();
+      auto Acc2 = B2.get_host_access();
 
       CGH.host_task([=] {
         std::this_thread::sleep_for(SleepFor);
@@ -104,7 +104,7 @@ void test(size_t Count) {
 
       CGH.depends_on(EHT3);
 
-      auto Acc5 = B5.get_access<mode::read_write, target::host_buffer>(CGH);
+      auto Acc5 = B5.get_host_access();
 
       CGH.host_task([=] { Acc5[5] = 1 * Idx; });
     });
diff --git a/sycl/test-e2e/HostInteropTask/host-task.cpp b/sycl/test-e2e/HostInteropTask/host-task.cpp
index 392f8c0972cd7..8dda4a1daaeb3 100644
--- a/sycl/test-e2e/HostInteropTask/host-task.cpp
+++ b/sycl/test-e2e/HostInteropTask/host-task.cpp
@@ -98,16 +98,16 @@ void test3(queue &Q) {
 
       std::cout << "Submit: " << Idx << std::endl;
 
-      auto Acc0 = B0.get_access<mode::read_write, target::host_buffer>(CGH);
-      auto Acc1 = B1.get_access<mode::read_write, target::host_buffer>(CGH);
-      auto Acc2 = B2.get_access<mode::read_write, target::host_buffer>(CGH);
-      auto Acc3 = B3.get_access<mode::read_write, target::host_buffer>(CGH);
-      auto Acc4 = B4.get_access<mode::read_write, target::host_buffer>(CGH);
-      auto Acc5 = B5.get_access<mode::read_write, target::host_buffer>(CGH);
-      auto Acc6 = B6.get_access<mode::read_write, target::host_buffer>(CGH);
-      auto Acc7 = B7.get_access<mode::read_write, target::host_buffer>(CGH);
-      auto Acc8 = B8.get_access<mode::read_write, target::host_buffer>(CGH);
-      auto Acc9 = B9.get_access<mode::read_write, target::host_buffer>(CGH);
+      auto Acc0 = B0.get_host_access();
+      auto Acc1 = B1.get_host_access();
+      auto Acc2 = B2.get_host_access();
+      auto Acc3 = B3.get_host_access();
+      auto Acc4 = B4.get_host_access();
+      auto Acc5 = B5.get_host_access();
+      auto Acc6 = B6.get_host_access();
+      auto Acc7 = B7.get_host_access();
+      auto Acc8 = B8.get_host_access();
+      auto Acc9 = B9.get_host_access();
 
       auto Func = [=] {
         uint64_t X = 0;
diff --git a/sycl/test-e2e/HostInteropTask/interop-task-cuda-buffer-migrate.cpp b/sycl/test-e2e/HostInteropTask/interop-task-cuda-buffer-migrate.cpp
new file mode 100644
index 0000000000000..b577fe16d458d
--- /dev/null
+++ b/sycl/test-e2e/HostInteropTask/interop-task-cuda-buffer-migrate.cpp
@@ -0,0 +1,68 @@
+// REQUIRES: cuda
+// XFAIL: cuda
+//
+// FIXME: this is broken with a multi device context
+//
+// RUN: %{build} -o %t.out -lcuda
+// RUN: %{run} %t.out
+//
+// Test for buffer use in a context with multiple devices (all found
+// root-devices)
+//
+// Make sure that memory migration works for buffers across devices in a context
+// when using host tasks.
+//
+
+#include <cuda.h>
+#include <iostream>
+#include <sycl/backend.hpp>
+#include <sycl/detail/core.hpp>
+#include <sycl/detail/host_task_impl.hpp>
+
+using namespace sycl;
+
+int main() {
+
+  int Data = 0;
+  int Result = 0;
+  buffer<int, 1> buf(&Data, range<1>(1));
+
+  const auto &Devices =
+      platform(gpu_selector_v).get_devices(info::device_type::gpu);
+  std::cout << Devices.size() << " devices found" << std::endl;
+
+  if (Devices.size() == 1) {
+    // Since this is XFAIL for Devices.size() > 1 we need to return failure if
+    // test can't run
+    return 1;
+  }
+
+  context C(Devices);
+
+  int Index = 0;
+  for (auto D : Devices) {
+    std::cout << "Using on device " << Index << ": "
+              << D.get_info<info::device::name>() << std::endl;
+
+    queue Q(C, D);
+    Q.submit([&](handler &cgh) {
+      accessor acc{buf, cgh, read_write};
+      cgh.host_task([=](interop_handle ih) {
+        auto ptr = ih.get_native_mem<backend::ext_oneapi_cuda>(acc);
+        int tmp = 0;
+        cuMemcpyDtoH(&tmp, ptr, sizeof(int));
+        tmp++;
+        cuMemcpyHtoD(ptr, &tmp, sizeof(int));
+      });
+    });
+    Q.wait();
+    ++Index;
+  }
+
+  auto host_acc = buf.get_host_access();
+  auto passed = (host_acc[0] == Index);
+  std::cout << "Checking result on host: " << (passed ? "passed" : "FAILED")
+            << std::endl;
+  std::cout << host_acc[0] << " ?= " << Index << std::endl;
+  return !passed;
+}
diff --git a/sycl/test-e2e/HostInteropTask/interop-task-cuda.cpp b/sycl/test-e2e/HostInteropTask/interop-task-cuda.cpp
new file mode 100644
index 0000000000000..503fa4a51067e
--- /dev/null
+++ b/sycl/test-e2e/HostInteropTask/interop-task-cuda.cpp
@@ -0,0 +1,128 @@
+// RUN: %{build} -o %t.out -lcuda
+// RUN: %{run} %t.out
+// REQUIRES: cuda
+
+#include <iostream>
+#include <sycl/backend.hpp>
+#include <sycl/detail/core.hpp>
+#include <sycl/detail/host_task_impl.hpp>
+
+#include <cuda.h>
+
+using namespace sycl;
+using namespace sycl::access;
+
+static constexpr size_t BUFFER_SIZE = 1024;
+
+template <typename T> class Modifier;
+
+template <typename T> class Init;
+
+template <typename BufferT, typename ValueT>
+void checkBufferValues(BufferT Buffer, ValueT Value) {
+  auto Acc = Buffer.get_host_access();
+  for (size_t Idx = 0; Idx < Acc.size(); ++Idx) {
+    if (Acc[Idx] != Value) {
+      std::cerr << "buffer[" << Idx << "] = " << Acc[Idx]
+                << ", expected val = " << Value << '\n';
+      exit(1);
+    }
+  }
+}
+
+template <typename DataT>
+void copy(buffer<DataT, 1> &Src, buffer<DataT, 1> &Dst, queue &Q) {
+  Q.submit([&](handler &CGH) {
+    auto SrcA = Src.template get_access<mode::read>(CGH);
+    auto DstA = Dst.template get_access<mode::write>(CGH);
+
+    auto Func = [=](interop_handle IH) {
+      auto Stream = IH.get_native_queue<backend::ext_oneapi_cuda>();
+      auto SrcMem = IH.get_native_mem<backend::ext_oneapi_cuda>(SrcA);
+      auto DstMem = IH.get_native_mem<backend::ext_oneapi_cuda>(DstA);
+
+      if (cuMemcpyAsync(DstMem, SrcMem, sizeof(DataT) * SrcA.size(), Stream) !=
+          CUDA_SUCCESS) {
+        throw;
+      }
+
+      if (cuStreamSynchronize(Stream) != CUDA_SUCCESS) {
+        throw;
+      }
+
+      if (Q.get_backend() != IH.get_backend())
+        throw;
+    };
+    CGH.host_task(Func);
+  });
+}
+
+template <typename DataT> void modify(buffer<DataT, 1> &B, queue &Q) {
+  Q.submit([&](handler &CGH) {
+    auto Acc = B.template get_access<mode::read_write>(CGH);
+
+    auto Kernel = [=](item<1> Id) { Acc[Id] += 1; };
+
+    CGH.parallel_for<Modifier<DataT>>(Acc.size(), Kernel);
+  });
+}
+
+template <typename DataT, DataT B1Init, DataT B2Init>
+void init(buffer<DataT, 1> &B1, buffer<DataT, 1> &B2, queue &Q) {
+  Q.submit([&](handler &CGH) {
+    auto Acc1 = B1.template get_access<mode::write>(CGH);
+    auto Acc2 = B2.template get_access<mode::write>(CGH);
+
+    CGH.parallel_for<Init<DataT>>(BUFFER_SIZE, [=](item<1> Id) {
+      Acc1[Id] = B1Init;
+      Acc2[Id] = B2Init;
+    });
+  });
+}
+
+// Check that a single host-interop-task with a buffer will work.
+void test_ht_buffer(queue &Q) {
+  buffer<int, 1> Buffer{BUFFER_SIZE};
+
+  Q.submit([&](handler &CGH) {
+    auto Acc = Buffer.get_access<mode::write>(CGH);
+    auto Func = [=](interop_handle IH) { /*A no-op */ };
+    CGH.host_task(Func);
+  });
+}
+
+// A test that uses CUDA interop to copy data from buffer A to buffer B, by
+// getting CUDA ptrs and calling the cuMemcpyWithAsync. Then run a SYCL
+// kernel that modifies the data in place for B, e.g. increment one, then copy
+// back to buffer A. Run it on a loop, to ensure the dependencies and the
+// reference counting of the objects is not leaked.
+void test_ht_kernel_dependencies(queue &Q) {
+  static constexpr int COUNT = 4;
+  buffer<int, 1> Buffer1{BUFFER_SIZE};
+  buffer<int, 1> Buffer2{BUFFER_SIZE};
+
+  // Init the buffer with a'priori invalid data.
+  init<int, -1, -2>(Buffer1, Buffer2, Q);
+
+  // Repeat a couple of times.
+  for (size_t Idx = 0; Idx < COUNT; ++Idx) {
+    copy(Buffer1, Buffer2, Q);
+    modify(Buffer2, Q);
+    copy(Buffer2, Buffer1, Q);
+  }
+
+  checkBufferValues(Buffer1, COUNT - 1);
+  checkBufferValues(Buffer2, COUNT - 1);
+}
+
+void tests(queue &Q) {
+  test_ht_buffer(Q);
+  test_ht_kernel_dependencies(Q);
+}
+
+int main() {
+  queue Q;
+  tests(Q);
+  std::cout << "Test PASSED" << std::endl;
+  return 0;
+}
diff --git a/sycl/test-e2e/InorderQueue/in_order_get_property.cpp b/sycl/test-e2e/InorderQueue/in_order_get_property.cpp
index ccbfd488db323..b501376303a27 100644
--- a/sycl/test-e2e/InorderQueue/in_order_get_property.cpp
+++ b/sycl/test-e2e/InorderQueue/in_order_get_property.cpp
@@ -24,10 +24,11 @@ int main() {
     Queue1.get_property<property::queue::in_order>();
     assert(false && "Queue1 was created without any properties therefore get "
                     "property should fail.");
-  } catch (const invalid_object_error &e) {
+  } catch (const exception &e) {
     std::string ErrorMessage = e.what();
     assert(
-        (ErrorMessage.find("The property is not found") != std::string::npos) &&
+        (e.code() == errc::invalid &&
+         ErrorMessage.find("The property is not found") != std::string::npos) &&
         "Caught unexpected error!");
   }
 
diff --git a/sycl/test-e2e/InorderQueue/in_order_kernels.cpp b/sycl/test-e2e/InorderQueue/in_order_kernels.cpp
index 2bbb3cb915bf6..daa2f553a1ef9 100644
--- a/sycl/test-e2e/InorderQueue/in_order_kernels.cpp
+++ b/sycl/test-e2e/InorderQueue/in_order_kernels.cpp
@@ -80,7 +80,7 @@ int main() {
     q.submit([&](handler &cgh) {
       cgh.parallel_for_work_group<class WkGrp>(
           range<1>{N / 2}, range<1>{2}, [=](group<1> myGroup) {
-            auto j = myGroup.get_id(0);
+            auto j = myGroup.get_group_id(0);
             myGroup.parallel_for_work_item(
                 [&](h_item<1> it) { A[(j * 2) + it.get_local_id(0)]++; });
           });
@@ -89,7 +89,7 @@ int main() {
     q.submit([&](handler &cgh) {
       cgh.parallel_for_work_group(
           range<1>{N / 2}, range<1>{2}, [=](group<1> myGroup) {
-            auto j = myGroup.get_id(0);
+            auto j = myGroup.get_group_id(0);
             myGroup.parallel_for_work_item(
                 [&](h_item<1> it) { A[(j * 2) + it.get_local_id(0)]++; });
           });
diff --git a/sycl/test-e2e/InorderQueue/in_order_usm_implicit.cpp b/sycl/test-e2e/InorderQueue/in_order_usm_implicit.cpp
index f313be5606289..8245921ce18d6 100644
--- a/sycl/test-e2e/InorderQueue/in_order_usm_implicit.cpp
+++ b/sycl/test-e2e/InorderQueue/in_order_usm_implicit.cpp
@@ -41,7 +41,7 @@ int main() {
       dataB[i] = 0;
     }
 
-    Queue.mem_advise(dataA, numBytes, (pi_mem_advice)mem_advice);
+    Queue.mem_advise(dataA, numBytes, mem_advice);
 
     Queue.submit([&](handler &cgh) {
       auto myRange = range<1>(dataSize);
diff --git a/sycl/test-e2e/IntermediateLib/Inputs/simple_lib.cpp b/sycl/test-e2e/IntermediateLib/Inputs/simple_lib.cpp
new file mode 100644
index 0000000000000..7b97e2fb8663c
--- /dev/null
+++ b/sycl/test-e2e/IntermediateLib/Inputs/simple_lib.cpp
@@ -0,0 +1,58 @@
+/*
+    // compile to static lib
+    clang++ -fsycl -c -fPIC -o simple_lib.o simple_lib.cpp
+
+    // compile to dynamic lib
+    clang++ -fsycl  -fPIC -shared -o simple_lib.so simple_lib.cpp
+
+*/
+
+#include "simple_lib.h"
+#include <sycl/detail/core.hpp>
+
+const size_t BUFF_SIZE = 1;
+
+class Delay {
+public:
+  std::shared_ptr<sycl::buffer<int, 1>> sharedBuffer;
+
+  void release() {
+    std::cout << "Delay.release()" << std::endl;
+    sharedBuffer.reset();
+  }
+
+  const sycl::buffer<int, 1> &getBuffer() {
+    if (!sharedBuffer) {
+      sharedBuffer = std::make_shared<sycl::buffer<int, 1>>(BUFF_SIZE);
+    }
+    return *sharedBuffer;
+  }
+
+  Delay() : sharedBuffer(nullptr) {}
+  ~Delay() { release(); }
+};
+
+#ifdef _WIN32
+static Delay theDelay;
+Delay *MyDelay = &theDelay;
+#else
+Delay *MyDelay = new Delay;
+
+__attribute__((destructor(101))) static void Unload101() {
+  std::cout << "lib unload - __attribute__((destructor(101)))" << std::endl;
+  delete MyDelay;
+}
+#endif
+
+EXPORTDECL int add_using_device(int a, int b) {
+  sycl::queue q;
+  sycl::buffer<int, 1> buf = MyDelay->getBuffer();
+  q.submit([&](sycl::handler &cgh) {
+     sycl::accessor acc(buf, cgh, sycl::write_only);
+
+     cgh.single_task([=] { acc[0] = a + b; });
+   }).wait();
+
+  sycl::host_accessor acc(buf);
+  return acc[0];
+}
diff --git a/sycl/test-e2e/IntermediateLib/Inputs/simple_lib.h b/sycl/test-e2e/IntermediateLib/Inputs/simple_lib.h
new file mode 100644
index 0000000000000..57c0b5a334464
--- /dev/null
+++ b/sycl/test-e2e/IntermediateLib/Inputs/simple_lib.h
@@ -0,0 +1,12 @@
+#ifndef SIMPLE_SYCL_LIB_H
+#define SIMPLE_SYCL_LIB_H
+
+#ifdef _WIN32
+#define EXPORTDECL extern "C" __declspec(dllexport)
+#else
+#define EXPORTDECL extern "C"
+#endif
+
+EXPORTDECL int add_using_device(int a, int b);
+
+#endif
diff --git a/sycl/test-e2e/IntermediateLib/dynamic_app_linux.cpp b/sycl/test-e2e/IntermediateLib/dynamic_app_linux.cpp
new file mode 100644
index 0000000000000..39299fe603a68
--- /dev/null
+++ b/sycl/test-e2e/IntermediateLib/dynamic_app_linux.cpp
@@ -0,0 +1,68 @@
+// REQUIRES: level_zero && linux
+
+// build shared library
+// RUN: %clangxx -fsycl -fPIC -shared -o %T/simple_lib.so %S/Inputs/simple_lib.cpp
+
+// build app
+// RUN: %clangxx -DSO_PATH="%T/simple_lib.so" -o %t.out %s
+
+// RUN: %{run} %t.out
+// RUN: env UR_L0_LEAKS_DEBUG=1 %{run} %t.out
+
+// In these tests we are building an intermediate library which uses SYCL and an
+// app that employs that intermediate library, using both static and dynamic
+// linking, and delayed release. This is to test that release and shutdown are
+// working correctly.
+
+/*
+    //library
+    clang++ -fsycl  -fPIC -shared -o simple_lib.so Inputs/simple_lib.cpp
+
+    //app
+    clang++ -DSO_PATH="simple_lib.so" -o dynamic_app.bin dynamic_app_linux.cpp
+
+    UR_L0_LEAKS_DEBUG=1 ./dynamic_app.bin
+
+*/
+
+#include "Inputs/simple_lib.h"
+#include <assert.h>
+#include <dlfcn.h>
+#include <iostream>
+
+void *handle = nullptr;
+
+__attribute__((destructor(101))) static void Unload101() {
+  std::cout << "app unload - __attribute__((destructor(101)))" << std::endl;
+  if (handle) {
+    dlclose(handle);
+    handle = nullptr;
+  }
+}
+
+#define STRINGIFY_HELPER(A) #A
+#define STRINGIFY(A) STRINGIFY_HELPER(A)
+#define SO_FNAME "" STRINGIFY(SO_PATH) ""
+
+int main() {
+
+  handle = dlopen(SO_FNAME, RTLD_NOW);
+  if (!handle) {
+    std::cout << "failed to load" << std::endl;
+    return 1;
+  }
+
+  // Function pointer to the exported function
+  int (*add_using_device)(int, int) =
+      (int (*)(int, int))dlsym(handle, "add_using_device");
+  if (!add_using_device) {
+    std::cout << "failed to get function" << std::endl;
+    return 2;
+  }
+
+  int result = add_using_device(3, 4);
+  std::cout << "Result: " << result << std::endl;
+  assert(result == 7);
+
+  return 0;
+}
\ No newline at end of file
diff --git a/sycl/test-e2e/IntermediateLib/static_app.cpp b/sycl/test-e2e/IntermediateLib/static_app.cpp
new file mode 100644
index 0000000000000..b5292e317d2d7
--- /dev/null
+++ b/sycl/test-e2e/IntermediateLib/static_app.cpp
@@ -0,0 +1,37 @@
+// REQUIRES: level_zero && linux
+
+// DEFINE: %{fPIC_flag} =  %if windows %{%} %else %{-fPIC%}
+// build static library
+// RUN: %clangxx -fsycl -c  %{fPIC_flag} -o simple_lib.o %S/Inputs/simple_lib.cpp
+
+// build app
+// RUN:  %clangxx -fsycl -o %t.out %s simple_lib.o
+
+// RUN: %{run} %t.out
+// RUN: env UR_L0_LEAKS_DEBUG=1 %{run} %t.out
+
+// In these tests we are building an intermediate library which uses SYCL and an
+// app that employs that intermediate library, using both static and dynamic
+// linking, and delayed release. This is to test that release and shutdown are
+// working correctly.
+
+/*
+    //library
+    clang++ -fsycl -c -fPIC -o simple_lib.o Inputs/simple_lib.cpp
+
+    //app
+    clang++ -fsycl -o static_app.bin static_app.cpp simple_lib.o
+
+    UR_L0_LEAKS_DEBUG=1 ./simple_app.bin
+
+*/
+
+#include "Inputs/simple_lib.h"
+#include <assert.h>
+#include <iostream>
+
+int main() {
+  int result = add_using_device(3, 4);
+  std::cout << "result: " << result << std::endl;
+  assert(result == 7);
+}
diff --git a/sycl/test-e2e/InvokeSimd/Feature/popcnt_emu.cpp b/sycl/test-e2e/InvokeSimd/Feature/popcnt_emu.cpp
index 692dea0905f75..daee9ffd8df99 100644
--- a/sycl/test-e2e/InvokeSimd/Feature/popcnt_emu.cpp
+++ b/sycl/test-e2e/InvokeSimd/Feature/popcnt_emu.cpp
@@ -126,7 +126,9 @@ int main(void) {
             } else {
               res = id % 2;
             }
-            sg.store(out_accessor.get_pointer() + offset, res);
+            sg.store(out_accessor.get_multi_ptr<access::decorated::yes>() +
+                         offset,
+                     res);
           });
     });
     e.wait();
diff --git a/sycl/test-e2e/InvokeSimd/Feature/scale.cpp b/sycl/test-e2e/InvokeSimd/Feature/scale.cpp
index b3750d69498a4..c122b96f0612d 100644
--- a/sycl/test-e2e/InvokeSimd/Feature/scale.cpp
+++ b/sycl/test-e2e/InvokeSimd/Feature/scale.cpp
@@ -101,9 +101,14 @@ template <class T, class QueueTY> bool test(QueueTY q) {
             unsigned int offset = g.get_group_id() * g.get_local_range() +
                                   sg.get_group_id() * sg.get_max_local_range();
 
-            T va = sg.load(acca.get_pointer() + offset);
+            T va = sg.load(
+                acca.template get_multi_ptr<access::decorated::yes>().get() +
+                offset);
             T vc = invoke_simd(sg, SIMD_CALLEE_scale<T>, va, uniform{n});
-            sg.store(accc.get_pointer() + offset, vc);
+            sg.store(
+                accc.template get_multi_ptr<access::decorated::yes>().get() +
+                    offset,
+                vc);
           });
     });
     e.wait();
diff --git a/sycl/test-e2e/InvokeSimd/Feature/void_retval.cpp b/sycl/test-e2e/InvokeSimd/Feature/void_retval.cpp
index a626165040cf7..a512834d358b9 100644
--- a/sycl/test-e2e/InvokeSimd/Feature/void_retval.cpp
+++ b/sycl/test-e2e/InvokeSimd/Feature/void_retval.cpp
@@ -102,8 +102,10 @@ int main(void) {
 
             unsigned int offset = g.get_group_id() * g.get_local_range() +
                                   sg.get_group_id() * sg.get_max_local_range();
-            float va = sg.load(PA.get_pointer() + offset);
-            float vb = sg.load(PB.get_pointer() + offset);
+            float va = sg.load(
+                PA.get_multi_ptr<access::decorated::yes>().get() + offset);
+            float vb = sg.load(
+                PB.get_multi_ptr<access::decorated::yes>().get() + offset);
             // We need to get a pointer to the starting address of where the
             // result of the vector addition should be stored in/written back to
             // C. Returns the index (ordinal number) of the work-group to which
@@ -112,7 +114,8 @@ int main(void) {
             // absolute starting index of the work-group in the ND-range to
             // which the current work-item belongs.
             int group_offset = g.get_group_linear_id() * VL;
-            float *pvc = PC.get_pointer() + group_offset;
+            float *pvc =
+                PC.get_multi_ptr<access::decorated::yes>().get() + group_offset;
 
             // Invoke SIMD function:
             // va values from each work-item are combined into a simd<float,
diff --git a/sycl/test-e2e/InvokeSimd/Regression/ImplicitSubgroup/call_vadd_1d_loop_naive.cpp b/sycl/test-e2e/InvokeSimd/Regression/ImplicitSubgroup/call_vadd_1d_loop_naive.cpp
index 67e9bb239b9c6..f1373ae0ec512 100644
--- a/sycl/test-e2e/InvokeSimd/Regression/ImplicitSubgroup/call_vadd_1d_loop_naive.cpp
+++ b/sycl/test-e2e/InvokeSimd/Regression/ImplicitSubgroup/call_vadd_1d_loop_naive.cpp
@@ -1,5 +1,3 @@
-// https://github.com/intel/llvm/issues/10369
-// UNSUPPORTED: gpu
 //
 // Check that full compilation works:
 // RUN: %clangxx -DIMPL_SUBGROUP -fsycl -fno-sycl-device-code-split-esimd -Xclang -fsycl-allow-func-ptr %S/../call_vadd_1d_loop_naive.cpp -o %t.out
diff --git a/sycl/test-e2e/InvokeSimd/Regression/address_space_cast.cpp b/sycl/test-e2e/InvokeSimd/Regression/address_space_cast.cpp
index 5599bba0bb93c..3890c41037fb9 100644
--- a/sycl/test-e2e/InvokeSimd/Regression/address_space_cast.cpp
+++ b/sycl/test-e2e/InvokeSimd/Regression/address_space_cast.cpp
@@ -28,7 +28,7 @@ ESIMD_CALLEE(float *A, esimd::simd<float, VL> b, int i) SYCL_ESIMD_FUNCTION {
   global_ptr<float, access::decorated::yes> ptr =
       sycl::address_space_cast<access::address_space::global_space,
                                access::decorated::yes, float>(A);
-  a.copy_from(ptr + i);
+  a.copy_from(ptr.get() + i);
   return a + b;
 }
 
@@ -68,8 +68,8 @@ bool test() {
           Range, [=](nd_item<1> ndi) [[intel::reqd_sub_group_size(VL)]] {
             sub_group sg = ndi.get_sub_group();
             group<1> g = ndi.get_group();
-            uint32_t i =
-                sg.get_group_linear_id() * VL + g.get_linear_id() * GroupSize;
+            uint32_t i = sg.get_group_linear_id() * VL +
+                         g.get_group_linear_id() * GroupSize;
             uint32_t wi_id = i + sg.get_local_id();
             float res = 0;
 
diff --git a/sycl/test-e2e/InvokeSimd/Regression/call_vadd_1d_spill.cpp b/sycl/test-e2e/InvokeSimd/Regression/call_vadd_1d_spill.cpp
index 6e303bf8a3729..d4b2aaf816c55 100644
--- a/sycl/test-e2e/InvokeSimd/Regression/call_vadd_1d_spill.cpp
+++ b/sycl/test-e2e/InvokeSimd/Regression/call_vadd_1d_spill.cpp
@@ -108,14 +108,22 @@ bool test(QueueTY q, float *A, float *B, float *C, float *P, float *Q, float *R,
 
             unsigned int offset = g.get_group_id() * g.get_local_range() +
                                   sg.get_group_id() * sg.get_max_local_range();
-            float va = sg.load(PA.get_pointer() + offset);
-            float vb = sg.load(PB.get_pointer() + offset);
-            float vp = sg.load(PP.get_pointer() + offset);
-            float vq = sg.load(PQ.get_pointer() + offset);
-            float vr = sg.load(PR.get_pointer() + offset);
-            float vx = sg.load(PX.get_pointer() + offset);
-            float vy = sg.load(PY.get_pointer() + offset);
-            float vz = sg.load(PZ.get_pointer() + offset);
+            float va = sg.load(
+                PA.get_multi_ptr<access::decorated::yes>().get() + offset);
+            float vb = sg.load(
+                PB.get_multi_ptr<access::decorated::yes>().get() + offset);
+            float vp = sg.load(
+                PP.get_multi_ptr<access::decorated::yes>().get() + offset);
+            float vq = sg.load(
+                PQ.get_multi_ptr<access::decorated::yes>().get() + offset);
+            float vr = sg.load(
+                PR.get_multi_ptr<access::decorated::yes>().get() + offset);
+            float vx = sg.load(
+                PX.get_multi_ptr<access::decorated::yes>().get() + offset);
+            float vy = sg.load(
+                PY.get_multi_ptr<access::decorated::yes>().get() + offset);
+            float vz = sg.load(
+                PZ.get_multi_ptr<access::decorated::yes>().get() + offset);
 
             float vc;
 
@@ -126,7 +134,8 @@ bool test(QueueTY q, float *A, float *B, float *C, float *P, float *Q, float *R,
               vc = SPMD_CALLEE_doVadd(va, vb, vx, vy, vx, vy, vx, vy, vx, vy,
                                       vp, vq, vr, vz);
             }
-            sg.store(PC.get_pointer() + offset, vc);
+            sg.store(PC.get_multi_ptr<access::decorated::yes>().get() + offset,
+                     vc);
           });
     });
     e.wait();
diff --git a/sycl/test-e2e/InvokeSimd/Regression/debug_symbols.cpp b/sycl/test-e2e/InvokeSimd/Regression/debug_symbols.cpp
index 14b20ac8dc6d6..a28787ebd1d7d 100644
--- a/sycl/test-e2e/InvokeSimd/Regression/debug_symbols.cpp
+++ b/sycl/test-e2e/InvokeSimd/Regression/debug_symbols.cpp
@@ -84,15 +84,18 @@ int main(void) {
 
             unsigned int offset = g.get_group_id() * g.get_local_range() +
                                   sg.get_group_id() * sg.get_max_local_range();
-            float va = sg.load(PA.get_pointer() + offset);
-            float vb = sg.load(PB.get_pointer() + offset);
+            float va = sg.load(
+                PA.get_multi_ptr<access::decorated::yes>().get() + offset);
+            float vb = sg.load(
+                PB.get_multi_ptr<access::decorated::yes>().get() + offset);
 
             // Invoke SIMD function:
             // va values from each work-item are combined into a simd<float,
             // VL>. vb values from each work-item are combined into a
             // simd<float, VL>.
             float vc = invoke_simd(sg, SIMD_CALLEE_doVadd, va, vb);
-            sg.store(PC.get_pointer() + offset, vc);
+            sg.store(PC.get_multi_ptr<access::decorated::yes>().get() + offset,
+                     vc);
           });
     });
     e.wait();
diff --git a/sycl/test-e2e/InvokeSimd/Regression/matrix_multiply_accessor_get_pointer.cpp b/sycl/test-e2e/InvokeSimd/Regression/matrix_multiply_accessor_get_pointer.cpp
index d25b7efedac34..b1187b1f90821 100644
--- a/sycl/test-e2e/InvokeSimd/Regression/matrix_multiply_accessor_get_pointer.cpp
+++ b/sycl/test-e2e/InvokeSimd/Regression/matrix_multiply_accessor_get_pointer.cpp
@@ -150,8 +150,9 @@ int main(void) {
               if constexpr (use_invoke_simd) {
                 int res = invoke_simd(
                     ndi.get_sub_group(), SIMD_CALLEE_computeDotProducts,
-                    row_index, column_index, uniform{acc_a.get_pointer()},
-                    uniform{acc_b.get_pointer()});
+                    row_index, column_index,
+                    uniform{acc_a.get_multi_ptr<access::decorated::yes>()},
+                    uniform{acc_b.get_multi_ptr<access::decorated::yes>()});
                 acc_c[row_index][column_index] = res;
               } else {
                 for (int k = 0; k < N; ++k) {
diff --git a/sycl/test-e2e/InvokeSimd/Regression/slm_gather_scatter.cpp b/sycl/test-e2e/InvokeSimd/Regression/slm_gather_scatter.cpp
index 66a2ba9e92cae..f4cc97d3c34b6 100644
--- a/sycl/test-e2e/InvokeSimd/Regression/slm_gather_scatter.cpp
+++ b/sycl/test-e2e/InvokeSimd/Regression/slm_gather_scatter.cpp
@@ -13,7 +13,7 @@
  * Test check basic support of local memory access in invoke_simd.
  */
 
-#include "../invoke_simd_utils.hpp"
+#include "../../ESIMD/esimd_test_utils.hpp"
 
 #include <sycl/detail/core.hpp>
 #include <sycl/ext/intel/esimd.hpp>
@@ -52,8 +52,8 @@ ESIMD_INLINE void slm_load_store_test(
     dtype *C, esimd::simd<uint32_t, VL> GlobalByteOffsets) SYCL_ESIMD_FUNCTION {
 
   uint32_t LocalAccOffset =
-      static_cast<uint32_t>(
-          reinterpret_cast<std::uintptr_t>(LocalAcc.get_pointer().get())) +
+      static_cast<uint32_t>(reinterpret_cast<std::uintptr_t>(
+          LocalAcc.get_multi_ptr<access::decorated::yes>().get())) +
       LAByteOffset;
   esimd::simd<uint32_t, VL> Offsets(LocalAccOffset, sizeof(dtype));
   auto Local1 = esimd::slm_gather<dtype, VL>(Offsets);
diff --git a/sycl/test-e2e/InvokeSimd/Regression/slm_load_store.cpp b/sycl/test-e2e/InvokeSimd/Regression/slm_load_store.cpp
index 20409797bcdc2..c5c92ff6ac4fc 100644
--- a/sycl/test-e2e/InvokeSimd/Regression/slm_load_store.cpp
+++ b/sycl/test-e2e/InvokeSimd/Regression/slm_load_store.cpp
@@ -13,7 +13,7 @@
  * Test check basic support of local memory access in invoke_simd.
  */
 
-#include "../invoke_simd_utils.hpp"
+#include "../../ESIMD/esimd_test_utils.hpp"
 
 #include <sycl/detail/core.hpp>
 #include <sycl/ext/intel/esimd.hpp>
@@ -52,8 +52,8 @@ ESIMD_INLINE void slm_load_store_test(
     dtype *C, esimd::simd<uint32_t, VL> GlobalByteOffsets) SYCL_ESIMD_FUNCTION {
 
   uint32_t LocalAccOffset =
-      static_cast<uint32_t>(
-          reinterpret_cast<std::uintptr_t>(LocalAcc.get_pointer().get())) +
+      static_cast<uint32_t>(reinterpret_cast<std::uintptr_t>(
+          LocalAcc.get_multi_ptr<access::decorated::yes>().get())) +
       LAByteOffset;
   auto Local1 = esimd::slm_block_load<dtype, VL>(LocalAccOffset);
   auto Local2 = esimd::slm_block_load<dtype, VL>(LocalAccOffset +
diff --git a/sycl/test-e2e/InvokeSimd/Spec/ESIMD_to_unmarked_function.cpp b/sycl/test-e2e/InvokeSimd/Spec/ESIMD_to_unmarked_function.cpp
index b92295d4249b8..992427bcbbf94 100644
--- a/sycl/test-e2e/InvokeSimd/Spec/ESIMD_to_unmarked_function.cpp
+++ b/sycl/test-e2e/InvokeSimd/Spec/ESIMD_to_unmarked_function.cpp
@@ -118,8 +118,10 @@ int main(void) {
 
             unsigned int offset = g.get_group_id() * g.get_local_range() +
                                   sg.get_group_id() * sg.get_max_local_range();
-            float va = sg.load(PA.get_pointer() + offset);
-            float vb = sg.load(PB.get_pointer() + offset);
+            float va = sg.load(
+                PA.get_multi_ptr<access::decorated::yes>().get() + offset);
+            float vb = sg.load(
+                PB.get_multi_ptr<access::decorated::yes>().get() + offset);
             float vc;
 
             if constexpr (use_invoke_simd) {
@@ -127,7 +129,8 @@ int main(void) {
             } else {
               vc = doVadd(va, vb);
             }
-            sg.store(PC.get_pointer() + offset, vc);
+            sg.store(PC.get_multi_ptr<access::decorated::yes>().get() + offset,
+                     vc);
           });
     });
     e.wait();
diff --git a/sycl/test-e2e/InvokeSimd/Spec/function_overloads.cpp b/sycl/test-e2e/InvokeSimd/Spec/function_overloads.cpp
index a78c6ec5ff651..f2a997defaa6d 100644
--- a/sycl/test-e2e/InvokeSimd/Spec/function_overloads.cpp
+++ b/sycl/test-e2e/InvokeSimd/Spec/function_overloads.cpp
@@ -115,7 +115,8 @@ int main(void) {
 
             unsigned int offset = g.get_group_id() * g.get_local_range() +
                                   sg.get_group_id() * sg.get_max_local_range();
-            float va = sg.load(PA.get_pointer() + offset);
+            float va = sg.load(
+                PA.get_multi_ptr<access::decorated::yes>().get() + offset);
             float vc;
 
             // Invoke SIMD function:
@@ -128,7 +129,8 @@ int main(void) {
                   simd<float, VL>, simd<float, VL>)>(sg, SIMD_CALLEE_scale, va,
                                                      n);
 
-            sg.store(PC.get_pointer() + offset, vc);
+            sg.store(PC.get_multi_ptr<access::decorated::yes>().get() + offset,
+                     vc);
           });
     });
     e.wait();
diff --git a/sycl/test-e2e/InvokeSimd/Spec/nested_ESIMD_to_ESIMD.cpp b/sycl/test-e2e/InvokeSimd/Spec/nested_ESIMD_to_ESIMD.cpp
index 850ba834ab2d9..30681bb968e2b 100644
--- a/sycl/test-e2e/InvokeSimd/Spec/nested_ESIMD_to_ESIMD.cpp
+++ b/sycl/test-e2e/InvokeSimd/Spec/nested_ESIMD_to_ESIMD.cpp
@@ -102,8 +102,10 @@ int main(void) {
 
             unsigned int offset = g.get_group_id() * g.get_local_range() +
                                   sg.get_group_id() * sg.get_max_local_range();
-            float va = sg.load(PA.get_pointer() + offset);
-            float vb = sg.load(PB.get_pointer() + offset);
+            float va = sg.load(
+                PA.get_multi_ptr<access::decorated::yes>().get() + offset);
+            float vb = sg.load(
+                PB.get_multi_ptr<access::decorated::yes>().get() + offset);
             float vc;
 
             if constexpr (use_invoke_simd) {
@@ -111,7 +113,8 @@ int main(void) {
             } else {
               vc = SPMD_CALLEE_doVadd(va, vb);
             }
-            sg.store(PC.get_pointer() + offset, vc);
+            sg.store(PC.get_multi_ptr<access::decorated::yes>().get() + offset,
+                     vc);
           });
     });
     e.wait();
diff --git a/sycl/test-e2e/InvokeSimd/Spec/nested_SPMD_to_ESIMD.cpp b/sycl/test-e2e/InvokeSimd/Spec/nested_SPMD_to_ESIMD.cpp
index 79fbae6436ccb..7436ec702c2a0 100644
--- a/sycl/test-e2e/InvokeSimd/Spec/nested_SPMD_to_ESIMD.cpp
+++ b/sycl/test-e2e/InvokeSimd/Spec/nested_SPMD_to_ESIMD.cpp
@@ -107,8 +107,10 @@ int main(void) {
 
             unsigned int offset = g.get_group_id() * g.get_local_range() +
                                   sg.get_group_id() * sg.get_max_local_range();
-            float va = sg.load(PA.get_pointer() + offset);
-            float vb = sg.load(PB.get_pointer() + offset);
+            float va = sg.load(
+                PA.get_multi_ptr<access::decorated::yes>().get() + offset);
+            float vb = sg.load(
+                PB.get_multi_ptr<access::decorated::yes>().get() + offset);
             float vc;
 
             if constexpr (use_invoke_simd) {
@@ -116,7 +118,8 @@ int main(void) {
             } else {
               vc = SPMD_CALLEE_doVadd(va, vb);
             }
-            sg.store(PC.get_pointer() + offset, vc);
+            sg.store(PC.get_multi_ptr<access::decorated::yes>().get() + offset,
+                     vc);
           });
     });
     e.wait();
diff --git a/sycl/test-e2e/InvokeSimd/Spec/simd_size/simd8.cpp b/sycl/test-e2e/InvokeSimd/Spec/simd_size/simd8.cpp
index a6c1a3caffeda..a9f1323d862aa 100644
--- a/sycl/test-e2e/InvokeSimd/Spec/simd_size/simd8.cpp
+++ b/sycl/test-e2e/InvokeSimd/Spec/simd_size/simd8.cpp
@@ -17,7 +17,7 @@
  * This test also runs with all types of VISA link time optimizations enabled.
  */
 
-#include "../../invoke_simd_utils.hpp"
+#include "../../../ESIMD/esimd_test_utils.hpp"
 #include "Inputs/common.hpp"
 
 int main(void) {
diff --git a/sycl/test-e2e/InvokeSimd/Spec/uniform_retval.cpp b/sycl/test-e2e/InvokeSimd/Spec/uniform_retval.cpp
index 3bd3afde39b97..90f668472ed0f 100644
--- a/sycl/test-e2e/InvokeSimd/Spec/uniform_retval.cpp
+++ b/sycl/test-e2e/InvokeSimd/Spec/uniform_retval.cpp
@@ -171,7 +171,9 @@ template <class T, bool return_SIMD, class QueueTY> bool test(QueueTY q) {
             unsigned int offset = g.get_group_id() * g.get_local_range() +
                                   sg.get_group_id() * sg.get_max_local_range();
 
-            T va = sg.load(acca.get_pointer() + offset);
+            T va = sg.load(
+                acca.template get_multi_ptr<access::decorated::yes>().get() +
+                offset);
             T vc;
 
             if constexpr (return_SIMD)
@@ -181,7 +183,10 @@ template <class T, bool return_SIMD, class QueueTY> bool test(QueueTY q) {
               vc = invoke_simd(sg, SIMD_CALLEE_return_uniform_scalar<T>, va,
                                uniform{n});
 
-            sg.store(accc.get_pointer() + offset, vc);
+            sg.store(
+                accc.template get_multi_ptr<access::decorated::yes>().get() +
+                    offset,
+                vc);
           });
     });
     e.wait();
diff --git a/sycl/test-e2e/InvokeSimd/invoke_simd_conv.cpp b/sycl/test-e2e/InvokeSimd/invoke_simd_conv.cpp
index cc760f624a0ab..24bfc1e37e97e 100644
--- a/sycl/test-e2e/InvokeSimd/invoke_simd_conv.cpp
+++ b/sycl/test-e2e/InvokeSimd/invoke_simd_conv.cpp
@@ -50,22 +50,19 @@ template <class SimdElemT>
   return calc(val);
 }
 
-class ESIMDSelector : public device_selector {
-  // Require GPU device
-  virtual int operator()(const device &device) const {
-    if (const char *dev_filter = getenv("ONEAPI_DEVICE_SELECTOR")) {
-      std::string filter_string(dev_filter);
-      if (filter_string.find("gpu") != std::string::npos)
-        return device.is_gpu() ? 1000 : -1;
-      std::cerr << "Supported 'ONEAPI_DEVICE_SELECTOR' env var values is "
-                   "'*:gpu' and  '"
-                << filter_string << "' does not contain such substrings.\n";
-      return -1;
-    }
-    // If "ONEAPI_DEVICE_SELECTOR" not defined, only allow gpu device
-    return device.is_gpu() ? 1000 : -1;
+int ESIMD_selector_v(const device &device) {
+  if (const char *dev_filter = getenv("ONEAPI_DEVICE_SELECTOR")) {
+    std::string filter_string(dev_filter);
+    if (filter_string.find("gpu") != std::string::npos)
+      return device.is_gpu() ? 1000 : -1;
+    std::cerr << "Supported 'ONEAPI_DEVICE_SELECTOR' env var values is "
+                 "'*:gpu' and  '"
+              << filter_string << "' does not contain such substrings.\n";
+    return -1;
   }
-};
+  // If "ONEAPI_DEVICE_SELECTOR" not defined, only allow gpu device
+  return device.is_gpu() ? 1000 : -1;
+}
 
 inline auto createExceptionHandler() {
   return [](exception_list l) {
@@ -150,7 +147,7 @@ template <class SpmdT, class SimdElemT, bool IsUniform> bool test(queue q) {
 }
 
 int main(void) {
-  queue q(ESIMDSelector{}, createExceptionHandler());
+  queue q(ESIMD_selector_v, createExceptionHandler());
 
   auto dev = q.get_device();
   std::cout << "Running on " << dev.get_info<sycl::info::device::name>()
diff --git a/sycl/test-e2e/InvokeSimd/invoke_simd_smoke.cpp b/sycl/test-e2e/InvokeSimd/invoke_simd_smoke.cpp
index ad349e2a7aa3d..d11dd4f5b579f 100644
--- a/sycl/test-e2e/InvokeSimd/invoke_simd_smoke.cpp
+++ b/sycl/test-e2e/InvokeSimd/invoke_simd_smoke.cpp
@@ -55,22 +55,19 @@ ESIMD_CALLEE(float *A, esimd::simd<float, VL> b, int i) SYCL_ESIMD_FUNCTION {
 
 float SPMD_CALLEE(float *A, float b, int i) { return A[i] + b; }
 
-class ESIMDSelector : public device_selector {
-  // Require GPU device
-  virtual int operator()(const device &device) const {
-    if (const char *dev_filter = getenv("ONEAPI_DEVICE_SELECTOR")) {
-      std::string filter_string(dev_filter);
-      if (filter_string.find("gpu") != std::string::npos)
-        return device.is_gpu() ? 1000 : -1;
-      std::cerr << "Supported 'ONEAPI_DEVICE_SELECTOR' env var values is "
-                   "'*:gpu' and '"
-                << filter_string << "' does not contain such substrings.\n";
-      return -1;
-    }
-    // If "ONEAPI_DEVICE_SELECTOR" not defined, only allow gpu device
-    return device.is_gpu() ? 1000 : -1;
+int ESIMD_selector_v(const device &device) {
+  if (const char *dev_filter = getenv("ONEAPI_DEVICE_SELECTOR")) {
+    std::string filter_string(dev_filter);
+    if (filter_string.find("gpu") != std::string::npos)
+      return device.is_gpu() ? 1000 : -1;
+    std::cerr << "Supported 'ONEAPI_DEVICE_SELECTOR' env var values is "
+                 "'*:gpu' and  '"
+              << filter_string << "' does not contain such substrings.\n";
+    return -1;
   }
-};
+  // If "ONEAPI_DEVICE_SELECTOR" not defined, only allow gpu device
+  return device.is_gpu() ? 1000 : -1;
+}
 
 inline auto createExceptionHandler() {
   return [](exception_list l) {
@@ -97,7 +94,7 @@ template <bool use_func_directly> bool test() {
   constexpr unsigned Size = 1024;
   constexpr unsigned GroupSize = 4 * VL;
 
-  queue q(ESIMDSelector{}, createExceptionHandler());
+  queue q(ESIMD_selector_v, createExceptionHandler());
 
   auto dev = q.get_device();
   std::cout << "Running with use_func_directly = " << use_func_directly
@@ -124,7 +121,7 @@ template <bool use_func_directly> bool test() {
         sub_group sg = ndi.get_sub_group();
         group<1> g = ndi.get_group();
         uint32_t i =
-            sg.get_group_linear_id() * VL + g.get_linear_id() * GroupSize;
+            sg.get_group_linear_id() * VL + g.get_group_linear_id() * GroupSize;
         uint32_t wi_id = i + sg.get_local_id();
         float res = 0;
 
diff --git a/sycl/test-e2e/InvokeSimd/invoke_simd_utils.hpp b/sycl/test-e2e/InvokeSimd/invoke_simd_utils.hpp
deleted file mode 100644
index e32eaf455dba3..0000000000000
--- a/sycl/test-e2e/InvokeSimd/invoke_simd_utils.hpp
+++ /dev/null
@@ -1,83 +0,0 @@
-#pragma once
-
-#include <string>
-#include <sycl/detail/core.hpp>
-#include <sycl/ext/intel/esimd.hpp>
-
-using namespace sycl::ext::oneapi::experimental;
-using namespace sycl;
-namespace esimd = sycl::ext::intel::esimd;
-
-enum GPUDriverOS { Linux = 1, Windows = 2, LinuxAndWindows = 3 };
-
-/// This function returns true if it can detect the level-zero or opencl
-/// GPU driver and can determine that the current driver is same or newer
-/// than the one passed in \p RequiredVersion or \p WinOpenCLRequiredVersion.
-///
-/// Below are how driver versions look like:
-///   Linux/L0:       [1.3.26370]
-///   Linux/opencl:   [23.22.26370.18]
-///   Windows/L0:     [1.3.26370]
-///   Windows/opencl: [31.0.101.4502]
-///
-/// This function uses only the part of the driver identification:
-///   - the second half of the driver id on win/opencl, e.g. 101.4502";
-///   - the 5-digit id for 3 other platforms, e.g. 26370.
-///
-/// Note: For the previous & new driver version and their release dates
-/// for win/opencl see the link:
-/// https://www.intel.com/content/www/us/en/download/726609/intel-arc-iris-xe-graphics-whql-windows.html
-bool isGPUDriverGE(queue Q, GPUDriverOS OSCheck, std::string RequiredVersion,
-                   std::string WinOpenCLRequiredVersion = "") {
-  auto Dev = Q.get_device();
-  if (!Dev.is_gpu())
-    return false;
-
-  bool IsLinux = false;
-#if defined(__SYCL_RT_OS_LINUX)
-  IsLinux = true;
-#elif !defined(__SYCL_RT_OS_WINDOWS)
-  return false;
-#endif
-
-  // A and B must have digits at the same positions.
-  // Otherwise, A and B symbols must be equal, e.g. both be equal to '.'.
-  auto verifyDriverVersionFormat = [](const std::string &A,
-                                      const std::string &B) {
-    if (A.size() != B.size())
-      throw std::runtime_error(
-          "Inconsistent expected & actual driver versions");
-    for (int I = 0; I < A.size(); I++) {
-      if ((A[I] >= '0' && A[I] <= '9' && !(B[I] >= '0' && B[I] <= '9')) &&
-          A[I] != B[I])
-        throw std::runtime_error(
-            "Inconsistent expected & actual driver versions");
-    }
-  };
-
-  auto BE = Q.get_backend();
-  int Length = 5;              // extract 5 digits for 3 or 4 platforms
-  int Start = 4;               // start of the driver id for 2 of 4 platforms
-  if (BE == backend::opencl) { // opencl has less-standard versioning
-    if (IsLinux) {
-      Start = 6;
-    } else {
-      Start = 5;
-      Length = 8;
-      RequiredVersion = WinOpenCLRequiredVersion;
-    }
-  }
-
-  bool IsGE = true;
-  if (IsLinux && (OSCheck & GPUDriverOS::Linux) ||
-      !IsLinux && (OSCheck & GPUDriverOS::Windows)) {
-    auto CurrentVersion = Dev.get_info<sycl::info::device::driver_version>();
-    CurrentVersion = CurrentVersion.substr(Start, Length);
-    verifyDriverVersionFormat(CurrentVersion, RequiredVersion);
-    std::cout << "RequiredVersion = " << RequiredVersion << ", Start=" << Start
-              << ", Length=" << Length << std::endl;
-    std::cout << "CurrentVersion = " << CurrentVersion << std::endl;
-    IsGE &= CurrentVersion >= RequiredVersion;
-  }
-  return IsGE;
-}
diff --git a/sycl/test-e2e/KernelAndProgram/build-log.cpp b/sycl/test-e2e/KernelAndProgram/build-log.cpp
index 89ec9960d197f..02dd5dfde893c 100644
--- a/sycl/test-e2e/KernelAndProgram/build-log.cpp
+++ b/sycl/test-e2e/KernelAndProgram/build-log.cpp
@@ -40,12 +40,14 @@ void test() {
     Queue.submit(
         [&](sycl::handler &CGH) { CGH.single_task<class SingleTask>(Kernel); });
     assert(false && "There must be compilation error");
-  } catch (const sycl::compile_program_error &e) {
+  } catch (const sycl::exception &e) {
     std::string Msg(e.what());
     std::cerr << Msg << std::endl;
+    assert(e.code() == sycl::errc::build &&
+           "Caught exception was not a compilation error");
     assert(Msg.find("PI_ERROR_BUILD_PROGRAM_FAILURE") != std::string::npos);
   } catch (...) {
-    assert(false && "There must be sycl::compile_program_error");
+    assert(false && "Caught exception was not a compilation error");
   }
 }
 
diff --git a/sycl/test-e2e/KernelAndProgram/cache-build-result.cpp b/sycl/test-e2e/KernelAndProgram/cache-build-result.cpp
index 15dcabae2a716..c1c084a1c7671 100644
--- a/sycl/test-e2e/KernelAndProgram/cache-build-result.cpp
+++ b/sycl/test-e2e/KernelAndProgram/cache-build-result.cpp
@@ -32,19 +32,19 @@ void test() {
         CGH.single_task<class SingleTask>(Kernel);
       });
       assert(false && "There must be compilation error");
-    } catch (const sycl::compile_program_error &e) {
+    } catch (const sycl::exception &e) {
       fprintf(stderr, "Exception: %s, %d\n", e.what(), e.code().value());
+      assert(e.code() == sycl::errc::build &&
+             "Caught exception was not a compilation error");
       if (Idx == 0) {
         Msg = e.what();
-        Result = e.code().value();
       } else {
         // Exception constantly adds info on its error code in the message
         assert(Msg.find_first_of(e.what()) == 0 &&
                "PI_ERROR_BUILD_PROGRAM_FAILURE");
-        assert(Result == e.code().value() && "Exception code differs");
       }
     } catch (...) {
-      assert(false && "There must be sycl::compile_program_error");
+      assert(false && "Caught exception was not a compilation error");
     }
   }
 }
diff --git a/sycl/test-e2e/KernelFusion/GroupAlgorithm/all_of.cpp b/sycl/test-e2e/KernelFusion/GroupAlgorithm/all_of.cpp
index 43cd14a01db2c..479e25724234d 100644
--- a/sycl/test-e2e/KernelFusion/GroupAlgorithm/all_of.cpp
+++ b/sycl/test-e2e/KernelFusion/GroupAlgorithm/all_of.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -I . -o %t.out
+// RUN: %{build} %{embed-ir} -I . -o %t.out
 // RUN: %{run} %t.out
 
 #include "../helpers.hpp"
diff --git a/sycl/test-e2e/KernelFusion/GroupAlgorithm/exclusive_scan.cpp b/sycl/test-e2e/KernelFusion/GroupAlgorithm/exclusive_scan.cpp
index 26b6d4671fb26..89dd25f7f55e9 100644
--- a/sycl/test-e2e/KernelFusion/GroupAlgorithm/exclusive_scan.cpp
+++ b/sycl/test-e2e/KernelFusion/GroupAlgorithm/exclusive_scan.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -I . -o %t.out
+// RUN: %{build} %{embed-ir} -I . -o %t.out
 // RUN: %{run} %t.out
 
 #include "../../helpers.hpp"
diff --git a/sycl/test-e2e/KernelFusion/GroupAlgorithm/permute.cpp b/sycl/test-e2e/KernelFusion/GroupAlgorithm/permute.cpp
index 76be3cb81fd78..33824088a9609 100644
--- a/sycl/test-e2e/KernelFusion/GroupAlgorithm/permute.cpp
+++ b/sycl/test-e2e/KernelFusion/GroupAlgorithm/permute.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 // Test fusion works with permute and remapping.
diff --git a/sycl/test-e2e/KernelFusion/GroupFunctions/group_barrier.cpp b/sycl/test-e2e/KernelFusion/GroupFunctions/group_barrier.cpp
index c587252f44732..470c32031ca13 100644
--- a/sycl/test-e2e/KernelFusion/GroupFunctions/group_barrier.cpp
+++ b/sycl/test-e2e/KernelFusion/GroupFunctions/group_barrier.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete_fusion preserves barriers by launching a kernel that requires a
diff --git a/sycl/test-e2e/KernelFusion/GroupFunctions/group_broadcast.cpp b/sycl/test-e2e/KernelFusion/GroupFunctions/group_broadcast.cpp
index 1237450c80705..08c724d2c4dc7 100644
--- a/sycl/test-e2e/KernelFusion/GroupFunctions/group_broadcast.cpp
+++ b/sycl/test-e2e/KernelFusion/GroupFunctions/group_broadcast.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 // Test fusion works with group_broadcast.
diff --git a/sycl/test-e2e/KernelFusion/GroupFunctions/group_broadcast_remapping.cpp b/sycl/test-e2e/KernelFusion/GroupFunctions/group_broadcast_remapping.cpp
index 9578d17937a52..03350be09c4d8 100644
--- a/sycl/test-e2e/KernelFusion/GroupFunctions/group_broadcast_remapping.cpp
+++ b/sycl/test-e2e/KernelFusion/GroupFunctions/group_broadcast_remapping.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 // Test fusion works with group_broadcast and remapping.
diff --git a/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_atomic_cross_wg.cpp b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_atomic_cross_wg.cpp
index e35d36b897cd2..133781a94d32c 100644
--- a/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_atomic_cross_wg.cpp
+++ b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_atomic_cross_wg.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 #include "./reduction.hpp"
diff --git a/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp
index 0868428cc2ef6..75e96c7afe351 100644
--- a/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp
+++ b/sycl/test-e2e/KernelFusion/Reduction/group_reduce_and_last_wg_detection.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 // COM: When ran on HIP and CUDA, this algorithm launches 'memcpy' commands
diff --git a/sycl/test-e2e/KernelFusion/Reduction/local_atomic_and_atomic_cross_wg.cpp b/sycl/test-e2e/KernelFusion/Reduction/local_atomic_and_atomic_cross_wg.cpp
index fb3702907e17f..0bb33f335dfba 100644
--- a/sycl/test-e2e/KernelFusion/Reduction/local_atomic_and_atomic_cross_wg.cpp
+++ b/sycl/test-e2e/KernelFusion/Reduction/local_atomic_and_atomic_cross_wg.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 #include "./reduction.hpp"
diff --git a/sycl/test-e2e/KernelFusion/Reduction/local_mem_tree_and_atomic_cross_wg.cpp b/sycl/test-e2e/KernelFusion/Reduction/local_mem_tree_and_atomic_cross_wg.cpp
index 53768affcbbf0..db0bb20164e0b 100644
--- a/sycl/test-e2e/KernelFusion/Reduction/local_mem_tree_and_atomic_cross_wg.cpp
+++ b/sycl/test-e2e/KernelFusion/Reduction/local_mem_tree_and_atomic_cross_wg.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 #include "./reduction.hpp"
diff --git a/sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp b/sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp
index 2c59b4e0369eb..3d81cd7818e54 100644
--- a/sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp
+++ b/sycl/test-e2e/KernelFusion/Reduction/range_basic.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 #include "./reduction.hpp"
diff --git a/sycl/test-e2e/KernelFusion/abort_fusion.cpp b/sycl/test-e2e/KernelFusion/abort_fusion.cpp
index ba8294e5b49d8..a0cc7291b47f7 100644
--- a/sycl/test-e2e/KernelFusion/abort_fusion.cpp
+++ b/sycl/test-e2e/KernelFusion/abort_fusion.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
 
 // Test fusion being aborted: Different scenarios causing the JIT compiler
diff --git a/sycl/test-e2e/KernelFusion/abort_internalization.cpp b/sycl/test-e2e/KernelFusion/abort_internalization.cpp
index f1838e96fde0f..854a730d1e994 100644
--- a/sycl/test-e2e/KernelFusion/abort_internalization.cpp
+++ b/sycl/test-e2e/KernelFusion/abort_internalization.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -O2 -fsycl-embed-ir -o %t.out
+// RUN: %{build} -O2 %{embed-ir} -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 SYCL_ENABLE_FUSION_CACHING=0 %{run} %t.out 2>&1 | FileCheck %s
 
 // Test incomplete internalization: Different scenarios causing the JIT compiler
diff --git a/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp b/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp
index c23690605bb52..0f83ec288afe3 100644
--- a/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp
+++ b/sycl/test-e2e/KernelFusion/abort_internalization_stored_ptr.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not "Computation error" --implicit-check-not "Internalized" --check-prefix=CHECK %if hip %{ --check-prefix=CHECK-HIP %} %else %{ --check-prefix=CHECK-NON-HIP %}
 
 // Test pointers being stored are not internalized.
diff --git a/sycl/test-e2e/KernelFusion/barrier_local_internalization.cpp b/sycl/test-e2e/KernelFusion/barrier_local_internalization.cpp
index bad4a9e8e8be1..d450a548fd840 100644
--- a/sycl/test-e2e/KernelFusion/barrier_local_internalization.cpp
+++ b/sycl/test-e2e/KernelFusion/barrier_local_internalization.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion with local internalization and a combination of kernels
diff --git a/sycl/test-e2e/KernelFusion/buffer_internalization.cpp b/sycl/test-e2e/KernelFusion/buffer_internalization.cpp
index 1509b7d27d178..0f952cd6c39ad 100644
--- a/sycl/test-e2e/KernelFusion/buffer_internalization.cpp
+++ b/sycl/test-e2e/KernelFusion/buffer_internalization.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/cached_ndrange.cpp b/sycl/test-e2e/KernelFusion/cached_ndrange.cpp
index b3dae4d6ce550..b869b5c2a29ec 100644
--- a/sycl/test-e2e/KernelFusion/cached_ndrange.cpp
+++ b/sycl/test-e2e/KernelFusion/cached_ndrange.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not "COMPUTATION ERROR"
 // UNSUPPORTED: hip
 
diff --git a/sycl/test-e2e/KernelFusion/cancel_fusion.cpp b/sycl/test-e2e/KernelFusion/cancel_fusion.cpp
index e5d788f4fce0f..9be8d8cbbd140 100644
--- a/sycl/test-e2e/KernelFusion/cancel_fusion.cpp
+++ b/sycl/test-e2e/KernelFusion/cancel_fusion.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 // Test cancel fusion
diff --git a/sycl/test-e2e/KernelFusion/complete_fusion.cpp b/sycl/test-e2e/KernelFusion/complete_fusion.cpp
index 81efaa4e360d3..f49bb70fc6d9b 100644
--- a/sycl/test-e2e/KernelFusion/complete_fusion.cpp
+++ b/sycl/test-e2e/KernelFusion/complete_fusion.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion without any internalization
diff --git a/sycl/test-e2e/KernelFusion/cooperative_kernel.cpp b/sycl/test-e2e/KernelFusion/cooperative_kernel.cpp
index 99485766f2c7c..0faf05d84e2e0 100644
--- a/sycl/test-e2e/KernelFusion/cooperative_kernel.cpp
+++ b/sycl/test-e2e/KernelFusion/cooperative_kernel.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=2 %{run} %t.out 2>&1 | FileCheck %s
 
 // Test cooperative kernels are not fused
diff --git a/sycl/test-e2e/KernelFusion/diamond_shape.cpp b/sycl/test-e2e/KernelFusion/diamond_shape.cpp
index 0ab2767d6bc89..7be3c248026b0 100644
--- a/sycl/test-e2e/KernelFusion/diamond_shape.cpp
+++ b/sycl/test-e2e/KernelFusion/diamond_shape.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/diamond_shape_local.cpp b/sycl/test-e2e/KernelFusion/diamond_shape_local.cpp
index ce5df8fbc20cd..4ffe33599892c 100644
--- a/sycl/test-e2e/KernelFusion/diamond_shape_local.cpp
+++ b/sycl/test-e2e/KernelFusion/diamond_shape_local.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion with local internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/event_wait_cancel.cpp b/sycl/test-e2e/KernelFusion/event_wait_cancel.cpp
index 3992293a6e03b..76b75d8c3a44d 100644
--- a/sycl/test-e2e/KernelFusion/event_wait_cancel.cpp
+++ b/sycl/test-e2e/KernelFusion/event_wait_cancel.cpp
@@ -1,5 +1,5 @@
 // REQUIRES: aspect-usm_shared_allocations
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 // Test validity of events after cancel_fusion.
diff --git a/sycl/test-e2e/KernelFusion/event_wait_complete.cpp b/sycl/test-e2e/KernelFusion/event_wait_complete.cpp
index b663d98ba1132..7ab2a80ea48a6 100644
--- a/sycl/test-e2e/KernelFusion/event_wait_complete.cpp
+++ b/sycl/test-e2e/KernelFusion/event_wait_complete.cpp
@@ -1,5 +1,5 @@
 // REQUIRES: aspect-usm_shared_allocations
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 // Test validity of events after complete_fusion.
diff --git a/sycl/test-e2e/KernelFusion/existing_local_accessor.cpp b/sycl/test-e2e/KernelFusion/existing_local_accessor.cpp
index 5b168cfca9332..6653bff5e9857 100644
--- a/sycl/test-e2e/KernelFusion/existing_local_accessor.cpp
+++ b/sycl/test-e2e/KernelFusion/existing_local_accessor.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion with local internalization and an local accessor that
diff --git a/sycl/test-e2e/KernelFusion/internal_explicit_dependency.cpp b/sycl/test-e2e/KernelFusion/internal_explicit_dependency.cpp
index 584894f5b3465..693abaf37f916 100644
--- a/sycl/test-e2e/KernelFusion/internal_explicit_dependency.cpp
+++ b/sycl/test-e2e/KernelFusion/internal_explicit_dependency.cpp
@@ -1,5 +1,5 @@
 // REQUIRES: aspect-usm_shared_allocations
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion where one kernel in the fusion list specifies an
diff --git a/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp b/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp
index 0d8d4e12f79b5..38c94b1d851d9 100644
--- a/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_array_wrapper.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: %{run} %t.out
 
 // Test internalization of a nested array type.
diff --git a/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp b/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp
index e019c720a3156..d2c8949acec8a 100644
--- a/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_array_wrapper_local.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: %{run} %t.out
 
 // Test local internalization of a nested array type.
diff --git a/sycl/test-e2e/KernelFusion/internalize_deep.cpp b/sycl/test-e2e/KernelFusion/internalize_deep.cpp
index 4515a8b994bfb..6cae5c3a516fc 100644
--- a/sycl/test-e2e/KernelFusion/internalize_deep.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_deep.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion with internalization of a deep struct type.
diff --git a/sycl/test-e2e/KernelFusion/internalize_multi_ptr.cpp b/sycl/test-e2e/KernelFusion/internalize_multi_ptr.cpp
index 8023f219a39ab..790c9ee82a57b 100644
--- a/sycl/test-e2e/KernelFusion/internalize_multi_ptr.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_multi_ptr.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/internalize_non_unit_localsize.cpp b/sycl/test-e2e/KernelFusion/internalize_non_unit_localsize.cpp
index caa8edca3d221..0e337675f3dd3 100644
--- a/sycl/test-e2e/KernelFusion/internalize_non_unit_localsize.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_non_unit_localsize.cpp
@@ -1,5 +1,5 @@
 // REQUIRES: fusion
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: %{run} %t.out
 
 // Test private internalization with "LocalSize" == 3 on buffers that trigger
diff --git a/sycl/test-e2e/KernelFusion/internalize_vec.cpp b/sycl/test-e2e/KernelFusion/internalize_vec.cpp
index c256803b8f6df..bed6cce2da760 100644
--- a/sycl/test-e2e/KernelFusion/internalize_vec.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_vec.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion with internalization of a struct type.
diff --git a/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp b/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp
index d8e59c2d1544b..e1269b151679d 100644
--- a/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/jit_caching.cpp b/sycl/test-e2e/KernelFusion/jit_caching.cpp
index d13131006c2fe..02e51c80e42f6 100644
--- a/sycl/test-e2e/KernelFusion/jit_caching.cpp
+++ b/sycl/test-e2e/KernelFusion/jit_caching.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not "COMPUTATION ERROR" --implicit-check-not "WRONG INTERNALIZATION"
 
 // Test caching for JIT fused kernels. Also test for debug messages being
diff --git a/sycl/test-e2e/KernelFusion/lit.local.cfg b/sycl/test-e2e/KernelFusion/lit.local.cfg
index 05b47667da5c9..1d0db3020f754 100644
--- a/sycl/test-e2e/KernelFusion/lit.local.cfg
+++ b/sycl/test-e2e/KernelFusion/lit.local.cfg
@@ -6,3 +6,7 @@ config.unsupported_features += ['accelerator']
 # TODO: enable on Windows once kernel fusion is supported on Windows.
 if platform.system() != "Linux":
    config.unsupported = True
+
+config.substitutions.append(
+    ("%{embed-ir}", "%if any-device-is-hip || any-device-is-cuda %{ -fsycl-embed-ir %}")
+)
diff --git a/sycl/test-e2e/KernelFusion/local_internalization.cpp b/sycl/test-e2e/KernelFusion/local_internalization.cpp
index c18ab5cf92d36..501dd36695358 100644
--- a/sycl/test-e2e/KernelFusion/local_internalization.cpp
+++ b/sycl/test-e2e/KernelFusion/local_internalization.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion with local internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/math_function.cpp b/sycl/test-e2e/KernelFusion/math_function.cpp
index 9d17c1428f35d..49f2fd04c5b93 100644
--- a/sycl/test-e2e/KernelFusion/math_function.cpp
+++ b/sycl/test-e2e/KernelFusion/math_function.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 // Test fusion of a kernel using a math function.
diff --git a/sycl/test-e2e/KernelFusion/non-kernel-cg.cpp b/sycl/test-e2e/KernelFusion/non-kernel-cg.cpp
index a4da945970287..3ae4b5be72712 100644
--- a/sycl/test-e2e/KernelFusion/non-kernel-cg.cpp
+++ b/sycl/test-e2e/KernelFusion/non-kernel-cg.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=2 %{run} %t.out 2>&1 | FileCheck %s
 
 // Test non-kernel device command groups are not fused
diff --git a/sycl/test-e2e/KernelFusion/non_unit_local_size.cpp b/sycl/test-e2e/KernelFusion/non_unit_local_size.cpp
index 120c02bb57418..db369b8ee0f37 100644
--- a/sycl/test-e2e/KernelFusion/non_unit_local_size.cpp
+++ b/sycl/test-e2e/KernelFusion/non_unit_local_size.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion with local internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/pointer_arg_function.cpp b/sycl/test-e2e/KernelFusion/pointer_arg_function.cpp
index 4f27b675f90bc..a5ec0a57926ee 100644
--- a/sycl/test-e2e/KernelFusion/pointer_arg_function.cpp
+++ b/sycl/test-e2e/KernelFusion/pointer_arg_function.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 // This test currently fails because InferAddressSpace is not able to remove all
 // address-space casts, causing internalization to fail.
diff --git a/sycl/test-e2e/KernelFusion/private_internalization.cpp b/sycl/test-e2e/KernelFusion/private_internalization.cpp
index 38f14c5be2f65..f53aa40e90389 100644
--- a/sycl/test-e2e/KernelFusion/private_internalization.cpp
+++ b/sycl/test-e2e/KernelFusion/private_internalization.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:512 %{run} %t.out
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/queue-shortcut-functions.cpp b/sycl/test-e2e/KernelFusion/queue-shortcut-functions.cpp
index 50feac19c2f42..6adcd29ab3071 100644
--- a/sycl/test-e2e/KernelFusion/queue-shortcut-functions.cpp
+++ b/sycl/test-e2e/KernelFusion/queue-shortcut-functions.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 \
 // RUN:   | FileCheck %s --implicit-check-not=ERROR
 
diff --git a/sycl/test-e2e/KernelFusion/ranged_offset_accessor.cpp b/sycl/test-e2e/KernelFusion/ranged_offset_accessor.cpp
index 41e658bf2a38d..6c3c2d3237d5e 100644
--- a/sycl/test-e2e/KernelFusion/ranged_offset_accessor.cpp
+++ b/sycl/test-e2e/KernelFusion/ranged_offset_accessor.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion with private internalization on accessors with different
diff --git a/sycl/test-e2e/KernelFusion/struct_with_array.cpp b/sycl/test-e2e/KernelFusion/struct_with_array.cpp
index 5f35d0d2e7c23..c64ed6e4979c5 100644
--- a/sycl/test-e2e/KernelFusion/struct_with_array.cpp
+++ b/sycl/test-e2e/KernelFusion/struct_with_array.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion with private internalization on a kernel functor with an
diff --git a/sycl/test-e2e/KernelFusion/sync_acc_mem_op.cpp b/sycl/test-e2e/KernelFusion/sync_acc_mem_op.cpp
index 1107c8a3b555f..825b0818eb344 100644
--- a/sycl/test-e2e/KernelFusion/sync_acc_mem_op.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_acc_mem_op.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
 
 // Windows doesn't yet have full shutdown().
diff --git a/sycl/test-e2e/KernelFusion/sync_buffer_destruction.cpp b/sycl/test-e2e/KernelFusion/sync_buffer_destruction.cpp
index 56e935dbacc40..a221568e4e4f8 100644
--- a/sycl/test-e2e/KernelFusion/sync_buffer_destruction.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_buffer_destruction.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
 
 // Windows doesn't yet have full shutdown().
diff --git a/sycl/test-e2e/KernelFusion/sync_event_wait.cpp b/sycl/test-e2e/KernelFusion/sync_event_wait.cpp
index 6729ebeda0a9d..9f93852dbcc82 100644
--- a/sycl/test-e2e/KernelFusion/sync_event_wait.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_event_wait.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
 
 // Test fusion cancellation on event::wait() happening before
diff --git a/sycl/test-e2e/KernelFusion/sync_host_accessor.cpp b/sycl/test-e2e/KernelFusion/sync_host_accessor.cpp
index c3ca9891ae11d..7aa702767d93e 100644
--- a/sycl/test-e2e/KernelFusion/sync_host_accessor.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_host_accessor.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
 
 // Windows doesn't yet have full shutdown().
diff --git a/sycl/test-e2e/KernelFusion/sync_host_task.cpp b/sycl/test-e2e/KernelFusion/sync_host_task.cpp
index cf42331b2dcef..5b1eaaac745b8 100644
--- a/sycl/test-e2e/KernelFusion/sync_host_task.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_host_task.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
 
 // Windows doesn't yet have full shutdown().
diff --git a/sycl/test-e2e/KernelFusion/sync_queue_destruction.cpp b/sycl/test-e2e/KernelFusion/sync_queue_destruction.cpp
index c1e7bec1fe23d..24d50197571c3 100644
--- a/sycl/test-e2e/KernelFusion/sync_queue_destruction.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_queue_destruction.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
 
 // Windows doesn't yet have full shutdown().
diff --git a/sycl/test-e2e/KernelFusion/sync_queue_wait.cpp b/sycl/test-e2e/KernelFusion/sync_queue_wait.cpp
index e33508a673430..7cf6197c7d1ec 100644
--- a/sycl/test-e2e/KernelFusion/sync_queue_wait.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_queue_wait.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
 
 // Windows doesn't yet have full shutdown().
diff --git a/sycl/test-e2e/KernelFusion/sync_second_queue.cpp b/sycl/test-e2e/KernelFusion/sync_second_queue.cpp
index 93ada86575d1d..836d9984e824f 100644
--- a/sycl/test-e2e/KernelFusion/sync_second_queue.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_second_queue.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
 
 // Windows doesn't yet have full shutdown().
diff --git a/sycl/test-e2e/KernelFusion/sync_two_queues_requirement.cpp b/sycl/test-e2e/KernelFusion/sync_two_queues_requirement.cpp
index 475a6694f8a1d..9bc209552c35a 100644
--- a/sycl/test-e2e/KernelFusion/sync_two_queues_requirement.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_two_queues_requirement.cpp
@@ -1,5 +1,5 @@
 // For this test, complete_fusion must be supported.
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
 
 // Test fusion cancellation for requirement between two active fusions.
diff --git a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
index 927540bf2b40b..fa0ae3f1b31d9 100644
--- a/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
+++ b/sycl/test-e2e/KernelFusion/sync_usm_mem_op.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: env SYCL_RT_WARNING_LEVEL=1 %{run} %t.out 2>&1 | FileCheck %s
 // Windows doesn't yet have full shutdown().
 // UNSUPPORTED: ze_debug && windows
diff --git a/sycl/test-e2e/KernelFusion/three_dimensional.cpp b/sycl/test-e2e/KernelFusion/three_dimensional.cpp
index 48d2339a61cc7..9dcdcd264e7b9 100644
--- a/sycl/test-e2e/KernelFusion/three_dimensional.cpp
+++ b/sycl/test-e2e/KernelFusion/three_dimensional.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/two_dimensional.cpp b/sycl/test-e2e/KernelFusion/two_dimensional.cpp
index 1f4f2c5f62af3..aa098009d5f23 100644
--- a/sycl/test-e2e/KernelFusion/two_dimensional.cpp
+++ b/sycl/test-e2e/KernelFusion/two_dimensional.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
+// RUN: %{build} %{embed-ir} -O2 -o %t.out
 // RUN: env SYCL_PARALLEL_FOR_RANGE_ROUNDING_PARAMS=16:32:64 %{run} %t.out
 
 // Test complete fusion with private internalization specified on the
diff --git a/sycl/test-e2e/KernelFusion/usm_no_dependencies.cpp b/sycl/test-e2e/KernelFusion/usm_no_dependencies.cpp
index 570ac943bf723..f3152cee4c413 100644
--- a/sycl/test-e2e/KernelFusion/usm_no_dependencies.cpp
+++ b/sycl/test-e2e/KernelFusion/usm_no_dependencies.cpp
@@ -1,5 +1,5 @@
 // REQUIRES: aspect-usm_shared_allocations
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion using USM pointers.
diff --git a/sycl/test-e2e/KernelFusion/work_group_barrier.cpp b/sycl/test-e2e/KernelFusion/work_group_barrier.cpp
index 74bab49c46a65..39a5a4cdf57ca 100644
--- a/sycl/test-e2e/KernelFusion/work_group_barrier.cpp
+++ b/sycl/test-e2e/KernelFusion/work_group_barrier.cpp
@@ -1,4 +1,4 @@
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion with a combination of kernels that require a work-group
diff --git a/sycl/test-e2e/KernelFusion/wrapped_usm.cpp b/sycl/test-e2e/KernelFusion/wrapped_usm.cpp
index 584c55a3469ea..1254026223aa7 100644
--- a/sycl/test-e2e/KernelFusion/wrapped_usm.cpp
+++ b/sycl/test-e2e/KernelFusion/wrapped_usm.cpp
@@ -1,5 +1,5 @@
 // REQUIRES: aspect-usm_shared_allocations
-// RUN: %{build} -fsycl-embed-ir -o %t.out
+// RUN: %{build} %{embed-ir} -o %t.out
 // RUN: %{run} %t.out
 
 // Test complete fusion using an wrapped USM pointer as kernel functor argument.
diff --git a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp
index 5b77ec89fd997..80e0c0c6b845d 100644
--- a/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp
+++ b/sycl/test-e2e/Matrix/SG32/get_coord_int8_matB.cpp
@@ -5,7 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // RUN: %{build} -o %t.out
@@ -13,12 +15,7 @@
 // XFAIL: cpu
 
 #include "../common.hpp"
-#include <iostream>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
 
 #define SG_SZ 32
-constexpr size_t TN = 16;
 
 #include "../get_coord_int8_matB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp
index b93985f8e594e..10334f93afa80 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache.cpp
@@ -5,21 +5,18 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
-// RUN: %{build} -o %t_gpu.out -ffp-model=precise
-// RUN: %if gpu %{ %{run} %t_gpu.out %}
-
-// RUN: %{build}  -ffp-model=precise -o %t_cpu.out -DtM=16 -DtK=32 -DNCACHE1=32 -DKCACHE1=32
-// RUN: %if cpu %{ %{run} %t_cpu.out %}
+// RUN: %{build} -o %t.out -ffp-model=precise
+// RUN: %{run} %t.out
 
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "../common.hpp"
-#include <cstddef>
 
 #define SG_SZ 32
-constexpr size_t TN = 16;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp
index 10391f2e7e319..32af965ec431a 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_init.cpp
@@ -5,7 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix, gpu
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix, gpu
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // RUN: %{build} -o %t.out -DINIT_LIST -ffp-model=precise
@@ -14,9 +16,7 @@
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "../common.hpp"
-#include <cstddef>
 
 #define SG_SZ 32
-constexpr size_t TN = 16;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp
index 994a2217d681f..1c7533e331e73 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll.cpp
@@ -5,23 +5,20 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
-// RUN: %{build} -mllvm -inline-threshold=5000 -ffp-model=precise -o %t_gpu.out -DMANUAL_UNROLL
-// RUN: %if gpu %{ %{run} %t_gpu.out %}
-
-// RUN: %{build} -mllvm -inline-threshold=5000 -ffp-model=precise -o %t_cpu.out -DMANUAL_UNROLL -DtM=16 -DtK=32 -DNCACHE1=32 -DKCACHE1=32
-// RUN: %if cpu %{ %{run} %t_cpu.out %}
+// RUN: %{build} -mllvm -inline-threshold=5000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL
+// RUN: %{run} %t.out
 
 // -mllvm -inline-threshold added as a workaround,
 // since IGC doesn't support some variants of IR for Joint Matrix currently
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "../common.hpp"
-#include <cstddef>
 
 #define SG_SZ 32
-constexpr size_t TN = 16;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
index 4f7e3638daaf3..f8d30cdc26756 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
@@ -5,7 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix, gpu
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix, gpu
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // RUN: %{build} -mllvm -inline-threshold=5000 -ffp-model=precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL
@@ -16,9 +18,7 @@
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "../common.hpp"
-#include <cstddef>
 
 #define SG_SZ 32
-constexpr size_t TN = 16;
 
 #include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_array.cpp
index f9a113af731a5..87fd837446618 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_array.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_bfloat16_array.cpp
@@ -5,16 +5,16 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
 #include "../common.hpp"
-#include <cstddef>
 
-constexpr std::size_t SG_SZ = 32;
-static constexpr int TN = 16;
+#define SG_SZ 32
 
 #include "../joint_matrix_bfloat16_array_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp
index 214dd10f5158f..6cea5a248e0b2 100644
--- a/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp
+++ b/sycl/test-e2e/Matrix/SG32/joint_matrix_transposeC.cpp
@@ -5,7 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// SG size = 32 is not currently supported for SYCL Joint Matrix by IGC on DG2
+// UNSUPPORTED: gpu-intel-dg2
+// REQUIRES: aspect-ext_intel_matrix
 // REQUIRES-INTEL-DRIVER: lin: 27501, win: 101.4943
 
 // RUN: %{build} -o %t.out
@@ -14,6 +16,5 @@
 #include "../common.hpp"
 
 #define SG_SZ 32
-constexpr size_t TN = 16;
 
 #include "../joint_matrix_transposeC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp
deleted file mode 100644
index 4c4d6c6eb5765..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/get_coord_int8_matB.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//==----------- get_coord_int8_matB.cpp  - DPC++ joint_matrix---------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-// XFAIL: *
-
-#include "../common.hpp"
-#include <iostream>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-constexpr size_t TN = 8;
-
-#include "../get_coord_int8_matB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp
deleted file mode 100644
index fbcd21be62f75..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//==--- joint_matrix_bf16_fill_k_cache.cpp  - DPC++ joint_matrix----------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -o %t.out -ffp-model=precise
-// RUN: %{run} %t.out
-
-// -ffp-model=precise is added to not depend on compiler defaults.
-
-#include "../common.hpp"
-#include <cstddef>
-
-constexpr size_t TN = 8;
-
-#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp
deleted file mode 100644
index c5e399bc98f48..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_init.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//== joint_matrix_bf16_fill_k_cache_init.cpp  - DPC++ joint_matrix----------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -o %t.out -DINIT_LIST -ffp-model=precise
-// RUN: %{run} %t.out
-
-// -ffp-model=precise is added to not depend on compiler defaults.
-
-#include "../common.hpp"
-#include <cstddef>
-
-constexpr size_t TN = 8;
-
-#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp
deleted file mode 100644
index ba24ea0dfc4b8..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//===joint_matrix_bf16_fill_k_cache_unroll.cpp - DPC++ joint_matrix--------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL
-// RUN: %{run} %t.out
-
-// -mllvm -inline-threshold=2000 added as a workaround,
-// since IGC doesn't support some variants of IR for Joint Matrix currently
-// -ffp-model=precise is added to not depend on compiler defaults.
-
-#include "../common.hpp"
-#include <cstddef>
-
-constexpr size_t TN = 8;
-
-#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
deleted file mode 100644
index 9d88c89c50f41..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//==joint_matrix_bf16_fill_k_cache_unroll_init.cpp  - DPC++ joint_matrix----==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t.out -DINIT_LIST -DMANUAL_UNROLL
-// RUN: %{run} %t.out
-
-// -mllvm -inline-threshold=2000 added as a workaround,
-// since IGC doesn't support some variants of IR for Joint Matrix currently
-// -ffp-model=precise is added to not depend on compiler defaults.
-
-#include "../common.hpp"
-#include <cstddef>
-
-constexpr size_t TN = 8;
-
-#include "../joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp
deleted file mode 100644
index 5a41f19bc2ac1..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bfloat16_32x64.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-//==----- joint_matrix_bfloat16_32x64.cpp  - DPC++ joint_matrix-------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// XFAIL: *
-
-#include "../common.hpp"
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-constexpr size_t TN = 8;
-
-#include "../joint_matrix_bfloat16_32x64_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
deleted file mode 100644
index 30b3522ad2442..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_opt_kernel_feature.cpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// Test checks that exception will be thrown in case matrix parameters are
-// incompatible on the current device
-
-#include "../common.hpp"
-#include "../joint_matrix_opt_kernel_feature_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp
deleted file mode 100644
index 0ba69032465b9..0000000000000
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_out_bounds.cpp
+++ /dev/null
@@ -1,20 +0,0 @@
-//==-------- joint_matrix_out_bounds.cpp - DPC++ joint_matrix--------------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// REQUIRES: matrix-xmx8
-
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// XFAIL:*
-
-#include "../common.hpp"
-
-constexpr size_t TN = 8;
-static constexpr size_t MATRIX_K = 1024 + 24;
-
-#include "../joint_matrix_out_bounds_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
index ad064fd82fc0a..0b7f520888fd1 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matB.cpp
@@ -5,14 +5,11 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-// XFAIL: cpu
+// XFAIL: cpu, gpu-intel-dg2
 
 #include "common.hpp"
-
-constexpr size_t TN = 16;
-
 #include "get_coord_int8_matB_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp b/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp
index 480f01ca77ceb..8b63dadc029b3 100644
--- a/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp
+++ b/sycl/test-e2e/Matrix/get_coord_int8_matB_impl.hpp
@@ -5,22 +5,23 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+#include <sycl/atomic_ref.hpp>
 #include <sycl/group_algorithm.hpp>
 
-constexpr size_t TK = 32;
-constexpr size_t VF = 4;
+template <size_t TileRows, size_t TileCols> class add_cols;
 
-template <typename T, size_t K, size_t N>
-void sum_cols_ref(host_accessor<T, 2, access::mode::read_write> B,
-                  host_accessor<int, 1, access::mode::read_write> sum_cols) {
-  int sum_cols_ref[N] = {0};
-  for (size_t j = 0; j < N; j++) {
-    for (size_t i = 0; i < K; i++) {
+template <typename T, typename TResult, size_t Rows, size_t Cols>
+void sum_cols_ref(
+    host_accessor<T, 2, access::mode::read_write> B,
+    host_accessor<TResult, 1, access::mode::read_write> sum_cols) {
+  TResult sum_cols_ref[Cols] = {0};
+  for (size_t j = 0; j < Cols; j++) {
+    for (size_t i = 0; i < Rows; i++) {
       sum_cols_ref[j] += B[i][j];
     }
     auto diff = sum_cols[j] - sum_cols_ref[j];
-    assert(std::fabs(static_cast<int>(diff)) <=
-           std::numeric_limits<int>::epsilon());
+    assert(std::fabs(static_cast<TResult>(diff)) <=
+           std::numeric_limits<TResult>::epsilon());
   }
 }
 
@@ -93,26 +94,27 @@ wi [1,0] -->    i=0, [8, 0]
 
 // clang-format on
 
-template <typename T, size_t K, size_t N>
-void matrix_sum_cols(big_matrix<T, K, N> &B,
-                     big_matrix<T, K / VF, N * VF> &Bvnni) {
-  buffer<int8_t, 2> bufB(B.get_data(), range<2>(K, N));
-  buffer<int8_t, 2> bufBvnni(Bvnni.get_data(), range<2>(K / VF, N * VF));
+template <typename T, typename TResult, size_t Rows, size_t Cols,
+          size_t TileRows, size_t TileCols, size_t VNNI>
+void matrix_sum_cols(big_matrix<T, Rows, Cols> &B,
+                     big_matrix<T, Rows / VNNI, Cols * VNNI> &Bvnni) {
+  buffer<T, 2> bufB(B.get_data(), range<2>(Rows, Cols));
+  buffer<T, 2> bufBvnni(Bvnni.get_data(), range<2>(Rows / VNNI, Cols * VNNI));
 
-  int sum_cols[N] = {0};
-  buffer<int> sum_cols_v(sum_cols, N);
+  TResult sum_cols[Cols] = {0};
+  buffer<TResult> sum_cols_v(sum_cols, Cols);
 
-  size_t NDRangeK = K / TK;
-  size_t NDRangeN = N / TN;
+  size_t NDRangeK = Rows / TileRows;
+  size_t NDRangeN = Cols / TileCols;
   queue q;
-  size_t sg_size = get_sg_size<class sum>(q);
+  size_t sg_size = get_sg_size<add_cols<TileRows, TileCols>>(q);
   nd_range<2> r({NDRangeK, NDRangeN * sg_size}, {1, 1 * sg_size});
 
   q.submit([&](handler &cgh) {
-     auto accB = bufBvnni.get_access<access::mode::read_write>(cgh);
-     auto v = sum_cols_v.get_access<access::mode::atomic>(cgh);
+     sycl::accessor accB{bufBvnni, cgh, sycl::read_write};
+     sycl::accessor v{sum_cols_v, cgh, sycl::read_write};
 
-     cgh.parallel_for<class sum>(
+     cgh.parallel_for<add_cols<TileRows, TileCols>>(
          r, [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
                 [[intel::reqd_sub_group_size(SG_SZ)]]
@@ -125,57 +127,88 @@ void matrix_sum_cols(big_matrix<T, K, N> &B,
 
            sycl::sub_group sg = spmd_item.get_sub_group();
 
-           joint_matrix<sub_group, int8_t, use::b, TK, TN,
+           joint_matrix<sub_group, T, use::b, TileRows, TileCols,
                         layout::ext_intel_packed>
                sub_b;
 
            joint_matrix_load(
                sg, sub_b,
                accB.template get_multi_ptr<access::decorated::no>() +
-                   (sg_startx * (TK / VF) * N * VF) +
-                   sg_starty / sg_size * TN * VF,
-               N * VF);
+                   (sg_startx * (TileRows / VNNI) * Cols * VNNI) +
+                   sg_starty / sg_size * TileCols * VNNI,
+               Cols * VNNI);
 
-           int32_t sum_local_cols[N] = {0};
+           TResult sum_local_cols[Cols] = {0};
            ext::intel::experimental::matrix::joint_matrix_apply(
-               sg, sub_b, [&](int8_t &x, size_t row, size_t col) {
-                 // the coordinates returned are in the logical range [K,N]
-                 // If users want to retrieve the VNNIed coordinates, they can
-                 // be obtained using colVNNI = col/VF rowVNNI = row*VF
-                 size_t global_index = col + global_idy / sg_size * TN;
+               sg, sub_b, [&](T &x, size_t row, size_t col) {
+                 // the coordinates returned are in the logical range
+                 // [Rows,Cols] If users want to retrieve the VNNIed
+                 // coordinates, they can be obtained using colVNNI = col/VNNI
+                 // rowVNNI = row*VNNI
+                 size_t global_index = col + global_idy / sg_size * TileCols;
                  sum_local_cols[global_index] += x;
                });
 
-           for (int i = 0; i < N; i++) {
+           for (int i = 0; i < Cols; i++) {
              sum_local_cols[i] =
                  reduce_over_group(sg, sum_local_cols[i], sycl::plus<>());
-             if (global_idy % sg_size == 0)
-               atomic_fetch_add(v[i], sum_local_cols[i]);
+             if (global_idy % sg_size == 0) {
+               sycl::atomic_ref<TResult, sycl::memory_order::relaxed,
+                                sycl::memory_scope::device>
+                   aref(v[i]);
+               aref.fetch_add(sum_local_cols[i]);
+             }
            }
          }); // parallel for
    }).wait();
-  sum_cols_ref<T, K, N>(bufB.get_host_access(), sum_cols_v.get_host_access());
+  sum_cols_ref<T, TResult, Rows, Cols>(bufB.get_host_access(),
+                                       sum_cols_v.get_host_access());
 }
 
-int main() {
+template <typename T, typename TResult, size_t VNNI, size_t TK, size_t TN>
+void test() {
   static constexpr size_t scale = 2;
   static constexpr size_t MATRIX_K = TK * scale;
   static constexpr size_t MATRIX_N = TN * scale;
 
-  int8_t B[MATRIX_K][MATRIX_N];
-  big_matrix<int8_t, MATRIX_K, MATRIX_N> MB((int8_t *)&B);
+  T B[MATRIX_K][MATRIX_N];
+  big_matrix<T, MATRIX_K, MATRIX_N> MB((T *)&B);
 
-  int8_t Bvnni[MATRIX_K / VF][MATRIX_N * VF];
-  big_matrix<int8_t, MATRIX_K / VF, MATRIX_N * VF> MBvnni((int8_t *)&Bvnni);
+  T Bvnni[MATRIX_K / VNNI][MATRIX_N * VNNI];
+  big_matrix<T, MATRIX_K / VNNI, MATRIX_N * VNNI> MBvnni((T *)&Bvnni);
 
   for (int i = 0; i < MATRIX_K; i++) {
     for (int j = 0; j < MATRIX_N; j++) {
       B[i][j] = i + j;
     }
   }
-  matrix_vnni<int8_t>(MATRIX_K, MATRIX_N, *B, *Bvnni, VF);
+  matrix_vnni<T>(MATRIX_K, MATRIX_N, *B, *Bvnni, VNNI);
   // This test calculates sum of columns in the non VNNI B matrix
-  matrix_sum_cols<int8_t, MATRIX_K, MATRIX_N>(MB, MBvnni);
-  std::cout << "Passed\n";
+  matrix_sum_cols<T, TResult, MATRIX_K, MATRIX_N, TK, TN, VNNI>(MB, MBvnni);
+}
+
+int main() {
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      test<int8_t, int32_t, 4, /*TK*/ 64, /*TN*/ 16>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      test<int8_t, int32_t, 4, /*TK*/ 32, /*TN*/ 16>();
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      test<int8_t, int32_t, 4, /*TK*/ 32, /*TN*/ 8>();
+      break;
+    }
+  }
   return 0;
 }
\ No newline at end of file
diff --git a/sycl/test-e2e/Matrix/joint_matrix_apply_cuda.hpp b/sycl/test-e2e/Matrix/joint_matrix_apply_cuda.hpp
index 6613efa7dfe17..185a410fb3aef 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_apply_cuda.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_apply_cuda.hpp
@@ -50,9 +50,11 @@ void matrix_verify_lambda(queue q,
 
             auto sg = spmd_item.get_sub_group();
 
-            joint_matrix<sub_group, T, use::a, M, K, layout::row_major> sub_a;
-            joint_matrix<sub_group, T, use::b, K, N, layout::row_major> sub_b;
-            joint_matrix<sub_group, T2, use::accumulator, M, N> sub_c;
+            joint_matrix<sycl::sub_group, T, use::a, M, K, layout::row_major>
+                sub_a;
+            joint_matrix<sycl::sub_group, T, use::b, K, N, layout::row_major>
+                sub_b;
+            joint_matrix<sycl::sub_group, T2, use::accumulator, M, N> sub_c;
 
             joint_matrix_fill(sg, sub_a, 3);
             joint_matrix_fill(sg, sub_b, 1);
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp
index abee7d7259f28..2be4c14615799 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache.cpp
@@ -5,19 +5,12 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 
-// RUN: %{build} -o %t_gpu.out -ffp-model=precise
-// RUN: %if gpu %{ %{run} %t_gpu.out %}
-
-// RUN: %{build}  -ffp-model=precise -o %t_cpu.out -DtM=16 -DtK=32 -DNCACHE1=32 -DKCACHE1=32
-// RUN: %if cpu %{ %{run} %t_cpu.out %}
+// RUN: %{build} -o %t.out -ffp-model=precise
+// RUN: %{run} %t.out
 
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "common.hpp"
-#include <cstddef>
-
-constexpr size_t TN = 16;
-
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_OOB.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_OOB.cpp
index 46d0acd79a1b5..4d84656c3d451 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_OOB.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_OOB.cpp
@@ -16,9 +16,4 @@
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "common.hpp"
-#include <cstddef>
-
-#define SG_SZ 16
-constexpr size_t TN = 16;
-
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
index e389ea7137428..56250cf9fb3e1 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_impl.hpp
@@ -7,14 +7,8 @@
 //===-------------------------------------------------------------------------===//
 
 #include <random>
-#include <sycl/detail/core.hpp>
-#include <sycl/ext/oneapi/matrix/matrix.hpp>
 #include <sycl/usm.hpp>
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-using bfloat16 = sycl::ext::oneapi::bfloat16;
-
 // number of test iterations
 constexpr unsigned int testIterations = 100;
 // start recording time after X iterations
@@ -24,36 +18,6 @@ constexpr unsigned int recordThresh = 10;
 #define MATRIX_SIZE 256
 #endif
 
-#ifndef tM
-#define tM 8
-#endif
-#ifndef tN
-#define tN TN
-#endif
-#ifndef tK
-#define tK 16
-#endif
-
-#ifndef MCACHE1
-#define MCACHE1 32
-#endif
-#ifndef NCACHE1
-#define NCACHE1 (TN * 4)
-#endif
-#ifndef KCACHE1
-#define KCACHE1 16
-#endif
-
-#ifndef MCACHE2
-#define MCACHE2 256
-#endif
-#ifndef NCACHE2
-#define NCACHE2 256
-#endif
-#ifndef KCACHE2
-#define KCACHE2 32
-#endif
-
 #ifdef MANUAL_UNROLL
 template <class T, T... inds, class F>
 static constexpr void loop(std::integer_sequence<T, inds...>, F &&f) {
@@ -66,25 +30,28 @@ static constexpr void manually_unroll_loop(F &&f) {
 }
 #endif
 
-template <unsigned int rowsA, unsigned int colsA, unsigned int rowsB,
-          unsigned int colsB, unsigned int vnniFactor, typename TOperand,
-          typename TResult>
+template <size_t TM, size_t TN, size_t TK> class MatMul;
+
+template <size_t rowsA, size_t colsA, size_t rowsB, size_t colsB, size_t VNNI,
+          typename TOperand, typename TResult, size_t TM, size_t TN, size_t TK,
+          size_t MCache1, size_t NCache1, size_t KCache1, size_t MCache2,
+          size_t NCache2, size_t KCache2>
 double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
-  size_t sgSize = get_sg_size<class MatMul>(q);
-  range<2> global{rowsA / MCACHE1, (colsB / NCACHE1) * sgSize};
-  range<2> cachelocal{MCACHE2 / MCACHE1, NCACHE2 / NCACHE1 * sgSize};
+  size_t sgSize = get_sg_size<MatMul<TM, TN, TK>>(q);
+  range<2> global{rowsA / MCache1, (colsB / NCache1) * sgSize};
+  range<2> cachelocal{MCache2 / MCache1, NCache2 / NCache1 * sgSize};
 
   // throw error if padding needed
   assert(colsA == rowsB);
-  assert(rowsA % tM == 0);
-  assert(colsA % tK == 0);
-  assert(colsB % tN == 0);
+  assert(rowsA % TM == 0);
+  assert(colsA % TK == 0);
+  assert(colsB % TN == 0);
   // submit main kernel
   std::chrono::high_resolution_clock::time_point start =
       std::chrono::high_resolution_clock::now();
 
   q.submit([&](handler &h) {
-    h.parallel_for<class MatMul>( // cache layer#1
+    h.parallel_for<MatMul<TM, TN, TK>>( // cache layer#1
         nd_range<2>{global, cachelocal},
         // loop global
         // loop localrange
@@ -107,33 +74,20 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
           auto m1 = it.get_local_id(0);
           auto n1 = it.get_local_id(1) / sgSize;
           auto sg = it.get_sub_group();
-          joint_matrix<sub_group, TResult, use::accumulator, tM, tN>
-              tC[MCACHE1 / tM][NCACHE1 / tN]
+          joint_matrix<sub_group, TResult, use::accumulator, TM, TN>
+              tC[MCache1 / TM][NCache1 / TN]
 #ifdef INIT_LIST
-              = {joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>(),
-                 joint_matrix<sub_group, TResult, use::accumulator, tM, tN>()}
+              = {}; // default initialization of all array elements
+#else
+              ; // no initialization
 #endif
-          ;
+
 #ifdef MANUAL_UNROLL
-          manually_unroll_loop<unsigned int, MCACHE1 / tM>([&](auto m) {
-            manually_unroll_loop<unsigned int, NCACHE1 / tN>([&](auto n) {
+          manually_unroll_loop<unsigned int, MCache1 / TM>([&](auto m) {
+            manually_unroll_loop<unsigned int, NCache1 / TN>([&](auto n) {
 #else
-          for (unsigned int m = 0; m < MCACHE1 / tM; m++) {
-            for (unsigned int n = 0; n < NCACHE1 / tN; n++) {
+          for (unsigned int m = 0; m < MCache1 / TM; m++) {
+            for (unsigned int n = 0; n < NCache1 / TN; n++) {
 #endif
               joint_matrix_fill(sg, tC[m][n], 0);
 #ifdef MANUAL_UNROLL
@@ -144,75 +98,45 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
           }
 #endif
 
-          for (unsigned int k2 = 0; k2 < colsA / KCACHE2; k2++) {
-            joint_matrix<sub_group, TOperand, use::a, tM, tK, layout::row_major>
-                tA[MCACHE1 / tM][KCACHE2 / KCACHE1]
+          for (unsigned int k2 = 0; k2 < colsA / KCache2; k2++) {
+            joint_matrix<sub_group, TOperand, use::a, TM, TK, layout::row_major>
+                tA[MCache1 / TM][KCache2 / KCache1]
 #ifdef INIT_LIST
-                = {joint_matrix<sub_group, TOperand, use::a, tM, tK,
-                                layout::row_major>(),
-                   joint_matrix<sub_group, TOperand, use::a, tM, tK,
-                                layout::row_major>(),
-                   joint_matrix<sub_group, TOperand, use::a, tM, tK,
-                                layout::row_major>(),
-                   joint_matrix<sub_group, TOperand, use::a, tM, tK,
-                                layout::row_major>(),
-                   joint_matrix<sub_group, TOperand, use::a, tM, tK,
-                                layout::row_major>(),
-                   joint_matrix<sub_group, TOperand, use::a, tM, tK,
-                                layout::row_major>(),
-                   joint_matrix<sub_group, TOperand, use::a, tM, tK,
-                                layout::row_major>(),
-                   joint_matrix<sub_group, TOperand, use::a, tM, tK,
-                                layout::row_major>()}
+                = {}; // default initialization of all array elements
+#else
+                ; // no initialization
 #endif
-            ;
 
-            joint_matrix<sub_group, TOperand, use::b, tK, tN,
+            joint_matrix<sub_group, TOperand, use::b, TK, TN,
                          layout::ext_intel_packed>
-                tB[NCACHE1 / tN][KCACHE2 / KCACHE1]
+                tB[NCache1 / TN][KCache2 / KCache1]
 #ifdef INIT_LIST
-                =
-                    {
-                        joint_matrix<sub_group, TOperand, use::b, tK, tN,
-                                     layout::ext_intel_packed>(),
-                        joint_matrix<sub_group, TOperand, use::b, tK, tN,
-                                     layout::ext_intel_packed>(),
-                        joint_matrix<sub_group, TOperand, use::b, tK, tN,
-                                     layout::ext_intel_packed>(),
-                        joint_matrix<sub_group, TOperand, use::b, tK, tN,
-                                     layout::ext_intel_packed>(),
-                        joint_matrix<sub_group, TOperand, use::b, tK, tN,
-                                     layout::ext_intel_packed>(),
-                        joint_matrix<sub_group, TOperand, use::b, tK, tN,
-                                     layout::ext_intel_packed>(),
-                        joint_matrix<sub_group, TOperand, use::b, tK, tN,
-                                     layout::ext_intel_packed>(),
-                        joint_matrix<sub_group, TOperand, use::b, tK, tN,
-                                     layout::ext_intel_packed>(),
-                    }
+                = {}; // default initialization of all array elements
+#else
+                ; // no initialization
 #endif
-            ;
+
 #ifdef MANUAL_UNROLL
-            manually_unroll_loop<unsigned int, KCACHE2 / KCACHE1>([&](auto k1) {
+            manually_unroll_loop<unsigned int, KCache2 / KCache1>([&](auto k1) {
 #else
-            for (unsigned int k1 = 0; k1 < KCACHE2 / KCACHE1; k1++) {
+            for (unsigned int k1 = 0; k1 < KCache2 / KCache1; k1++) {
 #endif
               // physical layer
-              unsigned int k = (k2 * KCACHE2 + k1 * KCACHE1) / tK;
+              unsigned int k = (k2 * KCache2 + k1 * KCache1) / TK;
 #ifdef MANUAL_UNROLL
-              manually_unroll_loop<unsigned int, MCACHE1 / tM>([&](auto m) {
+              manually_unroll_loop<unsigned int, MCache1 / TM>([&](auto m) {
 #else
-              for (unsigned int m = 0; m < MCACHE1 / tM; m++) {
+              for (unsigned int m = 0; m < MCache1 / TM; m++) {
 #endif
 #ifdef OOB
                 ext::intel::experimental::matrix::joint_matrix_load_checked(
                     sg, tA[m][k1], pA, colsA, rowsA, colsA,
-                    m2 * MCACHE2 + m1 * MCACHE1 + m * tM, k * tK);
+                    m2 * MCache2 + m1 * MCache1 + m * TM, k * TK);
 #else
                 joint_matrix_load(
                     sg, tA[m][k1],
-                    pA + (m2 * MCACHE2 + m1 * MCACHE1 + m * tM) * colsA +
-                        k * tK,
+                    pA + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsA +
+                        k * TK,
                     colsA);
 #endif
 #ifdef MANUAL_UNROLL
@@ -221,21 +145,21 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
               } // m
 #endif
 #ifdef MANUAL_UNROLL
-              manually_unroll_loop<unsigned int, NCACHE1 / tN>([&](auto n) {
+              manually_unroll_loop<unsigned int, NCache1 / TN>([&](auto n) {
 #else
-              for (unsigned int n = 0; n < NCACHE1 / tN; n++) {
+              for (unsigned int n = 0; n < NCache1 / TN; n++) {
 #endif
 #ifdef OOB
                 ext::intel::experimental::matrix::joint_matrix_load_checked(
-                    sg, tB[n][k1], pB, colsB * vnniFactor, rowsB / vnniFactor,
-                    colsB * vnniFactor, k * tK / vnniFactor,
-                    (n2 * NCACHE2 + n1 * NCACHE1 + n * tN) * vnniFactor);
+                    sg, tB[n][k1], pB, colsB * VNNI, rowsB / VNNI, colsB * VNNI,
+                    k * TK / VNNI,
+                    (n2 * NCache2 + n1 * NCache1 + n * TN) * VNNI);
 #else
-                joint_matrix_load(
-                    sg, tB[n][k1],
-                    pB + (k * tK / vnniFactor) * (colsB * vnniFactor) +
-                        (n2 * NCACHE2 + n1 * NCACHE1 + n * tN) * vnniFactor,
-                    colsB * vnniFactor);
+                joint_matrix_load(sg, tB[n][k1],
+                                  pB + (k * TK / VNNI) * (colsB * VNNI) +
+                                      (n2 * NCache2 + n1 * NCache1 + n * TN) *
+                                          VNNI,
+                                  colsB * VNNI);
 #endif
 #ifdef MANUAL_UNROLL
               });
@@ -243,14 +167,14 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
               } // n
 #endif
 #ifdef MANUAL_UNROLL
-              manually_unroll_loop<unsigned int, MCACHE1 / tM>([&](auto m) {
+              manually_unroll_loop<unsigned int, MCache1 / TM>([&](auto m) {
 #else
-              for (unsigned int m = 0; m < MCACHE1 / tM; m++) {
+              for (unsigned int m = 0; m < MCache1 / TM; m++) {
 #endif
 #ifdef MANUAL_UNROLL
-                manually_unroll_loop<unsigned int, NCACHE1 / tN>([&](auto n) {
+                manually_unroll_loop<unsigned int, NCache1 / TN>([&](auto n) {
 #else
-                for (unsigned int n = 0; n < NCACHE1 / tN; n++) {
+                for (unsigned int n = 0; n < NCache1 / TN; n++) {
 
 #endif
                   joint_matrix_mad(sg, tC[m][n], tA[m][k1], tB[n][k1],
@@ -266,25 +190,25 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
 #endif
           } // for k2
 #ifdef MANUAL_UNROLL
-          manually_unroll_loop<unsigned int, MCACHE1 / tM>([&](auto m) {
+          manually_unroll_loop<unsigned int, MCache1 / TM>([&](auto m) {
 #else
-          for (unsigned int m = 0; m < MCACHE1 / tM; m++) {
+          for (unsigned int m = 0; m < MCache1 / TM; m++) {
 #endif
 #ifdef MANUAL_UNROLL
-            manually_unroll_loop<unsigned int, NCACHE1 / tN>([&](auto n) {
+            manually_unroll_loop<unsigned int, NCache1 / TN>([&](auto n) {
 #else
-            for (unsigned int n = 0; n < NCACHE1 / tN; n++) {
+            for (unsigned int n = 0; n < NCache1 / TN; n++) {
 #endif
 #ifdef OOB
               ext::intel::experimental::matrix::joint_matrix_store_checked(
                   sg, tC[m][n], pC, colsB, layout::row_major, rowsA, colsB,
-                  m2 * MCACHE2 + m1 * MCACHE1 + m * tM,
-                  n2 * NCACHE2 + n1 * NCACHE1 + n * tN);
+                  m2 * MCache2 + m1 * MCache1 + m * TM,
+                  n2 * NCache2 + n1 * NCache1 + n * TN);
 #else
               joint_matrix_store(
                   sg, tC[m][n],
-                  pC + (m2 * MCACHE2 + m1 * MCACHE1 + m * tM) * colsB +
-                      (n2 * NCACHE2 + n1 * NCACHE1 + n * tN),
+                  pC + (m2 * MCache2 + m1 * MCache1 + m * TM) * colsB +
+                      (n2 * NCache2 + n1 * NCache1 + n * TN),
                   colsB, layout::row_major);
 #endif
 #ifdef MANUAL_UNROLL
@@ -305,60 +229,46 @@ double joint_matmul(TOperand *A, TOperand *B, TResult *C, queue &q, int i) {
   return duration.count();
 }
 
-void fill_matrix(bfloat16 *M) {
-  std::random_device dev;
-  std::uniform_real_distribution<float> fdistr(-1.0, 1.0);
-  for (unsigned int i = 0; i < MATRIX_SIZE; i++) {
-    for (unsigned int j = 0; j < MATRIX_SIZE; j++) {
-      M[i * MATRIX_SIZE + j] = bfloat16(fdistr(dev));
-    }
-  }
-}
-
-void native_matmul(bfloat16 *A, bfloat16 *B, float *C) {
-  memset(C, 0, sizeof(float) * MATRIX_SIZE * MATRIX_SIZE);
-  for (unsigned int i = 0; i < MATRIX_SIZE; i++) {
-    for (unsigned int k = 0; k < MATRIX_SIZE; k++) {
-      for (unsigned int j = 0; j < MATRIX_SIZE; j++) {
-        C[i * MATRIX_SIZE + j] += make_fp32(A[i * MATRIX_SIZE + k]) *
-                                  make_fp32(B[k * MATRIX_SIZE + j]);
-      }
-    }
-  }
-}
-
-int main(void) {
-  assert(MATRIX_SIZE >= tM && MATRIX_SIZE >= tK && MATRIX_SIZE >= tN &&
+template <typename T, typename TResult, size_t VNNI, size_t TM, size_t TN,
+          size_t TK, size_t MCache1, size_t NCache1, size_t KCache1,
+          size_t MCache2, size_t NCache2, size_t KCache2>
+void test() {
+  assert(MATRIX_SIZE >= TM && MATRIX_SIZE >= TK && MATRIX_SIZE >= TN &&
          "invalid matrix size");
-  assert((MATRIX_SIZE % tM) == 0 && (MATRIX_SIZE % tN) == 0 &&
-         (MATRIX_SIZE % tK) == 0 &&
-         "invalid matrix size detected: not a multiple of <tM,tN,tK>");
+  assert((MATRIX_SIZE % TM) == 0 && (MATRIX_SIZE % TN) == 0 &&
+         (MATRIX_SIZE % TK) == 0 &&
+         "invalid matrix size detected: not a multiple of <TM,TN,TK>");
+
+  std::cout << "Testing: " << TM << " x " << TN << " x " << TK
+            << " [TM x TN x TK]" << std::endl;
 
   queue q;
-  bfloat16 *A = malloc_shared<bfloat16>(MATRIX_SIZE * MATRIX_SIZE, q);
-  bfloat16 *B = malloc_shared<bfloat16>(MATRIX_SIZE * MATRIX_SIZE, q);
-  bfloat16 *vnniB = malloc_shared<bfloat16>(MATRIX_SIZE * MATRIX_SIZE, q);
-  float *C = malloc_shared<float>(MATRIX_SIZE * MATRIX_SIZE, q);
-  float *refC = malloc_shared<float>(MATRIX_SIZE * MATRIX_SIZE, q);
-
-  // Initialize; fill matrices
-  fill_matrix(A);
-  fill_matrix(B);
-  matrix_vnni<bfloat16>(MATRIX_SIZE, MATRIX_SIZE, B, vnniB, 2);
-  native_matmul(A, B, refC);
+  T *A = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
+  T *B = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
+  T *vnniB = malloc_shared<T>(MATRIX_SIZE * MATRIX_SIZE, q);
+  TResult *C = malloc_shared<TResult>(MATRIX_SIZE * MATRIX_SIZE, q);
+  TResult *refC = malloc_shared<TResult>(MATRIX_SIZE * MATRIX_SIZE, q);
+
+  matrix_rand<T>(MATRIX_SIZE, MATRIX_SIZE, A, T(1));
+  matrix_rand<T>(MATRIX_SIZE, MATRIX_SIZE, B, T(1));
+  matrix_vnni<T>(MATRIX_SIZE, MATRIX_SIZE, B, vnniB, VNNI);
+
+  matrix_multiply_ref<T, T, TResult, 1>(A, B, refC, MATRIX_SIZE, MATRIX_SIZE,
+                                        MATRIX_SIZE);
 
   // run testIterations time, aggregate and calculate average run time
   double totalDuration = 0;
   for (unsigned int i = 0; i < testIterations; i++) {
     double duration =
-        joint_matmul<MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, 2,
-                     bfloat16, float>(A, vnniB, C, q, i);
+        joint_matmul<MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, VNNI,
+                     T, TResult, TM, TN, TK, MCache1, NCache1, KCache1, MCache2,
+                     NCache2, KCache2>(A, vnniB, C, q, i);
     if (i >= recordThresh) {
       totalDuration += duration;
     }
   }
 
-  bool result = matrix_compare(MATRIX_SIZE, MATRIX_SIZE, C, refC);
+  assert(matrix_compare(MATRIX_SIZE, MATRIX_SIZE, C, refC));
 
   double msecPerMatrixMul =
       totalDuration / static_cast<double>(testIterations - recordThresh);
@@ -373,6 +283,55 @@ int main(void) {
   free(vnniB, q);
   free(C, q);
   free(refC, q);
+}
+
+int main() {
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
 
-  return !result;
+  constexpr size_t MCache1 = 32;
+  constexpr size_t MCache2 = 256;
+  constexpr size_t NCache2 = 256;
+  constexpr size_t KCache2 = 32;
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      constexpr size_t NCache1 = 32;
+      constexpr size_t KCache1 = 32;
+
+      test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32, MCache1,
+           NCache1, KCache1, MCache2, NCache2, KCache2>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      constexpr size_t NCache1 = 4 * /*TN*/ 16;
+      constexpr size_t KCache1 = 16;
+
+      test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16, MCache1, NCache1,
+           KCache1, MCache2, NCache2, KCache2>();
+#if (!defined(SG_SZ) || SG_SZ != 32)
+      // These combination are not currently supported for subgroup size = 32 in
+      // IGC
+      test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16, MCache1,
+           NCache1, KCache1, MCache2, NCache2, KCache2>();
+      test<bfloat16, float, 2, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16, MCache1,
+           NCache1, KCache1, MCache2, NCache2, KCache2>();
+#endif
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      constexpr size_t NCache1 = 4 * /*TN*/ 8;
+      constexpr size_t KCache1 = 16;
+
+      test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16, MCache1, NCache1,
+           KCache1, MCache2, NCache2, KCache2>();
+      break;
+    }
+  }
+  return 0;
 }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp
index d839f3db8f481..0770e7881edc7 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_init.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix, gpu
+// REQUIRES: aspect-ext_intel_matrix, gpu
 
 // RUN: %{build} -o %t.out -DINIT_LIST -ffp-model=precise
 // RUN: %{run} %t.out
@@ -13,8 +13,4 @@
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "common.hpp"
-#include <cstddef>
-
-constexpr size_t TN = 16;
-
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp
index 1800901e24111..4f5616d7e7f4f 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll.cpp
@@ -5,21 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 
-// RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t_gpu.out -DMANUAL_UNROLL
-// RUN: %if gpu %{ %{run} %t_gpu.out %}
-
-// RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t_cpu.out -DMANUAL_UNROLL -DtM=16 -DtK=32 -DNCACHE1=32 -DKCACHE1=32
-// RUN: %if cpu %{ %{run} %t_cpu.out %}
+// RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL
+// RUN: %{run} %t.out
 
 // -mllvm -inline-threshold=2000 added as a workaround,
 // since IGC doesn't support some variants of IR for Joint Matrix currently
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "common.hpp"
-#include <cstddef>
-
-constexpr size_t TN = 16;
-
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
index 701c17741f576..ff4c29251200d 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bf16_fill_k_cache_unroll_init.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix, gpu
+// REQUIRES: aspect-ext_intel_matrix, gpu
 
 // RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t_gpu.out -DINIT_LIST -DMANUAL_UNROLL
 // RUN: %{run} %t_gpu.out
@@ -15,8 +15,4 @@
 // -ffp-model=precise is added to not depend on compiler defaults.
 
 #include "common.hpp"
-#include <cstddef>
-
-constexpr size_t TN = 16;
-
 #include "joint_matrix_bf16_fill_k_cache_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp
index 98ed155b297ad..5cd2a4dc1962f 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array.cpp
@@ -5,13 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-
-static constexpr int TN = 16;
-
 #include "joint_matrix_bfloat16_array_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp
index 9aefc370bd0c6..f393eaa5e8436 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_bfloat16_array_impl.hpp
@@ -6,30 +6,28 @@
 //
 //===-------------------------------------------------------------------------===//
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
+template <typename T, size_t TM, size_t TN, size_t TK> class mult;
 
-static constexpr int TM = 8;
-static constexpr int TK = 16;
 static constexpr int JM_ARRAY_SZ = 2;
 
-template <typename T1, typename T2, size_t M, size_t N, size_t K>
-void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
-                     big_matrix<T2, K / 2, N * 2> &B) {
+template <typename TResult, typename T, size_t M, size_t N, size_t K, size_t TM,
+          size_t TN, size_t TK, size_t VNNI>
+void matrix_multiply(big_matrix<TResult, M, N> &C, big_matrix<T, M, K> &A,
+                     big_matrix<T, K / VNNI, N * VNNI> &B) {
   size_t NDRangeM = M / (TM * JM_ARRAY_SZ);
   size_t NDRangeN = N / TN;
-  buffer<bfloat16, 2> bufA(A.get_data(), range<2>(M, K));
-  buffer<bfloat16, 2> bufB(B.get_data(), range<2>(K, N));
-  buffer<float, 2> bufC((float *)C.get_data(), range<2>(M, N));
+  buffer<T, 2> bufA(A.get_data(), range<2>(M, K));
+  buffer<T, 2> bufB(B.get_data(), range<2>(K, N));
+  buffer<TResult, 2> bufC((TResult *)C.get_data(), range<2>(M, N));
 
   queue q;
-  size_t sg_size = get_sg_size<class imatrix>(q);
+  size_t sg_size = get_sg_size<mult<T, TM, TN, TK>>(q);
   q.submit([&](handler &cgh) {
-     auto accC = bufC.get_access<access::mode::read_write>(cgh);
-     auto accA = bufA.get_access<access::mode::read_write>(cgh);
-     auto accB = bufB.get_access<access::mode::read_write>(cgh);
+     sycl::accessor accA{bufA, cgh, sycl::read_write};
+     sycl::accessor accB{bufB, cgh, sycl::read_write};
+     sycl::accessor accC{bufC, cgh, sycl::read_write};
 
-     cgh.parallel_for<class imatrix>(
+     cgh.parallel_for<mult<T, TM, TN, TK>>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
          [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
@@ -45,25 +43,25 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
            const auto sg_starty = global_idy - spmd_item.get_local_id(1);
 
            sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<sub_group, bfloat16, use::a, TM, TK, layout::row_major>
+           joint_matrix<sub_group, T, use::a, TM, TK, layout::row_major>
                sub_a[JM_ARRAY_SZ];
 
            // For B, we assume B has been already VNNIed.
-           joint_matrix<sub_group, bfloat16, use::b, TK, TN,
-                        layout::ext_intel_packed>
+           joint_matrix<sub_group, T, use::b, TK, TN, layout::ext_intel_packed>
                sub_b;
-           joint_matrix<sub_group, float, use::accumulator, TM, TN>
+           joint_matrix<sub_group, TResult, use::accumulator, TM, TN>
                sub_c[JM_ARRAY_SZ];
 
            for (int i = 0; i < JM_ARRAY_SZ; ++i)
-             joint_matrix_fill(sg, sub_c[i], 1.0);
+             joint_matrix_fill(sg, sub_c[i], TResult(1));
 
            for (int k = 0; k < K / TK; ++k) {
              joint_matrix_load(
                  sg, sub_b,
                  accB.template get_multi_ptr<access::decorated::no>() +
-                     (k * TK / 2) * (N * 2) + sg_starty / sg_size * TN * 2,
-                 N * 2);
+                     (k * TK / VNNI) * (N * VNNI) +
+                     sg_starty / sg_size * TN * VNNI,
+                 N * VNNI);
 
              for (int i = 0; i < JM_ARRAY_SZ; ++i) {
                joint_matrix_load(
@@ -86,35 +84,70 @@ void matrix_multiply(big_matrix<T1, M, N> &C, big_matrix<T2, M, K> &A,
    }).wait();
 }
 
-int main() {
+template <typename T, typename TResult, size_t VNNI, size_t TM, size_t TN,
+          size_t TK>
+void test() {
+  std::cout << "Testing: " << TM << " x " << TN << " x " << TK
+            << " [TM x TN x TK]" << std::endl;
   static constexpr size_t MATRIX_M = TM * 2;
   static constexpr size_t MATRIX_N = TN * 2;
   static constexpr size_t MATRIX_K = TK * 2;
 
-  bfloat16 A[MATRIX_M][MATRIX_K];
-  bfloat16 B[MATRIX_K / 2][MATRIX_N * 2];
+  T A[MATRIX_M][MATRIX_K];
+  T B[MATRIX_K / VNNI][MATRIX_N * VNNI];
+
+  TResult C[MATRIX_M][MATRIX_N];
+  TResult D[MATRIX_M][MATRIX_N];
 
-  float C[MATRIX_M][MATRIX_N];
-  float D[MATRIX_M][MATRIX_N];
+  matrix_fill(MATRIX_M, MATRIX_K, (T *)A,
+              [](int i, int j) { return TResult(1) * (i + j); });
+  matrix_fill(MATRIX_K / VNNI, MATRIX_N * VNNI, (T *)B,
+              [](int i, int j) { return TResult(2) * i + TResult(3) * j; });
+  matrix_fill(MATRIX_M, MATRIX_N, (TResult *)C, TResult(1));
+  matrix_fill(MATRIX_M, MATRIX_N, (TResult *)D, TResult(1));
 
-  matrix_fill(MATRIX_M, MATRIX_K, (bfloat16 *)A,
-              [](int i, int j) { return 1.0f * (i + j); });
-  matrix_fill(MATRIX_K / 2, MATRIX_N * 2, (bfloat16 *)B,
-              [](int i, int j) { return 2.0f * i + 3.0f * j; });
-  matrix_fill(MATRIX_M, MATRIX_N, (float *)C, 1.0f);
-  matrix_fill(MATRIX_M, MATRIX_N, (float *)D, 1.0f);
+  big_matrix<TResult, MATRIX_M, MATRIX_N> MC((TResult *)&C);
+  big_matrix<TResult, MATRIX_M, MATRIX_N> MD((TResult *)&D);
+  big_matrix<T, MATRIX_M, MATRIX_K> MA((T *)&A);
+  big_matrix<T, MATRIX_K / VNNI, MATRIX_N * VNNI> MB((T *)&B);
 
-  big_matrix<float, MATRIX_M, MATRIX_N> MC((float *)&C);
-  big_matrix<float, MATRIX_M, MATRIX_N> MD((float *)&D);
-  big_matrix<bfloat16, MATRIX_M, MATRIX_K> MA((bfloat16 *)&A);
-  big_matrix<bfloat16, MATRIX_K / 2, MATRIX_N * 2> MB((bfloat16 *)&B);
+  matrix_multiply<TResult, T, MATRIX_M, MATRIX_N, MATRIX_K, TM, TN, TK, VNNI>(
+      MC, MA, MB);
+  matrix_multiply_ref<T, T, TResult, VNNI>((T *)A, (T *)B, (TResult *)D,
+                                           MATRIX_M, MATRIX_N, MATRIX_K / VNNI);
 
-  matrix_multiply(MC, MA, MB);
-  matrix_multiply_ref<bfloat16, bfloat16, float, 2>(
-      (bfloat16 *)A, (bfloat16 *)B, (float *)D, MATRIX_M, MATRIX_N,
-      MATRIX_K / 2);
+  assert(matrix_compare(MATRIX_M, MATRIX_N, (TResult *)C, (TResult *)D));
+}
 
-  bool res = matrix_compare(MATRIX_M, MATRIX_N, (float *)C, (float *)D);
-  std::cout << (res ? "passed" : "failed") << std::endl;
-  return !res;
+int main() {
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 32>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 16, /*TK*/ 16>();
+#if (!defined(SG_SZ) || SG_SZ != 32)
+      // These combination are not currently supported for subgroup size = 32 in
+      // IGC
+      test<bfloat16, float, 2, /*TM*/ 16, /*TN*/ 16, /*TK*/ 16>();
+      test<bfloat16, float, 2, /*TM*/ 1, /*TN*/ 64, /*TK*/ 16>();
+      test<bfloat16, float, 2, /*TM*/ 32, /*TN*/ 64, /*TK*/ 16>();
+      break;
+#endif
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      test<bfloat16, float, 2, /*TM*/ 8, /*TN*/ 8, /*TK*/ 16>();
+      break;
+    }
+  }
+  return 0;
 }
diff --git a/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp b/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp
index 3be8789531ee9..230a20a62e1c7 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_gemm_cuda.hpp
@@ -229,7 +229,8 @@ void test(queue &q) {
         auto res_device =
             matrix_ref_mn<Big_N, Big_K, Big_M, layout_A, layout_B>(m, n, A, B,
                                                                    C);
-        assert(fabs(2 * (D[index_D] - res_device)) / (D[index_D] + res_device) <
+        assert(sycl::fabs(2 * (D[index_D] - res_device)) /
+                   (D[index_D] + res_device) <
                bf16_eps * 2);
       } else {
         assert((D[index_D] ==
diff --git a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
index 5acc54a412096..a6b72f80a989d 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_opt_kernel_feature.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
diff --git a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp
index 721e44d2d6f58..b383c07f018a6 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_tensorcores_sm70.cpp
@@ -13,6 +13,12 @@
 // This tests the unified matrix extension interfaces for the cuda backend.
 // This test must be compiled with -Xsycl-target-backend --cuda-gpu-arch=sm_xx,
 // where sm_xx >= sm_70.
+// For some devices it is important to use the sm version (Compute Capability)
+// corresponding to the device that will run the program when specifying e.g.
+// `-fsycl-targets=nvidia_gpu_sm_xx` during compilation. This particularly
+// affects matrix operations using `half` such as those in this test. For more
+// information on this issue consult
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#wmma-restrictions
 
 #include "joint_matrix_apply_cuda.hpp"
 #include "joint_matrix_gemm_cuda.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp
index bd04b157cf667..b81093293cd33 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC.cpp
@@ -5,13 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// REQUIRES: matrix
+// REQUIRES: aspect-ext_intel_matrix
 
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
 
 #include "common.hpp"
-
-constexpr size_t TN = 16;
-
 #include "joint_matrix_transposeC_impl.hpp"
diff --git a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
index 24ba24a264f0d..278e5da5cf441 100644
--- a/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
+++ b/sycl/test-e2e/Matrix/joint_matrix_transposeC_impl.hpp
@@ -8,10 +8,7 @@
 
 #include <sycl/usm.hpp>
 
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-template <size_t TM> class LS;
+template <size_t TileRows, size_t TileCols> class LS;
 
 template <size_t TM, size_t TN, typename T1, size_t NUM_ROWS, size_t NUM_COLS>
 void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major,
@@ -24,10 +21,10 @@ void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major,
 
   size_t NDRangeM = M / TM;
   size_t NDRangeN = N / TN;
-  size_t sg_size = get_sg_size<class LS<TM>>(q);
+  size_t sg_size = get_sg_size<class LS<TM, TN>>(q);
 
   q.submit([&](handler &cgh) {
-     cgh.parallel_for<class LS<TM>>(
+     cgh.parallel_for<class LS<TM, TN>>(
          nd_range<2>({NDRangeM, NDRangeN * sg_size}, {1, 1 * sg_size}),
          [=](nd_item<2> spmd_item)
 #ifdef SG_SZ
@@ -51,7 +48,7 @@ void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major,
            const auto sg_starty = global_idy - spmd_item.get_local_id(1);
 
            sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<sub_group, float, use::accumulator, TM, TN> sub_matrix;
+           joint_matrix<sub_group, T1, use::accumulator, TM, TN> sub_matrix;
 
            auto row_major_offset =
                (sg_startx * TM) * N + (sg_starty / sg_size * TN);
@@ -72,32 +69,33 @@ void matrix_load_and_store(T1 *input, T1 *out_col_major, T1 *out_row_major,
    }).wait();
 }
 
-template <size_t TM> void run_matrix_test() {
+template <typename T, size_t TM, size_t TN> void run_matrix_test() {
   static constexpr size_t MATRIX_M = TM * 16;
   static constexpr size_t MATRIX_N = TN * 16;
 
   queue q;
-  float *input = malloc_shared<float>(MATRIX_M * MATRIX_N, q);
-  float *out_col_major = malloc_shared<float>(MATRIX_M * MATRIX_N, q);
-  float *out_row_major = malloc_shared<float>(MATRIX_M * MATRIX_N, q);
-  float *ref_col_major = malloc_shared<float>(MATRIX_M * MATRIX_N, q);
+  T *input = malloc_shared<T>(MATRIX_M * MATRIX_N, q);
+  T *out_col_major = malloc_shared<T>(MATRIX_M * MATRIX_N, q);
+  T *out_row_major = malloc_shared<T>(MATRIX_M * MATRIX_N, q);
+  T *ref_col_major = malloc_shared<T>(MATRIX_M * MATRIX_N, q);
 
   // input is column majot matrix so it is of NxM shape
-  matrix_rand(MATRIX_N, MATRIX_M, input, (float)5.0);
-  matrix_fill(MATRIX_M, MATRIX_N, out_col_major, (float)0);
-  matrix_fill(MATRIX_N, MATRIX_M, out_row_major, (float)0);
+  matrix_rand(MATRIX_N, MATRIX_M, input, (T)5.0);
+  matrix_fill(MATRIX_M, MATRIX_N, out_col_major, (T)0);
+  matrix_fill(MATRIX_N, MATRIX_M, out_row_major, (T)0);
   matrix_transpose(MATRIX_N, MATRIX_M, ref_col_major, input);
 
-  matrix_load_and_store<TM, TN, float, MATRIX_M, MATRIX_N>(input, out_col_major,
-                                                           out_row_major, q);
+  matrix_load_and_store<TM, TN, T, MATRIX_M, MATRIX_N>(input, out_col_major,
+                                                       out_row_major, q);
 
   // we use exact comparison as no low precision calculation is used in this
   // test
-  std::cout << "compare results for TM " << TM << "\n";
-  bool res = matrix_compare<float, float, true>(MATRIX_M, MATRIX_N,
-                                                out_col_major, ref_col_major) &&
-             matrix_compare<float, float, true>(MATRIX_N, MATRIX_M,
-                                                out_row_major, input);
+  std::cout << "compare results for: " << TM << " x " << TN << " [TM x TN]"
+            << std::endl;
+  bool res =
+      matrix_compare<T, T, true>(MATRIX_M, MATRIX_N, out_col_major,
+                                 ref_col_major) &&
+      matrix_compare<T, T, true>(MATRIX_N, MATRIX_M, out_row_major, input);
   free(input, q);
   free(out_col_major, q);
   free(out_row_major, q);
@@ -106,15 +104,48 @@ template <size_t TM> void run_matrix_test() {
 }
 
 int main() {
-  run_matrix_test<8>();
-  run_matrix_test<7>();
-  run_matrix_test<6>();
-  run_matrix_test<5>();
-  run_matrix_test<4>();
-  run_matrix_test<3>();
-  run_matrix_test<2>();
-  run_matrix_test<1>();
-
-  std::cout << "Passed\n";
+  queue q;
+  std::vector<combination> combinations =
+      q.get_device()
+          .get_info<sycl::ext::oneapi::experimental::info::device::
+                        matrix_combinations>();
+
+  for (unsigned int i = 0; i < combinations.size(); i++) {
+    if (combinations[i].nsize == 0) { // Intel AMX
+      run_matrix_test<float, /*TM*/ 8, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 7, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 6, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 5, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 4, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 3, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 2, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 1, /*TN*/ 16>();
+      break;
+    }
+
+    if (combinations[i].nsize == 16) { // architecture::intel_gpu_pvc
+      run_matrix_test<float, /*TM*/ 8, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 7, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 6, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 5, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 4, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 3, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 2, /*TN*/ 16>();
+      run_matrix_test<float, /*TM*/ 1, /*TN*/ 16>();
+      break;
+    }
+
+    if (combinations[i].nsize == 8) { // architecture::intel_gpu_dg2*
+      run_matrix_test<float, /*TM*/ 8, /*TN*/ 8>();
+      run_matrix_test<float, /*TM*/ 7, /*TN*/ 8>();
+      run_matrix_test<float, /*TM*/ 6, /*TN*/ 8>();
+      run_matrix_test<float, /*TM*/ 5, /*TN*/ 8>();
+      run_matrix_test<float, /*TM*/ 4, /*TN*/ 8>();
+      run_matrix_test<float, /*TM*/ 3, /*TN*/ 8>();
+      run_matrix_test<float, /*TM*/ 2, /*TN*/ 8>();
+      run_matrix_test<float, /*TM*/ 1, /*TN*/ 8>();
+      break;
+    }
+  }
   return 0;
 }
diff --git a/sycl/test-e2e/NewOffloadDriver/buffer.cpp b/sycl/test-e2e/NewOffloadDriver/buffer.cpp
new file mode 100644
index 0000000000000..1f48325ff9b5c
--- /dev/null
+++ b/sycl/test-e2e/NewOffloadDriver/buffer.cpp
@@ -0,0 +1,58 @@
+//==------------------- buffer.cpp - SYCL buffer basic test ----------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A basic test using the --offload-new-driver flag.
+
+// REQUIRES: level_zero
+// RUN: %clangxx -fsycl --offload-new-driver %s -o %t.out
+// RUN: %{run} %t.out
+
+#include <sycl/detail/core.hpp>
+
+int main() {
+  // Creating buffer of 4 elements to be used inside the kernel code.
+  sycl::buffer<size_t, 1> Buffer(4);
+
+  // Creating SYCL queue.
+  sycl::queue Queue;
+
+  // Size of index space for kernel.
+  sycl::range<1> NumOfWorkItems{Buffer.size()};
+
+  // Submitting command group(work) to queue.
+  Queue.submit([&](sycl::handler &cgh) {
+    // Getting write only access to the buffer on a device.
+    sycl::accessor Accessor{Buffer, cgh, sycl::write_only};
+    // Executing kernel.
+    cgh.parallel_for<class FillBuffer>(NumOfWorkItems, [=](sycl::id<1> WIid) {
+      // Fill buffer with indexes.
+      Accessor[WIid] = WIid.get(0);
+    });
+  });
+
+  // Getting read only access to the buffer on the host.
+  // Implicit barrier waiting for queue to complete the work.
+  sycl::host_accessor HostAccessor{Buffer, sycl::read_only};
+
+  // Check the results.
+  bool MismatchFound = false;
+  for (size_t I = 0; I < Buffer.size(); ++I) {
+    if (HostAccessor[I] != I) {
+      std::cout << "The result is incorrect for element: " << I
+                << " , expected: " << I << " , got: " << HostAccessor[I]
+                << std::endl;
+      MismatchFound = true;
+    }
+  }
+
+  if (!MismatchFound) {
+    std::cout << "The results are correct!" << std::endl;
+  }
+
+  return MismatchFound;
+}
diff --git a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
index 8d1d17df461df..3f7ae71566a2e 100644
--- a/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
+++ b/sycl/test-e2e/NonUniformGroups/ballot_group_algorithms.cpp
@@ -5,6 +5,10 @@
 // REQUIRES: sg-32
 // REQUIRES: aspect-ext_oneapi_ballot_group
 
+// Fails in Nightly testing on the self-hosted CUDA runner:
+// https://github.com/intel/llvm/issues/12995.
+// UNSUPPORTED: cuda
+
 #include <sycl/detail/core.hpp>
 #include <sycl/ext/oneapi/experimental/ballot_group.hpp>
 #include <sycl/group_algorithm.hpp>
diff --git a/sycl/test-e2e/Plugin/cuda-max-local-mem-size.cpp b/sycl/test-e2e/Plugin/cuda-max-local-mem-size.cpp
index 5e31a461379ee..aefbbd99a685f 100644
--- a/sycl/test-e2e/Plugin/cuda-max-local-mem-size.cpp
+++ b/sycl/test-e2e/Plugin/cuda-max-local-mem-size.cpp
@@ -1,8 +1,8 @@
 // REQUIRES: cuda
 
 // RUN: %{build} -o %t.out
-// RUN: not %{run} SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE=0 %t.out 2>&1 | FileCheck --check-prefixes=CHECK-ZERO %s
-// RUN: not %{run} SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE=100000000 %t.out 2>&1 | FileCheck --check-prefixes=CHECK-OVERALLOCATE %s
+// RUN: %{run} SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE=0 %t.out 2>&1 | FileCheck --check-prefixes=CHECK-ZERO %s
+// RUN: %{run} SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE=100000000 %t.out 2>&1 | FileCheck --check-prefixes=CHECK-OVERALLOCATE %s
 
 //==---------------------- cuda-max-local-mem-size.cpp --------------------===//
 //==--- SYCL test to test SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE env var----------===//
@@ -16,15 +16,19 @@
 #include <sycl/detail/core.hpp>
 
 int main() {
-  sycl::queue Q{};
-  auto LocalSize =
-      Q.get_device().get_info<sycl::info::device::local_mem_size>();
-  Q.submit([&](sycl::handler &cgh) {
-     auto LocalAcc = sycl::local_accessor<float>(LocalSize + 1, cgh);
-     cgh.parallel_for(sycl::nd_range<1>{32, 32}, [=](sycl::nd_item<1> idx) {
-       LocalAcc[idx.get_global_linear_id()] *= 2;
-     });
-   }).wait();
+  try {
+    sycl::queue Q{};
+    auto LocalSize =
+        Q.get_device().get_info<sycl::info::device::local_mem_size>();
+    Q.submit([&](sycl::handler &cgh) {
+       auto LocalAcc = sycl::local_accessor<float>(LocalSize + 1, cgh);
+       cgh.parallel_for(sycl::nd_range<1>{32, 32}, [=](sycl::nd_item<1> idx) {
+         LocalAcc[idx.get_global_linear_id()] *= 2;
+       });
+     }).wait();
+  } catch (const std::exception &e) {
+    std::puts(e.what());
+  }
   // CHECK-ZERO: Local memory for kernel exceeds the amount requested using SYCL_PI_CUDA_MAX_LOCAL_MEM_SIZE
   // CHECK-OVERALLOCATE: Excessive allocation of local memory on the device
 }
diff --git a/sycl/test-e2e/Plugin/cuda_queue_priority.cpp b/sycl/test-e2e/Plugin/cuda_queue_priority.cpp
index e51b4b37ce4b4..031dc252c578f 100644
--- a/sycl/test-e2e/Plugin/cuda_queue_priority.cpp
+++ b/sycl/test-e2e/Plugin/cuda_queue_priority.cpp
@@ -1,5 +1,4 @@
-// REQUIRES: gpu, cuda
-
+// REQUIRES: gpu, cuda, cuda_dev_kit
 // RUN: %{build} %cuda_options -o %t.out
 // RUN: %{run} %t.out
 //
diff --git a/sycl/test-e2e/Plugin/interop-cuda-experimental.cpp b/sycl/test-e2e/Plugin/interop-cuda-experimental.cpp
index 3d82b27d5a720..b9229b4863f75 100644
--- a/sycl/test-e2e/Plugin/interop-cuda-experimental.cpp
+++ b/sycl/test-e2e/Plugin/interop-cuda-experimental.cpp
@@ -3,6 +3,9 @@
 // RUN: %{build} %cuda_options -o %t.out
 // RUN: %{run} %t.out
 
+// An issue has been reported in https://github.com/intel/llvm/issues/14116
+// XFAIL: *
+
 #define SYCL_EXT_ONEAPI_BACKEND_CUDA_EXPERIMENTAL 1
 #include <sycl/backend.hpp>
 #include <sycl/detail/core.hpp>
diff --git a/sycl/test-e2e/Plugin/interop-experimental-single-TU-SYCL-CUDA-compilation.cpp b/sycl/test-e2e/Plugin/interop-experimental-single-TU-SYCL-CUDA-compilation.cpp
index 052578d9a20b4..7cce3c854dc87 100644
--- a/sycl/test-e2e/Plugin/interop-experimental-single-TU-SYCL-CUDA-compilation.cpp
+++ b/sycl/test-e2e/Plugin/interop-experimental-single-TU-SYCL-CUDA-compilation.cpp
@@ -2,6 +2,9 @@
 // RUN: %{build} %cuda_options -lcudart -lcuda -x cuda -o %t.out
 // RUN: %{run} %t.out
 
+// An issue has been reported in https://github.com/intel/llvm/issues/14115
+// XFAIL: *
+
 #include <cuda.h>
 #include <sycl/detail/core.hpp>
 
diff --git a/sycl/test-e2e/Plugin/sycl-ls-gpu-default-any.cpp b/sycl/test-e2e/Plugin/sycl-ls-gpu-default-any.cpp
index 373c95869ad02..298e12236e41e 100644
--- a/sycl/test-e2e/Plugin/sycl-ls-gpu-default-any.cpp
+++ b/sycl/test-e2e/Plugin/sycl-ls-gpu-default-any.cpp
@@ -4,9 +4,9 @@
 // RUN: env --unset=SYCL_DEVICE_FILTER  --unset=ONEAPI_DEVICE_SELECTOR sycl-ls --verbose >%t.default.out
 // RUN: FileCheck %s --check-prefixes=CHECK-GPU-BUILTIN,CHECK-GPU-CUSTOM --input-file %t.default.out
 
-// CHECK-GPU-BUILTIN: gpu_selector(){{.*}}gpu, {{.*}}{{Level-Zero|CUDA}}
+// CHECK-GPU-BUILTIN: gpu_selector(){{.*}}gpu, {{.*}}{{Level-Zero|CUDA|OpenCL}}
 // clang-format off
-// CHECK-GPU-CUSTOM: custom_selector(gpu){{.*}}gpu, {{.*}}{{Level-Zero|CUDA}}
+// CHECK-GPU-CUSTOM: custom_selector(gpu){{.*}}gpu, {{.*}}{{Level-Zero|CUDA|OpenCL}}
 // clang-format on
 
 //==--------------------- sycl-ls-gpu-default-any.cpp ----------------------==//
diff --git a/sycl/test-e2e/ProfilingTag/default_queue.cpp b/sycl/test-e2e/ProfilingTag/default_queue.cpp
index 0c18517cb92fa..4277deae3b7df 100644
--- a/sycl/test-e2e/ProfilingTag/default_queue.cpp
+++ b/sycl/test-e2e/ProfilingTag/default_queue.cpp
@@ -8,6 +8,11 @@
 // https://github.com/intel/llvm/issues/12904
 // UNSUPPORTED: hip
 
+// CUDA backend seems to fail sporadically for expected profiling tag time
+// query orderings.
+// https://github.com/intel/llvm/issues/14053
+// UNSUPPORTED: cuda
+
 #include "common.hpp"
 
 int main() {
diff --git a/sycl/test-e2e/ProfilingTag/in_order_profiling_queue.cpp b/sycl/test-e2e/ProfilingTag/in_order_profiling_queue.cpp
index 3dfaca5c20ba0..24fbc9e1b683b 100644
--- a/sycl/test-e2e/ProfilingTag/in_order_profiling_queue.cpp
+++ b/sycl/test-e2e/ProfilingTag/in_order_profiling_queue.cpp
@@ -16,6 +16,11 @@
 // https://github.com/intel/llvm/issues/12904
 // UNSUPPORTED: hip
 
+// CUDA backend seems to fail sporadically for expected profiling tag time
+// query orderings.
+// https://github.com/intel/llvm/issues/14053
+// UNSUPPORTED: cuda
+
 #include "common.hpp"
 
 int main() {
diff --git a/sycl/test-e2e/ProfilingTag/in_order_queue.cpp b/sycl/test-e2e/ProfilingTag/in_order_queue.cpp
index 3b094dea3fadc..f9b579a2f7905 100644
--- a/sycl/test-e2e/ProfilingTag/in_order_queue.cpp
+++ b/sycl/test-e2e/ProfilingTag/in_order_queue.cpp
@@ -8,6 +8,11 @@
 // https://github.com/intel/llvm/issues/12904
 // UNSUPPORTED: hip
 
+// CUDA backend seems to fail sporadically for expected profiling tag time
+// query orderings.
+// https://github.com/intel/llvm/issues/14053
+// UNSUPPORTED: cuda
+
 #include "common.hpp"
 
 int main() {
diff --git a/sycl/test-e2e/ProfilingTag/profiling_queue.cpp b/sycl/test-e2e/ProfilingTag/profiling_queue.cpp
index 7ae3235d20ff0..d0da7612d4ea9 100644
--- a/sycl/test-e2e/ProfilingTag/profiling_queue.cpp
+++ b/sycl/test-e2e/ProfilingTag/profiling_queue.cpp
@@ -19,6 +19,10 @@
 // FPGA emulator seems to return unexpected start time for the fallback barrier.
 // UNSUPPORTED: accelerator
 
+// Flaky on CUDA
+// https://github.com/intel/llvm/issues/14053
+// UNSUPPORTED: cuda
+
 #include "common.hpp"
 
 int main() {
diff --git a/sycl/test-e2e/README.md b/sycl/test-e2e/README.md
index 3767fe226f452..f054c51874ccf 100644
--- a/sycl/test-e2e/README.md
+++ b/sycl/test-e2e/README.md
@@ -233,6 +233,7 @@ unavailable.
  * **llvm-link** - llvm-link tool availability;
  * **fusion**: - Runtime supports kernel fusion;
  * **aspect-\<name\>**: - SYCL aspects supported by a device;
+ * **architecture-\<name\>** - [SYCL architecture](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_device_architecture.asciidoc) of a device (e.g. architecture-intel_gpu_pvc);
 
 ## llvm-lit parameters
 
diff --git a/sycl/test-e2e/Regression/in_order_barrier_profiling.cpp b/sycl/test-e2e/Regression/in_order_barrier_profiling.cpp
new file mode 100644
index 0000000000000..309cdab7b6da7
--- /dev/null
+++ b/sycl/test-e2e/Regression/in_order_barrier_profiling.cpp
@@ -0,0 +1,43 @@
+// RUN: %{build} -o %t.out
+// RUN: %{run} %t.out
+
+//==----------------- in_order_barrier_profiling.cpp -----------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Level Zero adapter has a similar in-order queue barrier optimization that
+// leads to incorrect profiling values.
+// https://github.com/intel/llvm/issues/14186
+// UNSUPPORTED: level_zero || (linux && opencl && gpu-intel-gen12)
+#include <sycl/detail/core.hpp>
+
+#include <sycl/properties/all_properties.hpp>
+
+using namespace sycl;
+
+// Checks that the barrier profiling info is consistent with the previous
+// command, despite the fact that the latter started after the barrier was
+// submitted.
+int main() {
+  queue Q({property::queue::in_order(), property::queue::enable_profiling()});
+
+  buffer<int, 1> Buf(range<1>(1));
+  event KernelEvent;
+  event BarrierEvent;
+  {
+    auto HostAcc = Buf.get_access();
+    KernelEvent = Q.submit([&](handler &cgh) {
+      auto Acc = Buf.get_access(cgh);
+      cgh.single_task([=]() {});
+    });
+    BarrierEvent = Q.ext_oneapi_submit_barrier();
+  }
+  uint64_t KernelEnd =
+      KernelEvent.get_profiling_info<info::event_profiling::command_end>();
+  uint64_t BarrierStart =
+      BarrierEvent.get_profiling_info<info::event_profiling::command_start>();
+  assert(KernelEnd <= BarrierStart);
+}
diff --git a/sycl/test-e2e/Regression/nop_event_profiling.cpp b/sycl/test-e2e/Regression/nop_event_profiling.cpp
deleted file mode 100644
index 65f0f065e5f83..0000000000000
--- a/sycl/test-e2e/Regression/nop_event_profiling.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-// RUN: %{build} -o %t.out
-// RUN: %{run} %t.out
-
-// Test to check that it is possible to get profiling info from the event
-// returned by barrier which turns into NOP.
-
-#include <sycl/detail/core.hpp>
-
-#include <sycl/properties/all_properties.hpp>
-
-int main() {
-  sycl::event start;
-  sycl::event stop;
-  sycl::queue q{sycl::property_list(sycl::property::queue::in_order(),
-                                    sycl::property::queue::enable_profiling())};
-  float elapsed = 0;
-
-  start = q.ext_oneapi_submit_barrier();
-  std::cout << "before parallel_for" << std::endl;
-  q.parallel_for(
-      sycl::nd_range<3>(sycl::range<3>(1, 1, 16) * sycl::range<3>(1, 1, 16),
-                        sycl::range<3>(1, 1, 16)),
-      [=](sycl::nd_item<3> item_ct1) {
-        int d = 123;
-        for (int i = 0; i < 10000; i++) {
-          d = d * i;
-        }
-      });
-  std::cout << "after parallel_for" << std::endl;
-  stop = q.ext_oneapi_submit_barrier();
-  stop.wait_and_throw();
-  elapsed =
-      (stop.get_profiling_info<sycl::info::event_profiling::command_end>() -
-       start.get_profiling_info<sycl::info::event_profiling::command_start>()) /
-      1000000.0f;
-  std::cout << "elapsed:" << elapsed << std::endl;
-  return 0;
-}
diff --git a/sycl/test-e2e/Regression/unoptimized_stream.cpp b/sycl/test-e2e/Regression/unoptimized_stream.cpp
index 82b2b1ad0d789..12c1eea03fd03 100644
--- a/sycl/test-e2e/Regression/unoptimized_stream.cpp
+++ b/sycl/test-e2e/Regression/unoptimized_stream.cpp
@@ -1,6 +1,3 @@
-// Test hangs on AMD with https://github.com/intel/llvm/pull/8412
-// UNSUPPORTED: hip_amd
-
 // RUN: %{build} -O0 -o %t.out
 // RUN: %{run} %t.out
 
diff --git a/sycl/test-e2e/Scheduler/DataMovement.cpp b/sycl/test-e2e/Scheduler/DataMovement.cpp
index afe7790ab7601..c06f11fef7c59 100644
--- a/sycl/test-e2e/Scheduler/DataMovement.cpp
+++ b/sycl/test-e2e/Scheduler/DataMovement.cpp
@@ -71,7 +71,7 @@ int main() {
 
   Queue1.wait_and_throw();
 
-  { auto HostAcc = Buf.get_access<sycl_access_mode::read>(); }
+  { sycl::host_accessor HostAcc(Buf, sycl::read_only); }
 
   Queue2.submit([&](sycl::handler &CGH) {
     auto BufAcc = Buf.get_access<sycl_access_mode::read_write>(CGH);
@@ -80,7 +80,7 @@ int main() {
 
   Queue2.wait_and_throw();
 
-  { auto HostAcc = Buf.get_access<sycl_access_mode::read>(); }
+  { sycl::host_accessor HostAcc(Buf, sycl::read_only); }
 
   return 0;
 }
diff --git a/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp b/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp
index b16745b507631..69f582fb6a1e4 100644
--- a/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp
+++ b/sycl/test-e2e/Scheduler/InOrderQueueDeps.cpp
@@ -30,8 +30,7 @@ int main() {
   int val;
   sycl::buffer<int, 1> Buf{&val, sycl::range<1>(1)};
 
-  sycl::default_selector DeviceSelector;
-  sycl::device Dev = DeviceSelector.select_device();
+  sycl::device Dev(sycl::default_selector_v);
   sycl::context Ctx{Dev};
 
   sycl::queue InOrderQueueA{Ctx, Dev, sycl::property::queue::in_order()};
diff --git a/sycl/test-e2e/Scheduler/MemObjRemapping.cpp b/sycl/test-e2e/Scheduler/MemObjRemapping.cpp
index 9d56822ff4d69..d12c58d7815da 100644
--- a/sycl/test-e2e/Scheduler/MemObjRemapping.cpp
+++ b/sycl/test-e2e/Scheduler/MemObjRemapping.cpp
@@ -33,7 +33,7 @@ int main() {
     // CHECK-NEXT: :
     // CHECK-NEXT: :
     // CHECK-NEXT: : 1
-    auto AccA = BufA.get_access<access::mode::read>();
+    host_accessor AccA(BufA, read_only);
     for (std::size_t I = 0; I < Size; ++I) {
       assert(AccA[I] == I);
     }
@@ -45,13 +45,13 @@ int main() {
     // CHECK-NEXT: :
     // CHECK-NEXT: :
     // CHECK-NEXT: : 3
-    auto AccA = BufA.get_access<access::mode::write>();
+    host_accessor AccA(BufA, write_only);
     for (std::size_t I = 0; I < Size; ++I)
       AccA[I] = 2 * I;
   }
 
   // CHECK-NOT: piEnqueueMemBufferMap
-  auto AccA = BufA.get_access<access::mode::read>();
+  host_accessor AccA(BufA, read_only);
   for (std::size_t I = 0; I < Size; ++I) {
     assert(AccA[I] == 2 * I);
   }
diff --git a/sycl/test-e2e/Scheduler/MultipleDevices.cpp b/sycl/test-e2e/Scheduler/MultipleDevices.cpp
index fb7982c678520..3641e5d58b5ad 100644
--- a/sycl/test-e2e/Scheduler/MultipleDevices.cpp
+++ b/sycl/test-e2e/Scheduler/MultipleDevices.cpp
@@ -93,36 +93,30 @@ int multidevice_test(queue MyQueue1, queue MyQueue2) {
 }
 
 int main() {
-  cpu_selector CPUSelector;
-  gpu_selector GPUSelector;
 
   int Result = -1;
   try {
-    queue MyQueue1(CPUSelector);
-    queue MyQueue2(CPUSelector);
+    queue MyQueue1(cpu_selector_v);
+    queue MyQueue2(cpu_selector_v);
     Result &= multidevice_test(MyQueue1, MyQueue2);
-  } catch (sycl::runtime_error &) {
+  } catch (sycl::exception &) {
     std::cout << "Skipping CPU and CPU" << std::endl;
   }
 
   try {
-    queue MyQueue1(CPUSelector);
-    queue MyQueue2(GPUSelector);
+    queue MyQueue1(cpu_selector_v);
+    queue MyQueue2(gpu_selector_v);
     Result &= multidevice_test(MyQueue1, MyQueue2);
-  } catch (sycl::runtime_error &) {
-    std::cout << "Skipping CPU and GPU" << std::endl;
-  } catch (sycl::compile_program_error &) {
+  } catch (sycl::exception &) {
     std::cout << "Skipping CPU and GPU" << std::endl;
   }
 
   try {
-    queue MyQueue1(GPUSelector);
-    queue MyQueue2(GPUSelector);
+    queue MyQueue1(gpu_selector_v);
+    queue MyQueue2(gpu_selector_v);
     Result &= multidevice_test(MyQueue1, MyQueue2);
-  } catch (sycl::runtime_error &) {
+  } catch (sycl::exception &) {
     std::cout << "Skipping GPU and GPU" << std::endl;
-  } catch (sycl::compile_program_error &) {
-    std::cout << "Skipping CPU and GPU" << std::endl;
   }
 
   return Result;
diff --git a/sycl/test-e2e/USM/P2P/p2p_access.cpp b/sycl/test-e2e/USM/P2P/p2p_access.cpp
index 76e84ad788670..5d9fd6f7065a3 100644
--- a/sycl/test-e2e/USM/P2P/p2p_access.cpp
+++ b/sycl/test-e2e/USM/P2P/p2p_access.cpp
@@ -1,6 +1,6 @@
-// REQUIRES: cuda
-// RUN: %{build} -o %t.out
-// RUN: %if cuda %{ %{run} %t.out %}
+// REQUIRES: cuda || hip || level_zero
+// RUN:  %{build} -o %t.out
+// RUN:  %{run} %t.out
 
 #include <cassert>
 #include <sycl/detail/core.hpp>
@@ -10,17 +10,8 @@ using namespace sycl;
 
 int main() {
 
-  // Note that this code will largely be removed: it is temporary due to the
-  // temporary lack of multiple devices per sycl context in the Nvidia backend.
-  // A portable implementation, using a single gpu platform, should be possible
-  // once the Nvidia context issues are resolved.
-  ////////////////////////////////////////////////////////////////////////
-  std::vector<sycl::device> Devs;
-  for (const auto &plt : sycl::platform::get_platforms()) {
+  auto Devs = platform(gpu_selector_v).get_devices(info::device_type::gpu);
 
-    if (plt.get_backend() == sycl::backend::ext_oneapi_cuda)
-      Devs.push_back(plt.get_devices()[0]);
-  }
   if (Devs.size() < 2) {
     std::cout << "Cannot test P2P capabilities, at least two devices are "
                  "required, exiting."
diff --git a/sycl/test-e2e/USM/P2P/p2p_atomics.cpp b/sycl/test-e2e/USM/P2P/p2p_atomics.cpp
index 950762aa2cc71..3975573394c0d 100644
--- a/sycl/test-e2e/USM/P2P/p2p_atomics.cpp
+++ b/sycl/test-e2e/USM/P2P/p2p_atomics.cpp
@@ -1,6 +1,6 @@
-// REQUIRES: cuda
-// RUN: %if any-device-is-cuda %{ %{build} -Xsycl-target-backend --cuda-gpu-arch=sm_61 -o %t.out %}
-// RUN: %if cuda %{ %{run} %t.out %}
+// REQUIRES: cuda || hip || level_zero
+// RUN:  %{build} %if any-device-is-cuda %{ -Xsycl-target-backend --cuda-gpu-arch=sm_61 %} -o %t.out
+// RUN:  %{run} %t.out
 
 #include <cassert>
 #include <numeric>
@@ -18,17 +18,8 @@ constexpr size_t N = 512;
 
 int main() {
 
-  // Note that this code will largely be removed: it is temporary due to the
-  // temporary lack of multiple devices per sycl context in the Nvidia backend.
-  // A portable implementation, using a single gpu platform, should be possible
-  // once the Nvidia context issues are resolved.
-  ////////////////////////////////////////////////////////////////////////
-  std::vector<sycl::device> Devs;
-  for (const auto &plt : sycl::platform::get_platforms()) {
+  auto Devs = platform(gpu_selector_v).get_devices(info::device_type::gpu);
 
-    if (plt.get_backend() == sycl::backend::ext_oneapi_cuda)
-      Devs.push_back(plt.get_devices()[0]);
-  }
   if (Devs.size() < 2) {
     std::cout << "Cannot test P2P capabilities, at least two devices are "
                  "required, exiting."
@@ -51,18 +42,18 @@ int main() {
   // Enables Devs[1] to access Devs[0] memory.
   Devs[1].ext_oneapi_enable_peer_access(Devs[0]);
 
-  std::vector<double> input(N);
+  std::vector<int> input(N);
   std::iota(input.begin(), input.end(), 0);
 
-  double h_sum = 0.;
+  int h_sum = 0.;
   for (const auto &value : input) {
     h_sum += value;
   }
 
-  double *d_sum = malloc_shared<double>(1, Queues[0]);
-  double *d_in = malloc_device<double>(N, Queues[0]);
+  int *d_sum = malloc_shared<int>(1, Queues[0]);
+  int *d_in = malloc_device<int>(N, Queues[0]);
 
-  Queues[0].memcpy(d_in, &input[0], N * sizeof(double));
+  Queues[0].memcpy(d_in, &input[0], N * sizeof(int));
   Queues[0].wait();
 
   range global_range{N};
@@ -70,7 +61,7 @@ int main() {
   *d_sum = 0.;
   Queues[1].submit([&](handler &h) {
     h.parallel_for<class peer_atomic>(global_range, [=](id<1> i) {
-      sycl::atomic_ref<double, sycl::memory_order::relaxed,
+      sycl::atomic_ref<int, sycl::memory_order::relaxed,
                        sycl::memory_scope::system,
                        access::address_space::global_space>(*d_sum) += d_in[i];
     });
diff --git a/sycl/test-e2e/USM/P2P/p2p_copy.cpp b/sycl/test-e2e/USM/P2P/p2p_copy.cpp
index 0ddff1c0aa6ca..724c0b7ad6bb7 100644
--- a/sycl/test-e2e/USM/P2P/p2p_copy.cpp
+++ b/sycl/test-e2e/USM/P2P/p2p_copy.cpp
@@ -1,6 +1,6 @@
-// REQUIRES: cuda
-// RUN: %{build} -o %t.out
-// RUN: %if cuda %{ %{run} %t.out %}
+// REQUIRES: cuda || hip || level_zero
+// RUN:  %{build} -o %t.out
+// RUN:  %{run} %t.out
 
 #include <cassert>
 #include <numeric>
@@ -15,17 +15,8 @@ constexpr int N = 100;
 
 int main() {
 
-  // Note that this code will largely be removed: it is temporary due to the
-  // temporary lack of multiple devices per sycl context in the Nvidia backend.
-  // A portable implementation, using a single gpu platform, should be possible
-  // once the Nvidia context issues are resolved.
-  ////////////////////////////////////////////////////////////////////////
-  std::vector<sycl::device> Devs;
-  for (const auto &plt : sycl::platform::get_platforms()) {
+  auto Devs = platform(gpu_selector_v).get_devices(info::device_type::gpu);
 
-    if (plt.get_backend() == sycl::backend::ext_oneapi_cuda)
-      Devs.push_back(plt.get_devices()[0]);
-  }
   if (Devs.size() < 2) {
     std::cout << "Cannot test P2P capabilities, at least two devices are "
                  "required, exiting."
diff --git a/sycl/test-e2e/USM/dep_events.cpp b/sycl/test-e2e/USM/dep_events.cpp
index 20ee05a309c7a..01f80564bc144 100644
--- a/sycl/test-e2e/USM/dep_events.cpp
+++ b/sycl/test-e2e/USM/dep_events.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 // REQUIRES: aspect-usm_shared_allocations
+// UNSUPPORTED: cuda && windows
 // RUN: %{build} -o %t1.out
 // RUN: %{run} %t1.out
 
diff --git a/sycl/test-e2e/USM/memcpy.cpp b/sycl/test-e2e/USM/memcpy.cpp
index 57739533239f7..fc1029964103b 100644
--- a/sycl/test-e2e/USM/memcpy.cpp
+++ b/sycl/test-e2e/USM/memcpy.cpp
@@ -86,7 +86,7 @@ void check_on_device(queue q, int *arr) {
           [&](handler &cgh) { cgh.memcpy(nullptr, ARR, sizeof(int) * N); });   \
       q.wait_and_throw();                                                      \
       assert(false && "Expected error from copying to nullptr");               \
-    } catch (runtime_error e) {                                                \
+    } catch (exception e) {                                                    \
     }                                                                          \
     /* Copying to nullptr should throw. */                                     \
     q.submit([&](handler &cgh) { cgh.memcpy(nullptr, ARR, 0); });              \
diff --git a/sycl/test-e2e/USM/memset.cpp b/sycl/test-e2e/USM/memset.cpp
index ff0d597e85036..dcd201677be5e 100644
--- a/sycl/test-e2e/USM/memset.cpp
+++ b/sycl/test-e2e/USM/memset.cpp
@@ -127,7 +127,7 @@ int main() {
     q.submit([&](handler &cgh) { cgh.memset(nullptr, 0, N * sizeof(char)); });
     q.wait_and_throw();
     assert(false && "Expected error from writing to nullptr");
-  } catch (runtime_error e) {
+  } catch (exception e) {
   }
 
   // Filling to nullptr is skipped if the number of bytes to fill is 0.
diff --git a/sycl/test-e2e/USM/pointer_query.cpp b/sycl/test-e2e/USM/pointer_query.cpp
index c8c66cd2cfdb3..84282d4859ba8 100644
--- a/sycl/test-e2e/USM/pointer_query.cpp
+++ b/sycl/test-e2e/USM/pointer_query.cpp
@@ -90,7 +90,7 @@ int main() {
   }
   try {
     D = get_pointer_device(array, ctxt);
-  } catch (runtime_error) {
+  } catch (exception) {
     free(array);
     return 0;
   }
diff --git a/sycl/test-e2e/USM/queue_wait.cpp b/sycl/test-e2e/USM/queue_wait.cpp
index 0f0ebbf02304b..0aa3d375b4120 100644
--- a/sycl/test-e2e/USM/queue_wait.cpp
+++ b/sycl/test-e2e/USM/queue_wait.cpp
@@ -36,13 +36,13 @@ int main() {
     Q.memset(nullptr, 42, Size);
     Q.wait_and_throw();
     assert(false && "Expected to have an exception throw instead of assert");
-  } catch (runtime_error e) {
+  } catch (exception e) {
   }
   try {
     Q.memcpy(nullptr, DevArr, Size);
     Q.wait_and_throw();
     assert(false && "Expected to have an exception throw instead of assert");
-  } catch (runtime_error e) {
+  } catch (exception e) {
   }
 
   Q.memset(nullptr, 42, 0);
diff --git a/sycl/test-e2e/UserDefinedReductions/user_defined_reductions.cpp b/sycl/test-e2e/UserDefinedReductions/user_defined_reductions.cpp
index aa44c88689f8d..b67cc16eb9baf 100644
--- a/sycl/test-e2e/UserDefinedReductions/user_defined_reductions.cpp
+++ b/sycl/test-e2e/UserDefinedReductions/user_defined_reductions.cpp
@@ -134,7 +134,7 @@ void test(queue q, InputContainer input, OutputContainer output,
                     it.get_group(), sycl::span(&scratch[0], temp_memory_size));
 
             const InputT *first =
-                in.template get_multi_ptr<access::decorated::no>();
+                in.template get_multi_ptr<access::decorated::no>().get();
             const InputT *last = first + N;
             // check reduce_over_group w/o init
             out[0] = sycl::ext::oneapi::experimental::reduce_over_group(
diff --git a/sycl/test-e2e/UserDefinedReductions/user_defined_reductions_wg_size_larger_than_data_size.cpp b/sycl/test-e2e/UserDefinedReductions/user_defined_reductions_wg_size_larger_than_data_size.cpp
index eeda906c446ed..1bae39ec9fb9a 100644
--- a/sycl/test-e2e/UserDefinedReductions/user_defined_reductions_wg_size_larger_than_data_size.cpp
+++ b/sycl/test-e2e/UserDefinedReductions/user_defined_reductions_wg_size_larger_than_data_size.cpp
@@ -46,9 +46,9 @@ void test(queue q, InputContainer input, OutputContainer output,
       cgh.parallel_for(
           nd_range<1>(workgroup_size, workgroup_size), [=](nd_item<1> it) {
             const InputT *segment_begin =
-                in.template get_multi_ptr<access::decorated::no>();
+                in.template get_multi_ptr<access::decorated::no>().get();
             const InputT *segment_end =
-                in.template get_multi_ptr<access::decorated::no>() +
+                in.template get_multi_ptr<access::decorated::no>().get() +
                 segment_size;
             auto handle =
                 sycl::ext::oneapi::experimental::group_with_scratchpad(
diff --git a/sycl/test-e2e/XPTI/buffer/accessors.cpp b/sycl/test-e2e/XPTI/buffer/accessors.cpp
index 535c6ce795933..65f1b62d6b2b5 100644
--- a/sycl/test-e2e/XPTI/buffer/accessors.cpp
+++ b/sycl/test-e2e/XPTI/buffer/accessors.cpp
@@ -45,12 +45,12 @@ int main() {
       (void)A6;
     });
   });
-  // CHECK: {{[0-9]+}}|Construct accessor|[[BUFFERID]]|[[ACCID7:.*]]|2018|1024|{{.*}}accessors.cpp:[[# @LINE + 1]]:15
-  { auto HA = Buf.get_access<mode::read>(); }
-  // CHECK: {{[0-9]+}}|Construct accessor|[[BUFFERID]]|[[ACCID8:.*]]|2018|1025|{{.*}}accessors.cpp:[[# @LINE + 1]]:15
-  { auto HA = Buf.get_access<mode::write>(); }
-  // CHECK: {{[0-9]+}}|Construct accessor|[[BUFFERID]]|[[ACCID9:.*]]|2018|1026|{{.*}}accessors.cpp:[[# @LINE + 1]]:15
-  { auto HA = Buf.get_access<mode::read_write>(); }
+  // CHECK: {{[0-9]+}}|Construct accessor|[[BUFFERID]]|[[ACCID7:.*]]|2018|1024|{{.*}}accessors.cpp:[[# @LINE + 1]]:25
+  { sycl::host_accessor HA(Buf, sycl::read_only); }
+  // CHECK: {{[0-9]+}}|Construct accessor|[[BUFFERID]]|[[ACCID8:.*]]|2018|1025|{{.*}}accessors.cpp:[[# @LINE + 1]]:25
+  { sycl::host_accessor HA(Buf, sycl::write_only); }
+  // CHECK: {{[0-9]+}}|Construct accessor|[[BUFFERID]]|[[ACCID9:.*]]|2018|1026|{{.*}}accessors.cpp:[[# @LINE + 1]]:25
+  { sycl::host_accessor HA(Buf, sycl::read_write); }
   // CHECK: {{[0-9]+}}|Construct accessor|[[BUFFERID]]|[[ACCID10:.*]]|2018|1027|{{.*}}accessors.cpp:[[# @LINE + 1]]:15
   { auto HA = Buf.get_access<mode::discard_write>(); }
   // CHECK: {{[0-9]+}}|Construct accessor|[[BUFFERID]]|[[ACCID11:.*]]|2018|1028|{{.*}}accessors.cpp:[[# @LINE + 1]]:15
diff --git a/sycl/test-e2e/XPTI/buffer/in_cycle.cpp b/sycl/test-e2e/XPTI/buffer/in_cycle.cpp
index 75fd3b3b96e85..7dbd84d336289 100644
--- a/sycl/test-e2e/XPTI/buffer/in_cycle.cpp
+++ b/sycl/test-e2e/XPTI/buffer/in_cycle.cpp
@@ -32,7 +32,7 @@ bool func(sycl::queue &Queue, int depth = 0) {
   // Get read only access to the buffer on the host.
   // This introduces an implicit barrier which blocks execution until the
   // command group above completes.
-  const auto HostAccessor = Buffer.get_access<sycl::access::mode::read>();
+  const sycl::host_accessor HostAccessor(Buffer, sycl::read_only);
 
   // Check the results.
   for (size_t I = 0; I < Buffer.size(); ++I) {
diff --git a/sycl/test-e2e/XPTI/buffer/multiple_buffers.cpp b/sycl/test-e2e/XPTI/buffer/multiple_buffers.cpp
index 874f81c11b481..247a5182413c8 100644
--- a/sycl/test-e2e/XPTI/buffer/multiple_buffers.cpp
+++ b/sycl/test-e2e/XPTI/buffer/multiple_buffers.cpp
@@ -35,8 +35,8 @@ int main() {
     });
   });
 
-  const auto HostAccessor1 = Buffer1.get_access<sycl::access::mode::read>();
-  const auto HostAccessor2 = Buffer2.get_access<sycl::access::mode::read>();
+  const sycl::host_accessor HostAccessor1(Buffer1, sycl::read_only);
+  const sycl::host_accessor HostAccessor2(Buffer2, sycl::read_only);
 
   // Check the results.
   for (size_t I = 0; I < Buffer1.size(); ++I) {
diff --git a/sycl/test-e2e/XPTI/buffer/multiple_queues.cpp b/sycl/test-e2e/XPTI/buffer/multiple_queues.cpp
index 4593ca88ae0ca..659a982669c6e 100644
--- a/sycl/test-e2e/XPTI/buffer/multiple_queues.cpp
+++ b/sycl/test-e2e/XPTI/buffer/multiple_queues.cpp
@@ -15,7 +15,10 @@
 int main() {
   bool MismatchFound = false;
 
-  sycl::device Device{sycl::ext::oneapi::filter_selector{"cpu,accelerator"}};
+  auto selector_v = [](const sycl::device &d) {
+    return std::max(cpu_selector_v(d), accelerator_selector_v(d));
+  };
+  sycl::device Device{selector_v};
   auto Devices = Device.create_sub_devices<
       sycl::info::partition_property::partition_equally>(2);
 
diff --git a/sycl/test-e2e/XPTI/buffer/recursion.cpp b/sycl/test-e2e/XPTI/buffer/recursion.cpp
index a8a4f4ab65a2c..00d90390311a1 100644
--- a/sycl/test-e2e/XPTI/buffer/recursion.cpp
+++ b/sycl/test-e2e/XPTI/buffer/recursion.cpp
@@ -32,7 +32,7 @@ bool func(sycl::queue &Queue, int depth = 0) {
   // Get read only access to the buffer on the host.
   // This introduces an implicit barrier which blocks execution until the
   // command group above completes.
-  const auto HostAccessor = Buffer.get_access<sycl::access::mode::read>();
+  const sycl::host_accessor HostAccessor(Buffer, sycl::read_only);
 
   // Check the results.
   for (size_t I = 0; I < Buffer.size(); ++I) {
diff --git a/sycl/test-e2e/XPTI/buffer/sub_buffer.cpp b/sycl/test-e2e/XPTI/buffer/sub_buffer.cpp
index 7bc46f33baa8e..ea45c2fab6aa9 100644
--- a/sycl/test-e2e/XPTI/buffer/sub_buffer.cpp
+++ b/sycl/test-e2e/XPTI/buffer/sub_buffer.cpp
@@ -33,8 +33,8 @@ int main() {
             Accessor1[WIid] = static_cast<int>(WIid.get(0));
           });
     });
-    // CHECK: {{[0-9]+}}|Construct accessor|[[USERID1]]|[[ACCID2:.*]]|2018|1024|{{.*}}sub_buffer.cpp:[[# @LINE + 1]]:22
-    auto Accessor1 = Buffer1.get_access<sycl::access::mode::read>();
+    // CHECK: {{[0-9]+}}|Construct accessor|[[USERID1]]|[[ACCID2:.*]]|2018|1024|{{.*}}sub_buffer.cpp:[[# @LINE + 1]]:25
+    sycl::host_accessor Accessor1(Buffer1, sycl::read_only);
     for (size_t I = 32; I < 64; ++I) {
       if (Accessor1[I] != I - 32) {
         std::cout << "The result is incorrect for element: " << I
diff --git a/sycl/test-e2e/XPTI/kernel/basic.cpp b/sycl/test-e2e/XPTI/kernel/basic.cpp
index f4066ae6245c3..59bafe844625e 100644
--- a/sycl/test-e2e/XPTI/kernel/basic.cpp
+++ b/sycl/test-e2e/XPTI/kernel/basic.cpp
@@ -82,8 +82,8 @@ int main() {
   // CHECK: Wait end|{{.*}}.cpp:[[# @LINE + 1]]:3
   Queue.wait();
 
-  // CHECK: {{[0-9]+}}|Construct accessor|[[BUFFERID]]|[[ACCID3:.*]]|2018|1024|{{.*}}.cpp:[[# @LINE + 1]]:15
-  { auto HA = Buf.get_access<mode::read>(); }
+  // CHECK: {{[0-9]+}}|Construct accessor|[[BUFFERID]]|[[ACCID3:.*]]|2018|1024|{{.*}}.cpp:[[# @LINE + 1]]:25
+  { sycl::host_accessor HA(Buf, sycl::read_only); }
 
   Queue.submit([&](sycl::handler &cgh) {
     // CHECK: {{[0-9]+}}|Construct accessor|[[BUFFERID]]|[[ACCID4:.+]]|2014|1026|{{.*}}.cpp:[[# @LINE + 1]]:16
diff --git a/sycl/test-e2e/bindless_images/array/read_write_unsampled_array.cpp b/sycl/test-e2e/bindless_images/array/read_write_unsampled_array.cpp
index f6a7726e84c7d..668063963cf92 100644
--- a/sycl/test-e2e/bindless_images/array/read_write_unsampled_array.cpp
+++ b/sycl/test-e2e/bindless_images/array/read_write_unsampled_array.cpp
@@ -1,3 +1,4 @@
+// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/bindless_images/bindless_helpers.hpp b/sycl/test-e2e/bindless_images/bindless_helpers.hpp
index f8668d054e0ae..618c29fb06745 100644
--- a/sycl/test-e2e/bindless_images/bindless_helpers.hpp
+++ b/sycl/test-e2e/bindless_images/bindless_helpers.hpp
@@ -53,6 +53,8 @@ static void fill_rand(std::vector<sycl::vec<DType, NChannels>> &v,
       return std::uniform_real_distribution<float>(0.0, 100.0);
     } else if constexpr (std::is_floating_point_v<DType>) {
       return std::uniform_real_distribution<DType>(0.0, 100.0);
+    } else if constexpr (sizeof(DType) == 1) {
+      return std::uniform_int_distribution<unsigned short>(0, 100);
     } else {
       return std::uniform_int_distribution<DType>(0, 100);
     }
@@ -61,7 +63,7 @@ static void fill_rand(std::vector<sycl::vec<DType, NChannels>> &v,
     sycl::vec<DType, NChannels> temp;
 
     for (int j = 0; j < NChannels; j++) {
-      temp[j] = distribution(generator);
+      temp[j] = static_cast<DType>(distribution(generator));
     }
 
     v[i] = temp;
diff --git a/sycl/test-e2e/bindless_images/cubemap/cubemap_sampled.cpp b/sycl/test-e2e/bindless_images/cubemap/cubemap_sampled.cpp
index 63d213586e4f5..71f9253b239a2 100644
--- a/sycl/test-e2e/bindless_images/cubemap/cubemap_sampled.cpp
+++ b/sycl/test-e2e/bindless_images/cubemap/cubemap_sampled.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: linux,cuda,aspect-ext_oneapi_cubemap
+// REQUIRES: cuda,aspect-ext_oneapi_cubemap
 // REQUIRES: aspect-ext_oneapi_cubemap_seamless_filtering
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/bindless_images/cubemap/cubemap_unsampled.cpp b/sycl/test-e2e/bindless_images/cubemap/cubemap_unsampled.cpp
index 383440d7835a2..413045190e54c 100644
--- a/sycl/test-e2e/bindless_images/cubemap/cubemap_unsampled.cpp
+++ b/sycl/test-e2e/bindless_images/cubemap/cubemap_unsampled.cpp
@@ -1,4 +1,4 @@
-// REQUIRES: linux,cuda,aspect-ext_oneapi_cubemap
+// REQUIRES: cuda,aspect-ext_oneapi_cubemap
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
 // RUN: %t.out
diff --git a/sycl/test-e2e/bindless_images/device_to_device_copy.cpp b/sycl/test-e2e/bindless_images/device_to_device_copy.cpp
index 3ca37772e4f5e..4a9263e44a13e 100644
--- a/sycl/test-e2e/bindless_images/device_to_device_copy.cpp
+++ b/sycl/test-e2e/bindless_images/device_to_device_copy.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/bindless_images/dx12_interop/read_write_unsampled.cpp b/sycl/test-e2e/bindless_images/dx12_interop/read_write_unsampled.cpp
new file mode 100644
index 0000000000000..eacf3a40ebfd5
--- /dev/null
+++ b/sycl/test-e2e/bindless_images/dx12_interop/read_write_unsampled.cpp
@@ -0,0 +1,479 @@
+// REQUIRES: cuda
+// REQUIRES: windows
+
+// RUN: %{build} -l d3d12 -l dxgi -l dxguid -o %t.out
+// RUN: %t.out
+
+#pragma clang diagnostic ignored "-Waddress-of-temporary"
+
+#include "read_write_unsampled.h"
+
+void DX12InteropTest::initDX12Device() {
+  // Create DXGI factory.
+  ThrowIfFailed(CreateDXGIFactory2(0 /* dxgiFactoryFlags */,
+                                   IID_PPV_ARGS(&m_dx12Factory)));
+
+  // Get the hardware adapter for a suitable GPU.
+  getDX12Adapter(m_dx12Factory.Get(), &m_dx12HardwareAdapter);
+
+  // Create a device from our hardware adapter.
+  ThrowIfFailed(D3D12CreateDevice(m_dx12HardwareAdapter.Get(),
+                                  D3D_FEATURE_LEVEL_12_0,
+                                  IID_PPV_ARGS(&m_dx12Device)));
+}
+
+void DX12InteropTest::initDX12CommandList() {
+  // Describe and create the command queue.
+  D3D12_COMMAND_QUEUE_DESC queueDesc = {D3D12_COMMAND_LIST_TYPE_DIRECT, 0,
+                                        D3D12_COMMAND_QUEUE_FLAG_NONE, 0};
+  ThrowIfFailed(m_dx12Device->CreateCommandQueue(
+      &queueDesc, IID_PPV_ARGS(&m_dx12CommandQueue)));
+
+  // Create the command allocator.
+  ThrowIfFailed(m_dx12Device->CreateCommandAllocator(
+      D3D12_COMMAND_LIST_TYPE_DIRECT, IID_PPV_ARGS(&m_dx12CommandAllocator)));
+
+  // Create the command list.
+  ThrowIfFailed(m_dx12Device->CreateCommandList(
+      0, D3D12_COMMAND_LIST_TYPE_DIRECT, m_dx12CommandAllocator.Get(), NULL,
+      IID_PPV_ARGS(&m_dx12CommandList)));
+}
+
+void DX12InteropTest::initDX12Resources() {
+
+  // Define default heap properties.
+  D3D12_HEAP_PROPERTIES defaultHeapProperties = {};
+  defaultHeapProperties.Type = D3D12_HEAP_TYPE_DEFAULT;
+  defaultHeapProperties.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
+  defaultHeapProperties.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
+  defaultHeapProperties.CreationNodeMask = 1;
+  defaultHeapProperties.VisibleNodeMask = 1;
+
+  // Define texture resource descriptor (1D, 32-bit integer).
+  D3D12_RESOURCE_DESC textureResourceDesc = {};
+  textureResourceDesc.Dimension = D3D12_RESOURCE_DIMENSION_TEXTURE1D;
+  textureResourceDesc.Alignment = 0;
+  textureResourceDesc.Width = m_width;
+  textureResourceDesc.Height = 1;
+  textureResourceDesc.DepthOrArraySize = 1;
+  textureResourceDesc.MipLevels = 0;
+  textureResourceDesc.Format = DXGI_FORMAT_R32_UINT;
+  textureResourceDesc.SampleDesc = DXGI_SAMPLE_DESC{1, 0};
+  textureResourceDesc.Layout = D3D12_TEXTURE_LAYOUT_UNKNOWN;
+  textureResourceDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
+
+  // Create the DX12 texture.
+  ThrowIfFailed(m_dx12Device->CreateCommittedResource(
+      &defaultHeapProperties, D3D12_HEAP_FLAG_SHARED, &textureResourceDesc,
+      D3D12_RESOURCE_STATE_COPY_DEST, nullptr, IID_PPV_ARGS(&m_dx12Texture)));
+
+  // Create a shared handle for our texture.
+  ThrowIfFailed(m_dx12Device->CreateSharedHandle(m_dx12Texture.Get(), nullptr,
+                                                 GENERIC_ALL, nullptr,
+                                                 &m_sharedMemoryHandle));
+
+  D3D12_RESOURCE_ALLOCATION_INFO textureAllocationInfo;
+  textureAllocationInfo =
+      m_dx12Device->GetResourceAllocationInfo(1, 1, &textureResourceDesc);
+  size_t allocationSize = textureAllocationInfo.SizeInBytes;
+
+  // Import our shared DX12 texture resource to SYCL.
+  importDX12SharedMemoryHandle(allocationSize);
+
+  // Create the DX12 fence and map to a SYCL semaphore.
+  ThrowIfFailed(m_dx12Device->CreateFence(
+      m_sharedFenceValue, D3D12_FENCE_FLAG_SHARED, IID_PPV_ARGS(&m_dx12Fence)));
+
+  ThrowIfFailed(m_dx12Device->CreateSharedHandle(m_dx12Fence.Get(), nullptr,
+                                                 GENERIC_ALL, nullptr,
+                                                 &m_sharedSemaphoreHandle));
+
+  // Import our shared DX12 fence resource to SYCL.
+  importDX12SharedSemaphoreHandle();
+
+  // Create an event handle to use for synchronization.
+  m_dx12FenceEvent = CreateEvent(nullptr, FALSE, FALSE, nullptr);
+  if (m_dx12FenceEvent == nullptr) {
+    ThrowIfFailed(HRESULT_FROM_WIN32(GetLastError()));
+  }
+
+  populateDX12Texture();
+}
+
+void DX12InteropTest::importDX12SharedMemoryHandle(size_t allocationSize) {
+  syclexp::external_mem_descriptor<syclexp::resource_win32_handle> extMemDesc{
+      m_sharedMemoryHandle,
+      syclexp::external_mem_handle_type::win32_nt_dx12_resource,
+      allocationSize};
+
+  m_syclInteropMemHandle =
+      syclexp::import_external_memory(extMemDesc, m_syclQueue);
+
+  m_syclImageMemHandle = syclexp::map_external_image_memory(
+      m_syclInteropMemHandle, m_syclImageDesc, m_syclQueue);
+
+  m_syclImageHandle =
+      syclexp::create_image(m_syclImageMemHandle, m_syclImageDesc, m_syclQueue);
+}
+
+void DX12InteropTest::importDX12SharedSemaphoreHandle() {
+  syclexp::external_semaphore_descriptor<syclexp::resource_win32_handle>
+      extSemDesc{m_sharedSemaphoreHandle,
+                 syclexp::external_semaphore_handle_type::win32_nt_dx12_fence};
+
+  m_syclInteropSemaphoreHandle =
+      syclexp::import_external_semaphore(extSemDesc, m_syclQueue);
+}
+
+void DX12InteropTest::callSYCLKernel() {
+
+  // Wait for imported semaphore. This semaphore was signalled at the
+  // end of `populateDX12Texture`.
+  m_syclQueue.ext_oneapi_wait_external_semaphore(m_syclInteropSemaphoreHandle,
+                                                 m_sharedFenceValue);
+
+  // We can't capture the image handle through `this` in the lambda.
+  // If we do the kernel will crash.
+  auto imgHandle = m_syclImageHandle;
+
+  // Submit our SYCL kernel. All we do is double the value of each pixel in the
+  // texture.
+  try {
+    m_syclQueue.submit([&](sycl::handler &cgh) {
+      cgh.parallel_for<class TestKernel>(
+          sycl::nd_range<1>{{m_width}, {1}}, [=](sycl::nd_item<1> it) {
+            size_t dim0 = it.get_global_id(0);
+
+            uint32_t px = syclexp::fetch_image<uint32_t>(imgHandle, int(dim0));
+
+            px *= 2;
+
+            syclexp::write_image(imgHandle, int(dim0), px);
+          });
+    });
+  } catch (sycl::exception e) {
+    std::cerr << "\tKernel submission failed! " << e.what() << std::endl;
+    exit(-1);
+  } catch (...) {
+    std::cerr << "\tKernel submission failed!" << std::endl;
+    exit(-1);
+  }
+
+  // Increment the fence value.
+  m_sharedFenceValue++;
+
+  // Signal imported semaphore.
+  m_syclQueue.submit([&](sycl::handler &cgh) {
+    cgh.ext_oneapi_signal_external_semaphore(m_syclInteropSemaphoreHandle,
+                                             m_sharedFenceValue);
+  });
+
+  // Use DX12 to wait for the semaphore signalled by SYCL above.
+  waitDX12Fence();
+}
+
+void DX12InteropTest::populateDX12Texture() {
+
+  // Set our texture data to upload.
+  std::vector<uint32_t> uploadData(m_width);
+  for (int i = 0; i < m_width; ++i) {
+    uploadData[i] = i;
+  }
+
+  // Get required staging buffer size.
+  uint64_t stagingBufferSize = 0;
+  m_dx12Device->GetCopyableFootprints(&m_dx12Texture->GetDesc(), 0, 1, 0,
+                                      nullptr, nullptr, nullptr,
+                                      &stagingBufferSize);
+
+  // Define upload heap properties.
+  D3D12_HEAP_PROPERTIES uploadHeapProperties = {};
+  uploadHeapProperties.Type = D3D12_HEAP_TYPE_UPLOAD;
+  uploadHeapProperties.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
+  uploadHeapProperties.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
+  uploadHeapProperties.CreationNodeMask = 1;
+  uploadHeapProperties.VisibleNodeMask = 1;
+
+  // Define upload buffer resource descriptor.
+  D3D12_RESOURCE_DESC uploadBufferResourceDesc = {};
+  uploadBufferResourceDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
+  uploadBufferResourceDesc.Alignment = 0;
+  uploadBufferResourceDesc.Width = stagingBufferSize;
+  uploadBufferResourceDesc.Height = 1;
+  uploadBufferResourceDesc.DepthOrArraySize = 1;
+  uploadBufferResourceDesc.MipLevels = 1;
+  uploadBufferResourceDesc.Format = DXGI_FORMAT_UNKNOWN;
+  uploadBufferResourceDesc.SampleDesc = DXGI_SAMPLE_DESC{1, 0};
+  uploadBufferResourceDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
+  uploadBufferResourceDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
+
+  // Allocate the staging upload buffer.
+  ComPtr<ID3D12Resource> stagingBuffer;
+  ThrowIfFailed(m_dx12Device->CreateCommittedResource(
+      &uploadHeapProperties, D3D12_HEAP_FLAG_NONE, &uploadBufferResourceDesc,
+      D3D12_RESOURCE_STATE_GENERIC_READ, nullptr,
+      IID_PPV_ARGS(&stagingBuffer)));
+
+  // Map the upload staging buffer to host visible memory.
+  D3D12_RANGE stagingBufferRange{0, stagingBufferSize};
+  uint32_t *pStagingBufferData{};
+  ThrowIfFailed(stagingBuffer->Map(
+      0, &stagingBufferRange, reinterpret_cast<void **>(&pStagingBufferData)));
+
+  // Populate the staging buffer with our upload data.
+  for (int i = 0; i < m_width; ++i) {
+    pStagingBufferData[i] = uploadData[i];
+  }
+
+  // Unmap the staging buffer.
+  D3D12_RANGE emptyRange{0, 0};
+  stagingBuffer->Unmap(0, &emptyRange);
+
+  // Reset command list to inital state if necessary.
+  m_dx12CommandList->Reset(m_dx12CommandAllocator.Get(), nullptr);
+
+  // Set the copy source and destination footprint/locations.
+  D3D12_PLACED_SUBRESOURCE_FOOTPRINT bufferFootprint = {};
+  bufferFootprint.Footprint.Width = m_width;
+  bufferFootprint.Footprint.Height = 1;
+  bufferFootprint.Footprint.Depth = 1;
+  bufferFootprint.Footprint.RowPitch = static_cast<uint32_t>(stagingBufferSize);
+  bufferFootprint.Footprint.Format = DXGI_FORMAT_R32_UINT;
+
+  D3D12_TEXTURE_COPY_LOCATION copyDest = {};
+  copyDest.pResource = m_dx12Texture.Get();
+  copyDest.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
+  copyDest.SubresourceIndex = 0;
+
+  D3D12_TEXTURE_COPY_LOCATION copySrc = {};
+  copySrc.pResource = stagingBuffer.Get();
+  copySrc.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
+  copySrc.PlacedFootprint = bufferFootprint;
+
+  // Copy the upload buffer data to our texture.
+  m_dx12CommandList->CopyTextureRegion(&copyDest, 0, 0, 0, &copySrc, nullptr);
+
+  D3D12_RESOURCE_BARRIER transitionResourceBarrier = {};
+  transitionResourceBarrier.Type = D3D12_RESOURCE_BARRIER_TYPE_TRANSITION;
+  transitionResourceBarrier.Flags = D3D12_RESOURCE_BARRIER_FLAG_NONE;
+  transitionResourceBarrier.Transition.pResource = m_dx12Texture.Get();
+  transitionResourceBarrier.Transition.StateBefore =
+      D3D12_RESOURCE_STATE_COPY_DEST;
+  transitionResourceBarrier.Transition.StateAfter =
+      D3D12_RESOURCE_STATE_COPY_SOURCE;
+  transitionResourceBarrier.Transition.Subresource =
+      D3D12_RESOURCE_BARRIER_ALL_SUBRESOURCES;
+
+  m_dx12CommandList->ResourceBarrier(1, &transitionResourceBarrier);
+
+  // Execute the command list.
+  ThrowIfFailed(m_dx12CommandList->Close());
+  ID3D12CommandList *ppCommandLists[] = {m_dx12CommandList.Get()};
+  m_dx12CommandQueue->ExecuteCommandLists(_countof(ppCommandLists),
+                                          ppCommandLists);
+  ThrowIfFailed(
+      m_dx12CommandQueue->Signal(m_dx12Fence.Get(), m_sharedFenceValue));
+
+  // Don't wait for the fence here. We will use the SYCL API to wait for this
+  // fence in `callSYCLKernel`.
+}
+
+bool DX12InteropTest::validateOutput() {
+
+  // Reset the command list.
+  ThrowIfFailed(
+      m_dx12CommandList->Reset(m_dx12CommandAllocator.Get(), nullptr));
+
+  // Get intermediate readback buffer size.
+  uint64_t readbackBufferSize = 0;
+  m_dx12Device->GetCopyableFootprints(&m_dx12Texture->GetDesc(), 0, 1, 0,
+                                      nullptr, nullptr, nullptr,
+                                      &readbackBufferSize);
+
+  // Define readback heap properties.
+  D3D12_HEAP_PROPERTIES readbackHeapProperties = {};
+  readbackHeapProperties.Type = D3D12_HEAP_TYPE_READBACK;
+  readbackHeapProperties.CPUPageProperty = D3D12_CPU_PAGE_PROPERTY_UNKNOWN;
+  readbackHeapProperties.MemoryPoolPreference = D3D12_MEMORY_POOL_UNKNOWN;
+  readbackHeapProperties.CreationNodeMask = 1;
+  readbackHeapProperties.VisibleNodeMask = 1;
+
+  // Define readback buffer resource descriptor.
+  D3D12_RESOURCE_DESC readbackBufferResourceDesc = {};
+  readbackBufferResourceDesc.Dimension = D3D12_RESOURCE_DIMENSION_BUFFER;
+  readbackBufferResourceDesc.Alignment = 0;
+  readbackBufferResourceDesc.Width = readbackBufferSize;
+  readbackBufferResourceDesc.Height = 1;
+  readbackBufferResourceDesc.DepthOrArraySize = 1;
+  readbackBufferResourceDesc.MipLevels = 1;
+  readbackBufferResourceDesc.Format = DXGI_FORMAT_UNKNOWN;
+  readbackBufferResourceDesc.SampleDesc = DXGI_SAMPLE_DESC{1, 0};
+  readbackBufferResourceDesc.Layout = D3D12_TEXTURE_LAYOUT_ROW_MAJOR;
+  readbackBufferResourceDesc.Flags = D3D12_RESOURCE_FLAG_NONE;
+
+  // Create the readback buffer.
+  ComPtr<ID3D12Resource> readbackBuffer;
+  ThrowIfFailed(m_dx12Device->CreateCommittedResource(
+      &readbackHeapProperties, D3D12_HEAP_FLAG_NONE,
+      &readbackBufferResourceDesc, D3D12_RESOURCE_STATE_COPY_DEST, nullptr,
+      IID_PPV_ARGS(&readbackBuffer)));
+
+  // Set the copy source and destination footprint/locations.
+  D3D12_PLACED_SUBRESOURCE_FOOTPRINT bufferFootprint = {};
+  bufferFootprint.Footprint.Width = m_width;
+  bufferFootprint.Footprint.Height = 1;
+  bufferFootprint.Footprint.Depth = 1;
+  bufferFootprint.Footprint.RowPitch =
+      static_cast<uint32_t>(readbackBufferSize);
+  bufferFootprint.Footprint.Format = DXGI_FORMAT_R32_UINT;
+
+  D3D12_TEXTURE_COPY_LOCATION copyDest = {};
+  copyDest.pResource = readbackBuffer.Get();
+  copyDest.Type = D3D12_TEXTURE_COPY_TYPE_PLACED_FOOTPRINT;
+  copyDest.PlacedFootprint = bufferFootprint;
+
+  D3D12_TEXTURE_COPY_LOCATION copySrc = {};
+  copySrc.pResource = m_dx12Texture.Get();
+  copySrc.Type = D3D12_TEXTURE_COPY_TYPE_SUBRESOURCE_INDEX;
+  copySrc.SubresourceIndex = 0;
+
+  // Copy the texture to our readback buffer.
+  m_dx12CommandList->CopyTextureRegion(&copyDest, 0, 0, 0, &copySrc, nullptr);
+
+  // Execute the command list.
+  ThrowIfFailed(m_dx12CommandList->Close());
+  ID3D12CommandList *ppCommandLists[] = {m_dx12CommandList.Get()};
+  m_dx12CommandQueue->ExecuteCommandLists(_countof(ppCommandLists),
+                                          ppCommandLists);
+  ThrowIfFailed(
+      m_dx12CommandQueue->Signal(m_dx12Fence.Get(), m_sharedFenceValue));
+
+  // Wait for the command list to finish execution and increment the fence
+  // value.
+  waitDX12Fence();
+  m_sharedFenceValue++;
+
+  // Map the readback buffer to host visible memory.
+  D3D12_RANGE readbackBufferRange{0, m_width};
+  uint32_t *pReadbackBufferData{};
+  ThrowIfFailed(
+      readbackBuffer->Map(0, &readbackBufferRange,
+                          reinterpret_cast<void **>(&pReadbackBufferData)));
+
+  // Wait for the GPU. Sometimes the Mapped memory isn't immediately visible to
+  // the host
+  ThrowIfFailed(
+      m_dx12CommandQueue->Signal(m_dx12Fence.Get(), m_sharedFenceValue));
+  waitDX12Fence();
+  m_sharedFenceValue++;
+
+  // Read back the updated texture data and validate it.
+  bool validated = true;
+  for (int i = 0; i < m_width; ++i) {
+    bool mismatch = false;
+    auto expected = i * 2;
+    auto actual = pReadbackBufferData[i];
+
+    if (actual != expected) {
+      mismatch = true;
+      validated = false;
+    }
+
+    if (mismatch) {
+#ifdef VERBOSE_PRINT
+      std::cout << "Result mismatch at " << i << "! Expected: " << expected
+                << ", Actual: " << actual << std::endl;
+#else
+      break;
+#endif
+    }
+  }
+
+  // Unmap the readback buffer.
+  D3D12_RANGE emptyRange{0, 0};
+  readbackBuffer->Unmap(0, &emptyRange);
+
+  // Signal the fence to wait upon before we can clean up DX12 later.
+  ThrowIfFailed(
+      m_dx12CommandQueue->Signal(m_dx12Fence.Get(), m_sharedFenceValue));
+
+  return validated;
+}
+
+void DX12InteropTest::waitDX12Fence(DWORD timeoutMilliseconds) {
+  // Check the current value of the fence to check if
+  // GPU has finished executing the command list.
+  if (m_dx12Fence->GetCompletedValue() < m_sharedFenceValue) {
+    // If not, set value fence is to set on completion.
+    ThrowIfFailed(m_dx12Fence->SetEventOnCompletion(m_sharedFenceValue,
+                                                    m_dx12FenceEvent));
+    // Wait for fence to be triggered.
+    WaitForSingleObject(m_dx12FenceEvent, timeoutMilliseconds);
+  }
+}
+
+void DX12InteropTest::cleanupDX12() {
+  // Wait for the command list to finish execution.
+  waitDX12Fence();
+
+  // Clean up opened handles
+  CloseHandle(m_sharedSemaphoreHandle);
+  CloseHandle(m_sharedMemoryHandle);
+  CloseHandle(m_dx12FenceEvent);
+
+  // ComPtr handles will be destroyed automatically.
+}
+
+void DX12InteropTest::getDX12Adapter(IDXGIFactory2 *pFactory,
+                                     IDXGIAdapter1 **ppAdapter) {
+  ComPtr<IDXGIAdapter1> adapter;
+  *ppAdapter = nullptr;
+
+  // Find a suitable hardware adapter.
+  uint32_t adapterIndex = 0;
+  HRESULT adapterFound = pFactory->EnumAdapters1(adapterIndex, &adapter);
+  while (adapterFound != DXGI_ERROR_NOT_FOUND) {
+    DXGI_ADAPTER_DESC1 desc;
+    adapter->GetDesc1(&desc);
+
+    if (desc.Flags & DXGI_ADAPTER_FLAG_SOFTWARE) {
+      // We don't want a software adapter.
+      continue;
+    }
+
+    // Check to see if the adapter supports Direct3D 12, but don't create the
+    // actual device yet.
+    if (SUCCEEDED(D3D12CreateDevice(adapter.Get(), D3D_FEATURE_LEVEL_12_0,
+                                    _uuidof(ID3D12Device), nullptr))) {
+      break;
+    }
+
+    // Increment adapter index and find the next adapter.
+    adapterIndex++;
+    pFactory->EnumAdapters1(adapterIndex, &adapter);
+  }
+
+  // Set the returned adapter.
+  *ppAdapter = adapter.Detach();
+}
+
+int main() {
+
+  bool validated = false;
+
+  DX12InteropTest interopTestInstance(1024);
+  interopTestInstance.initDX12Device();
+  interopTestInstance.initDX12CommandList();
+  interopTestInstance.initDX12Resources();
+  interopTestInstance.callSYCLKernel();
+  validated = interopTestInstance.validateOutput();
+  interopTestInstance.cleanupDX12();
+
+  if (validated) {
+    std::cout << "Test passed!" << std::endl;
+    return 0;
+  }
+
+  std::cerr << "Test failed!" << std::endl;
+  return 1;
+}
diff --git a/sycl/test-e2e/bindless_images/dx12_interop/read_write_unsampled.h b/sycl/test-e2e/bindless_images/dx12_interop/read_write_unsampled.h
new file mode 100644
index 0000000000000..fab2512d3bbdb
--- /dev/null
+++ b/sycl/test-e2e/bindless_images/dx12_interop/read_write_unsampled.h
@@ -0,0 +1,94 @@
+
+#pragma once
+
+// Reduce the size of Win32 header files
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+
+// Define NOMINMAX to enable compilation on Windows
+#define NOMINMAX
+
+#include <windows.h>
+
+#include <d3d12.h>
+#include <dxgi1_4.h>
+
+#include <iostream>
+#include <string>
+#include <wrl.h>
+
+#include <sycl/ext/oneapi/bindless_images.hpp>
+
+#include <sycl/properties/queue_properties.hpp>
+
+using Microsoft::WRL::ComPtr;
+namespace syclexp = sycl::ext::oneapi::experimental;
+
+inline std::string ResultToString(HRESULT result) {
+  char s_str[64] = {};
+  sprintf_s(s_str, "Error result == 0x%08X", static_cast<uint32_t>(result));
+  return std::string(s_str);
+}
+
+inline void ThrowIfFailed(HRESULT result) {
+  if (result != S_OK) {
+    throw std::runtime_error(ResultToString(result));
+  }
+}
+
+class DX12InteropTest {
+public:
+  DX12InteropTest(uint32_t width)
+      : m_width(width), m_sharedFenceValue(1),
+        m_syclImageDesc({m_width}, 1,
+                        sycl::image_channel_type::unsigned_int32) {
+    m_syclQueue =
+        sycl::queue{m_syclDevice, {sycl::property::queue::in_order{}}};
+  }
+  ~DX12InteropTest() {}
+
+  void initDX12Device();
+  void initDX12CommandList();
+  void initDX12Resources();
+  void cleanupDX12();
+
+  void callSYCLKernel();
+
+  bool validateOutput();
+
+private:
+  void waitDX12Fence(DWORD timeoutMilliseconds = INFINITE);
+  void populateDX12Texture();
+  void getDX12Adapter(IDXGIFactory2 *pFactory, IDXGIAdapter1 **ppAdapter);
+  void importDX12SharedMemoryHandle(size_t allocSize);
+  void importDX12SharedSemaphoreHandle();
+
+  // Dimensions of image
+  uint32_t m_width;
+
+  // DX12 Objects
+  ComPtr<IDXGIFactory4> m_dx12Factory;
+  ComPtr<IDXGIAdapter1> m_dx12HardwareAdapter;
+  ComPtr<ID3D12Device> m_dx12Device;
+  ComPtr<ID3D12CommandQueue> m_dx12CommandQueue;
+  ComPtr<ID3D12GraphicsCommandList> m_dx12CommandList;
+  ComPtr<ID3D12CommandAllocator> m_dx12CommandAllocator;
+  ComPtr<ID3D12Resource> m_dx12Texture;
+  ComPtr<ID3D12Fence> m_dx12Fence;
+  HANDLE m_dx12FenceEvent;
+
+  // Shared handles and values
+  uint64_t m_sharedFenceValue;
+  HANDLE m_sharedMemoryHandle;
+  HANDLE m_sharedSemaphoreHandle;
+
+  // SYCL Objects
+  sycl::queue m_syclQueue;
+  sycl::device m_syclDevice;
+  syclexp::image_descriptor m_syclImageDesc;
+  syclexp::interop_mem_handle m_syclInteropMemHandle;
+  syclexp::interop_semaphore_handle m_syclInteropSemaphoreHandle;
+  syclexp::image_mem_handle m_syclImageMemHandle;
+  syclexp::unsampled_image_handle m_syclImageHandle;
+};
diff --git a/sycl/test-e2e/bindless_images/image_get_info.cpp b/sycl/test-e2e/bindless_images/image_get_info.cpp
index b9dc8d19fc1c9..e30eded427da1 100644
--- a/sycl/test-e2e/bindless_images/image_get_info.cpp
+++ b/sycl/test-e2e/bindless_images/image_get_info.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp
index 003c48a7aac07..91f725dad01e3 100644
--- a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp
+++ b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_1D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
@@ -160,13 +159,13 @@ int main() {
 
   failed += runTest<int, sycl::image_channel_type::signed_int32>();
 
-  failed += runTest<uint, sycl::image_channel_type::unsigned_int32>();
+  failed += runTest<unsigned int, sycl::image_channel_type::unsigned_int32>();
 
   failed += runTest<float, sycl::image_channel_type::fp32>();
 
   failed += runTest<short, sycl::image_channel_type::signed_int16>();
 
-  failed += runTest<ushort, sycl::image_channel_type::unsigned_int16>();
+  failed += runTest<unsigned short, sycl::image_channel_type::unsigned_int16>();
 
   failed += runTest<char, sycl::image_channel_type::signed_int8>();
 
diff --git a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp
index 472f31487eded..afedf976077cf 100644
--- a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp
+++ b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_2D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
@@ -167,13 +166,13 @@ int main() {
 
   failed += runTest<int, sycl::image_channel_type::signed_int32>();
 
-  failed += runTest<uint, sycl::image_channel_type::unsigned_int32>();
+  failed += runTest<unsigned int, sycl::image_channel_type::unsigned_int32>();
 
   failed += runTest<float, sycl::image_channel_type::fp32>();
 
   failed += runTest<short, sycl::image_channel_type::signed_int16>();
 
-  failed += runTest<ushort, sycl::image_channel_type::unsigned_int16>();
+  failed += runTest<unsigned short, sycl::image_channel_type::unsigned_int16>();
 
   failed += runTest<char, sycl::image_channel_type::signed_int8>();
 
diff --git a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp
index e3d11bcb7c567..b2c5f7ae42b71 100644
--- a/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp
+++ b/sycl/test-e2e/bindless_images/mipmap/mipmap_read_3D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
@@ -159,13 +158,13 @@ int main() {
 
   failed += runTest<int, sycl::image_channel_type::signed_int32>();
 
-  failed += runTest<uint, sycl::image_channel_type::unsigned_int32>();
+  failed += runTest<unsigned int, sycl::image_channel_type::unsigned_int32>();
 
   failed += runTest<float, sycl::image_channel_type::fp32>();
 
   failed += runTest<short, sycl::image_channel_type::signed_int16>();
 
-  failed += runTest<ushort, sycl::image_channel_type::unsigned_int16>();
+  failed += runTest<unsigned short, sycl::image_channel_type::unsigned_int16>();
 
   failed += runTest<char, sycl::image_channel_type::signed_int8>();
 
diff --git a/sycl/test-e2e/bindless_images/read_1D.cpp b/sycl/test-e2e/bindless_images/read_1D.cpp
index 624e3d69aa8a0..c85157cf7f8b2 100644
--- a/sycl/test-e2e/bindless_images/read_1D.cpp
+++ b/sycl/test-e2e/bindless_images/read_1D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_2D.cpp b/sycl/test-e2e/bindless_images/read_2D.cpp
index 3b3ff18f8421d..5c5dbb5fc59f9 100644
--- a/sycl/test-e2e/bindless_images/read_2D.cpp
+++ b/sycl/test-e2e/bindless_images/read_2D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_2D_dynamic.cpp b/sycl/test-e2e/bindless_images/read_2D_dynamic.cpp
index e5de53c8c5fa3..377e0103b56c9 100644
--- a/sycl/test-e2e/bindless_images/read_2D_dynamic.cpp
+++ b/sycl/test-e2e/bindless_images/read_2D_dynamic.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_3D.cpp b/sycl/test-e2e/bindless_images/read_3D.cpp
index 556a37e555e7d..b77bfaa298752 100644
--- a/sycl/test-e2e/bindless_images/read_3D.cpp
+++ b/sycl/test-e2e/bindless_images/read_3D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_norm_types.cpp b/sycl/test-e2e/bindless_images/read_norm_types.cpp
index 8cb3b2117c175..aa2daa29b1bad 100644
--- a/sycl/test-e2e/bindless_images/read_norm_types.cpp
+++ b/sycl/test-e2e/bindless_images/read_norm_types.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_sampled.cpp b/sycl/test-e2e/bindless_images/read_sampled.cpp
index 7fe41261f2f5d..0972c3401b7b9 100644
--- a/sycl/test-e2e/bindless_images/read_sampled.cpp
+++ b/sycl/test-e2e/bindless_images/read_sampled.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_write_1D.cpp b/sycl/test-e2e/bindless_images/read_write_1D.cpp
index d6b985b203dd9..7f366ae682039 100644
--- a/sycl/test-e2e/bindless_images/read_write_1D.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_1D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_write_1D_subregion.cpp b/sycl/test-e2e/bindless_images/read_write_1D_subregion.cpp
index f866af948a52c..3572150cbd12f 100644
--- a/sycl/test-e2e/bindless_images/read_write_1D_subregion.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_1D_subregion.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_write_2D.cpp b/sycl/test-e2e/bindless_images/read_write_2D.cpp
index 28246fb6211a7..6fa09ea4a1eea 100644
--- a/sycl/test-e2e/bindless_images/read_write_2D.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_2D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_write_2D_subregion.cpp b/sycl/test-e2e/bindless_images/read_write_2D_subregion.cpp
index dddb9100ab85a..c227734d3a00a 100644
--- a/sycl/test-e2e/bindless_images/read_write_2D_subregion.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_2D_subregion.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_write_3D.cpp b/sycl/test-e2e/bindless_images/read_write_3D.cpp
index 1bcbed6a9fe4d..efb87a8ff9f4a 100644
--- a/sycl/test-e2e/bindless_images/read_write_3D.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_3D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_write_3D_subregion.cpp b/sycl/test-e2e/bindless_images/read_write_3D_subregion.cpp
index adbee77c2873f..6631260dc15f1 100644
--- a/sycl/test-e2e/bindless_images/read_write_3D_subregion.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_3D_subregion.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/read_write_unsampled.cpp b/sycl/test-e2e/bindless_images/read_write_unsampled.cpp
index 1ac11302224d2..cae8cbc32b3d2 100644
--- a/sycl/test-e2e/bindless_images/read_write_unsampled.cpp
+++ b/sycl/test-e2e/bindless_images/read_write_unsampled.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/sampled_fetch/fetch_1D_USM.cpp b/sycl/test-e2e/bindless_images/sampled_fetch/fetch_1D_USM.cpp
index d8d7e6e50fb30..965dc9f00c1c4 100644
--- a/sycl/test-e2e/bindless_images/sampled_fetch/fetch_1D_USM.cpp
+++ b/sycl/test-e2e/bindless_images/sampled_fetch/fetch_1D_USM.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 // REQUIRES: aspect-ext_oneapi_bindless_sampled_image_fetch_1d_usm
 
diff --git a/sycl/test-e2e/bindless_images/sampled_fetch/fetch_2D.cpp b/sycl/test-e2e/bindless_images/sampled_fetch/fetch_2D.cpp
index c85852dba64fd..0a6da2d97f136 100644
--- a/sycl/test-e2e/bindless_images/sampled_fetch/fetch_2D.cpp
+++ b/sycl/test-e2e/bindless_images/sampled_fetch/fetch_2D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 // REQUIRES: aspect-ext_oneapi_bindless_sampled_image_fetch_2d
 
diff --git a/sycl/test-e2e/bindless_images/sampled_fetch/fetch_2D_USM.cpp b/sycl/test-e2e/bindless_images/sampled_fetch/fetch_2D_USM.cpp
index 2258e4f098494..834ec5b6e8c79 100644
--- a/sycl/test-e2e/bindless_images/sampled_fetch/fetch_2D_USM.cpp
+++ b/sycl/test-e2e/bindless_images/sampled_fetch/fetch_2D_USM.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 // REQUIRES: aspect-ext_oneapi_bindless_sampled_image_fetch_2d_usm
 
diff --git a/sycl/test-e2e/bindless_images/sampled_fetch/fetch_3D.cpp b/sycl/test-e2e/bindless_images/sampled_fetch/fetch_3D.cpp
index 3c534e7329f96..ccb096dbfbdc5 100644
--- a/sycl/test-e2e/bindless_images/sampled_fetch/fetch_3D.cpp
+++ b/sycl/test-e2e/bindless_images/sampled_fetch/fetch_3D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 // REQUIRES: aspect-ext_oneapi_bindless_sampled_image_fetch_3d
 
diff --git a/sycl/test-e2e/bindless_images/sampling_1D.cpp b/sycl/test-e2e/bindless_images/sampling_1D.cpp
index b80a640aa370b..ef184c112568c 100644
--- a/sycl/test-e2e/bindless_images/sampling_1D.cpp
+++ b/sycl/test-e2e/bindless_images/sampling_1D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/sampling_2D.cpp b/sycl/test-e2e/bindless_images/sampling_2D.cpp
index 33dac190a4b68..92a26df5afc38 100644
--- a/sycl/test-e2e/bindless_images/sampling_2D.cpp
+++ b/sycl/test-e2e/bindless_images/sampling_2D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/sampling_2D_USM_shared.cpp b/sycl/test-e2e/bindless_images/sampling_2D_USM_shared.cpp
index aefd953b6f46d..52775a8b0806c 100644
--- a/sycl/test-e2e/bindless_images/sampling_2D_USM_shared.cpp
+++ b/sycl/test-e2e/bindless_images/sampling_2D_USM_shared.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 // REQUIRES: aspect-ext_oneapi_bindless_images_shared_usm
 
diff --git a/sycl/test-e2e/bindless_images/sampling_2D_half.cpp b/sycl/test-e2e/bindless_images/sampling_2D_half.cpp
index fc34dfcb13c50..aeb57976df5aa 100644
--- a/sycl/test-e2e/bindless_images/sampling_2D_half.cpp
+++ b/sycl/test-e2e/bindless_images/sampling_2D_half.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 // REQUIRES: aspect-fp16
 
diff --git a/sycl/test-e2e/bindless_images/sampling_3D.cpp b/sycl/test-e2e/bindless_images/sampling_3D.cpp
index 518b0e873d583..47d98aaf0be97 100644
--- a/sycl/test-e2e/bindless_images/sampling_3D.cpp
+++ b/sycl/test-e2e/bindless_images/sampling_3D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
diff --git a/sycl/test-e2e/bindless_images/sampling_unique_addr_modes.cpp b/sycl/test-e2e/bindless_images/sampling_unique_addr_modes.cpp
index eea7f53c5c0e3..895f7082adce6 100644
--- a/sycl/test-e2e/bindless_images/sampling_unique_addr_modes.cpp
+++ b/sycl/test-e2e/bindless_images/sampling_unique_addr_modes.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/bindless_images/user_types/mipmap_read_user_type_2D.cpp b/sycl/test-e2e/bindless_images/user_types/mipmap_read_user_type_2D.cpp
index 1e55ac489c2a3..77913a2836565 100644
--- a/sycl/test-e2e/bindless_images/user_types/mipmap_read_user_type_2D.cpp
+++ b/sycl/test-e2e/bindless_images/user_types/mipmap_read_user_type_2D.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/bindless_images/user_types/read_write_user_type.cpp b/sycl/test-e2e/bindless_images/user_types/read_write_user_type.cpp
index 7a55e31e40fba..db9347f9895e6 100644
--- a/sycl/test-e2e/bindless_images/user_types/read_write_user_type.cpp
+++ b/sycl/test-e2e/bindless_images/user_types/read_write_user_type.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 
 // RUN: %{build} -o %t.out
diff --git a/sycl/test-e2e/bindless_images/vulkan_interop/mipmaps.cpp b/sycl/test-e2e/bindless_images/vulkan_interop/mipmaps.cpp
index 2618e1df37001..bd4b72d551e15 100644
--- a/sycl/test-e2e/bindless_images/vulkan_interop/mipmaps.cpp
+++ b/sycl/test-e2e/bindless_images/vulkan_interop/mipmaps.cpp
@@ -1,4 +1,3 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 // REQUIRES: vulkan
 
@@ -8,6 +7,9 @@
 // Uncomment to print additional test information
 // #define VERBOSE_PRINT
 
+// Define NOMINMAX to enable compilation on Windows
+#define NOMINMAX
+
 #include "../bindless_helpers.hpp"
 #include "vulkan_common.hpp"
 
@@ -19,14 +21,23 @@ struct handles_t {
   syclexp::interop_mem_handle inputInteropMemHandle;
 };
 
+template <typename InteropMemHandleT>
 handles_t create_handles(sycl::context &ctxt, sycl::device &dev,
                          const syclexp::bindless_image_sampler &samp,
-                         int input_image_fd, syclexp::image_descriptor desc,
-                         size_t imgSize) {
+                         InteropMemHandleT inputImgInteropHandle,
+                         syclexp::image_descriptor desc, size_t imgSize) {
 
   // Extension: external memory descriptor
+#ifdef _WIN32
+  syclexp::external_mem_descriptor<syclexp::resource_win32_handle>
+      inputExtMemDesc{inputImgInteropHandle,
+                      syclexp::external_mem_handle_type::win32_nt_handle,
+                      imgSize};
+#else
   syclexp::external_mem_descriptor<syclexp::resource_fd> inputExtMemDesc{
-      input_image_fd, imgSize};
+      inputImgInteropHandle, syclexp::external_mem_handle_type::opaque_fd,
+      imgSize};
+#endif
 
   // Extension: interop mem handle imported from file descriptor
   syclexp::interop_mem_handle inputInteropMemHandle =
@@ -45,9 +56,11 @@ handles_t create_handles(sycl::context &ctxt, sycl::device &dev,
 }
 
 template <int NDims, typename DType, int NChannels,
-          sycl::image_channel_type CType, typename KernelName>
+          sycl::image_channel_type CType, typename InteropMemHandleT,
+          typename KernelName>
 bool run_sycl(sycl::range<NDims> globalSize, sycl::range<NDims> localSize,
-              int input_image_fd, size_t mipLevels, size_t reqSize) {
+              InteropMemHandleT inputImgInteropHandle, size_t mipLevels,
+              size_t reqSize) {
   sycl::device dev;
   sycl::queue q(dev);
   auto ctxt = q.get_context();
@@ -78,7 +91,8 @@ bool run_sycl(sycl::range<NDims> globalSize, sycl::range<NDims> localSize,
 
   using VecType = sycl::vec<DType, NChannels>;
 
-  auto handles = create_handles(ctxt, dev, samp, input_image_fd, desc, reqSize);
+  auto handles =
+      create_handles(ctxt, dev, samp, inputImgInteropHandle, desc, reqSize);
 
   std::vector<VecType> out(mip0Elems);
   try {
@@ -367,9 +381,14 @@ bool run_test(sycl::range<NDims> dims, sycl::range<NDims> localSize,
 
   printString("Getting memory file descriptors and calling into SYCL\n");
   // Pass memory to SYCL for modification
-  auto input_fd = vkutil::getMemoryOpaqueFD(inputMemory);
-  bool result = run_sycl<NDims, DType, NChannels, CType, KernelName>(
-      dims, localSize, input_fd, mipLevels, memRequirements.size);
+#ifdef _WIN32
+  auto inputMemHandle = vkutil::getMemoryWin32Handle(inputMemory);
+#else
+  auto inputMemHandle = vkutil::getMemoryOpaqueFD(inputMemory);
+#endif
+  bool result = run_sycl<NDims, DType, NChannels, CType,
+                         decltype(inputMemHandle), KernelName>(
+      dims, localSize, inputMemHandle, mipLevels, memRequirements.size);
 
   // Cleanup
   vkDestroyBuffer(vk_device, inputStagingBuffer, nullptr);
@@ -423,7 +442,11 @@ int main() {
     return EXIT_FAILURE;
   }
 
-  if (vkutil::setupDevice("NVIDIA") != VK_SUCCESS) {
+  const char *devices[] = {"Intel", "NVIDIA"};
+  if (std::none_of(std::begin(devices), std::end(devices),
+                   [](const char *device) {
+                     return vkutil::setupDevice(device) == VK_SUCCESS;
+                   })) {
     std::cerr << "Device setup failed!\n";
     return EXIT_FAILURE;
   }
diff --git a/sycl/test-e2e/bindless_images/vulkan_interop/sampled_images.cpp b/sycl/test-e2e/bindless_images/vulkan_interop/sampled_images.cpp
index 206c6bf4510d2..aff8b3716a204 100644
--- a/sycl/test-e2e/bindless_images/vulkan_interop/sampled_images.cpp
+++ b/sycl/test-e2e/bindless_images/vulkan_interop/sampled_images.cpp
@@ -1,9 +1,8 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 // REQUIRES: vulkan
 
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %link-vulkan %s -o %t.out
-// RUN: %t.out
+// RUN: %{build} %link-vulkan -o %t.out
+// RUN: %{run} %t.out
 
 // Uncomment to print additional test information
 // #define VERBOSE_PRINT
@@ -19,14 +18,22 @@ struct handles_t {
   syclexp::interop_mem_handle inputInteropMemHandle;
 };
 
+template <typename InteropHandleT>
 handles_t create_test_handles(sycl::context &ctxt, sycl::device &dev,
                               const syclexp::bindless_image_sampler &samp,
-                              int input_image_fd,
+                              InteropHandleT interopHandle,
                               syclexp::image_descriptor desc,
                               const size_t imgSize) {
   // Extension: external memory descriptor
+#ifdef _WIN32
+  syclexp::external_mem_descriptor<syclexp::resource_win32_handle>
+      inputExtMemDesc{interopHandle,
+                      syclexp::external_mem_handle_type::win32_nt_handle,
+                      imgSize};
+#else
   syclexp::external_mem_descriptor<syclexp::resource_fd> inputExtMemDesc{
-      input_image_fd, imgSize};
+      interopHandle, syclexp::external_mem_handle_type::opaque_fd, imgSize};
+#endif
 
   // Extension: interop mem handle imported from file descriptor
   syclexp::interop_mem_handle inputInteropMemHandle =
@@ -44,10 +51,10 @@ handles_t create_test_handles(sycl::context &ctxt, sycl::device &dev,
   return {imgInput, inputMappedMemHandle, inputInteropMemHandle};
 }
 
-template <int NDims, typename DType, int NChannels,
+template <typename InteropHandleT, int NDims, typename DType, int NChannels,
           sycl::image_channel_type CType, typename KernelName>
-bool run_sycl(sycl::range<NDims> globalSize, sycl::range<NDims> localSize,
-              int input_image_fd) {
+bool run_sycl(InteropHandleT inputInteropMemHandle,
+              sycl::range<NDims> globalSize, sycl::range<NDims> localSize) {
   sycl::device dev;
   sycl::queue q(dev);
   auto ctxt = q.get_context();
@@ -78,12 +85,11 @@ bool run_sycl(sycl::range<NDims> globalSize, sycl::range<NDims> localSize,
 
   using VecType = sycl::vec<DType, NChannels>;
 
-  auto handles =
-      create_test_handles(ctxt, dev, samp, input_image_fd, desc, img_size);
+  auto handles = create_test_handles(ctxt, dev, samp, inputInteropMemHandle,
+                                     desc, img_size);
 
   std::vector<VecType> out(numElems);
   try {
-
     sycl::buffer<VecType, NDims> buf((VecType *)out.data(), outBufferRange);
     q.submit([&](sycl::handler &cgh) {
       auto outAcc = buf.template get_access<sycl::access_mode::write>(
@@ -289,11 +295,20 @@ bool run_test(sycl::range<NDims> dims, sycl::range<NDims> localSize,
     VK_CHECK_CALL(vkQueueWaitIdle(vk_transfer_queue));
   }
 
-  printString("Getting memory file descriptors and calling into SYCL\n");
+  printString("Getting memory file descriptors\n");
   // Pass memory to SYCL for modification
-  auto input_fd = vkutil::getMemoryOpaqueFD(inputMemory);
-  bool result = run_sycl<NDims, DType, NChannels, CType, KernelName>(
-      dims, localSize, input_fd);
+
+#ifdef _WIN32
+  auto input_mem_handle = vkutil::getMemoryWin32Handle(inputMemory);
+#else
+  auto input_mem_handle = vkutil::getMemoryOpaqueFD(inputMemory);
+#endif
+
+  printString("Calling into SYCL with interop memory handle\n");
+
+  bool validated =
+      run_sycl<decltype(input_mem_handle), NDims, DType, NChannels, CType,
+               KernelName>(input_mem_handle, dims, localSize);
 
   // Cleanup
   vkDestroyBuffer(vk_device, inputStagingBuffer, nullptr);
@@ -301,7 +316,7 @@ bool run_test(sycl::range<NDims> dims, sycl::range<NDims> localSize,
   vkFreeMemory(vk_device, inputStagingMemory, nullptr);
   vkFreeMemory(vk_device, inputMemory, nullptr);
 
-  return result;
+  return validated;
 }
 
 bool run_tests() {
@@ -347,7 +362,11 @@ int main() {
     return EXIT_FAILURE;
   }
 
-  if (vkutil::setupDevice("NVIDIA") != VK_SUCCESS) {
+  const char *devices[] = {"Intel", "NVIDIA"};
+  if (std::none_of(std::begin(devices), std::end(devices),
+                   [](const char *device) {
+                     return vkutil::setupDevice(device) == VK_SUCCESS;
+                   })) {
     std::cerr << "Device setup failed!\n";
     return EXIT_FAILURE;
   }
diff --git a/sycl/test-e2e/bindless_images/vulkan_interop/unsampled_images.cpp b/sycl/test-e2e/bindless_images/vulkan_interop/unsampled_images.cpp
index d78f1b792cb66..332aeb642aec0 100644
--- a/sycl/test-e2e/bindless_images/vulkan_interop/unsampled_images.cpp
+++ b/sycl/test-e2e/bindless_images/vulkan_interop/unsampled_images.cpp
@@ -1,15 +1,15 @@
-// REQUIRES: linux
 // REQUIRES: cuda
 // REQUIRES: vulkan
 
-// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %link-vulkan %s -o %t.out
-// RUN: %t.out
+// RUN: %{build} %link-vulkan -o %t.out
+// RUN: %{run} %t.out
 
 // Uncomment to print additional test information
 // #define VERBOSE_PRINT
 
 #include "../bindless_helpers.hpp"
 #include "vulkan_common.hpp"
+#include <sycl/properties/queue_properties.hpp>
 
 #include <random>
 
@@ -27,19 +27,42 @@ struct handles_t {
   syclexp::unsampled_image_handle input_1, input_2, output;
 };
 
+template <typename InteropMemHandleT, typename InteropSemHandleT>
 handles_t
 create_test_handles(sycl::context &ctxt, sycl::device &dev,
-                    int input_image_fd_1, int input_image_fd_2,
-                    int output_image_fd, int sycl_wait_semaphore_fd,
-                    int sycl_done_semaphore_fd, const size_t img_size,
+                    InteropMemHandleT img_in_interop_handle_1,
+                    InteropMemHandleT img_in_interop_handle_2,
+                    InteropMemHandleT img_out_interop_handle,
+                    InteropSemHandleT sycl_wait_semaphore_handle,
+                    InteropSemHandleT sycl_done_semaphore_handle,
+                    const size_t img_size,
                     sycl::ext::oneapi::experimental::image_descriptor &desc) {
   // Extension: map the external memory descriptors
+
+#ifdef _WIN32
+  syclexp::external_mem_descriptor<syclexp::resource_win32_handle>
+      input_ext_mem_desc_1{img_in_interop_handle_1,
+                           syclexp::external_mem_handle_type::win32_nt_handle,
+                           img_size};
+  syclexp::external_mem_descriptor<syclexp::resource_win32_handle>
+      input_ext_mem_desc_2{img_in_interop_handle_2,
+                           syclexp::external_mem_handle_type::win32_nt_handle,
+                           img_size};
+  syclexp::external_mem_descriptor<syclexp::resource_win32_handle>
+      output_ext_mem_desc{img_out_interop_handle,
+                          syclexp::external_mem_handle_type::win32_nt_handle,
+                          img_size};
+#else
   syclexp::external_mem_descriptor<syclexp::resource_fd> input_ext_mem_desc_1{
-      input_image_fd_1, img_size};
+      img_in_interop_handle_1, syclexp::external_mem_handle_type::opaque_fd,
+      img_size};
   syclexp::external_mem_descriptor<syclexp::resource_fd> input_ext_mem_desc_2{
-      input_image_fd_2, img_size};
+      img_in_interop_handle_2, syclexp::external_mem_handle_type::opaque_fd,
+      img_size};
   syclexp::external_mem_descriptor<syclexp::resource_fd> output_ext_mem_desc{
-      output_image_fd, img_size};
+      img_out_interop_handle, syclexp::external_mem_handle_type::opaque_fd,
+      img_size};
+#endif
 
   // Extension: create interop memory handles
   syclexp::interop_mem_handle input_interop_mem_handle_1 =
@@ -69,10 +92,26 @@ create_test_handles(sycl::context &ctxt, sycl::device &dev,
       syclexp::create_image(output_mapped_mem_handle, desc, dev, ctxt);
 
   // Extension: import semaphores
+#ifdef _WIN32
+  syclexp::external_semaphore_descriptor<syclexp::resource_win32_handle>
+      sycl_wait_external_semaphore_desc{
+          sycl_wait_semaphore_handle,
+          syclexp::external_semaphore_handle_type::win32_nt_handle};
+  syclexp::external_semaphore_descriptor<syclexp::resource_win32_handle>
+      sycl_done_external_semaphore_desc{
+          sycl_done_semaphore_handle,
+          syclexp::external_semaphore_handle_type::win32_nt_handle};
+#else
   syclexp::external_semaphore_descriptor<syclexp::resource_fd>
-      sycl_wait_external_semaphore_desc{sycl_wait_semaphore_fd};
+      sycl_wait_external_semaphore_desc{
+          sycl_wait_semaphore_handle,
+          syclexp::external_semaphore_handle_type::opaque_fd};
   syclexp::external_semaphore_descriptor<syclexp::resource_fd>
-      sycl_done_external_semaphore_desc{sycl_done_semaphore_fd};
+      sycl_done_external_semaphore_desc{
+          sycl_done_semaphore_handle,
+          syclexp::external_semaphore_handle_type::opaque_fd};
+#endif
+
   syclexp::interop_semaphore_handle sycl_wait_interop_semaphore_handle =
       syclexp::import_external_semaphore(sycl_wait_external_semaphore_desc, dev,
                                          ctxt);
@@ -115,16 +154,20 @@ void cleanup_test(sycl::context &ctxt, sycl::device &dev, handles_t handles) {
                                    ctxt);
 }
 
-template <int NDims, typename DType, sycl::image_channel_type CType,
-          int NChannels, typename KernelName>
+template <typename InteropMemHandleT, typename InteropSemHandleT, int NDims,
+          typename DType, sycl::image_channel_type CType, int NChannels,
+          typename KernelName>
 void run_ndim_test(sycl::range<NDims> global_size,
-                   sycl::range<NDims> local_size, int input_image_fd_1,
-                   int input_image_fd_2, int output_image_fd,
-                   int sycl_wait_semaphore_fd, int sycl_done_semaphore_fd) {
+                   sycl::range<NDims> local_size,
+                   InteropMemHandleT img_in_interop_handle_1,
+                   InteropMemHandleT img_in_interop_handle_2,
+                   InteropMemHandleT img_out_interop_handle,
+                   InteropSemHandleT sycl_wait_semaphore_handle,
+                   InteropSemHandleT sycl_done_semaphore_handle) {
   using VecType = sycl::vec<DType, NChannels>;
 
   sycl::device dev;
-  sycl::queue q(dev);
+  sycl::queue q{dev, {sycl::property::queue::in_order{}}};
   auto ctxt = q.get_context();
 
   // Image descriptor - mapped to Vulkan image layout
@@ -133,8 +176,9 @@ void run_ndim_test(sycl::range<NDims> global_size,
   const size_t img_size = global_size.size() * sizeof(DType) * NChannels;
 
   auto handles = create_test_handles(
-      ctxt, dev, input_image_fd_1, input_image_fd_2, output_image_fd,
-      sycl_wait_semaphore_fd, sycl_done_semaphore_fd, img_size, desc);
+      ctxt, dev, img_in_interop_handle_1, img_in_interop_handle_2,
+      img_out_interop_handle, sycl_wait_semaphore_handle,
+      sycl_done_semaphore_handle, img_size, desc);
 
   // Extension: wait for imported semaphore
   q.ext_oneapi_wait_external_semaphore(
@@ -328,7 +372,11 @@ bool run_test(sycl::range<NDims> dims, sycl::range<NDims> local_size,
   {
     VkExportSemaphoreCreateInfo esci = {};
     esci.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO;
+#ifdef _WIN32
+    esci.handleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT;
+#else
     esci.handleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT;
+#endif
 
     VkSemaphoreCreateInfo sci = {};
     sci.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
@@ -341,7 +389,11 @@ bool run_test(sycl::range<NDims> dims, sycl::range<NDims> local_size,
   {
     VkExportSemaphoreCreateInfo esci = {};
     esci.sType = VK_STRUCTURE_TYPE_EXPORT_SEMAPHORE_CREATE_INFO;
+#ifdef _WIN32
+    esci.handleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT;
+#else
     esci.handleTypes = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_FD_BIT;
+#endif
 
     VkSemaphoreCreateInfo sci = {};
     sci.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
@@ -386,21 +438,46 @@ bool run_test(sycl::range<NDims> dims, sycl::range<NDims> local_size,
                                 &submission, VK_NULL_HANDLE /*fence*/));
   }
 
-  printString("Getting memory file descriptors and calling into SYCL\n");
-  // Pass memory to SYCL for modification
+  printString("Getting memory interop handles\n");
 
+  // Pass memory to SYCL for modification
   auto global_size = dims;
-  auto input_fd_1 = vkutil::getMemoryOpaqueFD(inVkImgRes1.imageMemory);
-  auto input_fd_2 = vkutil::getMemoryOpaqueFD(inVkImgRes2.imageMemory);
-  auto output_fd = vkutil::getMemoryOpaqueFD(outVkImgRes.imageMemory);
+#ifdef _WIN32
+  auto input_mem_handle_1 =
+      vkutil::getMemoryWin32Handle(inVkImgRes1.imageMemory);
+  auto input_mem_handle_2 =
+      vkutil::getMemoryWin32Handle(inVkImgRes2.imageMemory);
+  auto output_mem_handle =
+      vkutil::getMemoryWin32Handle(outVkImgRes.imageMemory);
+#else
+  auto input_mem_handle_1 = vkutil::getMemoryOpaqueFD(inVkImgRes1.imageMemory);
+  auto input_mem_handle_2 = vkutil::getMemoryOpaqueFD(inVkImgRes2.imageMemory);
+  auto output_mem_handle = vkutil::getMemoryOpaqueFD(outVkImgRes.imageMemory);
+#endif
+
+  printString("Getting semaphore interop handles\n");
 
   // Pass semaphores to SYCL for synchronization
-  int sycl_wait_semaphore_fd = vkutil::getSemaphoreOpaqueFD(syclWaitSemaphore);
-  int sycl_done_semaphore_fd = vkutil::getSemaphoreOpaqueFD(syclDoneSemaphore);
-
-  util::run_ndim_test<NDims, DType, CType, NChannels, KernelName>(
-      global_size, local_size, input_fd_1, input_fd_2, output_fd,
-      sycl_wait_semaphore_fd, sycl_done_semaphore_fd);
+#ifdef _WIN32
+  auto sycl_wait_semaphore_handle =
+      vkutil::getSemaphoreWin32Handle(syclWaitSemaphore);
+  auto sycl_done_semaphore_handle =
+      vkutil::getSemaphoreWin32Handle(syclDoneSemaphore);
+#else
+  auto sycl_wait_semaphore_handle =
+      vkutil::getSemaphoreOpaqueFD(syclWaitSemaphore);
+  auto sycl_done_semaphore_handle =
+      vkutil::getSemaphoreOpaqueFD(syclDoneSemaphore);
+#endif
+
+  printString("Calling into SYCL with interop memory and semaphore handles\n");
+
+  util::run_ndim_test<decltype(input_mem_handle_1),
+                      decltype(sycl_wait_semaphore_handle), NDims, DType, CType,
+                      NChannels, KernelName>(
+      global_size, local_size, input_mem_handle_1, input_mem_handle_2,
+      output_mem_handle, sycl_wait_semaphore_handle,
+      sycl_done_semaphore_handle);
 
   printString("Copying image memory to staging memory\n");
   // Copy main image memory to staging
@@ -544,8 +621,11 @@ int main() {
     return EXIT_FAILURE;
   }
 
-  // Currently only Nvidia devices are tested
-  if (vkutil::setupDevice("NVIDIA") != VK_SUCCESS) {
+  const char *devices[] = {"Intel", "NVIDIA"};
+  if (std::none_of(std::begin(devices), std::end(devices),
+                   [](const char *device) {
+                     return vkutil::setupDevice(device) == VK_SUCCESS;
+                   })) {
     std::cerr << "Device setup failed!\n";
     return EXIT_FAILURE;
   }
diff --git a/sycl/test-e2e/bindless_images/vulkan_interop/vulkan_common.hpp b/sycl/test-e2e/bindless_images/vulkan_interop/vulkan_common.hpp
index e6d703e2cc0eb..6c02e30778563 100644
--- a/sycl/test-e2e/bindless_images/vulkan_interop/vulkan_common.hpp
+++ b/sycl/test-e2e/bindless_images/vulkan_interop/vulkan_common.hpp
@@ -1,21 +1,30 @@
 #pragma once
+
+#ifdef _WIN32
+#define VK_USE_PLATFORM_WIN32_KHR
+#endif
+
 #include <vulkan/vulkan.h>
 
+#include <algorithm>
 #include <cstdlib>
 #include <iostream>
+#include <set>
 #include <vector>
 
 void printString(std::string str) {
 #ifdef VERBOSE_PRINT
-  std::cout << str;
+  std::cout << str << std::endl;
 #endif
 }
 
 #define VK_CHECK_CALL_RET(call)                                                \
   {                                                                            \
     VkResult err = call;                                                       \
-    if (err != VK_SUCCESS)                                                     \
+    if (err != VK_SUCCESS) {                                                   \
+      std::cerr << #call << " failed. Code: " << err << "\n";                  \
       return err;                                                              \
+    }                                                                          \
   }
 
 #define VK_CHECK_CALL(call)                                                    \
@@ -32,8 +41,13 @@ static VkDevice vk_device;
 static VkQueue vk_compute_queue;
 static VkQueue vk_transfer_queue;
 
+#ifdef _WIN32
+static PFN_vkGetMemoryWin32HandleKHR vk_GetMemoryWin32HandleKHR;
+static PFN_vkGetSemaphoreWin32HandleKHR vk_getSemaphoreWin32HandleKHR;
+#else
 static PFN_vkGetMemoryFdKHR vk_getMemoryFdKHR;
 static PFN_vkGetSemaphoreFdKHR vk_getSemaphoreFdKHR;
+#endif
 
 static uint32_t vk_computeQueueFamilyIndex;
 static uint32_t vk_transferQueueFamilyIndex;
@@ -44,6 +58,8 @@ static VkCommandPool vk_transferCmdPool;
 static VkCommandBuffer vk_computeCmdBuffer;
 static VkCommandBuffer vk_transferCmdBuffers[2];
 
+// A static debug callback function that relays messages from the Vulkan
+// validation layer to the terminal.
 static VKAPI_ATTR VkBool32 VKAPI_CALL
 debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity,
               VkDebugUtilsMessageTypeFlagsEXT messageType,
@@ -51,13 +67,42 @@ debugCallback(VkDebugUtilsMessageSeverityFlagBitsEXT messageSeverity,
               void *pUserData) {
   // Only print errors from validation layer
   if (messageSeverity & VK_DEBUG_UTILS_MESSAGE_SEVERITY_ERROR_BIT_EXT) {
-    std::cerr << pCallbackData->pMessage << "\n";
+    std::cerr << pCallbackData->pMessage << "\n\n";
   }
   return VK_FALSE;
 }
 
 namespace vkutil {
+
+// Returns all supported Vulkan instance extensions.
+VkResult
+getSupportedInstanceExtensions(std::vector<std::string> &supportedExtensions) {
+  uint32_t count = 0;
+  VK_CHECK_CALL_RET(
+      vkEnumerateInstanceExtensionProperties(nullptr, &count, nullptr));
+
+  std::vector<VkExtensionProperties> extensionProperties(count);
+
+  VK_CHECK_CALL_RET(vkEnumerateInstanceExtensionProperties(
+      nullptr, &count, extensionProperties.data()));
+
+  for (auto &extension : extensionProperties) {
+    supportedExtensions.push_back(extension.extensionName);
+  }
+
+  return VK_SUCCESS;
+}
+
+/*
+In this function we set up the Vulkan instance, which is the one of the first
+steps in setting up a Vulkan application.
+When creating an instance we need to specify some information about our
+application, most importantly, we need to specify some extensions that we
+require to perform interop operations.
+*/
 VkResult setupInstance() {
+  // Generic application information. The specific values are not important to
+  // the execution of the Vulkan program.
   VkApplicationInfo ai = {};
   ai.sType = VK_STRUCTURE_TYPE_APPLICATION_INFO;
   ai.pApplicationName = "SYCL-Vulkan-Interop";
@@ -66,6 +111,11 @@ VkResult setupInstance() {
   ai.engineVersion = VK_MAKE_VERSION(1, 0, 0);
   ai.apiVersion = VK_API_VERSION_1_0;
 
+  // Query the number of available layers and retrieve their names. One example
+  // of a layer is the validation layer, this layer allows for runtime debug
+  // messages to be returned if anything goes wrong in the Vulkan application.
+  // We will set up a callback function to print debug information if the
+  // validation layer is available.
   uint32_t layerCount;
   VK_CHECK_CALL_RET(vkEnumerateInstanceLayerProperties(&layerCount, nullptr));
 
@@ -73,23 +123,48 @@ VkResult setupInstance() {
   VK_CHECK_CALL_RET(
       vkEnumerateInstanceLayerProperties(&layerCount, availableLayers.data()));
 
+  // Query the supported instance extensions.
+  std::vector<std::string> supportedInstanceExtensions;
+  VK_CHECK_CALL_RET(
+      getSupportedInstanceExtensions(supportedInstanceExtensions));
+
+  // We have some instance extensions that we require for the tests to function.
+  std::vector<const char *> requiredInstanceExtensions = {
+      VK_EXT_DEBUG_UTILS_EXTENSION_NAME,
+      VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME,
+      VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME,
+      VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME};
+
+  // Make sure that our required instance extensions are supported by the
+  // running Vulkan instance.
+  for (int i = 0; i < requiredInstanceExtensions.size(); ++i) {
+    std::string requiredExtension = requiredInstanceExtensions[i];
+    if (std::find(supportedInstanceExtensions.begin(),
+                  supportedInstanceExtensions.end(),
+                  requiredExtension) == supportedInstanceExtensions.end())
+      return VK_ERROR_EXTENSION_NOT_PRESENT;
+  }
+
+  // Create the vulkan instance with our required extensions and layers.
   VkInstanceCreateInfo ci = {};
   ci.sType = VK_STRUCTURE_TYPE_INSTANCE_CREATE_INFO;
   ci.pApplicationInfo = &ai;
-  std::vector<const char *> extensions = {
-      VK_EXT_DEBUG_UTILS_EXTENSION_NAME,
-      VK_KHR_EXTERNAL_MEMORY_CAPABILITIES_EXTENSION_NAME,
-      VK_KHR_EXTERNAL_SEMAPHORE_CAPABILITIES_EXTENSION_NAME,
-      VK_KHR_EXTERNAL_FENCE_CAPABILITIES_EXTENSION_NAME,
-      VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME};
-  ci.enabledExtensionCount = extensions.size();
-  ci.ppEnabledExtensionNames = extensions.data();
-  std::vector<const char *> layers = {"VK_LAYER_KHRONOS_validation"};
+  ci.enabledExtensionCount = requiredInstanceExtensions.size();
+  ci.ppEnabledExtensionNames = requiredInstanceExtensions.data();
+  std::vector<const char *> layers;
+  if (std::any_of(availableLayers.begin(), availableLayers.end(),
+                  [](auto &layer) {
+                    return layer.layerName == "VK_LAYER_KHRONOS_validation";
+                  })) {
+    layers.push_back("VK_LAYER_KHRONOS_validation");
+  }
   ci.enabledLayerCount = layers.size();
   ci.ppEnabledLayerNames = layers.data();
 
   VK_CHECK_CALL_RET(vkCreateInstance(&ci, nullptr, &vk_instance));
 
+  // Create a debug utils messenger. This will allow us to print debug
+  // information from the Vulkan validation layer.
   VkDebugUtilsMessengerCreateInfoEXT dumci = {};
   dumci.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT;
   dumci.messageSeverity = VK_DEBUG_UTILS_MESSAGE_SEVERITY_VERBOSE_BIT_EXT |
@@ -108,20 +183,33 @@ VkResult setupInstance() {
     return VK_ERROR_EXTENSION_NOT_PRESENT;
   }
 
-  vk_getMemoryFdKHR = (PFN_vkGetMemoryFdKHR)vkGetInstanceProcAddr(
-      vk_instance, "vkGetMemoryFdKHR");
+  return VK_SUCCESS;
+}
+
+// Returns all supported Vulkan device extensions.
+VkResult
+getSupportedDeviceExtensions(std::vector<VkExtensionProperties> &extensions,
+                             VkPhysicalDevice device) {
+  uint32_t numExtensions = 0;
+
+  VK_CHECK_CALL_RET(vkEnumerateDeviceExtensionProperties(
+      device, nullptr, &numExtensions, nullptr));
 
-  vk_getSemaphoreFdKHR = (PFN_vkGetSemaphoreFdKHR)vkGetInstanceProcAddr(
-      vk_instance, "vkGetSemaphoreFdKHR");
+  extensions.resize(numExtensions);
+  VK_CHECK_CALL_RET(vkEnumerateDeviceExtensionProperties(
+      device, nullptr, &numExtensions, extensions.data()));
 
   return VK_SUCCESS;
 }
 
+// Set up the Vulkan device.
 VkResult setupDevice(std::string device) {
   uint32_t physicalDeviceCount = 0;
+  // Get all physical devices.
   VK_CHECK_CALL_RET(
       vkEnumeratePhysicalDevices(vk_instance, &physicalDeviceCount, nullptr));
   if (physicalDeviceCount == 0) {
+    // If no physical devices found, return error.
     return VK_ERROR_DEVICE_LOST;
   }
   std::vector<VkPhysicalDevice> physicalDevices(physicalDeviceCount);
@@ -130,23 +218,61 @@ VkResult setupDevice(std::string device) {
 
   bool foundDevice = false;
 
+  // Define the required device extensions to run the tests.
+  static constexpr std::string_view requiredExtensions[] = {
+      VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME,
+      VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME,
+#ifdef _WIN32
+      VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME,
+      VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME,
+#else
+      VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME,
+      VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
+#endif
+  };
+
+  // From all physical devices, find the first one that supports all our
+  // required device extensions.
   for (int i = 0; i < physicalDeviceCount; i++) {
     vk_physical_device = physicalDevices[i];
     VkPhysicalDeviceProperties props;
     vkGetPhysicalDeviceProperties(vk_physical_device, &props);
-    std::string str(props.deviceName);
+    std::string name(props.deviceName);
 
-    if (str.find(device) != std::string::npos) {
-      foundDevice = true;
-      break;
+    if (name.find(device) == std::string::npos) {
+      continue;
     }
+
+    std::vector<VkExtensionProperties> supportedDeviceExtensions;
+    getSupportedDeviceExtensions(supportedDeviceExtensions, vk_physical_device);
+    const bool hasRequiredExtensions = std::all_of(
+        std::begin(requiredExtensions), std::end(requiredExtensions),
+        [&](std::string_view requiredExt) {
+          auto it = std::find_if(std::begin(supportedDeviceExtensions),
+                                 std::end(supportedDeviceExtensions),
+                                 [&](const VkExtensionProperties &ext) {
+                                   return (ext.extensionName == requiredExt);
+                                 });
+          return (it != std::end(supportedDeviceExtensions));
+        });
+    if (!hasRequiredExtensions) {
+      continue;
+    }
+
+    foundDevice = true;
+    std::cout << "Found suitable Vulkan device: " << name << std::endl;
+    break;
   }
 
+  // If no device was found that supports all our required extensions return an
+  // error.
   if (!foundDevice) {
     std::cerr << "Failed to find suitable device!\n";
     return VK_ERROR_DEVICE_LOST;
   }
 
+  // Get queue families and assign queue family indices for compute and transfer
+  // queues.
   uint32_t queueFamilyCount = 0;
   vkGetPhysicalDeviceQueueFamilyProperties(vk_physical_device,
                                            &queueFamilyCount, nullptr);
@@ -164,8 +290,8 @@ VkResult setupDevice(std::string device) {
     ++i;
   }
 
+  // Populate queue information prior to Vulkan device creation.
   float queuePriority = 1.f;
-
   std::vector<VkDeviceQueueCreateInfo> qcis;
   if (vk_computeQueueFamilyIndex == vk_transferQueueFamilyIndex) {
     qcis.resize(1);
@@ -188,12 +314,21 @@ VkResult setupDevice(std::string device) {
 
   VkPhysicalDeviceFeatures deviceFeatures = {};
 
+  // Store our required device extensions. To be passed to the Vulkan device
+  // creation function.
   std::vector<const char *> extensions = {
       VK_KHR_EXTERNAL_MEMORY_EXTENSION_NAME,
-      VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
       VK_KHR_EXTERNAL_SEMAPHORE_EXTENSION_NAME,
-      VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME};
+#ifdef _WIN32
+      VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME,
+      VK_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME,
+#else
+      VK_KHR_EXTERNAL_SEMAPHORE_FD_EXTENSION_NAME,
+      VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME,
+#endif
+  };
 
+  // Create the Vulkan device with the above queues, extensions, and layers.
   VkDeviceCreateInfo dci = {};
   dci.sType = VK_STRUCTURE_TYPE_DEVICE_CREATE_INFO;
   dci.pQueueCreateInfos = qcis.data();
@@ -205,13 +340,59 @@ VkResult setupDevice(std::string device) {
   VK_CHECK_CALL_RET(
       vkCreateDevice(vk_physical_device, &dci, nullptr, &vk_device));
 
+  // Get the Vulkan queues from the device.
   vkGetDeviceQueue(vk_device, vk_transferQueueFamilyIndex, 0,
                    &vk_transfer_queue);
   vkGetDeviceQueue(vk_device, vk_computeQueueFamilyIndex, 0, &vk_compute_queue);
 
+  // Get function pointers for memory and semaphore handle exportation.
+  // Functions will depend on the OS being compiled for.
+#ifdef _WIN32
+  vk_GetMemoryWin32HandleKHR =
+      (PFN_vkGetMemoryWin32HandleKHR)vkGetDeviceProcAddr(
+          vk_device, "vkGetMemoryWin32HandleKHR");
+  if (!vk_GetMemoryWin32HandleKHR) {
+    std::cerr
+        << "Could not get func pointer to \"vkGetMemoryWin32HandleKHR\"!\n";
+    return VK_ERROR_UNKNOWN;
+  }
+  vk_getSemaphoreWin32HandleKHR =
+      (PFN_vkGetSemaphoreWin32HandleKHR)vkGetDeviceProcAddr(
+          vk_device, "vkGetSemaphoreWin32HandleKHR");
+  if (!vk_getSemaphoreWin32HandleKHR) {
+    std::cerr
+        << "Could not get func pointer to \"vkGetSemaphoreWin32HandleKHR\"!\n";
+    return VK_ERROR_UNKNOWN;
+  }
+#else
+  vk_getMemoryFdKHR =
+      (PFN_vkGetMemoryFdKHR)vkGetDeviceProcAddr(vk_device, "vkGetMemoryFdKHR");
+  if (!vk_getMemoryFdKHR) {
+    std::cerr << "Could not get func pointer to \"vkGetMemoryFdKHR\"!\n";
+    return VK_ERROR_UNKNOWN;
+  }
+  vk_getSemaphoreFdKHR = (PFN_vkGetSemaphoreFdKHR)vkGetDeviceProcAddr(
+      vk_device, "vkGetSemaphoreFdKHR");
+  if (!vk_getSemaphoreFdKHR) {
+    std::cerr << "Could not get func pointer to \"vkGetSemaphoreFdKHR\"!\n";
+    return VK_ERROR_UNKNOWN;
+  }
+#endif
+
   return VK_SUCCESS;
 }
 
+/*
+This function sets up Vulkan command buffers.
+Firstly we create command pools for each of the queues that can be used.
+We have two queue types which can be used:
+  - A transfer queue, used for data movement operations
+  - A compute queue, used for shader invocation operations
+We allocate command buffers from these command pools.
+Note that some Vulkan instances may provide queues with transfer and compute
+capabilities. If this is the case, we only create one command pool, and one
+command buffer.
+*/
 VkResult setupCommandBuffers() {
   VkCommandPoolCreateInfo cpci = {};
   cpci.sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO;
@@ -254,6 +435,9 @@ VkResult setupCommandBuffers() {
   return VK_SUCCESS;
 }
 
+/*
+Create a Vulkan buffer with a specified size and usage.
+*/
 VkBuffer createBuffer(size_t size, VkBufferUsageFlags usage) {
   VkBufferCreateInfo bci = {};
   bci.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO;
@@ -269,6 +453,12 @@ VkBuffer createBuffer(size_t size, VkBufferUsageFlags usage) {
   return buffer;
 }
 
+/*
+Create a Vulkan image with a specified image type, format, extent, and usage.
+This function also allows users to specify whether the image will be exportable,
+in which case the appropriate extension struct is populated based on the OS the
+program is compiled for.
+*/
 VkImage createImage(VkImageType type, VkFormat format, VkExtent3D extent,
                     VkImageUsageFlags usage, size_t mipLevels,
                     bool exportable = true) {
@@ -279,16 +469,18 @@ VkImage createImage(VkImageType type, VkFormat format, VkExtent3D extent,
   ici.extent = extent;
   ici.mipLevels = mipLevels;
   ici.arrayLayers = 1;
-  // ici.tiling = VK_IMAGE_TILING_LINEAR;
   ici.usage = usage;
   ici.sharingMode = VK_SHARING_MODE_EXCLUSIVE;
   ici.samples = VK_SAMPLE_COUNT_1_BIT;
-  // ici.initialLayout = VK_IMAGE_LAYOUT_PREINITIALIZED;
 
   VkExternalMemoryImageCreateInfo emici = {};
   if (exportable) {
     emici.sType = VK_STRUCTURE_TYPE_EXTERNAL_MEMORY_IMAGE_CREATE_INFO;
+#ifdef _WIN32
+    emici.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
+#else
     emici.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
+#endif
 
     ici.pNext = &emici;
   }
@@ -301,6 +493,12 @@ VkImage createImage(VkImageType type, VkFormat format, VkExtent3D extent,
   return image;
 }
 
+/*
+Allocate `size` of device memory of the specified memory type.
+This function also allows users to specify whether the memory will be
+exportable, in which case the appropriate extension struct is populated based on
+the OS the program is compiled for.
+*/
 VkDeviceMemory allocateDeviceMemory(size_t size, uint32_t memoryTypeIndex,
                                     bool exportable = true) {
   VkMemoryAllocateInfo mai = {};
@@ -311,8 +509,11 @@ VkDeviceMemory allocateDeviceMemory(size_t size, uint32_t memoryTypeIndex,
   VkExportMemoryAllocateInfo emai = {};
   if (exportable) {
     emai.sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO;
+#ifdef _WIN32
+    emai.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
+#else
     emai.handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT;
-
+#endif
     mai.pNext = &emai;
   }
 
@@ -321,9 +522,14 @@ VkDeviceMemory allocateDeviceMemory(size_t size, uint32_t memoryTypeIndex,
     std::cerr << "Could not allocate device memory!\n";
     return VK_NULL_HANDLE;
   }
+
   return memory;
 }
 
+/*
+Retrieve the image memory type index for the Vulkan device based on the memory
+property flags passed.
+*/
 uint32_t getImageMemoryTypeIndex(VkImage image, VkMemoryPropertyFlags flags,
                                  VkMemoryRequirements &memRequirements) {
   vkGetImageMemoryRequirements(vk_device, image, &memRequirements);
@@ -341,6 +547,10 @@ uint32_t getImageMemoryTypeIndex(VkImage image, VkMemoryPropertyFlags flags,
   return 0;
 }
 
+/*
+Retrieve the buffer memory type index for the Vulkan device based on the memory
+property flags passed.
+*/
 uint32_t getBufferMemoryTypeIndex(VkBuffer buffer,
                                   VkMemoryPropertyFlags flags) {
   VkMemoryRequirements memRequirements;
@@ -359,6 +569,10 @@ uint32_t getBufferMemoryTypeIndex(VkBuffer buffer,
   return 0;
 }
 
+/*
+Destroy Vulkan objects.
+This function is called towards the end of Vulkan program execution.
+*/
 VkResult cleanup() {
 
   if (vk_computeQueueFamilyIndex == vk_transferQueueFamilyIndex) {
@@ -379,6 +593,57 @@ VkResult cleanup() {
   return VK_SUCCESS;
 }
 
+#ifdef _WIN32
+
+/*
+Retrieve a win32 memory handle for a given Vulkan device memory allocation.
+*/
+HANDLE getMemoryWin32Handle(VkDeviceMemory memory) {
+
+  HANDLE retHandle = 0;
+
+  VkMemoryGetWin32HandleInfoKHR mgwhi = {};
+  mgwhi.sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR;
+  mgwhi.memory = memory;
+  mgwhi.handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT;
+
+  if (vk_GetMemoryWin32HandleKHR != nullptr) {
+    VK_CHECK_CALL(vk_GetMemoryWin32HandleKHR(vk_device, &mgwhi, &retHandle));
+  } else {
+    std::cerr << "Could not get win32 handle!\n";
+    return 0;
+  }
+
+  return retHandle;
+}
+
+/*
+Retrieve a win32 memory handle for a given Vulkan semaphore object.
+*/
+HANDLE getSemaphoreWin32Handle(VkSemaphore semaphore) {
+
+  HANDLE retHandle = 0;
+
+  VkSemaphoreGetWin32HandleInfoKHR sghwi = {};
+  sghwi.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_WIN32_HANDLE_INFO_KHR;
+  sghwi.semaphore = semaphore;
+  sghwi.handleType = VK_EXTERNAL_SEMAPHORE_HANDLE_TYPE_OPAQUE_WIN32_BIT;
+
+  if (vk_getSemaphoreWin32HandleKHR != nullptr) {
+    VK_CHECK_CALL(vk_getSemaphoreWin32HandleKHR(vk_device, &sghwi, &retHandle));
+  } else {
+    std::cerr << "Could not get semaphore opaque file descriptor!\n";
+    return 0;
+  }
+
+  return retHandle;
+}
+
+#else
+
+/*
+Retrieve an opaque file descriptor handle for a given Vulkan memory allocation.
+*/
 int getMemoryOpaqueFD(VkDeviceMemory memory) {
   VkMemoryGetFdInfoKHR mgfi = {};
   mgfi.sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR;
@@ -388,10 +653,17 @@ int getMemoryOpaqueFD(VkDeviceMemory memory) {
   int fd = 0;
   if (vk_getMemoryFdKHR != nullptr) {
     VK_CHECK_CALL(vk_getMemoryFdKHR(vk_device, &mgfi, &fd));
+  } else {
+    std::cerr << "Could not get memory opaque file descriptor!\n";
+    return 0;
   }
+
   return fd;
 }
 
+/*
+Retrieve an opaque file descriptor handle for a given Vulkan semaphore object.
+*/
 int getSemaphoreOpaqueFD(VkSemaphore semaphore) {
   VkSemaphoreGetFdInfoKHR sgfi = {};
   sgfi.sType = VK_STRUCTURE_TYPE_SEMAPHORE_GET_FD_INFO_KHR;
@@ -401,10 +673,20 @@ int getSemaphoreOpaqueFD(VkSemaphore semaphore) {
   int fd = 0;
   if (vk_getSemaphoreFdKHR != nullptr) {
     VK_CHECK_CALL(vk_getSemaphoreFdKHR(vk_device, &sgfi, &fd));
+  } else {
+    std::cerr << "Could not get semaphore opaque file descriptor!\n";
+    return 0;
   }
+
   return fd;
 }
+#endif
 
+/*
+Populate a generic image memory barrier for a specific Vulkan image.
+This function assumes we are transitioning from an undefined image layout to a
+general image layout, which is sufficient for our current Vulkan tests.
+*/
 auto createImageMemoryBarrier(VkImage &img, size_t mipLevels) {
   VkImageMemoryBarrier barrierInput = {};
   barrierInput.sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER;
@@ -421,6 +703,14 @@ auto createImageMemoryBarrier(VkImage &img, size_t mipLevels) {
   return barrierInput;
 }
 
+/*
+This struct contains Vulkan resources used in test files, and is used to
+simplify the code within these tests.
+The constructor creates images, allocates device memory required for those
+images, and binds that memory to the created image.
+The destructor cleans up the memory allocations and destroys the image and
+staging buffer used to transfer data to that image.
+*/
 struct vulkan_image_test_resources_t {
   VkImage vkImage;
   VkDeviceMemory imageMemory;
@@ -462,6 +752,10 @@ struct vulkan_image_test_resources_t {
   }
 };
 
+/*
+Convert a SYCL image channel order and image channel type to a corresponding
+Vulkan format.
+*/
 VkFormat to_vulkan_format(sycl::image_channel_order order,
                           sycl::image_channel_type channel_type) {
   if (channel_type == sycl::image_channel_type::signed_int8) {
diff --git a/sycl/test-e2e/lit.cfg.py b/sycl/test-e2e/lit.cfg.py
index effda4949e0a8..9a1676da29bb7 100644
--- a/sycl/test-e2e/lit.cfg.py
+++ b/sycl/test-e2e/lit.cfg.py
@@ -464,7 +464,60 @@
     )
 
 if "cuda:gpu" in config.sycl_devices:
+    if "CUDA_PATH" not in os.environ:
+        if platform.system() == "Windows":
+            cuda_root = r"C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA"
+            cuda_versions = []
+            if os.path.exists(cuda_root):
+                for entry in os.listdir(cuda_root):
+                    if os.path.isdir(
+                        os.path.join(cuda_root, entry)
+                    ) and entry.startswith("v"):
+                        version = entry[1:]  # Remove the leading 'v'
+                        if re.match(
+                            r"^\d+\.\d+$", version
+                        ):  # Match version pattern like 12.3
+                            cuda_versions.append(version)
+                latest_cuda_version = max(
+                    cuda_versions, key=lambda v: [int(i) for i in v.split(".")]
+                )
+                os.environ["CUDA_PATH"] = os.path.join(
+                    cuda_root, f"v{latest_cuda_version}"
+                )
+        else:
+            cuda_root = "/usr/local"
+            cuda_versions = []
+            if os.path.exists(cuda_root):
+                for entry in os.listdir(cuda_root):
+                    if os.path.isdir(
+                        os.path.join(cuda_root, entry)
+                    ) and entry.startswith("cuda-"):
+                        version = entry.split("-")[1]
+                        if re.match(
+                            r"^\d+\.\d+$", version
+                        ):  # Match version pattern like 12.3
+                            cuda_versions.append(version)
+                latest_cuda_version = max(
+                    cuda_versions, key=lambda v: [int(i) for i in v.split(".")]
+                )
+                os.environ["CUDA_PATH"] = os.path.join(
+                    cuda_root, f"cuda-{latest_cuda_version}"
+                )
+
+    if "CUDA_PATH" not in os.environ:
+        lit_config.error("Cannot run tests for CUDA without valid CUDA_PATH.")
+
     llvm_config.with_system_environment("CUDA_PATH")
+    if platform.system() == "Windows":
+        config.cuda_libs_dir = (
+            '"' + os.path.join(os.environ["CUDA_PATH"], r"lib\x64") + '"'
+        )
+        config.cuda_include = (
+            '"' + os.path.join(os.environ["CUDA_PATH"], "include") + '"'
+        )
+    else:
+        config.cuda_libs_dir = os.path.join(os.environ["CUDA_PATH"], r"lib64")
+        config.cuda_include = os.path.join(os.environ["CUDA_PATH"], "include")
 
 # FIXME: This needs to be made per-device as well, possibly with a helper.
 if "hip:gpu" in config.sycl_devices and config.hip_platform == "AMD":
@@ -636,6 +689,7 @@
 
     dev_aspects = []
     dev_sg_sizes = []
+    architectures = set()
     # See format.py's parse_min_intel_driver_req for explanation.
     is_intel_driver = False
     intel_driver_ver = {}
@@ -660,6 +714,9 @@
             # str.removeprefix isn't universally available...
             sg_sizes_str = line.strip().replace("info::device::sub_group_sizes: ", "")
             dev_sg_sizes.append(sg_sizes_str.strip().split(" "))
+        if re.match(r" *Architecture:", line):
+            _, architecture = line.strip().split(":", 1)
+            architectures.add(architecture.strip())
 
     if dev_aspects == []:
         lit_config.error(
@@ -685,11 +742,32 @@
     sg_sizes = set(dev_sg_sizes[0]).intersection(*dev_sg_sizes)
     lit_config.note("SG sizes for {}: {}".format(sycl_device, ", ".join(sg_sizes)))
 
+    # Currently, for fpga, the architecture reported by sycl-ls will always
+    # be unknown, as there are currently no architectures specified for fpga
+    # in sycl_ext_oneapi_device_architecture. Skip adding architecture features
+    # in this case.
+    if sycl_device == "opencl:fpga":
+        architectures = set()
+    else:
+        lit_config.note(
+            "Architectures for {}: {}".format(sycl_device, ", ".join(architectures))
+        )
+        if len(architectures) != 1 or "unknown" in architectures:
+            if not config.allow_unknown_arch:
+                lit_config.error(
+                    "Cannot detect architecture for {}\nstdout:\n{}\nstderr:\n{}".format(
+                        sycl_device, sp.stdout, sp.stderr
+                    )
+                )
+            architectures = set()
+
     aspect_features = set("aspect-" + a for a in aspects)
     sg_size_features = set("sg-" + s for s in sg_sizes)
+    architecture_feature = set("architecture-" + s for s in architectures)
     features = set()
     features.update(aspect_features)
     features.update(sg_size_features)
+    features.update(architecture_feature)
 
     be, dev = sycl_device.split(":")
     features.add(dev.replace("fpga", "accelerator"))
diff --git a/sycl/test-e2e/lit.site.cfg.py.in b/sycl/test-e2e/lit.site.cfg.py.in
index f11f9e8356371..ae7dd6dfdbe7e 100644
--- a/sycl/test-e2e/lit.site.cfg.py.in
+++ b/sycl/test-e2e/lit.site.cfg.py.in
@@ -43,6 +43,7 @@ config.vulkan_lib = "@Vulkan_LIBRARY@"
 config.vulkan_found = "@Vulkan_FOUND@"
 
 config.run_launcher = lit_config.params.get('run_launcher', "@SYCL_E2E_RUN_LAUNCHER@")
+config.allow_unknown_arch = "@SYCL_E2E_LIT_ALLOW_UNKNOWN_ARCH@"
 
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_arith.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_arith.cpp
index 30a8853aed05e..a2a4dcf187de0 100644
--- a/sycl/test-e2e/syclcompat/atomic/atomic_arith.cpp
+++ b/sycl/test-e2e/syclcompat/atomic/atomic_arith.cpp
@@ -32,9 +32,7 @@
 
 // UNSUPPORTED: hip
 
-// FIXME: Remove "-fsycl-device-code-split=per_kernel" option after fixing
-// https://github.com/intel/llvm/issues/12743.
-// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} -fsycl-device-code-split=per_kernel %s -o %t.out
+// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
 // RUN: %{run} %t.out
 
 #include <cstddef>
diff --git a/sycl/test-e2e/syclcompat/atomic/atomic_comp_exchange.cpp b/sycl/test-e2e/syclcompat/atomic/atomic_comp_exchange.cpp
index ce66db044ee8a..5378fffb7724b 100644
--- a/sycl/test-e2e/syclcompat/atomic/atomic_comp_exchange.cpp
+++ b/sycl/test-e2e/syclcompat/atomic/atomic_comp_exchange.cpp
@@ -32,9 +32,7 @@
 
 // UNSUPPORTED: hip
 
-// FIXME: Remove "-fsycl-device-code-split=per_kernel" option after fixing
-// https://github.com/intel/llvm/issues/12743.
-// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} -fsycl-device-code-split=per_kernel %s -o %t.out
+// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
 // RUN: %{run} %t.out
 
 #include <type_traits>
diff --git a/sycl/test-e2e/syclcompat/device/device.cpp b/sycl/test-e2e/syclcompat/device/device.cpp
index 9e4c8edcd91c9..0ba1ce6d1e729 100644
--- a/sycl/test-e2e/syclcompat/device/device.cpp
+++ b/sycl/test-e2e/syclcompat/device/device.cpp
@@ -359,6 +359,39 @@ void test_max_nd_range() {
 #endif
 }
 
+void test_list_devices() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+  DeviceTestsFixt dtf;
+
+  // Redirect std::cout to count new lines
+  CountingStream countingBuf(std::cout.rdbuf());
+  std::streambuf *orig_buf = std::cout.rdbuf();
+  std::cout.rdbuf(&countingBuf);
+
+  syclcompat::list_devices();
+
+  // Restore back std::cout
+  std::cout.rdbuf(orig_buf);
+
+  // Expected one line per device
+  assert(countingBuf.get_line_count() == dtf.get_n_devices());
+}
+
+void test_device_count() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  unsigned int count = syclcompat::device_count();
+  assert(count > 0);
+}
+
+void test_get_device_id() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  sycl::device dev = syclcompat::get_device(0);
+  unsigned int id = syclcompat::get_device_id(dev);
+  assert(id == 0);
+}
+
 int main() {
   test_at_least_one_device();
   test_matches_id();
@@ -377,6 +410,9 @@ int main() {
   test_version_parsing();
   test_image_max_attrs();
   test_max_nd_range();
+  test_list_devices();
+  test_device_count();
+  test_get_device_id();
 
   return 0;
 }
diff --git a/sycl/test-e2e/syclcompat/device/device_filter.cpp b/sycl/test-e2e/syclcompat/device/device_filter.cpp
new file mode 100644
index 0000000000000..3f03432401b0a
--- /dev/null
+++ b/sycl/test-e2e/syclcompat/device/device_filter.cpp
@@ -0,0 +1,78 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat API
+ *
+ *  device_filter.cpp
+ *
+ *  Description:
+ *    Device filtering tests
+ **************************************************************************/
+
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
+// RUN: %{run} %t.out
+
+#include <syclcompat/device.hpp>
+
+void test_filtering_existing_device() {
+  auto &dev = syclcompat::get_current_device();
+  std::string dev_name = dev.get_info<sycl::info::device::name>();
+
+  syclcompat::filter_device({dev_name});
+  try {
+    syclcompat::get_device_id(dev);
+  } catch (std::runtime_error const &e) {
+    std::cout << "  Unexpected SYCL exception caught: " << e.what()
+              << std::endl;
+    assert(0);
+  }
+
+  // Checks for a substring of the device as well
+  std::string dev_substr = dev_name.substr(1, dev_name.find(" ") + 2);
+  syclcompat::filter_device({dev_substr});
+  try {
+    syclcompat::get_device_id(dev);
+  } catch (std::runtime_error const &e) {
+    std::cout << "  Unexpected SYCL exception caught: " << e.what()
+              << std::endl;
+    assert(0);
+  }
+}
+
+void test_filter_devices() {
+  auto &dev = syclcompat::get_current_device();
+
+  assert(syclcompat::detail::dev_mgr::instance().device_count() > 0);
+
+  syclcompat::filter_device({"NON-EXISTENT DEVICE"});
+  assert(syclcompat::detail::dev_mgr::instance().device_count() == 0);
+
+  try {
+    syclcompat::get_device_id(dev);
+    assert(0);
+  } catch (std::runtime_error const &e) {
+    std::cout << "  Expected SYCL exception caught: " << e.what() << std::endl;
+  }
+}
+
+int main() {
+  // syclcompat::dev_mgr is a singleton, so any changes to the device list is
+  // permanent between tests. Test isolated instead of relying on it being the
+  // last test in a different test suite.
+  test_filtering_existing_device();
+
+  test_filter_devices();
+
+  return 0;
+}
diff --git a/sycl/test-e2e/syclcompat/device/device_fixt.hpp b/sycl/test-e2e/syclcompat/device/device_fixt.hpp
index 3a68eaf2317f1..7588eb71dd5b9 100644
--- a/sycl/test-e2e/syclcompat/device/device_fixt.hpp
+++ b/sycl/test-e2e/syclcompat/device/device_fixt.hpp
@@ -32,7 +32,7 @@ class DeviceTestsFixt {
 
 public:
   DeviceTestsFixt()
-      : n_devices{syclcompat::detail::dev_mgr::instance().device_count()},
+      : n_devices{syclcompat::device_count()},
         def_q_{syclcompat::get_default_queue()} {}
 
   unsigned int get_n_devices() { return n_devices; }
@@ -50,3 +50,32 @@ class DeviceExtFixt {
 
   syclcompat::device_ext &get_dev_ext() { return dev_; }
 };
+
+// Helper for counting the output lines of syclcompat::list_devices
+// Used to override std::cout
+class CountingStream : public std::streambuf {
+public:
+  CountingStream(std::streambuf *buf) : buf(buf), line_count(0) {}
+
+  int overflow(int c) override {
+    if (c == '\n') {
+      ++line_count;
+    }
+    return buf->sputc(c);
+  }
+
+  std::streamsize xsputn(const char_type *s, std::streamsize count) override {
+    for (std::streamsize i = 0; i < count; ++i) {
+      if (s[i] == '\n') {
+        ++line_count;
+      }
+    }
+    return buf->sputn(s, count);
+  }
+
+  int get_line_count() const { return line_count; }
+
+private:
+  std::streambuf *buf;
+  int line_count;
+};
diff --git a/sycl/test-e2e/syclcompat/math/math_bfe.cpp b/sycl/test-e2e/syclcompat/math/math_bfe.cpp
new file mode 100644
index 0000000000000..b8f59ead1e277
--- /dev/null
+++ b/sycl/test-e2e/syclcompat/math/math_bfe.cpp
@@ -0,0 +1,182 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat API
+ *
+ *  math_bfe.cpp
+ *
+ *  Description:
+ *    math bitfield extract tests
+ **************************************************************************/
+
+// ===----------- math_bfe.cpp ------------------ -*- C++ -* --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+// ===---------------------------------------------------------------------===//
+
+// RUN: %clangxx -std=c++17 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
+// RUN: %{run} %t.out
+
+#include <bitset>
+#include <chrono>
+#include <iostream>
+#include <limits.h>
+#include <random>
+#include <stdint.h>
+#include <sycl/detail/core.hpp>
+#include <syclcompat/math.hpp>
+#include <type_traits>
+#include <vector>
+
+template <typename T>
+inline std::enable_if_t<std::is_integral_v<T>, T>
+bfe_slow(const T source, const uint32_t bit_start, const uint32_t num_bits) {
+  const uint32_t msb = CHAR_BIT * sizeof(T) - 1;
+  const uint32_t pos = bit_start;
+  const uint32_t len = num_bits;
+
+  // If the requested bit field length is zero, the result is zero.
+  if (num_bits == 0)
+    return 0ULL;
+
+  T sbit;
+  std::bitset<CHAR_BIT * sizeof(T)> source_bitset(source);
+  if (std::is_unsigned_v<T> || len == 0)
+    sbit = 0;
+  else
+    sbit = source_bitset[std::min(pos + len - 1, msb)];
+
+  // If the start position is beyond the msb of the input, the destination d is
+  // filled with the replicated sign bit of the extracted field.
+  // -1 is 1111...
+  if (bit_start > msb)
+    return -sbit;
+
+  std::bitset<CHAR_BIT * sizeof(T)> result_bitset;
+  for (uint8_t i = 0; i <= msb; ++i)
+    result_bitset[i] =
+        (i < len && pos + i <= msb) ? source_bitset[pos + i] : sbit;
+  return result_bitset.to_ullong();
+}
+
+template <typename T> bool test(const char *Msg, int N) {
+  uint32_t bit_width = CHAR_BIT * sizeof(T);
+  T min_value = std::numeric_limits<T>::min();
+  T max_value = std::numeric_limits<T>::max();
+  std::random_device rd;
+  std::mt19937::result_type seed =
+      rd() ^
+      ((std::mt19937::result_type)
+           std::chrono::duration_cast<std::chrono::seconds>(
+               std::chrono::system_clock::now().time_since_epoch())
+               .count() +
+       (std::mt19937::result_type)
+           std::chrono::duration_cast<std::chrono::microseconds>(
+               std::chrono::high_resolution_clock::now().time_since_epoch())
+               .count());
+
+  std::mt19937 gen(seed);
+  std::uniform_int_distribution<T> rd_source(min_value, max_value);
+
+  // Define a small overshoot so that we adequately test out-of-range cases
+  // without sacrificing depth of testing of valid start+length combinations
+  constexpr uint32_t overshoot = 2;
+  std::uniform_int_distribution<uint32_t> rd_start(0, bit_width + overshoot);
+  std::uniform_int_distribution<uint32_t> rd_length(0, bit_width + overshoot);
+
+  std::vector<T> sources(N, 0);
+  std::vector<T> compat_results(N, 0);
+  std::vector<T> slow_results(N, 0);
+  std::vector<uint32_t> starts(N, 0);
+  std::vector<uint32_t> lengths(N, 0);
+  for (int i = 0; i < N; ++i) {
+    sources[i] = rd_source(gen);
+    starts[i] = rd_start(gen);
+    lengths[i] = rd_length(gen);
+  }
+
+  sycl::buffer<T, 1> source_buffer(sources.data(), N);
+  sycl::buffer<T, 1> compat_results_buffer(compat_results.data(), N);
+  sycl::buffer<T, 1> slow_results_buffer(slow_results.data(), N);
+  sycl::buffer<uint32_t, 1> starts_buffer(starts.data(), N);
+  sycl::buffer<uint32_t, 1> lengths_buffer(lengths.data(), N);
+
+  sycl::queue que;
+  que.submit([&](sycl::handler &handler) {
+    sycl::accessor source_accessor(source_buffer, handler, sycl::read_only);
+    sycl::accessor start_accessor(starts_buffer, handler, sycl::read_only);
+    sycl::accessor length_accessor(lengths_buffer, handler, sycl::read_only);
+    sycl::accessor compat_result_accessor(compat_results_buffer, handler,
+                                          sycl::write_only);
+    handler.parallel_for(N, [=](sycl::id<1> i) {
+      compat_result_accessor[i] = syclcompat::bfe_safe<T>(
+          source_accessor[i], start_accessor[i], length_accessor[i]);
+    });
+  });
+
+  que.submit([&](sycl::handler &handler) {
+    sycl::accessor source_accessor(source_buffer, handler, sycl::read_only);
+    sycl::accessor start_accessor(starts_buffer, handler, sycl::read_only);
+    sycl::accessor length_accessor(lengths_buffer, handler, sycl::read_only);
+    sycl::accessor slow_result_accessor(slow_results_buffer, handler,
+                                        sycl::write_only);
+    handler.parallel_for(N, [=](sycl::id<1> i) {
+      slow_result_accessor[i] = bfe_slow<T>(
+          source_accessor[i], start_accessor[i], length_accessor[i]);
+    });
+  });
+
+  que.wait_and_throw();
+  sycl::host_accessor source_accessor(source_buffer, sycl::read_only);
+  sycl::host_accessor start_accessor(starts_buffer, sycl::read_only);
+  sycl::host_accessor length_accessor(lengths_buffer, sycl::read_only);
+  sycl::host_accessor compat_result_accessor(compat_results_buffer,
+                                             sycl::read_only);
+  sycl::host_accessor slow_result_accessor(slow_results_buffer,
+                                           sycl::read_only);
+
+  int failed = 0;
+  for (int i = 0; i < N; ++i) {
+    if (compat_result_accessor[i] != slow_result_accessor[i]) {
+      failed++;
+      std::cout << "[source = " << source_accessor[i]
+                << ", bit_start = " << start_accessor[i]
+                << ", num_bits = " << length_accessor[i] << "] failed, expect "
+                << slow_result_accessor[i] << " but got "
+                << compat_result_accessor[i] << std::endl;
+    }
+  }
+  std::cout << "===============" << std::endl;
+  std::cout << "Test: " << Msg << std::endl;
+  std::cout << "Total: " << N << std::endl;
+  std::cout << "Success: " << N - failed << std::endl;
+  std::cout << "Failed: " << failed << std::endl;
+  std::cout << "===============" << std::endl;
+  return !failed;
+}
+
+int main() {
+  const int N = 1000;
+  assert(test<int16_t>("int16", N));
+  assert(test<uint16_t>("uint16", N));
+  assert(test<int32_t>("int32", N));
+  assert(test<uint32_t>("uint32", N));
+  assert(test<int64_t>("int64", N));
+  assert(test<uint64_t>("uint64", N));
+  return 0;
+}
diff --git a/sycl/test-e2e/syclcompat/math/math_bfi.cpp b/sycl/test-e2e/syclcompat/math/math_bfi.cpp
new file mode 100644
index 0000000000000..7f39bd8508ac4
--- /dev/null
+++ b/sycl/test-e2e/syclcompat/math/math_bfi.cpp
@@ -0,0 +1,167 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat API
+ *
+ *  math_bfi.cpp
+ *
+ *  Description:
+ *    math bitfield insert tests
+ **************************************************************************/
+
+// ===----------- math_bfi.cpp ------------------ -*- C++ -* --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+// ===---------------------------------------------------------------------===//
+
+// RUN: %clangxx -std=c++17 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
+// RUN: %{run} %t.out
+
+#include <bitset>
+#include <chrono>
+#include <iostream>
+#include <limits.h>
+#include <random>
+#include <stdint.h>
+#include <sycl/detail/core.hpp>
+#include <syclcompat/math.hpp>
+#include <type_traits>
+#include <vector>
+
+template <typename T>
+inline std::enable_if_t<std::is_unsigned_v<T>, T>
+bfi_slow(const T x, const T y, const uint32_t bit_start,
+         const uint32_t num_bits) {
+  const uint32_t msb = CHAR_BIT * sizeof(T) - 1;
+  const uint32_t pos = bit_start & 0xff;
+  const uint32_t len = num_bits & 0xff;
+  std::bitset<CHAR_BIT * sizeof(T)> source_bitset(x), result_bitset(y);
+  for (int i = 0; i < len && pos + i <= msb; i++) {
+    result_bitset[pos + i] = source_bitset[i];
+  }
+  return result_bitset.to_ullong();
+}
+
+template <typename T> bool test(const char *Msg, int N) {
+  uint32_t bit_width = CHAR_BIT * sizeof(T);
+  T min_value = std::numeric_limits<T>::min();
+  T max_value = std::numeric_limits<T>::max();
+  std::random_device rd;
+  std::mt19937::result_type seed =
+      rd() ^
+      ((std::mt19937::result_type)
+           std::chrono::duration_cast<std::chrono::seconds>(
+               std::chrono::system_clock::now().time_since_epoch())
+               .count() +
+       (std::mt19937::result_type)
+           std::chrono::duration_cast<std::chrono::microseconds>(
+               std::chrono::high_resolution_clock::now().time_since_epoch())
+               .count());
+
+  std::mt19937 gen(seed);
+  std::uniform_int_distribution<T> rd_source(min_value, max_value);
+  // Define a small overshoot so that we adequately test out-of-range cases
+  // without sacrificing depth of testing of valid start+length combinations
+  constexpr uint32_t overshoot = 2;
+  std::uniform_int_distribution<uint32_t> rd_start(0, bit_width + overshoot);
+  std::uniform_int_distribution<uint32_t> rd_length(0, bit_width + overshoot);
+
+  std::vector<T> x(N, 0);
+  std::vector<T> y(N, 0);
+  std::vector<T> compat_results(N, 0);
+  std::vector<T> slow_results(N, 0);
+  std::vector<uint32_t> starts(N, 0);
+  std::vector<uint32_t> lengths(N, 0);
+  for (int i = 0; i < N; ++i) {
+    x[i] = rd_source(gen);
+    y[i] = rd_source(gen);
+    starts[i] = rd_start(gen);
+    lengths[i] = rd_length(gen);
+  }
+
+  sycl::buffer<T, 1> x_buffer(x.data(), N);
+  sycl::buffer<T, 1> y_buffer(y.data(), N);
+  sycl::buffer<T, 1> compat_results_buffer(compat_results.data(), N);
+  sycl::buffer<T, 1> slow_results_buffer(slow_results.data(), N);
+  sycl::buffer<uint32_t, 1> starts_buffer(starts.data(), N);
+  sycl::buffer<uint32_t, 1> lengths_buffer(lengths.data(), N);
+
+  sycl::queue que;
+  que.submit([&](sycl::handler &handler) {
+    sycl::accessor x_accessor(x_buffer, handler, sycl::read_only);
+    sycl::accessor y_accessor(y_buffer, handler, sycl::read_only);
+    sycl::accessor start_accessor(starts_buffer, handler, sycl::read_only);
+    sycl::accessor length_accessor(lengths_buffer, handler, sycl::read_only);
+    sycl::accessor compat_result_accessor(compat_results_buffer, handler,
+                                          sycl::write_only);
+    handler.parallel_for(N, [=](sycl::id<1> i) {
+      compat_result_accessor[i] = syclcompat::bfi_safe<T>(
+          x_accessor[i], y_accessor[i], start_accessor[i], length_accessor[i]);
+    });
+  });
+
+  que.submit([&](sycl::handler &handler) {
+    sycl::accessor x_accessor(x_buffer, handler, sycl::read_only);
+    sycl::accessor y_accessor(y_buffer, handler, sycl::read_only);
+    sycl::accessor start_accessor(starts_buffer, handler, sycl::read_only);
+    sycl::accessor length_accessor(lengths_buffer, handler, sycl::read_only);
+    sycl::accessor slow_result_accessor(slow_results_buffer, handler,
+                                        sycl::write_only);
+    handler.parallel_for(N, [=](sycl::id<1> i) {
+      slow_result_accessor[i] = bfi_slow<T>(
+          x_accessor[i], y_accessor[i], start_accessor[i], length_accessor[i]);
+    });
+  });
+
+  que.wait_and_throw();
+  sycl::host_accessor x_accessor(x_buffer, sycl::read_only);
+  sycl::host_accessor y_accessor(y_buffer, sycl::read_only);
+  sycl::host_accessor start_accessor(starts_buffer, sycl::read_only);
+  sycl::host_accessor length_accessor(lengths_buffer, sycl::read_only);
+  sycl::host_accessor compat_result_accessor(compat_results_buffer,
+                                             sycl::read_only);
+  sycl::host_accessor slow_result_accessor(slow_results_buffer,
+                                           sycl::read_only);
+
+  int failed = 0;
+  for (int i = 0; i < N; ++i) {
+    if (compat_result_accessor[i] != slow_result_accessor[i]) {
+      failed++;
+      std::cout << "[x = " << x_accessor[i] << ", y = " << y_accessor[i]
+                << ", bit_start = " << start_accessor[i]
+                << ", num_bits = " << length_accessor[i] << "] failed, expect "
+                << slow_result_accessor[i] << " but got "
+                << compat_result_accessor[i] << std::endl;
+    }
+  }
+  std::cout << "===============" << std::endl;
+  std::cout << "Test: " << Msg << std::endl;
+  std::cout << "Total: " << N << std::endl;
+  std::cout << "Success: " << N - failed << std::endl;
+  std::cout << "Failed: " << failed << std::endl;
+  std::cout << "===============" << std::endl;
+  return !failed;
+}
+
+int main() {
+  const int N = 1000;
+  assert(test<uint16_t>("uint16", N));
+  assert(test<uint32_t>("uint32", N));
+  assert(test<uint64_t>("uint64", N));
+  return 0;
+}
diff --git a/sycl/test-e2e/syclcompat/math/math_byte_dot_product.cpp b/sycl/test-e2e/syclcompat/math/math_byte_dot_product.cpp
new file mode 100644
index 0000000000000..08da3b957bf94
--- /dev/null
+++ b/sycl/test-e2e/syclcompat/math/math_byte_dot_product.cpp
@@ -0,0 +1,300 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat API
+ *
+ *  math_byte_dot_product.cpp
+ *
+ *  Description:
+ *    Dp4a and Dp2a tests
+ **************************************************************************/
+
+// ===----------- math_dp2a_dp4a.cpp ------------ -*- C++ -* --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+// ===---------------------------------------------------------------------===//
+
+// RUN: %clangxx -std=c++17 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
+// RUN: %{run} %t.out
+
+#include <sycl/detail/core.hpp>
+#include <syclcompat.hpp>
+
+template <typename T, size_t N> constexpr size_t array_size(T (&)[N]) {
+  return N;
+}
+
+template <typename T1, typename T2> struct TestCaseStorage {
+  T1 a;
+  T2 b;
+  syclcompat::dot_product_acc_t<T1, T2> c;
+  syclcompat::dot_product_acc_t<T1, T2> d;
+};
+
+enum TestType { dp2a_lo, dp2a_hi, dp4a };
+
+template <TestType, typename T1, typename T2> struct TestCase;
+
+template <> struct TestCase<dp2a_lo, int32_t, int32_t> {
+  static constexpr TestCaseStorage<int32_t, int32_t> data[] = {
+      {930681129, 370772529, 2010968336, 2009507875},
+      {-182801821, 2018321974, -1344607006, -1345896544},
+      {-1405866995, -56456331, 2028627921, 2032457214},
+      {-2067420235, 667032387, -1549633870, -1551931432},
+      {150264517, 1499579728, 1168148523, 1167250815},
+      {-1488693248, 590983308, -1132841811, -1133637779},
+      {1952352829, 1541328881, -867130079, -868137584},
+      {1402917188, -396551268, 682657336, 684431698},
+      {1060076168, 2095822351, 266994190, 266267760},
+      {-597525506, 329411575, -760256038, -761517342},
+  };
+};
+
+template <> struct TestCase<dp2a_lo, int32_t, uint32_t> {
+  static constexpr TestCaseStorage<int32_t, uint32_t> data[] = {
+      {-1784870143, 3550903701, 929114859, 926130217},
+      {-906522442, 2115573780, -1285980330, -1286882122},
+      {1391650851, 4107608479, 273580150, 273309541},
+      {-1501013502, 3932674350, -905231285, -909141521},
+      {-304683280, 2104603303, -790552087, -792451259},
+      {-1341822015, 615507964, -1323598253, -1321376558},
+      {351927836, 264881689, -495668280, -494617318},
+      {-151229742, 3617293176, 628248961, 631228133},
+      {302881625, 4164956791, -1904446304, -1907281527},
+      {2037447091, 4048192261, -200189002, -196124539},
+  };
+};
+
+template <> struct TestCase<dp2a_lo, uint32_t, int32_t> {
+  static constexpr TestCaseStorage<uint32_t, int32_t> data[] = {
+      {3526794897, 1440743042, 370074542, 364852196},
+      {262513653, 298144108, 1265851732, 1270709221},
+      {1130955292, -963349034, -2078791855, -2076795466},
+      {2514054142, -1350622828, 257209474, 255489619},
+      {2734618833, -2039216829, 1170234974, 1174711303},
+      {2679502652, -552107997, 1516795981, 1513777921},
+      {2178722429, 1706794257, -1207356382, -1209905573},
+      {2938336684, 1853682464, 1478700448, 1479081561},
+      {4131007422, 88852262, 949301283, 946133869},
+      {1426380125, 1310424908, 2110346787, 2117262011},
+  };
+};
+
+template <> struct TestCase<dp2a_lo, uint32_t, uint32_t> {
+  static constexpr TestCaseStorage<uint32_t, uint32_t> data[] = {
+      {261879580, 462533001, 1244651601, 1254025336},
+      {3613440709, 39532914, 3612331201, 3620924635},
+      {2613678921, 3074075559, 2197617435, 2210733821},
+      {3858700825, 2932114399, 651043516, 660246528},
+      {3641490311, 1203902590, 1264123439, 1271505857},
+      {620567, 198432492, 1750593890, 1757851164},
+      {1924357490, 2672674441, 363874491, 372965679},
+      {575741870, 365675828, 4077327301, 4079479666},
+      {779333090, 1461441270, 3936527378, 3949974932},
+      {3047663397, 3117692984, 3095767416, 3100767768},
+  };
+};
+
+template <> struct TestCase<dp2a_hi, int32_t, int32_t> {
+  static constexpr TestCaseStorage<int32_t, int32_t> data[] = {
+      {2033148131, 1987852344, 1836738289, 1843474575},
+      {1854766635, -847369228, 570647947, 573274270},
+      {1221789280, -1504599082, 2039564501, 2038018823},
+      {1815893957, 522593320, -1194398972, -1192686202},
+      {-942058619, -1694947839, -1791401709, -1790085126},
+      {1261876252, -722935661, -401441440, -401822344},
+      {-1276948036, -2045446196, 883626458, 886422108},
+      {-1043904041, 1660095151, 924853314, 923046533},
+      {1873342481, -183952166, 1422494064, 1422142929},
+      {1548579097, 388816020, 1306723060, 1308540459},
+  };
+};
+template <> struct TestCase<dp2a_hi, int32_t, uint32_t> {
+  static constexpr TestCaseStorage<int32_t, uint32_t> data[] = {
+      {925779231, 2297216285, -2134129287, -2128032131},
+      {1226362493, 592978070, 1394319934, 1393859454},
+      {-820606485, 3315032306, -1946036979, -1953068392},
+      {865550467, 2594266420, 684086152, 688778945},
+      {2042373655, 2279820469, 330650825, 337071442},
+      {-803475029, 3557524416, 570180628, 567540937},
+      {-1920282536, 4207418946, -179074286, -188839786},
+      {-1611807508, 2012850000, -45410323, -52103004},
+      {-209217908, 3249694139, -1047805020, -1053226557},
+      {-938134420, 4023147013, -1637223186, -1637906791},
+  };
+};
+
+template <> struct TestCase<dp2a_hi, uint32_t, int32_t> {
+  static constexpr TestCaseStorage<uint32_t, int32_t> data[] = {
+      {1465064346, -987065627, 511196861, 510174688},
+      {423752047, -2037616892, 1367127780, 1359169438},
+      {1732089906, 1660637927, 835046327, 837441559},
+      {3240032526, -687279473, 314878829, 313361935},
+      {2028889232, 453690876, -1579929106, -1578835800},
+      {636106821, 1932111966, -1143803023, -1142096228},
+      {1744753942, 2120462197, 543738507, 552493329},
+      {1952094085, 75134480, -1870017090, -1865165688},
+      {1238028676, -368589994, 400410492, 400370364},
+      {1678354325, 1520837888, 900538674, 898982394},
+  };
+};
+
+template <> struct TestCase<dp2a_hi, uint32_t, uint32_t> {
+  static constexpr TestCaseStorage<uint32_t, uint32_t> data[] = {
+      {3407045239, 1034879260, 1566081712, 1573664144},
+      {1019854071, 319089899, 2048645673, 2049134832},
+      {3484748932, 23066577, 2279969923, 2280327476},
+      {772761490, 593919853, 110217101, 113334214},
+      {3040024654, 3302072533, 3503588845, 3513981095},
+      {247428909, 1708258743, 3414468907, 3421226563},
+      {3214691207, 2264421274, 2096321799, 2107689847},
+      {1978412244, 3523914401, 3482699206, 3489153446},
+      {845968593, 3600665955, 3398632658, 3406090055},
+      {2655885278, 642147090, 953440990, 957702400},
+  };
+};
+
+template <> struct TestCase<dp4a, int32_t, int32_t> {
+  static constexpr TestCaseStorage<int32_t, int32_t> data[] = {
+      {-1190208646, 231822748, 1361188354, 1361171428},
+      {-1897923580, -1660380472, -882257438, -882246232},
+      {-579619596, 1428550082, -686850248, -686847084},
+      {1276672648, 1193117464, 963222686, 963211136},
+      {-1511270552, 346453515, 539470060, 539466436},
+      {-1731107400, 30416897, 1116161329, 1116166641},
+      {314175584, 917356905, 1924209306, 1924227259},
+      {601261287, 461003584, -332185426, -332202489},
+      {451422378, 1069445579, 2077503598, 2077515898},
+      {1601425114, -1009494442, -12279717, -12298140},
+  };
+};
+
+template <> struct TestCase<dp4a, int32_t, uint32_t> {
+  static constexpr TestCaseStorage<int32_t, uint32_t> data[] = {
+      {851192907, 4159889898, -1560201465, -1560178121},
+      {-383662874, 94554831, -1699007777, -1699020048},
+      {319925525, 3224159406, -1636209897, -1636218115},
+      {390273202, 3538403320, 1599902512, 1599908059},
+      {-2133436013, 2204709798, -745513793, -745548526},
+      {-1365042624, 302260610, 1683641121, 1683648451},
+      {839091651, 3945553885, 18130274, 18116990},
+      {-92392216, 2135215000, -886668361, -886653647},
+      {-968453153, 2050948958, 1992996892, 1992963259},
+      {-234768205, 3930595068, -2067724845, -2067749613},
+  };
+};
+
+template <> struct TestCase<dp4a, uint32_t, int32_t> {
+  static constexpr TestCaseStorage<uint32_t, int32_t> data[] = {
+      {908604347, 1279608234, -1450969803, -1450975502},
+      {1784598592, 892171050, -824564831, -824528375},
+      {3414325281, 110856089, 1344013863, 1343984032},
+      {3589641407, 1110466407, 269001016, 269060567},
+      {3064317481, -1629226109, -733249792, -733278528},
+      {3599941523, 2112627078, 1626729914, 1626742113},
+      {1503610658, 885664480, 1900050896, 1900048832},
+      {2314829379, -2127096242, 1568300547, 1568304841},
+      {2817858008, -384307221, 307309401, 307306234},
+      {1408389703, 1080046077, -535563057, -535530708},
+  };
+};
+
+template <> struct TestCase<dp4a, uint32_t, uint32_t> {
+  static constexpr TestCaseStorage<uint32_t, uint32_t> data[] = {
+      {3065883002, 1618319527, 3160878852, 3160964499},
+      {750408200, 2617984089, 2072985277, 2073000475},
+      {1703570544, 1174656448, 1981665359, 1981717351},
+      {2526801072, 968400189, 821887370, 821972228},
+      {4033238565, 2506370972, 1177018849, 1177074623},
+      {2340922922, 2952738658, 316397016, 316469012},
+      {2559339202, 800262553, 1317311402, 1317374242},
+      {991496487, 2323953615, 2007618737, 2007639899},
+      {3918465905, 1041229499, 2826819834, 2826860086},
+      {4028147698, 2068172524, 482675182, 482797872}};
+};
+
+template <TestType Type, typename T1, typename T2> bool test() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  using Case = TestCase<Type, T1, T2>;
+  using CaseElement =
+      std::remove_cv_t<std::remove_extent_t<decltype(Case::data)>>;
+  using ResultT = syclcompat::dot_product_acc_t<T1, T2>;
+  constexpr size_t N = array_size(Case::data);
+  std::vector<ResultT> result(N);
+  std::vector<CaseElement> cases(std::begin(Case::data), std::end(Case::data));
+  sycl::buffer<CaseElement, 1> buffer(cases.data(), N);
+  sycl::buffer<ResultT, 1> result_buffer(result.data(), N);
+  sycl::queue q;
+  q.submit([&](sycl::handler &handler) {
+    sycl::accessor src(buffer, handler, sycl::read_only);
+    sycl::accessor res(result_buffer, handler, sycl::write_only);
+    handler.parallel_for(N, [=](sycl::id<1> i) {
+      if constexpr (Type == dp2a_lo)
+        res[i] = syclcompat::dp2a_lo<T1, T2>(src[i].a, src[i].b, src[i].c);
+      else if constexpr (Type == dp2a_hi)
+        res[i] = syclcompat::dp2a_hi<T1, T2>(src[i].a, src[i].b, src[i].c);
+      else
+        res[i] = syclcompat::dp4a<T1, T2>(src[i].a, src[i].b, src[i].c);
+    });
+  });
+
+  q.wait_and_throw();
+
+  int failed = 0;
+  sycl::host_accessor src(buffer, sycl::read_only);
+  sycl::host_accessor res(result_buffer, sycl::read_only);
+
+  for (int i = 0; i < N; ++i) {
+    if (src[i].d != res[i]) {
+      failed++;
+      std::cout << "  [a = " << src[i].a << ", b = " << src[i].b
+                << ", c = " << src[i].c << "] failed, expect " << src[i].d
+                << " but got " << res[i] << std::endl;
+    }
+  }
+
+  if (failed) {
+    std::cout << "  Total: " << N << std::endl;
+    std::cout << "  Success: " << N - failed << std::endl;
+    std::cout << "  Failed: " << failed << std::endl;
+  }
+
+  return !failed;
+}
+
+int main() {
+  bool passed = true;
+  passed = test<dp2a_lo, int32_t, int32_t>() && passed;
+  passed = test<dp2a_lo, int32_t, uint32_t>() && passed;
+  passed = test<dp2a_lo, uint32_t, int32_t>() && passed;
+  passed = test<dp2a_lo, uint32_t, uint32_t>() && passed;
+
+  passed = test<dp2a_hi, int32_t, int32_t>() && passed;
+  passed = test<dp2a_hi, int32_t, uint32_t>() && passed;
+  passed = test<dp2a_hi, uint32_t, int32_t>() && passed;
+  passed = test<dp2a_hi, uint32_t, uint32_t>() && passed;
+
+  passed = test<dp4a, int32_t, int32_t>() && passed;
+  passed = test<dp4a, int32_t, uint32_t>() && passed;
+  passed = test<dp4a, uint32_t, int32_t>() && passed;
+  passed = test<dp4a, uint32_t, uint32_t>() && passed;
+
+  assert(passed);
+  return 0;
+}
diff --git a/sycl/test-e2e/syclcompat/math/math_extend_v_2.cpp b/sycl/test-e2e/syclcompat/math/math_extend_v_2.cpp
new file mode 100644
index 0000000000000..256cedc4602f2
--- /dev/null
+++ b/sycl/test-e2e/syclcompat/math/math_extend_v_2.cpp
@@ -0,0 +1,464 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat API
+ *
+ *  math_extend_v_2.cpp
+ *
+ *  Description:
+ *    math extend 2-vectorized helpers tests
+ **************************************************************************/
+
+// ===------------- math_extend_vfunc_2.cpp ----------------*- C++ -*-----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+// ===---------------------------------------------------------------------===//
+
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
+// RUN: %{run} %t.out
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <stdio.h>
+#include <sycl/detail/core.hpp>
+
+#include <syclcompat/device.hpp>
+#include <syclcompat/math.hpp>
+#include <syclcompat/memory.hpp>
+
+#define CHECK(S, REF)                                                          \
+  {                                                                            \
+    auto ret = S;                                                              \
+    if (ret != REF) {                                                          \
+      return {#S, REF};                                                        \
+    }                                                                          \
+  }
+
+std::pair<const char *, int> vadd2() {
+  CHECK(syclcompat::extend_vadd2<int32_t>(0x0001FFFF, 0x00010005, 0),
+        0x00020004);
+  CHECK(syclcompat::extend_vadd2<int32_t>(0x7FFF7FFF, 0x00010001, 0),
+        0x80008000);
+  CHECK(syclcompat::extend_vadd2_sat<int32_t>(0x7FFF7FFF, 0x00010001, 0),
+        0x7FFF7FFF);
+
+  CHECK(syclcompat::extend_vadd2<uint32_t>(0x00010002, 0x00020003, 0),
+        0x00030005);
+  CHECK(syclcompat::extend_vadd2<uint32_t>(0xFFFEFFFF, 0x00030003, 0),
+        0x00010002);
+  CHECK(syclcompat::extend_vadd2_sat<uint32_t>((uint32_t)0xFFFEFFFF,
+                                               (uint32_t)0x00030003, 0),
+        0xFFFFFFFF);
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vsub2() {
+
+  CHECK(syclcompat::extend_vsub2<int32_t>(0x0001FFFF, 0xFFFF0001, 0),
+        0x0002FFFE);
+  // Testing API & Saturated API with mixed types
+  CHECK(syclcompat::extend_vsub2<int32_t>((int32_t)0x7FFFFFFD,
+                                          (int32_t)0xFFFA7FFF, 0),
+        0x80057FFE);
+  CHECK(syclcompat::extend_vsub2<int32_t>((uint32_t)0x7FFFFFFD,
+                                          (uint32_t)0xFFFA7FFF, 0),
+        0x80057FFE);
+  CHECK(syclcompat::extend_vsub2<int32_t>((uint32_t)0x7FFFFFFD,
+                                          (int32_t)0xFFFA7FFF, 0),
+        0x80057FFE);
+  CHECK(syclcompat::extend_vsub2<int32_t>((int32_t)0x7FFFFFFD,
+                                          (uint32_t)0xFFFA7FFF, 0),
+        0x80057FFE);
+  CHECK(syclcompat::extend_vsub2_sat<int32_t>((int32_t)0x7FFFFFFD,
+                                              (int32_t)0xFFFA7FFF, 0),
+        0x7FFF8000);
+  CHECK(syclcompat::extend_vsub2_sat<int32_t>((uint32_t)0x7FFFFFFD,
+                                              (uint32_t)0xFFFA7FFF, 0),
+        0x80057FFE);
+  CHECK(syclcompat::extend_vsub2_sat<int32_t>((int32_t)0x7FFFFFFD,
+                                              (uint32_t)0xFFFA7FFF, 0),
+        0x80058000);
+  CHECK(syclcompat::extend_vsub2_sat<int32_t>((uint32_t)0x7FFFFFFD,
+                                              (int32_t)0xFFFA7FFF, 0),
+        0x7FFF7FFE);
+
+  CHECK(syclcompat::extend_vsub2<uint32_t>(0x0002000B, 0x0001000A, 0),
+        0x00010001);
+  CHECK(syclcompat::extend_vsub2<uint32_t>((uint32_t)0x00010001,
+                                           (uint32_t)0x0002FFFF, 0),
+        0xFFFF0002);
+  CHECK(syclcompat::extend_vsub2<uint32_t>((int32_t)0x00010001,
+                                           (int32_t)0x0002FFFF, 0),
+        0xFFFF0002);
+  CHECK(syclcompat::extend_vsub2_sat<uint32_t>((uint32_t)0x00010001,
+                                               (uint32_t)0x0002FFFF, 0),
+        0x00000000);
+  CHECK(syclcompat::extend_vsub2_sat<uint32_t>((int32_t)0x00010001,
+                                               (int32_t)0x0002FFFF, 0),
+        0x00000002);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vadd2_add() {
+
+  CHECK(syclcompat::extend_vadd2_add<int32_t>(0x00010002, 0x00030004, 1),
+        0x0000000B);
+  CHECK(syclcompat::extend_vadd2_add<int32_t>(0x0001FFFF, 0x0002FFFE, -1),
+        0xFFFFFFFF);
+  CHECK(syclcompat::extend_vadd2_add<int32_t>(0x00017FFF, 0x00017FFF, 1),
+        0x00010001);
+
+  CHECK(syclcompat::extend_vadd2_add<uint32_t>(0x00010002, 0x00030004, 1),
+        0x0000000B);
+  CHECK(syclcompat::extend_vadd2_add<uint32_t>((uint32_t)0x0001FFFF,
+                                               (uint32_t)0x0002FFFF, 1),
+        0x00020002);
+  CHECK(syclcompat::extend_vadd2_add<uint32_t>(0x0001FFFF, 0x0002FFFF, 1),
+        0x00000002);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vsub2_add() {
+
+  // Testing API with mixed types
+  CHECK(syclcompat::extend_vsub2_add<int32_t>((int32_t)0x0001FFFF,
+                                              (int32_t)0xFFFF0001, 1),
+        1);
+  CHECK(syclcompat::extend_vsub2_add<int32_t>((uint32_t)0x7FFFFFFD,
+                                              (uint32_t)0xFFFA7FFF, -1),
+        0x00000002);
+  CHECK(syclcompat::extend_vsub2_add<int32_t>((int32_t)0x7FFFFFFD,
+                                              (int32_t)0xFFFA7FFF, -1),
+        0x00000002);
+  CHECK(syclcompat::extend_vsub2_add<int32_t>((int32_t)0x7FFFFFFD,
+                                              (uint32_t)0xFFFA7FFF, -1),
+        0xFFFF0002);
+  CHECK(syclcompat::extend_vsub2_add<int32_t>((uint32_t)0x7FFFFFFD,
+                                              (int32_t)0xFFFA7FFF, -1),
+        0x00010002);
+
+  CHECK(syclcompat::extend_vsub2_add<uint32_t>(0x0002000B, 0x0001000A, 1),
+        0x00000003);
+  CHECK(syclcompat::extend_vsub2_add<uint32_t>(0x00010001, 0x0002FFFF, 3),
+        0x00000004);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vabsdiff2() {
+
+  CHECK(syclcompat::extend_vabsdiff2<int32_t>((int32_t)0xFFFF0001,
+                                              (int32_t)0x0003FFFF, 0),
+        0x00040002);
+  CHECK(syclcompat::extend_vabsdiff2<int32_t>((int32_t)0x80000002,
+                                              (int32_t)0x00010001, 0),
+        0x80010001);
+  CHECK(syclcompat::extend_vabsdiff2_sat<int32_t>((int32_t)0x80000002,
+                                                  (int32_t)0x00010001, 0),
+        0x7FFF0001);
+
+  CHECK(syclcompat::extend_vabsdiff2<uint32_t>(0x00010004, 0x00030002, 0),
+        0x00020002);
+  CHECK(syclcompat::extend_vabsdiff2<uint32_t>((uint32_t)0xFFFF0001,
+                                               (int32_t)0xFFFE0003, 0),
+        0x00010002);
+  CHECK(syclcompat::extend_vabsdiff2_sat<uint32_t>((uint32_t)0xFFFF0001,
+                                                   (int32_t)0xFFFE0003, 0),
+        0xFFFF0002);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vabsdiff2_add() {
+
+  CHECK(syclcompat::extend_vabsdiff2_add<int32_t>((int32_t)0xFFFF0001,
+                                                  (int32_t)0x0003FFFF, -2),
+        0x00000004);
+
+  CHECK(syclcompat::extend_vabsdiff2_add<uint32_t>(0x000A000C, 0x000B000A, 1),
+        0x00000004);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vmin2() {
+
+  CHECK(syclcompat::extend_vmin2<int32_t>((int32_t)0xFFFF0002, 0x00010001, 0),
+        (int32_t)0xFFFF0001);
+  CHECK(syclcompat::extend_vmin2_sat<int32_t>(0x0002FFF1, 0x0001FFF2, 0),
+        0x0001FFF1);
+
+  CHECK(syclcompat::extend_vmin2<uint32_t>(0x000A000D, 0x000B000C, 0),
+        0x000A000C);
+  CHECK(syclcompat::extend_vmin2_sat<uint32_t>(0x0002FFF1, 0x0001FFF2, 0),
+        0x00010000);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vmax2() {
+
+  CHECK(syclcompat::extend_vmax2<int32_t>((int32_t)0xFFFF0002, 0x00010001, 0),
+        0x00010002);
+  CHECK(syclcompat::extend_vmax2_sat<int32_t>(0x80008000, 0x00018001, 0),
+        0x7FFF7FFF);
+
+  CHECK(syclcompat::extend_vmax2<uint32_t>(0x000A000D, 0x000B000C, 0),
+        0x000B000D);
+  CHECK(syclcompat::extend_vmax2_sat<uint32_t>(0x0002FFF1, 0x0001FFF2, 0),
+        0x00020000);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vmin2_vmax2_add() {
+
+  CHECK(
+      syclcompat::extend_vmin2_add<int32_t>((int32_t)0xFFFF0002, 0x00010001, 2),
+      0x00000002);
+  CHECK(syclcompat::extend_vmin2_add<uint32_t>(0x000A000D, 0x000B000C, 2),
+        0x00000018);
+
+  CHECK(syclcompat::extend_vmax2_add<int32_t>((int32_t)0xFFFF0002, 0x00010001,
+                                              -2),
+        0x00000001);
+  CHECK(syclcompat::extend_vmax2_add<uint32_t>(0x000A000D, 0x000B000C, 2),
+        0x0000001A);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vavrg2() {
+
+  CHECK(syclcompat::extend_vavrg2<int32_t>((int32_t)0xFFFFFFF6, 0x0005FFFA, 0),
+        0x0002FFF8);
+  CHECK(syclcompat::extend_vavrg2_sat<int32_t>((int32_t)0xFFFFFFF6, 0x0005FFFA,
+                                               0),
+        0x0002FFF8);
+
+  CHECK(syclcompat::extend_vavrg2<uint32_t>(0x00010006, 0x00030001, 0),
+        0x00020004);
+  CHECK(syclcompat::extend_vavrg2_sat<uint32_t>(0x00010006, 0x00030001, 0),
+        0x00020004);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vavrg2_add() {
+
+  CHECK(syclcompat::extend_vavrg2_add<int32_t>((int32_t)0xFFFFFFF6, 0x0005FFFA,
+                                               -2),
+        0xFFFFFFF8);
+
+  CHECK(syclcompat::extend_vavrg2_add<uint32_t>(0x00010006, 0x00030002, 2),
+        0x00000008);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vcompare2() {
+
+  CHECK(syclcompat::extend_vcompare2(0x0002FFFF, 0x0001FFFF, std::greater<>()),
+        (unsigned)0x00010000);
+  CHECK(syclcompat::extend_vcompare2((uint32_t)0x0002FFFF, (int32_t)0x0001FFFF,
+                                     std::greater<>()),
+        (unsigned)0x00010001);
+  CHECK(syclcompat::extend_vcompare2((int32_t)0x0002FFFF, (uint32_t)0x0001FFFF,
+                                     std::greater<>()),
+        (unsigned)0x00010000);
+
+  CHECK(syclcompat::extend_vcompare2(0x0002FFFF, 0x0001FFFF, std::less<>()),
+        (unsigned)0x00000000);
+  CHECK(syclcompat::extend_vcompare2(0x0002FFFF, 0x0002FFFF,
+                                     std::greater_equal<>()),
+        (unsigned)0x00010001);
+  CHECK(
+      syclcompat::extend_vcompare2(0x0002FFFF, 0x0001FFFF, std::less_equal<>()),
+      (unsigned)0x00000001);
+  CHECK(syclcompat::extend_vcompare2(0xFFFE0002, 0xFFFF0002, std::equal_to<>()),
+        (unsigned)0x00000001);
+  CHECK(syclcompat::extend_vcompare2(0xFFFE0002, 0xFFFF0002,
+                                     std::not_equal_to<>()),
+        (unsigned)0x00010000);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vcompare2_add() {
+
+  CHECK(syclcompat::extend_vcompare2_add(0x0002FFFF, 0x0001FFFF, 1,
+                                         std::greater<>()),
+        (unsigned)0x00000002);
+  CHECK(syclcompat::extend_vcompare2_add(0x0002FFFF, 0x0001FFFF, 2,
+                                         std::less<>()),
+        (unsigned)0x00000002);
+  CHECK(syclcompat::extend_vcompare2_add(0x0002FFFF, 0x0002FFFF, 1,
+                                         std::greater_equal<>()),
+        (unsigned)0x00000003);
+  CHECK(syclcompat::extend_vcompare2_add(0x0002FFFF, 0x0001FFFF, 2,
+                                         std::less_equal<>()),
+        (unsigned)0x00000003);
+  CHECK(syclcompat::extend_vcompare2_add(0xFFFE0002, 0xFFFF0002, 0xFFFF,
+                                         std::equal_to<>()),
+        (unsigned)0x00010000);
+  CHECK(syclcompat::extend_vcompare2_add(0xFFFE0002, 0xFFFF0002, 0xFF,
+                                         std::not_equal_to<>()),
+        (unsigned)0x00000100);
+
+  return {nullptr, 0};
+}
+
+void test(const sycl::stream &s, int *ec) {
+  {
+    auto res = vadd2();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 1;
+      return;
+    }
+    s << "vadd2 check passed!\n";
+  }
+  {
+    auto res = vsub2();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 2;
+      return;
+    }
+    s << "vsub2 check passed!\n";
+  }
+  {
+    auto res = vadd2_add();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 3;
+      return;
+    }
+    s << "vadd2_add check passed!\n";
+  }
+  {
+    auto res = vsub2_add();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 4;
+      return;
+    }
+    s << "vsub2_add check passed!\n";
+  }
+  {
+    auto res = vabsdiff2();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 5;
+      return;
+    }
+    s << "vabsdiff2 check passed!\n";
+  }
+  {
+    auto res = vmin2();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 6;
+      return;
+    }
+    s << "vmin2 check passed!\n";
+  }
+  {
+    auto res = vmax2();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 7;
+      return;
+    }
+    s << "vmax2 check passed!\n";
+  }
+  {
+    auto res = vmin2_vmax2_add();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 8;
+      return;
+    }
+    s << "vmin2_add/vmax2_add check passed!\n";
+  }
+  {
+    auto res = vavrg2();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 9;
+      return;
+    }
+    s << "vavrg2 check passed!\n";
+  }
+  {
+    auto res = vavrg2_add();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 10;
+      return;
+    }
+    s << "vavrg2_add check passed!\n";
+  }
+  {
+    auto res = vabsdiff2_add();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 11;
+      return;
+    }
+    s << "vabsdiff2_add check passed!\n";
+  }
+  {
+    auto res = vcompare2();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 12;
+      return;
+    }
+    s << "vcompare2 check passed!\n";
+  }
+  {
+    auto res = vcompare2_add();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 13;
+      return;
+    }
+    s << "vcompare2_add check passed!\n";
+  }
+  *ec = 0;
+}
+
+int main() {
+  sycl::queue q = syclcompat::get_default_queue();
+  int *ec = syclcompat::malloc<int>(1);
+  syclcompat::fill<int>(ec, 0, 1);
+  q.submit([&](sycl::handler &cgh) {
+    sycl::stream out(1024, 256, cgh);
+    cgh.parallel_for(1, [=](sycl::item<1> it) { test(out, ec); });
+  });
+  q.wait_and_throw();
+
+  int ec_h;
+  syclcompat::memcpy<int>(&ec_h, ec, 1);
+
+  return ec_h;
+}
diff --git a/sycl/test-e2e/syclcompat/math/math_extend_v_4.cpp b/sycl/test-e2e/syclcompat/math/math_extend_v_4.cpp
new file mode 100644
index 0000000000000..a2ac657000fbb
--- /dev/null
+++ b/sycl/test-e2e/syclcompat/math/math_extend_v_4.cpp
@@ -0,0 +1,479 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat API
+ *
+ *  math_extend_v_4.cpp
+ *
+ *  Description:
+ *    math extend 4-vectorized helpers tests
+ **************************************************************************/
+
+// ===------------- math_extend_vfunc_4.cpp ----------------*- C++ -*-----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+// ===---------------------------------------------------------------------===//
+
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
+// RUN: %{run} %t.out
+
+#include <cmath>
+#include <cstdint>
+#include <limits>
+#include <stdio.h>
+#include <sycl/detail/core.hpp>
+
+#include <syclcompat/device.hpp>
+#include <syclcompat/math.hpp>
+#include <syclcompat/memory.hpp>
+
+#define CHECK(S, REF)                                                          \
+  {                                                                            \
+    auto ret = S;                                                              \
+    if (ret != REF) {                                                          \
+      return {#S, REF};                                                        \
+    }                                                                          \
+  }
+
+std::pair<const char *, int> vadd4() {
+  CHECK(syclcompat::extend_vadd4<int32_t>(0x0102FFFE, 0x01FF02FF, 0),
+        0x020101FD);
+  CHECK(syclcompat::extend_vadd4<int32_t>((int32_t)0x7E81FEFF,
+                                          (int32_t)0x02FD03FF, 0),
+        0x807E01FE);
+  CHECK(syclcompat::extend_vadd4<int32_t>((uint32_t)0x7E81FEFF,
+                                          (uint32_t)0x02FD03FF, 0),
+        0x807E01FE);
+  CHECK(syclcompat::extend_vadd4<int32_t>((uint32_t)0x7E81FEFF,
+                                          (int32_t)0x02FD03FF, 0),
+        0x807E01FE);
+  CHECK(syclcompat::extend_vadd4<int32_t>((int32_t)0x7E81FEFF,
+                                          (uint32_t)0x02FD03FF, 0),
+        0x807E01FE);
+  CHECK(syclcompat::extend_vadd4_sat<int32_t>((int32_t)0x7E81FEFF,
+                                              (int32_t)0x02FD03FF, 0),
+        0x7F8001FE);
+  CHECK(syclcompat::extend_vadd4_sat<int32_t>((uint32_t)0x7E81FEFF,
+                                              (uint32_t)0x02FD03FF, 0),
+        0x7F7F7F7F);
+  CHECK(syclcompat::extend_vadd4_sat<int32_t>((uint32_t)0x7E81FEFF,
+                                              (int32_t)0x02FD03FF, 0),
+        0x7F7E7F7F);
+  CHECK(syclcompat::extend_vadd4_sat<int32_t>((int32_t)0x7E81FEFF,
+                                              (uint32_t)0x02FD03FF, 0),
+        0x7F7E017F);
+
+  CHECK(syclcompat::extend_vadd4<uint32_t>(0x01020304, 0x0A0B0C0D, 0),
+        0x0B0D0F11);
+  CHECK(syclcompat::extend_vadd4<uint32_t>((uint32_t)0x000100FF,
+                                           (uint32_t)0x00FE0001, 0),
+        0x00FF0000);
+  CHECK(syclcompat::extend_vadd4_sat<uint32_t>((uint32_t)0x000100FF,
+                                               (uint32_t)0x00FE0001, 0),
+        0x00FF00FF);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vadd4_add() {
+
+  CHECK(syclcompat::extend_vadd4_add<int32_t>(0x0102FFFE, 0x01FF02FF, 1),
+        0x00000002);
+  CHECK(syclcompat::extend_vadd4_add<int32_t>((int32_t)0x7E81FEFF,
+                                              (int32_t)0x02FD03FF, -1),
+        0xFFFFFFFC);
+  CHECK(syclcompat::extend_vadd4_add<int32_t>((uint32_t)0x7E81FEFF,
+                                              (uint32_t)0x02FD03FF, -1),
+        0x000004FC);
+  CHECK(syclcompat::extend_vadd4_add<int32_t>((uint32_t)0x7E81FEFF,
+                                              (int32_t)0x02FD03FF, -1),
+        0x000002FC);
+  CHECK(syclcompat::extend_vadd4_add<int32_t>((int32_t)0x7E81FEFF,
+                                              (uint32_t)0x02FD03FF, -1),
+        0x000001FC);
+
+  CHECK(syclcompat::extend_vadd4_add<uint32_t>(0x01020304, 0x01000100, 1),
+        0x0000000D);
+  CHECK(syclcompat::extend_vadd4_add<uint32_t>((uint32_t)0x000100FF,
+                                               (uint32_t)0x00FE0001, 1),
+        0x0000000200);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vsub4() {
+
+  CHECK(syclcompat::extend_vsub4<int32_t>((int32_t)0x0102FFFF,
+                                          (int32_t)0x020101FE, 0),
+        0xFF01FE01);
+  CHECK(syclcompat::extend_vsub4<int32_t>((int32_t)0x01807F10, 0x0102FE10, 0),
+        0x007E8100);
+  CHECK(
+      syclcompat::extend_vsub4_sat<int32_t>((int32_t)0x01807F10, 0x0102FE10, 0),
+      0x00807F00);
+
+  CHECK(syclcompat::extend_vsub4<uint32_t>(0x02020C0B, 0x02010A0A, 0),
+        0x00010201);
+  CHECK(syclcompat::extend_vsub4<uint32_t>(0x01020304, 0x02040608, 0),
+        0xFFFEFDFC);
+  CHECK(syclcompat::extend_vsub4_sat<uint32_t>(0x01020304, 0x02040608, 0),
+        0x00000000);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vsub4_add() {
+
+  CHECK(syclcompat::extend_vsub4_add<int32_t>((int32_t)0x0102FFFF,
+                                              (int32_t)0x020101FE, -1),
+        0xFFFFFFFE);
+  CHECK(
+      syclcompat::extend_vsub4_add<int32_t>((int32_t)0x01807F10, 0x0102FE10, 2),
+      0x00000001);
+
+  CHECK(syclcompat::extend_vsub4_add<uint32_t>(0x02020C0B, 0x02010A0A, 2),
+        0x00000006);
+  CHECK(syclcompat::extend_vsub4_add<uint32_t>(0x01020304, 0x02040608, 1),
+        0xFFFFFFF7);
+
+  CHECK(syclcompat::extend_vsub4_add<uint32_t>((uint32_t)0x01020304,
+                                               (uint32_t)0x02040608, 1),
+        0xFFFFFFF7);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vabsdiff4() {
+
+  CHECK(
+      syclcompat::extend_vabsdiff4<int32_t>((int32_t)0xFF01FF02, 0x01FF02FF, 0),
+      0x02020303);
+  CHECK(syclcompat::extend_vabsdiff4<int32_t>((int32_t)0x8002007F,
+                                              (int32_t)0x01010080, 0),
+        0x810100FF);
+  CHECK(syclcompat::extend_vabsdiff4_sat<int32_t>((int32_t)0x8002007F,
+                                                  (int32_t)0x01010080, 0),
+        0x7F01007F);
+
+  CHECK(syclcompat::extend_vabsdiff4<uint32_t>(0x01020304, 0x04030201, 0),
+        0x03010103);
+  CHECK(syclcompat::extend_vabsdiff4<uint32_t>((uint32_t)0xFEFF0001,
+                                               (int32_t)0xF0FE0003, 0),
+        0x0E010002);
+  CHECK(syclcompat::extend_vabsdiff4_sat<uint32_t>((uint32_t)0xFEFF0001,
+                                                   (int32_t)0xF0FE0003, 0),
+        0xFFFF0002);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vabsdiff4_add() {
+
+  CHECK(syclcompat::extend_vabsdiff4_add<int32_t>((int32_t)0xFF01FF02,
+                                                  0x01FF02FF, 1),
+        0x0000000B);
+  CHECK(syclcompat::extend_vabsdiff4_add<int32_t>((int32_t)0x8002007F,
+                                                  (int32_t)0x01010080, -1),
+        0x00000180);
+
+  CHECK(syclcompat::extend_vabsdiff4_add<uint32_t>(0x01020304, 0x04030201, 2),
+        0x0000000A);
+  CHECK(syclcompat::extend_vabsdiff4_add<uint32_t>((uint32_t)0xFEFF0001,
+                                                   (int32_t)0xF0FE0003, 1),
+        0x00000212);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vmin4() {
+
+  CHECK(syclcompat::extend_vmin4<int32_t>((int32_t)0xFFFF0102,
+                                          (int32_t)0xFE010201, 0),
+        0xFEFF0101);
+
+  CHECK(syclcompat::extend_vmin4_sat<int32_t>(0x0102FF00, 0x0201FE00, 0),
+        0x0101FE00);
+
+  CHECK(syclcompat::extend_vmin4<uint32_t>(0x010A020D, 0x000B020C, 0),
+        0x000A020C);
+
+  CHECK(syclcompat::extend_vmin4_sat<uint32_t>(0x020201FF, 0x0201FFFE, 0),
+        0x02010000);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vmax4() {
+
+  CHECK(syclcompat::extend_vmax4<int32_t>((int32_t)0xFFFF0102,
+                                          (int32_t)0xFE010201, 0),
+        0xFF010202);
+  CHECK(syclcompat::extend_vmax4_sat<int32_t>(0x0102FF00, 0x0201FE00, 0),
+        0x0202FF00);
+
+  CHECK(syclcompat::extend_vmax4<uint32_t>(0x010A020D, 0x000B020C, 0),
+        0x010B020D);
+  CHECK(syclcompat::extend_vmax4_sat<uint32_t>(0x020201FF, 0x0201FFFE, 0),
+        0x02020100);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vmin4_vmax4_add() {
+
+  CHECK(syclcompat::extend_vmin4_add<int32_t>((int32_t)0xFFFF0102,
+                                              (int32_t)0xFE010201, -1),
+        0xFFFFFFFE);
+
+  CHECK(syclcompat::extend_vmin4_add<uint32_t>(0x010A020D, 0x000B020C, 1),
+        0x00000019);
+
+  CHECK(syclcompat::extend_vmax4_add<int32_t>((int32_t)0xFFFF0102,
+                                              (int32_t)0xFE010201, 2),
+        0x00000006);
+  CHECK(syclcompat::extend_vmax4_add<uint32_t>(0x010A020D, 0x000B020C, -1),
+        0x0000001A);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vavrg4() {
+
+  CHECK(syclcompat::extend_vavrg4<int32_t>((int32_t)0xFF01FF01, 0x0505FF00, 0),
+        0x0203FF01);
+  CHECK(syclcompat::extend_vavrg4_sat<int32_t>((int32_t)0xFF01FF01, 0x0505FF00,
+                                               0),
+        0x0203FF01);
+
+  CHECK(syclcompat::extend_vavrg4<uint32_t>(0x00010106, (int32_t)0xFC050101, 0),
+        (int32_t)0xFE030104);
+  CHECK(syclcompat::extend_vavrg4_sat<uint32_t>(0x00010106, (int32_t)0xFC050101,
+                                                0),
+        (int32_t)0x00030104);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vavrg4_add() {
+
+  CHECK(syclcompat::extend_vavrg4_add<int32_t>((int32_t)0xFF01FF01, 0x0505FF00,
+                                               1),
+        0x00000006);
+  CHECK(syclcompat::extend_vavrg4_add<int32_t>((int32_t)0xFF01FF01, 0x0505FF00,
+                                               -6),
+        0xFFFFFFFF);
+
+  CHECK(syclcompat::extend_vavrg4_add<uint32_t>(0x00010106, (int32_t)0xFC050101,
+                                                1),
+        (int32_t)0x00000007);
+
+  CHECK(syclcompat::extend_vavrg4_add<uint32_t>(0x00010106, (int32_t)0xFC050101,
+                                                -1),
+        (int32_t)0x00000005);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vcompare4() {
+
+  CHECK(syclcompat::extend_vcompare4(0x0102FEFF, 0x01FFFFFE, std::greater<>()),
+        (unsigned)0x00010001);
+  CHECK(syclcompat::extend_vcompare4((uint32_t)0x0102FEFF, (int32_t)0x01FFFFFE,
+                                     std::greater<>()),
+        (unsigned)0x00010101);
+  CHECK(syclcompat::extend_vcompare4((int32_t)0x0102FEFF, (uint32_t)0x01FFFFFE,
+                                     std::greater<>()),
+        (unsigned)0x00000000);
+
+  CHECK(syclcompat::extend_vcompare4(0x0102FEFF, 0x01FFFFFE, std::less<>()),
+        (unsigned)0x00000100);
+  CHECK(syclcompat::extend_vcompare4(0x0102FEFF, 0x01FFFFFE,
+                                     std::greater_equal<>()),
+        (unsigned)0x01010001);
+  CHECK(
+      syclcompat::extend_vcompare4(0x0102FEFF, 0x01FFFFFE, std::less_equal<>()),
+      (unsigned)0x01000100);
+  CHECK(syclcompat::extend_vcompare4(0xFFFE0102, 0xFFFF0202, std::equal_to<>()),
+        (unsigned)0x01000001);
+  CHECK(syclcompat::extend_vcompare4(0xFFFE0102, 0xFFFF0202,
+                                     std::not_equal_to<>()),
+        (unsigned)0x00010100);
+
+  return {nullptr, 0};
+}
+
+std::pair<const char *, int> vcompare4_add() {
+
+  CHECK(syclcompat::extend_vcompare4_add(0x0102FEFF, 0x01FFFFFE, 1,
+                                         std::greater<>()),
+        (unsigned)0x00000003);
+  CHECK(syclcompat::extend_vcompare4_add(0x0102FEFF, 0x01FFFFFE, 1,
+                                         std::less<>()),
+        (unsigned)0x00000002);
+  CHECK(syclcompat::extend_vcompare4_add(0x0102FEFF, 0x01FFFFFE, 2,
+                                         std::greater_equal<>()),
+        (unsigned)0x00000005);
+  CHECK(syclcompat::extend_vcompare4_add(0x0102FEFF, 0x01FFFFFE, 2,
+                                         std::less_equal<>()),
+        (unsigned)0x00000004);
+  CHECK(syclcompat::extend_vcompare4_add(0xFFFE0102, 0xFFFF0202, 0xFF,
+                                         std::equal_to<>()),
+        (unsigned)0x00000101);
+  CHECK(syclcompat::extend_vcompare4_add(0xFFFE0102, 0xFFFF0202, 0xFFFF,
+                                         std::not_equal_to<>()),
+        (unsigned)0x00010001);
+
+  return {nullptr, 0};
+}
+
+void test(const sycl::stream &s, int *ec) {
+  {
+    auto res = vadd4();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 1;
+      return;
+    }
+    s << "vadd4 check passed!\n";
+  }
+  {
+    auto res = vsub4();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 2;
+      return;
+    }
+    s << "vsub4 check passed!\n";
+  }
+  {
+    auto res = vadd4_add();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 3;
+      return;
+    }
+    s << "vadd4_add check passed!\n";
+  }
+  {
+    auto res = vsub4_add();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 4;
+      return;
+    }
+    s << "vsub4_add check passed!\n";
+  }
+  {
+    auto res = vabsdiff4();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 5;
+      return;
+    }
+    s << "vabsdiff4 check passed!\n";
+  }
+  {
+    auto res = vabsdiff4_add();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 6;
+      return;
+    }
+    s << "vabsdiff4_add check passed!\n";
+  }
+  {
+    auto res = vmin4();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 7;
+      return;
+    }
+    s << "vmin4 check passed!\n";
+  }
+  {
+    auto res = vmax4();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 8;
+      return;
+    }
+    s << "vmax4 check passed!\n";
+  }
+  {
+    auto res = vmin4_vmax4_add();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 9;
+      return;
+    }
+    s << "vmin4_add/vmax4_add check passed!\n";
+  }
+  {
+    auto res = vavrg4();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 10;
+      return;
+    }
+    s << "vavrg4 check passed!\n";
+  }
+  {
+    auto res = vavrg4_add();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 11;
+      return;
+    }
+    s << "vavrg4_add check passed!\n";
+  }
+  {
+    auto res = vcompare4();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 12;
+      return;
+    }
+    s << "vcompare4 check passed!\n";
+  }
+  {
+    auto res = vcompare4_add();
+    if (res.first) {
+      s << res.first << " = " << res.second << " check failed!\n";
+      *ec = 13;
+      return;
+    }
+    s << "vcompare4_add check passed!\n";
+  }
+  *ec = 0;
+}
+
+int main() {
+  sycl::queue q = syclcompat::get_default_queue();
+  int *ec = syclcompat::malloc<int>(1);
+  syclcompat::fill<int>(ec, 0, 1);
+  q.submit([&](sycl::handler &cgh) {
+    sycl::stream out(1024, 256, cgh);
+    cgh.parallel_for(1, [=](sycl::item<1> it) { test(out, ec); });
+  });
+  q.wait_and_throw();
+
+  int ec_h;
+  syclcompat::memcpy<int>(&ec_h, ec, 1);
+
+  return ec_h;
+}
diff --git a/sycl/test-e2e/syclcompat/math/math_ops.cpp b/sycl/test-e2e/syclcompat/math/math_ops.cpp
index 93c62f5f6ee50..c4d11547271af 100644
--- a/sycl/test-e2e/syclcompat/math/math_ops.cpp
+++ b/sycl/test-e2e/syclcompat/math/math_ops.cpp
@@ -20,9 +20,7 @@
  *    tests for non-vectorized math helper functions
  **************************************************************************/
 
-// FIXME: Remove "-fsycl-device-code-split=per_kernel" option after fixing
-// https://github.com/intel/llvm/issues/12743.
-// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} -fsycl-device-code-split=per_kernel %s -o %t.out
+// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
 // RUN: %{run} %t.out
 
 #include <syclcompat/dims.hpp>
diff --git a/sycl/test-e2e/syclcompat/memory/memory_async.cpp b/sycl/test-e2e/syclcompat/memory/memory_async.cpp
index 3eb4123014497..b2ce1f9c8304f 100644
--- a/sycl/test-e2e/syclcompat/memory/memory_async.cpp
+++ b/sycl/test-e2e/syclcompat/memory/memory_async.cpp
@@ -43,14 +43,15 @@
 
 #include "memory_fixt.hpp"
 
-// free_async is a host task, so we are really testing the event dependency here
+// enqueue_free is just a host task, so we are really testing the event
+// dependency here
 void test_free_async() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
   AsyncTest atest;
 
   float *d_D = (float *)syclcompat::malloc(sizeof(float));
   sycl::event kernel_ev = atest.launch_kernel();
-  sycl::event free_ev = syclcompat::free_async({d_D}, {kernel_ev});
+  sycl::event free_ev = syclcompat::enqueue_free({d_D}, {kernel_ev});
 
   atest.check_events(kernel_ev, free_ev);
 }
diff --git a/sycl/test-e2e/syclcompat/memory/memory_image.cpp b/sycl/test-e2e/syclcompat/memory/memory_image.cpp
new file mode 100644
index 0000000000000..5dc2ac8d5ed8a
--- /dev/null
+++ b/sycl/test-e2e/syclcompat/memory/memory_image.cpp
@@ -0,0 +1,257 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat API
+ *
+ *  memory_image.cpp
+ *
+ *  Description:
+ *    3D memory copy tests for new image/memcpy_parameter API
+ **************************************************************************/
+
+// RUN: %clangxx -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
+// RUN: %{run} %t.out
+
+// FIXME(@intel/syclcompat-lib-reviewers): These are some limited tests for the
+// new syclcompat::experimental::memcpy API. These aren't officially supported
+// at present, but we can test the pitched_data variants easily. Once this
+// moves out of experimental, let's test these APIs thoroughly
+
+#include <malloc.h>
+#include <stdio.h>
+#include <sycl/detail/core.hpp>
+
+#include <syclcompat/memory.hpp>
+
+#include "memory_common.hpp"
+
+void test_memcpy3D_parameter_offset() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+  size_t width = 4;
+  size_t height = 4;
+  size_t depth = 5;
+  float *h_data;
+
+  syclcompat::pitched_data d_data;
+  sycl::range<3> extent = sycl::range<3>(sizeof(float) * 1, 1, 1);
+  syclcompat::pitched_data cpyParm_from_data_ct1, cpyParm_to_data_ct1;
+  sycl::id<3> cpyParm_from_pos_ct1(0, 0, 0), cpyParm_to_pos_ct1(0, 0, 0);
+  sycl::range<3> cpyParm_size_ct1(0, 0, 0);
+
+  h_data =
+      (float *)syclcompat::malloc_host(sizeof(float) * width * height * depth);
+  /*
+    0.000000        1.000000        2.000000        3.000000
+    4.000000        5.000000        6.000000        7.000000
+    8.000000        9.000000        10.000000       11.000000
+    12.000000       13.000000       14.000000       15.000000
+
+    16.000000       17.000000       18.000000       19.000000
+    20.000000       21.000000       22.000000       23.000000
+    24.000000       25.000000       26.000000       27.000000
+    28.000000       29.000000       30.000000       31.000000
+
+    32.000000       33.000000       34.000000       35.000000
+    36.000000       37.000000       38.000000       39.000000
+    40.000000       41.000000       42.000000       43.000000
+    44.000000       45.000000       46.000000       47.000000
+
+    48.000000       49.000000       50.000000       51.000000
+    52.000000       53.000000       54.000000       55.000000
+    56.000000       57.000000       58.000000       59.000000
+    60.000000       61.000000       62.000000       63.000000
+
+    64.000000       65.000000       66.000000       67.000000
+    68.000000       69.000000       70.000000       71.000000
+    72.000000       73.000000       74.000000       75.000000
+    76.000000       77.000000       78.000000       79.000000
+  */
+  for (int i = 0; i < width * height * depth; i++)
+    h_data[i] = (float)i;
+
+  /*
+    5.000000        6.000000
+    9.000000        10.000000
+
+    21.000000       22.000000
+    25.000000       26.000000
+
+    37.000000       38.000000
+    41.000000       42.000000
+  */
+  float Ref[12] = {5, 6, 9, 10, 21, 22, 25, 26, 37, 38, 41, 42};
+
+  size_t out_width = 2;
+  size_t out_height = 2;
+  size_t out_depth = 3;
+
+  // alloc memory.
+  extent = sycl::range<3>(sizeof(float) * width, height, depth);
+  d_data = (syclcompat::pitched_data)syclcompat::malloc(extent);
+
+  // copy to Device.
+  cpyParm_from_data_ct1 = syclcompat::pitched_data(
+      (void *)h_data, sizeof(float) * width, width, height);
+  cpyParm_to_data_ct1 = d_data;
+  cpyParm_size_ct1 = extent;
+
+  {
+    syclcompat::experimental::memcpy_parameter params{};
+    params.to.pitched = cpyParm_to_data_ct1;
+    params.to.pos = cpyParm_to_pos_ct1;
+    params.from.pitched = cpyParm_from_data_ct1;
+    params.from.pos = cpyParm_from_pos_ct1;
+    params.size = cpyParm_size_ct1;
+    syclcompat::experimental::memcpy(params);
+  }
+
+  cpyParm_from_pos_ct1 = {1 * sizeof(float), 1, 0}; // set offset on x/y/z.
+  cpyParm_size_ct1 = {out_width * sizeof(float), out_height, out_depth};
+
+  for (int i = 0; i < out_width * out_height * out_depth; i++)
+    h_data[i] = -1;
+  // copy back to host.
+  cpyParm_from_data_ct1 = d_data;
+  cpyParm_to_data_ct1 = syclcompat::pitched_data(
+      (void *)h_data, sizeof(float) * out_width, out_width, out_height);
+
+  {
+    syclcompat::experimental::memcpy_parameter params{};
+    params.to.pitched = cpyParm_to_data_ct1;
+    params.to.pos = cpyParm_to_pos_ct1;
+    params.from.pitched = cpyParm_from_data_ct1;
+    params.from.pos = cpyParm_from_pos_ct1;
+    params.size = cpyParm_size_ct1;
+    syclcompat::experimental::memcpy(params);
+  }
+
+  // Copy back to host data.
+  check(h_data, Ref, out_width * out_height * out_depth);
+  syclcompat::free(h_data);
+  sycl::free(d_data.get_data_ptr(), syclcompat::get_default_context());
+}
+
+void test_memcpy3D_async_parameter_offset() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  size_t width = 4;
+  size_t height = 4;
+  size_t depth = 5;
+  float *h_data;
+
+  syclcompat::pitched_data d_data;
+  sycl::range<3> extent = sycl::range<3>(sizeof(float) * 1, 1, 1);
+  syclcompat::pitched_data cpyParm_from_data_ct1, cpyParm_to_data_ct1;
+  sycl::id<3> cpyParm_from_pos_ct1(0, 0, 0), cpyParm_to_pos_ct1(0, 0, 0);
+  sycl::range<3> cpyParm_size_ct1(0, 0, 0);
+
+  h_data =
+      (float *)syclcompat::malloc_host(sizeof(float) * width * height * depth);
+  /*
+    0.000000        1.000000        2.000000        3.000000
+    4.000000        5.000000        6.000000        7.000000
+    8.000000        9.000000        10.000000       11.000000
+    12.000000       13.000000       14.000000       15.000000
+
+    16.000000       17.000000       18.000000       19.000000
+    20.000000       21.000000       22.000000       23.000000
+    24.000000       25.000000       26.000000       27.000000
+    28.000000       29.000000       30.000000       31.000000
+
+    32.000000       33.000000       34.000000       35.000000
+    36.000000       37.000000       38.000000       39.000000
+    40.000000       41.000000       42.000000       43.000000
+    44.000000       45.000000       46.000000       47.000000
+
+    48.000000       49.000000       50.000000       51.000000
+    52.000000       53.000000       54.000000       55.000000
+    56.000000       57.000000       58.000000       59.000000
+    60.000000       61.000000       62.000000       63.000000
+
+    64.000000       65.000000       66.000000       67.000000
+    68.000000       69.000000       70.000000       71.000000
+    72.000000       73.000000       74.000000       75.000000
+    76.000000       77.000000       78.000000       79.000000
+  */
+  for (int i = 0; i < width * height * depth; i++)
+    h_data[i] = (float)i;
+
+  /*
+    5.000000        6.000000
+    9.000000        10.000000
+
+    21.000000       22.000000
+    25.000000       26.000000
+
+    37.000000       38.000000
+    41.000000       42.000000
+  */
+  float Ref[12] = {5, 6, 9, 10, 21, 22, 25, 26, 37, 38, 41, 42};
+
+  size_t out_width = 2;
+  size_t out_height = 2;
+  size_t out_depth = 3;
+
+  // alloc memory.
+  extent = sycl::range<3>(sizeof(float) * width, height, depth);
+  // test_feature:malloc
+  d_data = (syclcompat::pitched_data)syclcompat::malloc(extent);
+
+  // copy to Device.
+  cpyParm_from_data_ct1 = syclcompat::pitched_data(
+      (void *)h_data, sizeof(float) * width, width, height);
+  cpyParm_to_data_ct1 = d_data;
+  cpyParm_size_ct1 = extent;
+
+  {
+    syclcompat::experimental::memcpy_parameter params{};
+    params.to.pitched = cpyParm_to_data_ct1;
+    params.to.pos = cpyParm_to_pos_ct1;
+    params.from.pitched = cpyParm_from_data_ct1;
+    params.from.pos = cpyParm_from_pos_ct1;
+    params.size = cpyParm_size_ct1;
+    syclcompat::experimental::memcpy_async(params);
+  }
+  syclcompat::get_default_queue().wait_and_throw();
+  cpyParm_from_pos_ct1 = {1 * sizeof(float), 1, 0}; // set offset on x/y/z.
+  cpyParm_size_ct1 = {out_width * sizeof(float), out_height, out_depth};
+
+  for (int i = 0; i < out_width * out_height * out_depth; i++)
+    h_data[i] = -1;
+  // copy back to host.
+  cpyParm_from_data_ct1 = d_data;
+  cpyParm_to_data_ct1 = syclcompat::pitched_data(
+      (void *)h_data, sizeof(float) * out_width, out_width, out_height);
+  {
+    syclcompat::experimental::memcpy_parameter params{};
+    params.to.pitched = cpyParm_to_data_ct1;
+    params.to.pos = cpyParm_to_pos_ct1;
+    params.from.pitched = cpyParm_from_data_ct1;
+    params.from.pos = cpyParm_from_pos_ct1;
+    params.size = cpyParm_size_ct1;
+    syclcompat::experimental::memcpy_async(params);
+  }
+  syclcompat::get_default_queue().wait_and_throw();
+  // Copy back to host data.
+  check(h_data, Ref, out_width * out_height * out_depth);
+  syclcompat::free(h_data);
+  sycl::free(d_data.get_data_ptr(), syclcompat::get_default_context());
+}
+
+int main() {
+  // Copied and modified from memcpy_3d.cpp test_memcpy3D_offset()
+  test_memcpy3D_parameter_offset();
+  test_memcpy3D_async_parameter_offset();
+  return 0;
+}
diff --git a/sycl/test-e2e/syclcompat/memory/memory_image_xfails.cpp b/sycl/test-e2e/syclcompat/memory/memory_image_xfails.cpp
new file mode 100644
index 0000000000000..a011e1da9e407
--- /dev/null
+++ b/sycl/test-e2e/syclcompat/memory/memory_image_xfails.cpp
@@ -0,0 +1,142 @@
+/***************************************************************************
+ *
+ *  Copyright (C) Codeplay Software Ltd.
+ *
+ *  Part of the LLVM Project, under the Apache License v2.0 with LLVM
+ *  Exceptions. See https://llvm.org/LICENSE.txt for license information.
+ *  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ *  SYCLcompat API
+ *
+ *  memory_async.cpp
+ *
+ *  Description:
+ *    Asynchronous memory operations event dependency tests
+ **************************************************************************/
+
+// The original source was under the license below:
+// ====------ memory_async.cpp------------------- -*- C++ -* ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+// ===---------------------------------------------------------------------===//
+
+// RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
+// RUN: %{run} %t.out
+
+// Tests for the sycl::events returned from syclcompat::*Async API calls
+
+#include "sycl/exception.hpp"
+#include <stdexcept>
+#include <stdio.h>
+
+#include <sycl/detail/core.hpp>
+
+#include <syclcompat/memory.hpp>
+
+void test_memcpy_parameter_async(
+    syclcompat::experimental::memcpy_parameter param, bool xpass) {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  try {
+    syclcompat::experimental::memcpy_async(param);
+    assert(xpass);
+  } catch (std::runtime_error &) {
+    assert(!xpass);
+  }
+}
+
+void test_memcpy_parameter(syclcompat::experimental::memcpy_parameter param,
+                           bool xpass) {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+  try {
+    syclcompat::experimental::memcpy(param);
+    assert(xpass);
+  } catch (std::runtime_error &) {
+    assert(!xpass);
+  }
+}
+
+// Check (most) memcpy_parameter APIs raise std::runtime_error.
+void test_memcpy_parameter_xfails() {
+
+  {
+    // Empty `memcpy_params` passes in no bindless_image
+    // or image pointers. This is the code path that ought to pass.
+    syclcompat::experimental::memcpy_parameter params;
+    test_memcpy_parameter(params, true);
+    test_memcpy_parameter_async(params, true);
+  }
+
+  {
+    // Mimick passing a bindless image for source
+    syclcompat::experimental::memcpy_parameter params;
+    params.from.image_bindless =
+        reinterpret_cast<syclcompat::experimental::image_mem_wrapper *>(1);
+    test_memcpy_parameter(params, false);
+    test_memcpy_parameter_async(params, false);
+  }
+
+  {
+    // Mimick passing a bindless image for dest
+    syclcompat::experimental::memcpy_parameter params;
+    params.to.image_bindless =
+        reinterpret_cast<syclcompat::experimental::image_mem_wrapper *>(1);
+    test_memcpy_parameter(params, false);
+    test_memcpy_parameter_async(params, false);
+  }
+
+  {
+    // Mimick passing a bindless image for source & dest
+    syclcompat::experimental::memcpy_parameter params;
+    params.from.image_bindless =
+        reinterpret_cast<syclcompat::experimental::image_mem_wrapper *>(1);
+    params.to.image_bindless =
+        reinterpret_cast<syclcompat::experimental::image_mem_wrapper *>(1);
+    test_memcpy_parameter(params, false);
+    test_memcpy_parameter_async(params, false);
+  }
+
+  {
+    // Mimick passing an image for source
+    syclcompat::experimental::memcpy_parameter params;
+    params.from.image =
+        reinterpret_cast<syclcompat::experimental::image_matrix *>(1);
+    test_memcpy_parameter(params, false);
+    test_memcpy_parameter_async(params, false);
+  }
+
+  {
+    // Mimick passing an image for dest
+    syclcompat::experimental::memcpy_parameter params;
+    params.to.image =
+        reinterpret_cast<syclcompat::experimental::image_matrix *>(1);
+    test_memcpy_parameter(params, false);
+    test_memcpy_parameter_async(params, false);
+  }
+
+  {
+    // Mimick passing an image for source & dest
+    syclcompat::experimental::memcpy_parameter params;
+    params.from.image =
+        reinterpret_cast<syclcompat::experimental::image_matrix *>(1);
+    params.to.image =
+        reinterpret_cast<syclcompat::experimental::image_matrix *>(1);
+    test_memcpy_parameter(params, false);
+    test_memcpy_parameter_async(params, false);
+  }
+}
+
+int main() {
+  test_memcpy_parameter_xfails();
+  return 0;
+}
diff --git a/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp b/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp
index ee0c5fc146d59..c456cda333fed 100644
--- a/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp
+++ b/sycl/test-e2e/syclcompat/memory/memory_management_test3.cpp
@@ -29,7 +29,6 @@
 //
 //
 // ===----------------------------------------------------------------------===//
-
 // RUN: %clangxx -std=c++20 -fsycl -fsycl-targets=%{sycl_triple} %s -o %t.out
 // RUN: %{run} %t.out
 
@@ -65,6 +64,29 @@ void test_free_memory_q() {
   syclcompat::free(nullptr, q);
 }
 
+void test_wait_and_free_memory() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  float *d_A = (float *)syclcompat::malloc(sizeof(float));
+  syclcompat::wait_and_free((void *)d_A);
+
+  syclcompat::wait_and_free(0);
+  syclcompat::wait_and_free(NULL);
+  syclcompat::wait_and_free(nullptr);
+}
+
+void test_wait_and_free_memory_q() {
+  std::cout << __PRETTY_FUNCTION__ << std::endl;
+
+  sycl::queue q{{sycl::property::queue::in_order()}};
+  float *d_A = (float *)syclcompat::malloc(sizeof(float), q);
+  syclcompat::wait_and_free((void *)d_A, q);
+
+  syclcompat::wait_and_free(0, q);
+  syclcompat::wait_and_free(NULL, q);
+  syclcompat::wait_and_free(nullptr, q);
+}
+
 void test_memcpy_async() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
 
@@ -662,6 +684,8 @@ void test_constant_memcpy_async_q() {
 int main() {
   test_free_memory();
   test_free_memory_q();
+  test_wait_and_free_memory();
+  test_wait_and_free_memory_q();
   test_memcpy_async();
   test_memcpy_async_q();
   test_memcpy_async_pitched();
diff --git a/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp b/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp
index 532163dda263a..2de67aeebb251 100644
--- a/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp
+++ b/sycl/test-e2e/syclcompat/memory/usm_allocations.cpp
@@ -88,7 +88,7 @@ void test_non_templated_host() {
 void test_deduce() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
 
-  using memcpy_direction = syclcompat::detail::memcpy_direction;
+  using namespace syclcompat::experimental; // for memcpy_direction
   auto default_queue = syclcompat::get_default_queue();
   if (!default_queue.get_device().has(sycl::aspect::usm_host_allocations))
     return; // Skip unsupported
diff --git a/sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp b/sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp
index 6737614549863..22c45a1e874a4 100644
--- a/sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp
+++ b/sycl/test-e2e/syclcompat/memory/usm_shared_allocations.cpp
@@ -63,7 +63,7 @@ void test_non_templated_shared() {
 void test_deduce_shared() {
   std::cout << __PRETTY_FUNCTION__ << std::endl;
 
-  using memcpy_direction = syclcompat::detail::memcpy_direction;
+  using namespace syclcompat::experimental;
   auto default_queue = syclcompat::get_default_queue();
 
   int *h_ptr = (int *)syclcompat::malloc_host(sizeof(int));
diff --git a/sycl/test/abi/layout_vec.cpp b/sycl/test/abi/layout_vec.cpp
index 8f70f2835bf72..af33ca24fa111 100644
--- a/sycl/test/abi/layout_vec.cpp
+++ b/sycl/test/abi/layout_vec.cpp
@@ -1,5 +1,8 @@
 // RUN: %clangxx -fsycl -c -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s
 // RUN: %clangxx -fsycl -fsycl-device-only -c -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s
+// RUN: %if preview-breaking-changes-supported %{ %clangxx -fsycl -c -fpreview-breaking-changes -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s -check-prefix FSYCL-PREVIEW-BREAKING-CHANGES-CHECK %}
+// RUN: %if preview-breaking-changes-supported %{ %clangxx -fsycl -fsycl-device-only -c -fpreview-breaking-changes -fno-color-diagnostics -Xclang -fdump-record-layouts %s -o %t.out | FileCheck %s -check-prefix FSYCL-PREVIEW-BREAKING-CHANGES-CHECK %}
+
 // REQUIRES: linux
 // UNSUPPORTED: libcxx
 
@@ -14,6 +17,14 @@ SYCL_EXTERNAL void foo(sycl::vec<int, 4>) {}
 // CHECK-NEXT: | [sizeof=16, dsize=16, align=16,
 // CHECK-NEXT: |  nvsize=16, nvalign=16]
 
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK: 0 | class sycl::vec<int, 4>
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK: 0 |   class sycl::detail::vec_arith<int, 4> (base) (empty)
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK-NEXT: 0 |     class sycl::detail::vec_arith_common<int, 4> (base) (empty)
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK-NEXT: 0 |   struct std::array<int, 4> m_Data
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK-NEXT: 0 |     typename _AT_Type::_Type _M_elems
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK-NEXT: | [sizeof=16, dsize=16, align=16,
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK-NEXT: |  nvsize=16, nvalign=16]
+
 //--------------------------------------
 
 SYCL_EXTERNAL void foo(sycl::vec<bool, 16>) {}
@@ -22,3 +33,11 @@ SYCL_EXTERNAL void foo(sycl::vec<bool, 16>) {}
 // CHECK-NEXT: 0 |   DataType m_Data
 // CHECK-NEXT: | [sizeof=16, dsize=16, align=16,
 // CHECK-NEXT: |  nvsize=16, nvalign=16]
+
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK: 0 | class sycl::vec<_Bool, 16>
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK: 0 |   class sycl::detail::vec_arith<_Bool, 16> (base) (empty)
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK-NEXT: 0 |     class sycl::detail::vec_arith_common<_Bool, 16> (base) (empty)
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK-NEXT: 0 |   struct std::array<_Bool, 16> m_Data
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK-NEXT: 0 |     typename _AT_Type::_Type _M_elems
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK-NEXT: | [sizeof=16, dsize=16, align=16,
+// FSYCL-PREVIEW-BREAKING-CHANGES-CHECK-NEXT: |  nvsize=16, nvalign=16]
diff --git a/sycl/test/abi/pi_cuda_symbol_check.dump b/sycl/test/abi/pi_cuda_symbol_check.dump
index 58f44604021f6..d3047c6bb1cd0 100644
--- a/sycl/test/abi/pi_cuda_symbol_check.dump
+++ b/sycl/test/abi/pi_cuda_symbol_check.dump
@@ -120,6 +120,8 @@ piextEventCreateWithNativeHandle
 piextEventGetNativeHandle
 piextGetDeviceFunctionPointer
 piextGetGlobalVariablePointer
+piextImportExternalMemory
+piextImportExternalSemaphore
 piextImportExternalSemaphoreOpaqueFD
 piextKernelCreateWithNativeHandle
 piextKernelGetNativeHandle
diff --git a/sycl/test/abi/pi_hip_symbol_check.dump b/sycl/test/abi/pi_hip_symbol_check.dump
index e82ad83166652..c83b4a4ba6122 100644
--- a/sycl/test/abi/pi_hip_symbol_check.dump
+++ b/sycl/test/abi/pi_hip_symbol_check.dump
@@ -120,6 +120,8 @@ piextEventCreateWithNativeHandle
 piextEventGetNativeHandle
 piextGetDeviceFunctionPointer
 piextGetGlobalVariablePointer
+piextImportExternalMemory
+piextImportExternalSemaphore
 piextImportExternalSemaphoreOpaqueFD
 piextKernelCreateWithNativeHandle
 piextKernelGetNativeHandle
diff --git a/sycl/test/abi/pi_level_zero_symbol_check.dump b/sycl/test/abi/pi_level_zero_symbol_check.dump
index 762aa089d18d0..d6cc82870c669 100644
--- a/sycl/test/abi/pi_level_zero_symbol_check.dump
+++ b/sycl/test/abi/pi_level_zero_symbol_check.dump
@@ -119,6 +119,8 @@ piextEventCreateWithNativeHandle
 piextEventGetNativeHandle
 piextGetDeviceFunctionPointer
 piextGetGlobalVariablePointer
+piextImportExternalMemory
+piextImportExternalSemaphore
 piextImportExternalSemaphoreOpaqueFD
 piextKernelCreateWithNativeHandle
 piextKernelGetNativeHandle
diff --git a/sycl/test/abi/pi_nativecpu_symbol_check.dump b/sycl/test/abi/pi_nativecpu_symbol_check.dump
index 659db9ed05365..850e6d22fdb72 100644
--- a/sycl/test/abi/pi_nativecpu_symbol_check.dump
+++ b/sycl/test/abi/pi_nativecpu_symbol_check.dump
@@ -120,6 +120,8 @@ piextEventCreateWithNativeHandle
 piextEventGetNativeHandle
 piextGetDeviceFunctionPointer
 piextGetGlobalVariablePointer
+piextImportExternalMemory
+piextImportExternalSemaphore
 piextImportExternalSemaphoreOpaqueFD
 piextKernelCreateWithNativeHandle
 piextKernelGetNativeHandle
diff --git a/sycl/test/abi/pi_opencl_symbol_check.dump b/sycl/test/abi/pi_opencl_symbol_check.dump
index 972a577a3037e..daaf7bbee5de5 100644
--- a/sycl/test/abi/pi_opencl_symbol_check.dump
+++ b/sycl/test/abi/pi_opencl_symbol_check.dump
@@ -119,6 +119,8 @@ piextEventCreateWithNativeHandle
 piextEventGetNativeHandle
 piextGetDeviceFunctionPointer
 piextGetGlobalVariablePointer
+piextImportExternalMemory
+piextImportExternalSemaphore
 piextImportExternalSemaphoreOpaqueFD
 piextKernelCreateWithNativeHandle
 piextKernelGetNativeHandle
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index 0edaaa25b4ba1..dec17d9f11fe8 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3032,6 +3032,8 @@ _ZN4sycl3_V13ext6oneapi12experimental22import_external_memoryINS3_11resource_fdE
 _ZN4sycl3_V13ext6oneapi12experimental22import_external_memoryINS3_11resource_fdEEENS3_18interop_mem_handleENS3_23external_mem_descriptorIT_EERKNS0_6deviceERKNS0_7contextE
 _ZN4sycl3_V13ext6oneapi12experimental22import_external_memoryINS3_15external_mem_fdEEENS3_18interop_mem_handleENS3_23external_mem_descriptorIT_EERKNS0_5queueE
 _ZN4sycl3_V13ext6oneapi12experimental22import_external_memoryINS3_15external_mem_fdEEENS3_18interop_mem_handleENS3_23external_mem_descriptorIT_EERKNS0_6deviceERKNS0_7contextE
+_ZN4sycl3_V13ext6oneapi12experimental22import_external_memoryINS3_21resource_win32_handleEEENS3_18interop_mem_handleENS3_23external_mem_descriptorIT_EERKNS0_5queueE
+_ZN4sycl3_V13ext6oneapi12experimental22import_external_memoryINS3_21resource_win32_handleEEENS3_18interop_mem_handleENS3_23external_mem_descriptorIT_EERKNS0_6deviceERKNS0_7contextE
 _ZN4sycl3_V13ext6oneapi12experimental23prepare_for_device_copyEPKvmRKNS0_5queueE
 _ZN4sycl3_V13ext6oneapi12experimental23prepare_for_device_copyEPKvmRKNS0_7contextE
 _ZN4sycl3_V13ext6oneapi12experimental23release_external_memoryENS3_18interop_mem_handleERKNS0_5queueE
@@ -3044,6 +3046,8 @@ _ZN4sycl3_V13ext6oneapi12experimental25import_external_semaphoreINS3_11resource_
 _ZN4sycl3_V13ext6oneapi12experimental25import_external_semaphoreINS3_11resource_fdEEENS3_24interop_semaphore_handleENS3_29external_semaphore_descriptorIT_EERKNS0_6deviceERKNS0_7contextE
 _ZN4sycl3_V13ext6oneapi12experimental25import_external_semaphoreINS3_21external_semaphore_fdEEENS3_24interop_semaphore_handleENS3_29external_semaphore_descriptorIT_EERKNS0_5queueE
 _ZN4sycl3_V13ext6oneapi12experimental25import_external_semaphoreINS3_21external_semaphore_fdEEENS3_24interop_semaphore_handleENS3_29external_semaphore_descriptorIT_EERKNS0_6deviceERKNS0_7contextE
+_ZN4sycl3_V13ext6oneapi12experimental25import_external_semaphoreINS3_21resource_win32_handleEEENS3_24interop_semaphore_handleENS3_29external_semaphore_descriptorIT_EERKNS0_5queueE
+_ZN4sycl3_V13ext6oneapi12experimental25import_external_semaphoreINS3_21resource_win32_handleEEENS3_24interop_semaphore_handleENS3_29external_semaphore_descriptorIT_EERKNS0_6deviceERKNS0_7contextE
 _ZN4sycl3_V13ext6oneapi12experimental25map_external_image_memoryENS3_18interop_mem_handleERKNS3_16image_descriptorERKNS0_5queueE
 _ZN4sycl3_V13ext6oneapi12experimental25map_external_image_memoryENS3_18interop_mem_handleERKNS3_16image_descriptorERKNS0_6deviceERKNS0_7contextE
 _ZN4sycl3_V13ext6oneapi12experimental25map_external_memory_arrayENS3_18interop_mem_handleERKNS3_16image_descriptorERKNS0_5queueE
@@ -3119,6 +3123,7 @@ _ZN4sycl3_V15queue10mem_adviseEPKvmiRKSt6vectorINS0_5eventESaIS5_EERKNS0_6detail
 _ZN4sycl3_V15queue10wait_proxyERKNS0_6detail13code_locationE
 _ZN4sycl3_V15queue11submit_implESt8functionIFvRNS0_7handlerEEERKNS0_6detail13code_locationE
 _ZN4sycl3_V15queue11submit_implESt8functionIFvRNS0_7handlerEEES1_RKNS0_6detail13code_locationE
+_ZN4sycl3_V15queue15ext_oneapi_prodEv
 _ZN4sycl3_V15queue17discard_or_returnERKNS0_5eventE
 _ZN4sycl3_V15queue18throw_asynchronousEv
 _ZN4sycl3_V15queue20memcpyToDeviceGlobalEPvPKvbmmRKSt6vectorINS0_5eventESaIS6_EE
@@ -3621,8 +3626,11 @@ _ZN4sycl3_V17handler28memcpyToHostOnlyDeviceGlobalEPKvS3_mbmm
 _ZN4sycl3_V17handler28setStateExplicitKernelBundleEv
 _ZN4sycl3_V17handler30memcpyFromHostOnlyDeviceGlobalEPvPKvbmm
 _ZN4sycl3_V17handler30verifyUsedKernelBundleInternalENS0_6detail11string_viewE
+_ZN4sycl3_V17handler32verifyDeviceHasProgressGuaranteeENS0_3ext6oneapi12experimental26forward_progress_guaranteeENS4_15execution_scopeES6_
 _ZN4sycl3_V17handler34ext_oneapi_wait_external_semaphoreENS0_3ext6oneapi12experimental24interop_semaphore_handleE
+_ZN4sycl3_V17handler34ext_oneapi_wait_external_semaphoreENS0_3ext6oneapi12experimental24interop_semaphore_handleEm
 _ZN4sycl3_V17handler36ext_oneapi_signal_external_semaphoreENS0_3ext6oneapi12experimental24interop_semaphore_handleE
+_ZN4sycl3_V17handler36ext_oneapi_signal_external_semaphoreENS0_3ext6oneapi12experimental24interop_semaphore_handleEm
 _ZN4sycl3_V17handler6memcpyEPvPKvm
 _ZN4sycl3_V17handler6memsetEPvim
 _ZN4sycl3_V17handler8finalizeEv
@@ -3633,7 +3641,6 @@ _ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEEb
 _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_3ext6oneapi12experimental6detail10graph_implEE
 _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEES5_S5_b
 _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEEb
-_ZN4sycl3_V17handler32verifyDeviceHasProgressGuaranteeENS0_3ext6oneapi12experimental26forward_progress_guaranteeENS4_15execution_scopeES6_
 _ZN4sycl3_V17samplerC1ENS0_29coordinate_normalization_modeENS0_15addressing_modeENS0_14filtering_modeERKNS0_13property_listE
 _ZN4sycl3_V17samplerC1EP11_cl_samplerRKNS0_7contextE
 _ZN4sycl3_V17samplerC2ENS0_29coordinate_normalization_modeENS0_15addressing_modeENS0_14filtering_modeERKNS0_13property_listE
@@ -3748,7 +3755,6 @@ _ZNK4sycl3_V15queue12has_propertyINS0_8property5queue16enable_profilingEEEbv
 _ZNK4sycl3_V15queue12has_propertyINS0_8property5queue4cuda18use_default_streamEEEbv
 _ZNK4sycl3_V15queue12has_propertyINS0_8property5queue8in_orderEEEbv
 _ZNK4sycl3_V15queue16ext_oneapi_emptyEv
-_ZN4sycl3_V15queue15ext_oneapi_prodEv
 _ZNK4sycl3_V15queue16get_backend_infoINS0_4info6device15backend_versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv
 _ZNK4sycl3_V15queue16get_backend_infoINS0_4info6device7versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv
 _ZNK4sycl3_V15queue16get_backend_infoINS0_4info8platform7versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv
@@ -3973,6 +3979,12 @@ _ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device22m
 _ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device22max_image_linear_widthEEENS0_6detail11ABINeutralTINS9_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device23max_image_linear_heightEEENS0_6detail11ABINeutralTINS9_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device26max_image_linear_row_pitchEEENS0_6detail11ABINeutralTINS9_19is_device_info_descIT_E11return_typeEE4typeEv
+_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31sub_group_progress_capabilitiesILNS5_15execution_scopeE2EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
+_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31sub_group_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
+_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE1EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
+_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE2EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
+_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
+_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device32work_group_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_3ext8codeplay12experimental4info6device15supports_fusionEEENS0_6detail11ABINeutralTINS9_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_3ext8codeplay12experimental4info6device28max_registers_per_work_groupEEENS0_6detail11ABINeutralTINS9_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_4info6device10extensionsEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv
@@ -4084,12 +4096,6 @@ _ZNK4sycl3_V16device13get_info_implINS0_4info6device7versionEEENS0_6detail11ABIN
 _ZNK4sycl3_V16device13get_info_implINS0_4info6device8atomic64EEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_4info6device8platformEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_4info6device9vendor_idEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv
-_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device32work_group_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
-_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31sub_group_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
-_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31sub_group_progress_capabilitiesILNS5_15execution_scopeE2EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
-_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE2EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
-_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
-_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE1EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13has_extensionERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE
 _ZNK4sycl3_V16device14is_acceleratorEv
 _ZNK4sycl3_V16device16get_backend_infoINS0_4info6device15backend_versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv
diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
index 41c0b7bfa2e89..e8610211e8572 100644
--- a/sycl/test/abi/sycl_symbols_windows.dump
+++ b/sycl/test/abi/sycl_symbols_windows.dump
@@ -449,10 +449,14 @@
 ??$import_external_memory@Uexternal_mem_fd@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_mem_handle@01234@U?$external_mem_descriptor@Uexternal_mem_fd@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVqueue@34@@Z
 ??$import_external_memory@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_mem_handle@01234@U?$external_mem_descriptor@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVdevice@34@AEBVcontext@34@@Z
 ??$import_external_memory@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_mem_handle@01234@U?$external_mem_descriptor@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVqueue@34@@Z
+??$import_external_memory@Uresource_win32_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_mem_handle@01234@U?$external_mem_descriptor@Uresource_win32_handle@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVdevice@34@AEBVcontext@34@@Z
+??$import_external_memory@Uresource_win32_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_mem_handle@01234@U?$external_mem_descriptor@Uresource_win32_handle@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVqueue@34@@Z
 ??$import_external_semaphore@Uexternal_semaphore_fd@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_semaphore_handle@01234@U?$external_semaphore_descriptor@Uexternal_semaphore_fd@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVdevice@34@AEBVcontext@34@@Z
 ??$import_external_semaphore@Uexternal_semaphore_fd@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_semaphore_handle@01234@U?$external_semaphore_descriptor@Uexternal_semaphore_fd@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVqueue@34@@Z
 ??$import_external_semaphore@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_semaphore_handle@01234@U?$external_semaphore_descriptor@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVdevice@34@AEBVcontext@34@@Z
 ??$import_external_semaphore@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_semaphore_handle@01234@U?$external_semaphore_descriptor@Uresource_fd@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVqueue@34@@Z
+??$import_external_semaphore@Uresource_win32_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_semaphore_handle@01234@U?$external_semaphore_descriptor@Uresource_win32_handle@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVdevice@34@AEBVcontext@34@@Z
+??$import_external_semaphore@Uresource_win32_handle@experimental@oneapi@ext@_V1@sycl@@@experimental@oneapi@ext@_V1@sycl@@YA?AUinterop_semaphore_handle@01234@U?$external_semaphore_descriptor@Uresource_win32_handle@experimental@oneapi@ext@_V1@sycl@@@01234@AEBVqueue@34@@Z
 ??$update_nd_range@$00@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$00@45@@Z
 ??$update_nd_range@$01@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$01@45@@Z
 ??$update_nd_range@$02@node@experimental@oneapi@ext@_V1@sycl@@QEAAXV?$nd_range@$02@45@@Z
@@ -4156,18 +4160,26 @@
 ?ext_oneapi_prefetch_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEAX_KV?$vector@IV?$allocator@I@std@@@6@PEAI@Z
 ?ext_oneapi_set_external_event@queue@_V1@sycl@@QEAAXAEBVevent@23@@Z
 ?ext_oneapi_signal_external_semaphore@handler@_V1@sycl@@QEAAXUinterop_semaphore_handle@experimental@oneapi@ext@23@@Z
+?ext_oneapi_signal_external_semaphore@handler@_V1@sycl@@QEAAXUinterop_semaphore_handle@experimental@oneapi@ext@23@_K@Z
 ?ext_oneapi_signal_external_semaphore@queue@_V1@sycl@@QEAA?AVevent@23@Uinterop_semaphore_handle@experimental@oneapi@ext@23@AEBUcode_location@detail@23@@Z
 ?ext_oneapi_signal_external_semaphore@queue@_V1@sycl@@QEAA?AVevent@23@Uinterop_semaphore_handle@experimental@oneapi@ext@23@AEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z
 ?ext_oneapi_signal_external_semaphore@queue@_V1@sycl@@QEAA?AVevent@23@Uinterop_semaphore_handle@experimental@oneapi@ext@23@V423@AEBUcode_location@detail@23@@Z
+?ext_oneapi_signal_external_semaphore@queue@_V1@sycl@@QEAA?AVevent@23@Uinterop_semaphore_handle@experimental@oneapi@ext@23@_KAEBUcode_location@detail@23@@Z
+?ext_oneapi_signal_external_semaphore@queue@_V1@sycl@@QEAA?AVevent@23@Uinterop_semaphore_handle@experimental@oneapi@ext@23@_KAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z
+?ext_oneapi_signal_external_semaphore@queue@_V1@sycl@@QEAA?AVevent@23@Uinterop_semaphore_handle@experimental@oneapi@ext@23@_KV423@AEBUcode_location@detail@23@@Z
 ?ext_oneapi_submit_barrier@queue@_V1@sycl@@QEAA?AVevent@23@AEBUcode_location@detail@23@@Z
 ?ext_oneapi_submit_barrier@queue@_V1@sycl@@QEAA?AVevent@23@AEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z
 ?ext_oneapi_supports_cl_c_feature@device@_V1@sycl@@QEAA_NAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z
 ?ext_oneapi_supports_cl_c_version@device@_V1@sycl@@QEBA_NAEBUcl_version@experimental@oneapi@ext@23@@Z
 ?ext_oneapi_supports_cl_extension@device@_V1@sycl@@QEBA_NAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAUcl_version@experimental@oneapi@ext@23@@Z
 ?ext_oneapi_wait_external_semaphore@handler@_V1@sycl@@QEAAXUinterop_semaphore_handle@experimental@oneapi@ext@23@@Z
+?ext_oneapi_wait_external_semaphore@handler@_V1@sycl@@QEAAXUinterop_semaphore_handle@experimental@oneapi@ext@23@_K@Z
 ?ext_oneapi_wait_external_semaphore@queue@_V1@sycl@@QEAA?AVevent@23@Uinterop_semaphore_handle@experimental@oneapi@ext@23@AEBUcode_location@detail@23@@Z
 ?ext_oneapi_wait_external_semaphore@queue@_V1@sycl@@QEAA?AVevent@23@Uinterop_semaphore_handle@experimental@oneapi@ext@23@AEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z
 ?ext_oneapi_wait_external_semaphore@queue@_V1@sycl@@QEAA?AVevent@23@Uinterop_semaphore_handle@experimental@oneapi@ext@23@V423@AEBUcode_location@detail@23@@Z
+?ext_oneapi_wait_external_semaphore@queue@_V1@sycl@@QEAA?AVevent@23@Uinterop_semaphore_handle@experimental@oneapi@ext@23@_KAEBUcode_location@detail@23@@Z
+?ext_oneapi_wait_external_semaphore@queue@_V1@sycl@@QEAA?AVevent@23@Uinterop_semaphore_handle@experimental@oneapi@ext@23@_KAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z
+?ext_oneapi_wait_external_semaphore@queue@_V1@sycl@@QEAA?AVevent@23@Uinterop_semaphore_handle@experimental@oneapi@ext@23@_KV423@AEBUcode_location@detail@23@@Z
 ?extractArgsAndReqs@handler@_V1@sycl@@AEAAXXZ
 ?extractArgsAndReqsFromLambda@handler@_V1@sycl@@AEAAXPEAD_KPEBUkernel_param_desc_t@detail@23@_N@Z
 ?fill@MemoryManager@detail@_V1@sycl@@SAXPEAVSYCLMemObjI@234@PEAXV?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_KPEBDIV?$range@$02@34@5V?$id@$02@34@IV?$vector@PEAU_pi_event@@V?$allocator@PEAU_pi_event@@@std@@@7@AEAPEAU_pi_event@@@Z
diff --git a/sycl/test/basic_tests/accessor/no_offset_error.cpp b/sycl/test/basic_tests/accessor/no_offset_error.cpp
new file mode 100644
index 0000000000000..39ced3dc00a73
--- /dev/null
+++ b/sycl/test/basic_tests/accessor/no_offset_error.cpp
@@ -0,0 +1,22 @@
+// RUN:  %clangxx -fsycl-device-only -Xclang -verify -Xclang -verify-ignore-unexpected=note -emit-llvm -o - %s
+
+#include <sycl/sycl.hpp>
+
+inline constexpr int size = 100;
+
+int main() {
+
+  sycl::buffer<int> a{sycl::range{size}};
+  sycl::queue q;
+
+  q.submit([&](sycl::handler &cgh) {
+    sycl::ext::oneapi::accessor_property_list PL{sycl::ext::oneapi::no_offset,
+                                                 sycl::no_init};
+    sycl::accessor acc_a(a, cgh, sycl::write_only, PL);
+    // expected-error@sycl/accessor.hpp:* {{static assertion failed due to requirement '!(accessor_property_list<sycl::ext::oneapi::property::no_offset::instance<true>, sycl::property::no_init>::has_property())': Accessor has no_offset property, get_offset() can not be used}}
+    auto b = acc_a.get_offset();
+  });
+
+  q.wait();
+  return 0;
+}
diff --git a/sycl/test/basic_tests/min_max_test.cpp b/sycl/test/basic_tests/min_max_test.cpp
index fde42cd4ce710..e91b3f18102e5 100644
--- a/sycl/test/basic_tests/min_max_test.cpp
+++ b/sycl/test/basic_tests/min_max_test.cpp
@@ -1,5 +1,6 @@
 // REQUIRES: windows
 // RUN: %clangxx -fsycl -fsycl-device-only -fsyntax-only -Xclang -verify %s -I %sycl_include
+// RUN: %clangxx -fsycl -fpreview-breaking-changes -fsycl-device-only -fsyntax-only -Xclang -verify %s -I %sycl_include
 // expected-no-diagnostics
 
 #include "windows.h"
diff --git a/sycl/test/basic_tests/vectors/vectors.cpp b/sycl/test/basic_tests/vectors/vectors.cpp
index 7e558544564ec..2b288b01c0bb5 100644
--- a/sycl/test/basic_tests/vectors/vectors.cpp
+++ b/sycl/test/basic_tests/vectors/vectors.cpp
@@ -168,6 +168,26 @@ int main() {
   assert((!inputVec4.lo().as<sycl::vec<bool, 2>>()[0]));
   assert((inputVec4.lo().as<sycl::vec<bool, 2>>()[1]));
 
+  // Check assignment operator for swizzles.
+  {
+    sycl::vec<int8_t, 2> inputVec1 = sycl::vec<int8_t, 2>(0, 1);
+    sycl::vec<int8_t, 2> inputVec2 = sycl::vec<int8_t, 2>(2, 3);
+    auto swiz1 = inputVec1.template swizzle<sycl::elem::s1, sycl::elem::s0>();
+    auto swiz2 = inputVec2.template swizzle<sycl::elem::s0, sycl::elem::s1>();
+
+    // Assign swizzle to swizzle.
+    swiz1 = swiz2;
+    assert(inputVec1[0] == 3 && inputVec1[1] == 2);
+
+    // Assign vec to swizzle.
+    swiz1 = sycl::vec<int8_t, 2>(0, 1);
+    assert(inputVec1[0] == 1 && inputVec1[1] == 0);
+
+    // Assign single element to swizzle.
+    swiz1 = (int8_t)5;
+    assert(inputVec1[0] == 5 && inputVec1[1] == 5);
+  }
+
   // Check that [u]long[n] type aliases match vec<[u]int64_t, n> types.
   assert((std::is_same<sycl::vec<std::int64_t, 2>, sycl::long2>::value));
   assert((std::is_same<sycl::vec<std::int64_t, 3>, sycl::long3>::value));
diff --git a/sycl/test/check_device_code/accessor_index.cpp b/sycl/test/check_device_code/accessor_index.cpp
index 04d248e27ccc3..e35be7d97670c 100644
--- a/sycl/test/check_device_code/accessor_index.cpp
+++ b/sycl/test/check_device_code/accessor_index.cpp
@@ -5,13 +5,8 @@
 // CHECK-NOT: llvm.loop
 // CHECK-NOT: br i1
 using namespace sycl;
-int main() {
-  queue Q;
-  range<3> Range{8, 8, 8};
-  buffer<int, 3> Buf(Range);
-  Q.submit([&](handler &Cgh) {
-    auto Acc = Buf.get_access<access::mode::write>(Cgh);
-    local_accessor<int, 3> LocAcc(Range, Cgh);
-    Cgh.parallel_for(Range, [=](item<3> It) { LocAcc[It] = Acc[It]; });
-  });
+
+SYCL_EXTERNAL void accessor_index(accessor<int, 3, access::mode::write> Acc,
+                                  local_accessor<int, 3> LocAcc, item<3> It) {
+  LocAcc[It] = Acc[It];
 }
diff --git a/sycl/test/check_device_code/ap_fixed.cpp b/sycl/test/check_device_code/ap_fixed.cpp
index a2c38b827ced5..a9e29a8e16cf0 100644
--- a/sycl/test/check_device_code/ap_fixed.cpp
+++ b/sycl/test/check_device_code/ap_fixed.cpp
@@ -10,101 +10,57 @@
 
 #include "CL/__spirv/spirv_ops.hpp"
 
-template <int W, int rW, bool S, int I, int rI>
-void sqrt() {
-  sycl::detail::ap_int<W> a;
-  auto ap_fixed_Sqrt = __spirv_FixedSqrtINTEL<W, rW>(a, S, I, rI);
+SYCL_EXTERNAL auto test_sqrt(sycl::detail::ap_int<13> a) {
+  return __spirv_FixedSqrtINTEL<13, 5>(a, false, 2, 2);
   // CHECK: %{{.*}} = call spir_func signext i5 @_Z[[#]]__spirv_FixedSqrtINTEL{{.*}}(i13 signext  %[[#]], i1 zeroext false, i32 2, i32 2, i32 0, i32 0)
 }
 
-template <int W, int rW, bool S, int I, int rI>
-void recip() {
-  sycl::detail::ap_int<W> a;
-  auto ap_fixed_Recip = __spirv_FixedRecipINTEL<W, rW>(a, S, I, rI);
+SYCL_EXTERNAL auto test_recip(sycl::detail::ap_int<3> a) {
+  return __spirv_FixedRecipINTEL<3, 8>(a, true, 4, 4);
   // CHECK: %{{.*}} = call spir_func signext i8 @_Z[[#]]__spirv_FixedRecipINTEL{{.*}}(i3 signext %[[#]], i1 zeroext true, i32 4, i32 4, i32 0, i32 0)
 }
 
-template <int W, int rW, bool S, int I, int rI>
-void rsqrt() {
-  sycl::detail::ap_int<W> a;
-  auto ap_fixed_Rsqrt = __spirv_FixedRsqrtINTEL<W, rW>(a, S, I, rI);
+SYCL_EXTERNAL auto test_rsqrt(sycl::detail::ap_int<11> a) {
+  return __spirv_FixedRsqrtINTEL<11, 10>(a, false, 8, 6);
   // CHECK: %{{.*}} = call spir_func signext i10 @_Z[[#]]__spirv_FixedRsqrtINTEL{{.*}}(i11 signext %[[#]], i1 zeroext false, i32 8, i32 6, i32 0, i32 0)
 }
 
-template <int W, int rW, bool S, int I, int rI>
-void sin() {
-  sycl::detail::ap_int<W> a;
-  auto ap_fixed_Sin = __spirv_FixedSinINTEL<W, rW>(a, S, I, rI);
+SYCL_EXTERNAL auto test_sin(sycl::detail::ap_int<17> a) {
+  return __spirv_FixedSinINTEL<17, 11>(a, true, 7, 5);
   // CHECK: %{{.*}} = call spir_func signext i11 @_Z[[#]]__spirv_FixedSinINTEL{{.*}}(i17 signext %[[#]], i1 zeroext true, i32 7, i32 5, i32 0, i32 0)
 }
 
-template <int W, int rW, bool S, int I, int rI>
-void cos() {
-  sycl::detail::ap_int<W> a;
-  auto ap_fixed_Cos = __spirv_FixedCosINTEL<W, rW>(a, S, I, rI);
+SYCL_EXTERNAL auto test_cos(sycl::detail::ap_int<35> a) {
+  return __spirv_FixedCosINTEL<35, 28>(a, false, 9, 3);
   // CHECK: %{{.*}} = call spir_func signext i28 @_Z[[#]]__spirv_FixedCosINTEL{{.*}}(i35 %[[#]], i1 zeroext false, i32 9, i32 3, i32 0, i32 0)
 }
 
-template <int W, int rW, bool S, int I, int rI>
-void sin_cos() {
-  sycl::detail::ap_int<W> a;
-  auto ap_fixed_SinCos = __spirv_FixedSinCosINTEL<W, rW>(a, S, I, rI);
+SYCL_EXTERNAL auto test_sin_cos(sycl::detail::ap_int<31> a) {
+  return __spirv_FixedSinCosINTEL<31, 20>(a, true, 10, 12);
   // CHECK: %{{.*}} = call spir_func i40 @_Z[[#]]__spirv_FixedSinCosINTEL{{.*}}(i31 signext %[[#]], i1 zeroext true, i32 10, i32 12, i32 0, i32 0)
 }
 
-template <int W, int rW, bool S, int I, int rI>
-void sin_pi() {
-  sycl::detail::ap_int<W> a;
-  auto ap_fixed_SinPi = __spirv_FixedSinPiINTEL<W, rW>(a, S, I, rI);
+SYCL_EXTERNAL auto test_sin_pi(sycl::detail::ap_int<60> a) {
+  return __spirv_FixedSinPiINTEL<60, 5>(a, false, 2, 2);
   // CHECK: %{{.*}} = call spir_func signext i5 @_Z[[#]]__spirv_FixedSinPiINTEL{{.*}}(i60 %[[#]], i1 zeroext false, i32 2, i32 2, i32 0, i32 0)
 }
 
-template <int W, int rW, bool S, int I, int rI>
-void cos_pi() {
-  sycl::detail::ap_int<W> a;
-  auto ap_fixed_CosPi = __spirv_FixedCosPiINTEL<W, rW>(a, S, I, rI);
+SYCL_EXTERNAL auto test_cos_pi(sycl::detail::ap_int<28> a) {
+  return __spirv_FixedCosPiINTEL<28, 16>(a, false, 8, 5);
   // CHECK: %{{.*}} = call spir_func signext i16 @_Z[[#]]__spirv_FixedCosPiINTEL{{.*}}(i28 signext %[[#]], i1 zeroext false, i32 8, i32 5, i32 0, i32 0)
 }
 
-template <int W, int rW, bool S, int I, int rI>
-void sin_cos_pi() {
-  sycl::detail::ap_int<W> a;
-  auto ap_fixed_SinCosPi = __spirv_FixedSinCosPiINTEL<W, rW>(a, S, I, rI);
+SYCL_EXTERNAL auto test_sin_cos_pi(sycl::detail::ap_int<13> a) {
+  return __spirv_FixedSinCosPiINTEL<13, 5>(a, false, 2, 2);
   // CHECK: %{{.*}} = call spir_func signext i10 @_Z[[#]]__spirv_FixedSinCosPiINTEL{{.*}}(i13 signext %[[#]], i1 zeroext false, i32 2, i32 2, i32 0, i32 0)
 }
 
-template <int W, int rW, bool S, int I, int rI>
-void log() {
-  sycl::detail::ap_int<W> a;
-  auto ap_fixed_Log = __spirv_FixedLogINTEL<W, rW>(a, S, I, rI);
+SYCL_EXTERNAL auto test_log(sycl::detail::ap_int<64> a) {
+  return __spirv_FixedLogINTEL<64, 44>(a, true, 24, 22);
   // CHECK: %{{.*}} = call spir_func i44 @_Z[[#]]__spirv_FixedLogINTEL{{.*}}(i64 %[[#]], i1 zeroext true, i32 24, i32 22, i32 0, i32 0)
 }
 
-template <int W, int rW, bool S, int I, int rI>
-void exp() {
-  sycl::detail::ap_int<W> a;
-  auto ap_fixed_Exp = __spirv_FixedExpINTEL<W, rW>(a, S, I, rI);
+SYCL_EXTERNAL auto test_exp(sycl::detail::ap_int<44> a) {
+  return __spirv_FixedExpINTEL<44, 34>(a, false, 20, 20);
   // CHECK: %{{.*}} = call spir_func i34 @_Z[[#]]__spirv_FixedExpINTEL{{.*}}(i44 %[[#]], i1 zeroext false, i32 20, i32 20, i32 0, i32 0)
-}
-
-template <typename name, typename Func>
-__attribute__((sycl_kernel)) void kernel_single_task(Func kernelFunc) {
-  kernelFunc();
-}
-
-int main() {
-  kernel_single_task<class kernel_function>([]() {
-    sqrt<13, 5, false, 2, 2>();
-    recip<3, 8, true, 4, 4>();
-    rsqrt<11, 10, false, 8, 6>();
-    sin<17, 11, true, 7, 5>();
-    cos<35, 28, false, 9, 3>();
-    sin_cos<31, 20, true, 10, 12>();
-    sin_pi<60, 5, false, 2, 2>();
-    cos_pi<28, 16, false, 8, 5>();
-    sin_cos_pi<13, 5, false, 2, 2>();
-    log<64, 44, true, 24, 22>();
-    exp<44, 34, false, 20, 20>();
-  });
-  return 0;
-}
+}
\ No newline at end of file
diff --git a/sycl/test/check_device_code/bf16_vector_conversion.cpp b/sycl/test/check_device_code/bf16_vector_conversion.cpp
new file mode 100644
index 0000000000000..e423150d7bf88
--- /dev/null
+++ b/sycl/test/check_device_code/bf16_vector_conversion.cpp
@@ -0,0 +1,130 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals none --version 4
+// NOTE: ..., followed by some manual cleanup.
+
+// RUN: %clangxx -I %sycl_include -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -Xclang -disable-lifetime-markers -fsycl-device-only %s -o - | FileCheck %s
+
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/bfloat16.hpp>
+
+using namespace sycl;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
+
+// CHECK-LABEL: define dso_local spir_func void @_Z10TestBFtoF1PN4sycl3_V13ext6oneapi8bfloat16EPf(
+// CHECK-SAME: ptr addrspace(4) noundef [[A:%.*]], ptr addrspace(4) noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !srcloc [[META5:![0-9]+]] !sycl_fixed_targets [[META6:![0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call spir_func void @__devicelib_ConvertBF16ToFINTELVec1(ptr addrspace(4) noundef [[A]], ptr addrspace(4) noundef [[B]]) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestBFtoF1(bfloat16 *a, float *b) {
+  ext::oneapi::detail::BF16VecToFloatVec<1>(a, b);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z10TestFtoBF1PfPN4sycl3_V13ext6oneapi8bfloat16Ei(
+// CHECK-SAME: ptr addrspace(4) noundef [[A:%.*]], ptr addrspace(4) noundef [[B:%.*]], i32 noundef [[SIZE:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META7:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call spir_func void @__devicelib_ConvertFToBF16INTELVec1(ptr addrspace(4) noundef [[A]], ptr addrspace(4) noundef [[B]]) #[[ATTR2]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFtoBF1(float *a, bfloat16 *b, int size) {
+  ext::oneapi::detail::FloatVecToBF16Vec<1>(a, b);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z10TestBFtoF2PN4sycl3_V13ext6oneapi8bfloat16EPf(
+// CHECK-SAME: ptr addrspace(4) noundef [[A:%.*]], ptr addrspace(4) noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META8:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call spir_func void @__devicelib_ConvertBF16ToFINTELVec2(ptr addrspace(4) noundef [[A]], ptr addrspace(4) noundef [[B]]) #[[ATTR2]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestBFtoF2(bfloat16 *a, float *b) {
+  ext::oneapi::detail::BF16VecToFloatVec<2>(a, b);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z10TestFtoBF2PfPN4sycl3_V13ext6oneapi8bfloat16Ei(
+// CHECK-SAME: ptr addrspace(4) noundef [[A:%.*]], ptr addrspace(4) noundef [[B:%.*]], i32 noundef [[SIZE:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META9:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call spir_func void @__devicelib_ConvertFToBF16INTELVec2(ptr addrspace(4) noundef [[A]], ptr addrspace(4) noundef [[B]]) #[[ATTR2]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFtoBF2(float *a, bfloat16 *b, int size) {
+  ext::oneapi::detail::FloatVecToBF16Vec<2>(a, b);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z10TestBFtoF3PN4sycl3_V13ext6oneapi8bfloat16EPf(
+// CHECK-SAME: ptr addrspace(4) noundef [[A:%.*]], ptr addrspace(4) noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META10:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call spir_func void @__devicelib_ConvertBF16ToFINTELVec3(ptr addrspace(4) noundef [[A]], ptr addrspace(4) noundef [[B]]) #[[ATTR2]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestBFtoF3(bfloat16 *a, float *b) {
+  ext::oneapi::detail::BF16VecToFloatVec<3>(a, b);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z10TestFtoBF3PfPN4sycl3_V13ext6oneapi8bfloat16Ei(
+// CHECK-SAME: ptr addrspace(4) noundef [[A:%.*]], ptr addrspace(4) noundef [[B:%.*]], i32 noundef [[SIZE:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META11:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call spir_func void @__devicelib_ConvertFToBF16INTELVec3(ptr addrspace(4) noundef [[A]], ptr addrspace(4) noundef [[B]]) #[[ATTR2]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFtoBF3(float *a, bfloat16 *b, int size) {
+  ext::oneapi::detail::FloatVecToBF16Vec<3>(a, b);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z10TestBFtoF4PN4sycl3_V13ext6oneapi8bfloat16EPf(
+// CHECK-SAME: ptr addrspace(4) noundef [[A:%.*]], ptr addrspace(4) noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META12:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call spir_func void @__devicelib_ConvertBF16ToFINTELVec4(ptr addrspace(4) noundef [[A]], ptr addrspace(4) noundef [[B]]) #[[ATTR2]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestBFtoF4(bfloat16 *a, float *b) {
+  ext::oneapi::detail::BF16VecToFloatVec<4>(a, b);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z10TestFtoBF4PfPN4sycl3_V13ext6oneapi8bfloat16Ei(
+// CHECK-SAME: ptr addrspace(4) noundef [[A:%.*]], ptr addrspace(4) noundef [[B:%.*]], i32 noundef [[SIZE:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META13:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call spir_func void @__devicelib_ConvertFToBF16INTELVec4(ptr addrspace(4) noundef [[A]], ptr addrspace(4) noundef [[B]]) #[[ATTR2]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFtoBF4(float *a, bfloat16 *b, int size) {
+  ext::oneapi::detail::FloatVecToBF16Vec<4>(a, b);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z10TestBFtoF8PN4sycl3_V13ext6oneapi8bfloat16EPf(
+// CHECK-SAME: ptr addrspace(4) noundef [[A:%.*]], ptr addrspace(4) noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META14:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call spir_func void @__devicelib_ConvertBF16ToFINTELVec8(ptr addrspace(4) noundef [[A]], ptr addrspace(4) noundef [[B]]) #[[ATTR2]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestBFtoF8(bfloat16 *a, float *b) {
+  ext::oneapi::detail::BF16VecToFloatVec<8>(a, b);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z10TestFtoBF8PfPN4sycl3_V13ext6oneapi8bfloat16Ei(
+// CHECK-SAME: ptr addrspace(4) noundef [[A:%.*]], ptr addrspace(4) noundef [[B:%.*]], i32 noundef [[SIZE:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META15:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call spir_func void @__devicelib_ConvertFToBF16INTELVec8(ptr addrspace(4) noundef [[A]], ptr addrspace(4) noundef [[B]]) #[[ATTR2]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFtoBF8(float *a, bfloat16 *b, int size) {
+  ext::oneapi::detail::FloatVecToBF16Vec<8>(a, b);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z11TestBFtoF16PN4sycl3_V13ext6oneapi8bfloat16EPf(
+// CHECK-SAME: ptr addrspace(4) noundef [[A:%.*]], ptr addrspace(4) noundef [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META16:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call spir_func void @__devicelib_ConvertBF16ToFINTELVec16(ptr addrspace(4) noundef [[A]], ptr addrspace(4) noundef [[B]]) #[[ATTR2]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestBFtoF16(bfloat16 *a, float *b) {
+  ext::oneapi::detail::BF16VecToFloatVec<16>(a, b);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z11TestFtoBF16PfPN4sycl3_V13ext6oneapi8bfloat16Ei(
+// CHECK-SAME: ptr addrspace(4) noundef [[A:%.*]], ptr addrspace(4) noundef [[B:%.*]], i32 noundef [[SIZE:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META17:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call spir_func void @__devicelib_ConvertFToBF16INTELVec16(ptr addrspace(4) noundef [[A]], ptr addrspace(4) noundef [[B]]) #[[ATTR2]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFtoBF16(float *a, bfloat16 *b, int size) {
+  ext::oneapi::detail::FloatVecToBF16Vec<16>(a, b);
+}
diff --git a/sycl/test/check_device_code/cuda/ldg.cpp b/sycl/test/check_device_code/cuda/ldg.cpp
index f0fd4ac9deef8..e9ed4ba8a51ca 100644
--- a/sycl/test/check_device_code/cuda/ldg.cpp
+++ b/sycl/test/check_device_code/cuda/ldg.cpp
@@ -1,3 +1,4 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --functions "ldg" --include-generated-funcs --version 4
 // REQUIRES: cuda
 
 // RUN: %clangxx -fsycl-device-only -fsycl-targets=nvptx64-nvidia-cuda -Xclang -fnative-half-type -S -Xclang -emit-llvm %s -o -| FileCheck %s --check-prefixes=CHECK-OPAQUE
@@ -9,279 +10,208 @@ using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::cuda;
 using namespace sycl::ext::oneapi::experimental;
 
-int main() {
-
-  sycl::queue q;
-
-  auto *in_c = sycl::malloc_device<char>(1, q);
-  auto *in_sc = sycl::malloc_device<signed char>(1, q);
-  auto *in_s = sycl::malloc_device<short>(1, q);
-  auto *in_i = sycl::malloc_device<int>(1, q);
-  auto *in_l = sycl::malloc_device<long>(1, q);
-  auto *in_ll = sycl::malloc_device<long long>(1, q);
-
-  auto *in_uc = sycl::malloc_device<unsigned char>(1, q);
-  auto *in_us = sycl::malloc_device<unsigned short>(1, q);
-  auto *in_ui = sycl::malloc_device<unsigned int>(1, q);
-  auto *in_ul = sycl::malloc_device<unsigned long>(1, q);
-  auto *in_ull = sycl::malloc_device<unsigned long long>(1, q);
-
-  auto *in_c2 = sycl::malloc_device<sycl::vec<char, 2>>(1, q);
-  auto *in_c3 = sycl::malloc_device<sycl::vec<char, 3>>(1, q);
-  auto *in_sc2 = sycl::malloc_device<sycl::vec<signed char, 2>>(1, q);
-  auto *in_sc3 = sycl::malloc_device<sycl::vec<signed char, 3>>(1, q);
-  auto *in_s2 = sycl::malloc_device<sycl::vec<short, 2>>(1, q);
-  auto *in_s3 = sycl::malloc_device<sycl::vec<short, 3>>(1, q);
-  auto *in_i2 = sycl::malloc_device<sycl::vec<int, 2>>(1, q);
-  auto *in_i3 = sycl::malloc_device<sycl::vec<int, 3>>(1, q);
-  auto *in_l2 = sycl::malloc_device<sycl::vec<long, 2>>(1, q);
-  auto *in_l3 = sycl::malloc_device<sycl::vec<long, 3>>(1, q);
-  auto *in_ll2 = sycl::malloc_device<sycl::vec<long long, 2>>(1, q);
-  auto *in_ll3 = sycl::malloc_device<sycl::vec<long long, 3>>(1, q);
-  auto *in_l4 = sycl::malloc_device<sycl::vec<long, 4>>(1, q);
-  auto *in_ll4 = sycl::malloc_device<sycl::vec<long long, 4>>(1, q);
-
-  auto *in_c4 = sycl::malloc_device<sycl::vec<char, 4>>(1, q);
-  auto *in_sc4 = sycl::malloc_device<sycl::vec<signed char, 4>>(1, q);
-  auto *in_s4 = sycl::malloc_device<sycl::vec<short, 4>>(1, q);
-  auto *in_i4 = sycl::malloc_device<sycl::vec<int, 4>>(1, q);
-
-  auto *in_uc2 = sycl::malloc_device<sycl::vec<unsigned char, 2>>(1, q);
-  auto *in_uc3 = sycl::malloc_device<sycl::vec<unsigned char, 3>>(1, q);
-  auto *in_us2 = sycl::malloc_device<sycl::vec<unsigned short, 2>>(1, q);
-  auto *in_us3 = sycl::malloc_device<sycl::vec<unsigned short, 3>>(1, q);
-  auto *in_ui2 = sycl::malloc_device<sycl::vec<unsigned int, 2>>(1, q);
-  auto *in_ui3 = sycl::malloc_device<sycl::vec<unsigned int, 3>>(1, q);
-  auto *in_ul2 = sycl::malloc_device<sycl::vec<unsigned long, 2>>(1, q);
-  auto *in_ul3 = sycl::malloc_device<sycl::vec<unsigned long, 3>>(1, q);
-  auto *in_ull2 = sycl::malloc_device<sycl::vec<unsigned long long, 2>>(1, q);
-  auto *in_ull3 = sycl::malloc_device<sycl::vec<unsigned long long, 3>>(1, q);
-  auto *in_ul4 = sycl::malloc_device<sycl::vec<unsigned long, 4>>(1, q);
-  auto *in_ull4 = sycl::malloc_device<sycl::vec<unsigned long long, 4>>(1, q);
-
-  auto *in_uc4 = sycl::malloc_device<sycl::vec<unsigned char, 4>>(1, q);
-  auto *in_us4 = sycl::malloc_device<sycl::vec<unsigned short, 4>>(1, q);
-  auto *in_ui4 = sycl::malloc_device<sycl::vec<unsigned int, 4>>(1, q);
-
-  auto *in_h = sycl::malloc_device<half>(1, q);
-  auto *in_f = sycl::malloc_device<float>(1, q);
-  auto *in_d = sycl::malloc_device<double>(1, q);
-
-  auto *in_h2 = sycl::malloc_device<sycl::vec<half, 2>>(1, q);
-  auto *in_h3 = sycl::malloc_device<sycl::vec<half, 3>>(1, q);
-  auto *in_h4 = sycl::malloc_device<sycl::vec<half, 4>>(1, q);
-  auto *in_f2 = sycl::malloc_device<sycl::vec<float, 2>>(1, q);
-  auto *in_f3 = sycl::malloc_device<sycl::vec<float, 3>>(1, q);
-  auto *in_f4 = sycl::malloc_device<sycl::vec<float, 4>>(1, q);
-  auto *in_d2 = sycl::malloc_device<sycl::vec<double, 2>>(1, q);
-  auto *in_d3 = sycl::malloc_device<sycl::vec<double, 3>>(1, q);
-  auto *in_d4 = sycl::malloc_device<sycl::vec<double, 4>>(1, q);
-
-  q.wait();
-
-  q.submit([=](sycl::handler &h) {
-    h.single_task<class check>([=] {
-      //CHECK-OPAQUE: tail call half @llvm.nvvm.ldg.global.f.f16.p0(ptr %{{.*}}, i32 2)
-      auto cached_h = ldg(&in_h[0]);
-      //CHECK-OPAQUE: tail call noundef float @llvm.nvvm.ldg.global.f.f32.p0(ptr %{{.*}}, i32 4)
-      auto cached_f = ldg(&in_f[0]);
-      //CHECK-OPAQUE: tail call noundef double @llvm.nvvm.ldg.global.f.f64.p0(ptr %{{.*}}, i32 8)
-      auto cached_d = ldg(&in_d[0]);
-
-      //CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr %{{.*}}, i32 4)
-      auto cached_h2 = ldg(&in_h2[0]);
-      //CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr %{{.*}}, i32 4)
-      //CHECK-OPAQUE: tail call half @llvm.nvvm.ldg.global.f.f16.p0(ptr nonnull %{{.*}}, i32 2)
-      auto cached_h3 = ldg(&in_h3[0]);
-      //CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr %{{.*}}, i32 4)
-      //CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr nonnull %{{.*}}, i32 4)
-      auto cached_h4 = ldg(&in_h4[0]);
-      //CHECK-OPAQUE: tail call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr %{{.*}}, i32 8)
-      auto cached_f2 = ldg(&in_f2[0]);
-      //CHECK-OPAQUE: tail call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr %{{.*}}, i32 8)
-      //CHECK-OPAQUE: tail call float @llvm.nvvm.ldg.global.f.f32.p0(ptr nonnull %{{.*}}, i32 4)
-      auto cached_f3 = ldg(&in_f3[0]);
-      //CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr %{{.*}}, i32 16)
-      auto cached_d2 = ldg(&in_d2[0]);
-      //CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr %{{.*}}, i32 16)
-      //CHECK-OPAQUE: tail call double @llvm.nvvm.ldg.global.f.f64.p0(ptr nonnull %{{.*}}, i32 8)
-      auto cached_d3 = ldg(&in_d3[0]);
-      //CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr %{{.*}}, i32 16)
-      //CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr nonnull %{{.*}}, i32 16)
-      auto cached_d4 = ldg(&in_d4[0]);
-      //CHECK-OPAQUE: tail call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0(ptr %{{.*}}, i32 16)
-      auto cached_f4 = ldg(&in_f4[0]);
-
-      // Unsigned variants are identical to signed variants, but this leads to
-      // correct behavior.
-
-      //CHECK-OPAQUE: tail call noundef i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr %{{.*}}, i32 1)
-      auto cached_c = ldg(&in_c[0]);
-      //CHECK-OPAQUE: tail call noundef i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr %{{.*}}, i32 1)
-      auto cached_sc = ldg(&in_sc[0]);
-      //CHECK-OPAQUE: tail call noundef i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr %{{.*}}, i32 2)
-      auto cached_s = ldg(&in_s[0]);
-      //CHECK-OPAQUE: tail call noundef i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr %{{.*}}, i32 4)
-      auto cached_i = ldg(&in_i[0]);
-      //CHECK-OPAQUE: tail call noundef i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr %{{.*}}, i32 8)
-      auto cached_l = ldg(&in_l[0]);
-      //CHECK-OPAQUE: tail call noundef i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr %{{.*}}, i32 8)
-      auto cached_ll = ldg(&in_ll[0]);
-      //CHECK-OPAQUE: tail call noundef i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr %{{.*}}, i32 1)
-      auto cached_uc = ldg(&in_uc[0]);
-      //CHECK-OPAQUE: tail call noundef i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr %{{.*}}, i32 2)
-      auto cached_us = ldg(&in_us[0]);
-      //CHECK-OPAQUE: tail call noundef i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr %{{.*}}, i32 4)
-      auto cached_ui = ldg(&in_ui[0]);
-      //CHECK-OPAQUE: tail call noundef i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr %{{.*}}, i32 8)
-      auto cached_ul = ldg(&in_ul[0]);
-      //CHECK-OPAQUE: tail call noundef i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr %{{.*}}, i32 8)
-      auto cached_ull = ldg(&in_ull[0]);
-
-      //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
-      auto cached_c2 = ldg(&in_c2[0]);
-      //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
-      //CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr nonnull %{{.*}}, i32 1)
-      auto cached_c3 = ldg(&in_c3[0]);
-      //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
-      auto cached_sc2 = ldg(&in_sc2[0]);
-      //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
-      //CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr nonnull %{{.*}}, i32 1)
-      auto cached_sc3 = ldg(&in_sc3[0]);
-      //CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
-      auto cached_s2 = ldg(&in_s2[0]);
-      //CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
-      //CHECK-OPAQUE: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr nonnull %{{.*}}, i32 2)
-      auto cached_s3 = ldg(&in_s3[0]);
-      //CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
-      auto cached_i2 = ldg(&in_i2[0]);
-      //CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
-      //CHECK-OPAQUE: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr nonnull %{{.*}}, i32 4)
-      auto cached_i3 = ldg(&in_i3[0]);
-      //CHECK-OPAQUE: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0(ptr %{{.*}}, i32 {{8|16}})
-      auto cached_l2 = ldg(&in_l2[0]);
-      //CHECK-OPAQUE: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0(ptr %{{.*}}, i32 {{8|16}})
-      //CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
-      auto cached_l3 = ldg(&in_l3[0]);
-      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
-      auto cached_ll2 = ldg(&in_ll2[0]);
-      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
-      //CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
-      auto cached_ll3 = ldg(&in_ll3[0]);
-      //CHECK-OPAQUE: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0(ptr %{{.*}}, i32 {{8|16}})
-      //CHECK-OPAQUE: tail call <2 x i{{32|64}}> @llvm.nvvm.ldg.global.i.v2i{{32|64}}.p0(ptr nonnull %{{.*}}, i32 {{8|16}})
-      auto cached_l4 = ldg(&in_l4[0]);
-      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
-      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr nonnull %{{.*}}, i32 16)
-      auto cached_ll4 = ldg(&in_ll4[0]);
-      //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
-      auto cached_uc2 = ldg(&in_uc2[0]);
-      //CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
-      //CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr nonnull %{{.*}}, i32 1)
-      auto cached_uc3 = ldg(&in_uc3[0]);
-      //CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
-      auto cached_us2 = ldg(&in_us2[0]);
-      //CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
-      //CHECK-OPAQUE: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr nonnull %{{.*}}, i32 2)
-      auto cached_us3 = ldg(&in_us3[0]);
-      //CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
-      auto cached_ui2 = ldg(&in_ui2[0]);
-      //CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
-      //CHECK-OPAQUE: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr nonnull %{{.*}}, i32 4)
-      auto cached_ui3 = ldg(&in_ui3[0]);
-      //CHECK-OPAQUE: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0(ptr %{{.*}}, i32 {{8|16}})
-      auto cached_ul2 = ldg(&in_ul2[0]);
-      //CHECK-OPAQUE: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0(ptr %{{.*}}, i32 {{8|16}})
-      //CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
-      auto cached_ul3 = ldg(&in_ul3[0]);
-      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
-      auto cached_ull2 = ldg(&in_ull2[0]);
-      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
-      //CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
-      auto cached_ull3 = ldg(&in_ull3[0]);
-      //CHECK-OPAQUE: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0(ptr %{{.*}}, i32 {{8|16}})
-      //CHECK-OPAQUE: tail call <2 x i{{64|32}}> @llvm.nvvm.ldg.global.i.v2i{{64|32}}.p0(ptr nonnull %{{.*}}, i32 {{8|16}})
-      auto cached_ul4 = ldg(&in_ul4[0]);
-      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
-      //CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr nonnull %{{.*}}, i32 16)
-      auto cached_ull4 = ldg(&in_ull4[0]);
-
-      //CHECK-OPAQUE: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr %{{.*}}, i32 4)
-      auto cached_c4 = ldg(&in_c4[0]);
-      //CHECK-OPAQUE: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr %{{.*}}, i32 4)
-      auto cached_sc4 = ldg(&in_sc4[0]);
-      //CHECK-OPAQUE: tail call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr %{{.*}}, i32 8)
-      auto cached_s4 = ldg(&in_s4[0]);
-      //CHECK-OPAQUE: tail call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr %{{.*}}, i32 16)
-      auto cached_i4 = ldg(&in_i4[0]);
-
-      //CHECK-OPAQUE: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr %{{.*}}, i32 4)
-      auto cached_uc4 = ldg(&in_uc4[0]);
-      //CHECK-OPAQUE: tail call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr %{{.*}}, i32 8)
-      auto cached_us4 = ldg(&in_us4[0]);
-      //CHECK-OPAQUE: tail call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr %{{.*}}, i32 16)
-      auto cached_ui4 = ldg(&in_ui4[0]);
-    });
-  });
-
-  q.wait();
-
-  free(in_h, q);
-  free(in_f, q);
-  free(in_d, q);
-  free(in_h2, q);
-  free(in_h3, q);
-  free(in_h4, q);
-  free(in_f2, q);
-  free(in_f3, q);
-  free(in_f4, q);
-  free(in_d2, q);
-  free(in_d3, q);
-  free(in_d4, q);
-  free(in_c, q);
-  free(in_sc, q);
-  free(in_s, q);
-  free(in_i, q);
-  free(in_l, q);
-  free(in_ll, q);
-  free(in_uc, q);
-  free(in_us, q);
-  free(in_ui, q);
-  free(in_ul, q);
-  free(in_ull, q);
-  free(in_c2, q);
-  free(in_c3, q);
-  free(in_sc2, q);
-  free(in_sc3, q);
-  free(in_s2, q);
-  free(in_s3, q);
-  free(in_i2, q);
-  free(in_i3, q);
-  free(in_l2, q);
-  free(in_l3, q);
-  free(in_ll2, q);
-  free(in_ll3, q);
-  free(in_l4, q);
-  free(in_ll4, q);
-  free(in_uc2, q);
-  free(in_uc3, q);
-  free(in_us2, q);
-  free(in_us3, q);
-  free(in_ui2, q);
-  free(in_ui3, q);
-  free(in_ul2, q);
-  free(in_ul3, q);
-  free(in_ull2, q);
-  free(in_ull3, q);
-  free(in_ul4, q);
-  free(in_ull4, q);
-  free(in_c4, q);
-  free(in_sc4, q);
-  free(in_s4, q);
-  free(in_i4, q);
-  free(in_uc4, q);
-  free(in_us4, q);
-  free(in_ui4, q);
-
-  return 0;
-};
+// CHECK-OPAQUE: tail call half @llvm.nvvm.ldg.global.f.f16.p0(ptr %{{.*}}, i32 2)
+template SYCL_EXTERNAL half
+sycl::ext::oneapi::experimental::cuda::ldg(const half *);
+// CHECK-OPAQUE: tail call float @llvm.nvvm.ldg.global.f.f32.p0(ptr %{{.*}}, i32 4)
+template SYCL_EXTERNAL float
+sycl::ext::oneapi::experimental::cuda::ldg(const float *);
+// CHECK-OPAQUE: tail call double @llvm.nvvm.ldg.global.f.f64.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL double
+sycl::ext::oneapi::experimental::cuda::ldg(const double *);
+
+// CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<half, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<half, 2> *);
+// CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr %{{.*}}, i32 4)
+// CHECK-OPAQUE: tail call half @llvm.nvvm.ldg.global.f.f16.p0(ptr nonnull %{{.*}}, i32 2)
+template SYCL_EXTERNAL sycl::vec<half, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<half, 3> *);
+// CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr %{{.*}}, i32 4)
+// CHECK-OPAQUE: tail call <2 x half> @llvm.nvvm.ldg.global.f.v2f16.p0(ptr nonnull %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<half, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<half, 4> *);
+// CHECK-OPAQUE: tail call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<float, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<float, 2> *);
+// CHECK-OPAQUE: tail call <2 x float> @llvm.nvvm.ldg.global.f.v2f32.p0(ptr %{{.*}}, i32 8)
+// CHECK-OPAQUE: tail call float @llvm.nvvm.ldg.global.f.f32.p0(ptr nonnull %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<float, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<float, 3> *);
+// CHECK-OPAQUE: tail call <4 x float> @llvm.nvvm.ldg.global.f.v4f32.p0(ptr %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<float, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<float, 4> *);
+// CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<double, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<double, 2> *);
+// CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call double @llvm.nvvm.ldg.global.f.f64.p0(ptr nonnull %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<double, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<double, 3> *);
+// CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call <2 x double> @llvm.nvvm.ldg.global.f.v2f64.p0(ptr nonnull %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<double, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<double, 4> *);
+
+// Unsigned variants are identical to signed variants, but this leads to
+// correct behavior.
+
+// CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr %{{.*}}, i32 1)
+template SYCL_EXTERNAL char
+sycl::ext::oneapi::experimental::cuda::ldg(const char *);
+// CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr %{{.*}}, i32 1)
+template SYCL_EXTERNAL signed char
+sycl::ext::oneapi::experimental::cuda::ldg(const signed char *);
+// CHECK-OPAQUE: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr %{{.*}}, i32 2)
+template SYCL_EXTERNAL short
+sycl::ext::oneapi::experimental::cuda::ldg(const short *);
+// CHECK-OPAQUE: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr %{{.*}}, i32 4)
+template SYCL_EXTERNAL int
+sycl::ext::oneapi::experimental::cuda::ldg(const int *);
+// CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL long
+sycl::ext::oneapi::experimental::cuda::ldg(const long *);
+// CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL long long
+sycl::ext::oneapi::experimental::cuda::ldg(const long long *);
+
+// CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr %{{.*}}, i32 1)
+template SYCL_EXTERNAL unsigned char
+sycl::ext::oneapi::experimental::cuda::ldg(const unsigned char *);
+// CHECK-OPAQUE: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr %{{.*}}, i32 2)
+template SYCL_EXTERNAL unsigned short
+sycl::ext::oneapi::experimental::cuda::ldg(const unsigned short *);
+// CHECK-OPAQUE: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr %{{.*}}, i32 4)
+template SYCL_EXTERNAL unsigned int
+sycl::ext::oneapi::experimental::cuda::ldg(const unsigned int *);
+// CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL unsigned long
+sycl::ext::oneapi::experimental::cuda::ldg(const unsigned long *);
+// CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL unsigned long long
+sycl::ext::oneapi::experimental::cuda::ldg(const unsigned long long *);
+
+// CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
+template SYCL_EXTERNAL sycl::vec<char, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<char, 2> *);
+// CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
+// CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr nonnull %{{.*}}, i32 1)
+template SYCL_EXTERNAL sycl::vec<char, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<char, 3> *);
+// CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
+template SYCL_EXTERNAL sycl::vec<signed char, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<signed char, 2> *);
+// CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
+// CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr nonnull %{{.*}}, i32 1)
+template SYCL_EXTERNAL sycl::vec<signed char, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<signed char, 3> *);
+// CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<short, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<short, 2> *);
+// CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
+// CHECK-OPAQUE: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr nonnull %{{.*}}, i32 2)
+template SYCL_EXTERNAL sycl::vec<short, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<short, 3> *);
+// CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<int, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<int, 2> *);
+// CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
+// CHECK-OPAQUE: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr nonnull %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<int, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<int, 3> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<long, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<long, 2> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<long, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<long, 3> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<long long, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<long long, 2> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<long long, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<long long, 3> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr nonnull %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<long, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<long, 4> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr nonnull %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<long long, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<long long, 4> *);
+
+// CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
+template SYCL_EXTERNAL sycl::vec<unsigned char, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<unsigned char, 2> *);
+// CHECK-OPAQUE: tail call <2 x i8> @llvm.nvvm.ldg.global.i.v2i8.p0(ptr %{{.*}}, i32 2)
+// CHECK-OPAQUE: tail call i8 @llvm.nvvm.ldg.global.i.i8.p0(ptr nonnull %{{.*}}, i32 1)
+template SYCL_EXTERNAL sycl::vec<unsigned char, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<unsigned char, 3> *);
+// CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<unsigned short, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(
+    const sycl::vec<unsigned short, 2> *);
+// CHECK-OPAQUE: tail call <2 x i16> @llvm.nvvm.ldg.global.i.v2i16.p0(ptr %{{.*}}, i32 4)
+// CHECK-OPAQUE: tail call i16 @llvm.nvvm.ldg.global.i.i16.p0(ptr nonnull %{{.*}}, i32 2)
+template SYCL_EXTERNAL sycl::vec<unsigned short, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(
+    const sycl::vec<unsigned short, 3> *);
+// CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<unsigned int, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<unsigned int, 2> *);
+
+// CHECK-OPAQUE: tail call <2 x i32> @llvm.nvvm.ldg.global.i.v2i32.p0(ptr %{{.*}}, i32 8)
+// CHECK-OPAQUE: tail call i32 @llvm.nvvm.ldg.global.i.i32.p0(ptr nonnull %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<unsigned int, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<unsigned int, 3> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<unsigned long, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<unsigned long, 2> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<unsigned long, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<unsigned long, 3> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<unsigned long long, 2>
+sycl::ext::oneapi::experimental::cuda::ldg(
+    const sycl::vec<unsigned long long, 2> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call i64 @llvm.nvvm.ldg.global.i.i64.p0(ptr nonnull %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<unsigned long long, 3>
+sycl::ext::oneapi::experimental::cuda::ldg(
+    const sycl::vec<unsigned long long, 3> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr nonnull %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<unsigned long, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<unsigned long, 4> *);
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr %{{.*}}, i32 16)
+// CHECK-OPAQUE: tail call <2 x i64> @llvm.nvvm.ldg.global.i.v2i64.p0(ptr nonnull %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<unsigned long long, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(
+    const sycl::vec<unsigned long long, 4> *);
+
+// CHECK-OPAQUE: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<char, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<char, 4> *);
+// CHECK-OPAQUE: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<signed char, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<signed char, 4> *);
+// CHECK-OPAQUE: tail call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<short, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<short, 4> *);
+// CHECK-OPAQUE: tail call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<int, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<int, 4> *);
+
+// CHECK-OPAQUE: tail call <4 x i8> @llvm.nvvm.ldg.global.i.v4i8.p0(ptr %{{.*}}, i32 4)
+template SYCL_EXTERNAL sycl::vec<unsigned char, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<unsigned char, 4> *);
+// CHECK-OPAQUE: tail call <4 x i16> @llvm.nvvm.ldg.global.i.v4i16.p0(ptr %{{.*}}, i32 8)
+template SYCL_EXTERNAL sycl::vec<unsigned short, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(
+    const sycl::vec<unsigned short, 4> *);
+// CHECK-OPAQUE: tail call <4 x i32> @llvm.nvvm.ldg.global.i.v4i32.p0(ptr %{{.*}}, i32 16)
+template SYCL_EXTERNAL sycl::vec<unsigned int, 4>
+sycl::ext::oneapi::experimental::cuda::ldg(const sycl::vec<unsigned int, 4> *);
diff --git a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-bfloat16-test.cpp b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-bfloat16-test.cpp
index 9f99cb6ea9457..dc6cd06270433 100644
--- a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-bfloat16-test.cpp
+++ b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-bfloat16-test.cpp
@@ -10,215 +10,230 @@ using sycl::ext::oneapi::bfloat16;
 
 constexpr int stride = 16;
 
-int main() {
-
-  buffer<bfloat16, 1> bufA(nullptr, range<1>(1));
-  buffer<bfloat16, 1> bufB(nullptr, range<1>(1));
-  buffer<float, 1> bufC(nullptr, range<1>(1));
-  buffer<float, 1> bufD(nullptr, range<1>(1));
-
-  queue q;
-
-  q.submit([&](handler &cgh) {
-    sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accA(bufA, cgh);
-    sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accB(bufB, cgh);
-    sycl::accessor<float, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accC(bufC, cgh);
-    sycl::accessor<float, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accD(bufD, cgh);
-
-    cgh.parallel_for<class row_row_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, bfloat16, use::a, 16, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, bfloat16, use::b, 16, 16, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.mma.row.row.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, bfloat16, use::a, 16, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, bfloat16, use::b, 16, 16, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.mma.col.col.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, bfloat16, use::a, 32, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, bfloat16, use::b, 16, 8, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.mma.row.row.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, bfloat16, use::a, 32, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, bfloat16, use::b, 16, 8, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.mma.col.col.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, bfloat16, use::a, 8, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, bfloat16, use::b, 16, 32, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.mma.row.row.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, bfloat16, use::a, 8, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, bfloat16, use::b, 16, 32, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.mma.col.col.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-  });
-
-  return 0;
-};
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m16n16k16(sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, bfloat16, use::a, 16, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, bfloat16, use::b, 16, 16, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.mma.row.row.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m16n16k16(sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, bfloat16, use::a, 16, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, bfloat16, use::b, 16, 16, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.mma.col.col.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m32n8k16(sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, bfloat16, use::a, 32, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, bfloat16, use::b, 16, 8, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.mma.row.row.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m32n8k16(sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, bfloat16, use::a, 32, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, bfloat16, use::b, 16, 8, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.mma.col.col.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m8n32k16(sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, bfloat16, use::a, 8, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, bfloat16, use::b, 16, 32, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.mma.row.row.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m8n32k16(sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, bfloat16, use::a, 8, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, bfloat16, use::b, 16, 32, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.bf16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.mma.col.col.bf16(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
diff --git a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-double-test.cpp b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-double-test.cpp
index f4a79d2756937..750632ca80243 100644
--- a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-double-test.cpp
+++ b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-double-test.cpp
@@ -15,96 +15,78 @@ constexpr int N = 8; // number of cols of accumulator,
                      // number of rows of a.
 constexpr int K = 4; // number of cols of a/number of rows of b.
 
-double A[M * K];
-double B[K * N];
-double C[M * N];
-double D[M * N];
-
-int main() {
-
-  buffer<double, 1> bufA(A, range<1>(M * K));
-  buffer<double, 1> bufB(B, range<1>(K * N));
-  buffer<double, 1> bufC(C, range<1>(M * N));
-  buffer<double, 1> bufD(D, range<1>(M * N));
-
-  queue q;
-
-  q.submit([&](handler &cgh) {
-    sycl::accessor<double, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accA(bufA, cgh);
-    sycl::accessor<double, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accB(bufB, cgh);
-    sycl::accessor<double, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accC(bufC, cgh);
-    sycl::accessor<double, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accD(bufD, cgh);
-
-    cgh.parallel_for<class row_row>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, double, use::accumulator, M, N> sub_c{};
-          joint_matrix<sub_group, double, use::a, M, K, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, double, use::b, K, N, layout::row_major>
-              sub_b{};
-
-          //CHECK-OPAQUE: tail call { double, double } @llvm.nvvm.wmma.m8n8k4.load.c.row.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 8)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              N, layout::row_major);
-          //CHECK-OPAQUE: tail call double @llvm.nvvm.wmma.m8n8k4.load.a.row.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 4)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              K);
-          //CHECK-OPAQUE: tail call double @llvm.nvvm.wmma.m8n8k4.load.b.row.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 8)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              N);
-          //CHECK-OPAQUE: tail call { double, double } @llvm.nvvm.wmma.m8n8k4.mma.row.row.f64(double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          //CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n8k4.store.d.row.stride.f64.p1(ptr addrspace(1) %{{.*}}, double {{.*}}, double {{.*}}, i32 8)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              N, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, double, use::accumulator, M, N> sub_c{};
-          joint_matrix<sub_group, double, use::a, M, K, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, double, use::b, K, N, layout::col_major>
-              sub_b{};
-
-          //CHECK-OPAQUE: tail call { double, double } @llvm.nvvm.wmma.m8n8k4.load.c.col.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 8)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              M, layout::col_major);
-          //CHECK-OPAQUE: tail call double @llvm.nvvm.wmma.m8n8k4.load.a.col.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 8)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              M);
-          //CHECK-OPAQUE: tail call double @llvm.nvvm.wmma.m8n8k4.load.b.col.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 4)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              K);
-          //CHECK-OPAQUE: tail call { double, double } @llvm.nvvm.wmma.m8n8k4.mma.col.col.f64(double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          //CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n8k4.store.d.col.stride.f64.p1(ptr addrspace(1) %{{.*}}, double {{.*}}, double {{.*}}, i32 8)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              M, layout::col_major);
-        });
-  });
-
-  return 0;
-};
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m8n8k4(sycl::accessor<double, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accA,
+               sycl::accessor<double, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accB,
+               sycl::accessor<double, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accC,
+               sycl::accessor<double, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accD,
+               nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, double, use::accumulator, M, N> sub_c{};
+  joint_matrix<sub_group, double, use::a, M, K, layout::row_major> sub_a{};
+  joint_matrix<sub_group, double, use::b, K, N, layout::row_major> sub_b{};
+
+  //CHECK-OPAQUE: tail call { double, double } @llvm.nvvm.wmma.m8n8k4.load.c.row.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 8)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(), N,
+                    layout::row_major);
+  //CHECK-OPAQUE: tail call double @llvm.nvvm.wmma.m8n8k4.load.a.row.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 4)
+  joint_matrix_load(sg, sub_a,
+                    accA.template get_multi_ptr<access::decorated::yes>(), K);
+  //CHECK-OPAQUE: tail call double @llvm.nvvm.wmma.m8n8k4.load.b.row.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 8)
+  joint_matrix_load(sg, sub_b,
+                    accB.template get_multi_ptr<access::decorated::yes>(), N);
+  //CHECK-OPAQUE: tail call { double, double } @llvm.nvvm.wmma.m8n8k4.mma.row.row.f64(double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  //CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n8k4.store.d.row.stride.f64.p1(ptr addrspace(1) %{{.*}}, double {{.*}}, double {{.*}}, i32 8)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(), N,
+                     layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m8n8k4(sycl::accessor<double, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accA,
+               sycl::accessor<double, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accB,
+               sycl::accessor<double, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accC,
+               sycl::accessor<double, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accD,
+               nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, double, use::accumulator, M, N> sub_c{};
+  joint_matrix<sub_group, double, use::a, M, K, layout::col_major> sub_a{};
+  joint_matrix<sub_group, double, use::b, K, N, layout::col_major> sub_b{};
+
+  //CHECK-OPAQUE: tail call { double, double } @llvm.nvvm.wmma.m8n8k4.load.c.col.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 8)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(), M,
+                    layout::col_major);
+  //CHECK-OPAQUE: tail call double @llvm.nvvm.wmma.m8n8k4.load.a.col.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 8)
+  joint_matrix_load(sg, sub_a,
+                    accA.template get_multi_ptr<access::decorated::yes>(), M);
+  //CHECK-OPAQUE: tail call double @llvm.nvvm.wmma.m8n8k4.load.b.col.stride.f64.p1(ptr addrspace(1) %{{.*}}, i32 4)
+  joint_matrix_load(sg, sub_b,
+                    accB.template get_multi_ptr<access::decorated::yes>(), K);
+  //CHECK-OPAQUE: tail call { double, double } @llvm.nvvm.wmma.m8n8k4.mma.col.col.f64(double {{.*}}, double {{.*}}, double {{.*}}, double {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  //CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n8k4.store.d.col.stride.f64.p1(ptr addrspace(1) %{{.*}}, double {{.*}}, double {{.*}}, i32 8)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(), M,
+                     layout::col_major);
+}
diff --git a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-half-float-test.cpp b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-half-float-test.cpp
index cb5b3da54b794..a3e7e61a94b20 100644
--- a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-half-float-test.cpp
+++ b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-half-float-test.cpp
@@ -9,215 +9,230 @@ using namespace sycl::ext::oneapi::experimental::matrix;
 
 constexpr int stride = 16;
 
-int main() {
-
-  buffer<half, 1> bufA(nullptr, range<1>(1));
-  buffer<half, 1> bufB(nullptr, range<1>(1));
-  buffer<float, 1> bufC(nullptr, range<1>(1));
-  buffer<float, 1> bufD(nullptr, range<1>(1));
-
-  queue q;
-
-  q.submit([&](handler &cgh) {
-    sycl::accessor<half, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accA(bufA, cgh);
-    sycl::accessor<half, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accB(bufB, cgh);
-    sycl::accessor<float, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accC(bufC, cgh);
-    sycl::accessor<float, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accD(bufD, cgh);
-
-    cgh.parallel_for<class row_row_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, half, use::a, 16, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 16, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, half, use::a, 16, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 16, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, half, use::a, 32, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 8, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, half, use::a, 32, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 8, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, half, use::a, 8, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 32, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, half, use::a, 8, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 32, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-  });
-
-  return 0;
-};
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m16n16k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, half, use::a, 16, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 16, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.mma.row.row.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m16n16k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, half, use::a, 16, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 16, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.mma.col.col.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m32n8k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, half, use::a, 32, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 8, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.mma.row.row.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m32n8k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, half, use::a, 32, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 8, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m32n8k16.mma.col.col.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m8n32k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, half, use::a, 8, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 32, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.mma.row.row.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m8n8k4(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accA,
+               sycl::accessor<half, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accB,
+               sycl::accessor<float, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accC,
+               sycl::accessor<float, 1, sycl::access::mode::read_write,
+                              sycl::target::device>
+                   accD,
+               nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, half, use::a, 8, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 32, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m8n32k16.mma.col.col.f32.f32(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f32.p1(ptr addrspace(1) %{{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
diff --git a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-half-half-test.cpp b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-half-half-test.cpp
index feea65a79848b..602fff0a038ba 100644
--- a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-half-half-test.cpp
+++ b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-half-half-test.cpp
@@ -9,215 +9,231 @@ using namespace sycl::ext::oneapi::experimental::matrix;
 
 constexpr int stride = 16;
 
-int main() {
-
-  buffer<half, 1> bufA(nullptr, range<1>(1));
-  buffer<half, 1> bufB(nullptr, range<1>(1));
-  buffer<half, 1> bufC(nullptr, range<1>(1));
-  buffer<half, 1> bufD(nullptr, range<1>(1));
-
-  queue q;
-
-  q.submit([&](handler &cgh) {
-    sycl::accessor<half, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accA(bufA, cgh);
-    sycl::accessor<half, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accB(bufB, cgh);
-    sycl::accessor<half, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accC(bufC, cgh);
-    sycl::accessor<half, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accD(bufD, cgh);
-
-    cgh.parallel_for<class row_row_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, half, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, half, use::a, 16, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 16, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, half, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, half, use::a, 16, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 16, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, half, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, half, use::a, 32, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 8, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, half, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, half, use::a, 32, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 8, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, half, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, half, use::a, 8, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 32, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, half, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, half, use::a, 8, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 32, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-  });
-
-  return 0;
-};
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m16n16k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, half, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, half, use::a, 16, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 16, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.mma.row.row.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m16n16k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, half, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, half, use::a, 16, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 16, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m16n16k16.mma.col.col.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m32n8k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, half, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, half, use::a, 32, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 8, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.mma.row.row.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m32n8k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, half, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, half, use::a, 32, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 8, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m32n8k16.mma.col.col.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m8n32k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, half, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, half, use::a, 8, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 32, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.mma.row.row.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m8n32k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, half, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, half, use::a, 8, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 32, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.f16.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { <2 x half>, <2 x half>, <2 x half>, <2 x half> } @llvm.nvvm.wmma.m8n32k16.mma.col.col.f16.f16(<2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.f16.p0(ptr %{{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, <2 x half> {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
diff --git a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-int8-test.cpp b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-int8-test.cpp
index 492313dbaf71d..ed1d8b0c62221 100644
--- a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-int8-test.cpp
+++ b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-int8-test.cpp
@@ -9,215 +9,251 @@ using namespace sycl::ext::oneapi::experimental::matrix;
 
 constexpr int stride = 16;
 
-int main() {
-
-  buffer<int8_t, 1> bufA(nullptr, range<1>(1));
-  buffer<int8_t, 1> bufB(nullptr, range<1>(1));
-  buffer<int32_t, 1> bufC(nullptr, range<1>(1));
-  buffer<int32_t, 1> bufD(nullptr, range<1>(1));
-
-  queue q;
-
-  q.submit([&](handler &cgh) {
-    sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accA(bufA, cgh);
-    sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accB(bufB, cgh);
-    sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accC(bufC, cgh);
-    sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accD(bufD, cgh);
-
-    cgh.parallel_for<class row_row_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, int8_t, use::a, 16, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, int8_t, use::b, 16, 16, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.mma.row.row.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, int8_t, use::a, 16, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, int8_t, use::b, 16, 16, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.mma.col.col.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, int8_t, use::a, 32, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, int8_t, use::b, 16, 8, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.mma.row.row.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, int8_t, use::a, 32, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, int8_t, use::b, 16, 8, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.mma.col.col.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, int8_t, use::a, 8, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, int8_t, use::b, 16, 32, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.mma.row.row.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, int8_t, use::a, 8, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, int8_t, use::b, 16, 32, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.s8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.mma.col.col.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-  });
-
-  return 0;
-};
+// The following SYCL_EXTERNAL functions (e.g. row_row_m16n16k16) test perform
+// matrix multiplication in various different ways. They were originally written
+// in the following manner:
+//
+//  ...
+//  q.submit([&] (handler &cgh) {
+//      sycl::accessor<int8_t,  1, sycl::access::mode::read_write,
+//      sycl::target::device> accA(bufA, cgh); sycl::accessor<int8_t,  1,
+//      sycl::access::mode::read_write, sycl::target::device> accB(bufB, cgh);
+//      sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+//      sycl::target::device> accC(bufC, cgh); sycl::accessor<int32_t, 1,
+//      sycl::access::mode::read_write, sycl::target::device> accD(bufD, cgh);
+
+//      cgh.parallel_for<class row_row_m16n16k16>(nd_range<2>({1, 32}, {1, 32}),
+//          [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
+//              row_row_m16n16k16(accA, accB, accC, accD, item);
+//          });
+//  });
+//
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m16n16k16(sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, int8_t, use::a, 16, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, int8_t, use::b, 16, 16, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.mma.row.row.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m16n16k16(sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, int8_t, use::a, 16, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, int8_t, use::b, 16, 16, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.mma.col.col.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m32n8k16(sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, int8_t, use::a, 32, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, int8_t, use::b, 16, 8, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.mma.row.row.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m32n8k16(sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, int8_t, use::a, 32, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, int8_t, use::b, 16, 8, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.mma.col.col.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m8n32k16(sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, int8_t, use::a, 8, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, int8_t, use::b, 16, 32, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.mma.row.row.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m8n32k16(sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, int8_t, use::a, 8, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, int8_t, use::b, 16, 32, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.s8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.mma.col.col.s8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
diff --git a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-tf32-test.cpp b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-tf32-test.cpp
index e9200d930de46..d1298c6c3f862 100644
--- a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-tf32-test.cpp
+++ b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-tf32-test.cpp
@@ -32,104 +32,105 @@ constexpr int N = 16; // number of cols of accumulator,
                       // number of rows of a.
 constexpr int K = 8;  // number of cols of a/number of rows of b.
 
-// float is used in this test as the storage type for tf32
-float A[M * K];
-float B[K * N];
-float C[M * N];
-float D[M * N];
-
-int main() {
-
-  buffer<float, 1> bufA(A, range<1>(M * K)); // will be used as tf32
-  buffer<float, 1> bufB(B, range<1>(K * N)); // will be used as tf32
-  buffer<float, 1> bufC(C, range<1>(M * N));
-  buffer<float, 1> bufD(D, range<1>(M * N));
-
-  queue q;
-
-  q.submit([&](handler &cgh) {
-    auto accA = bufA.get_access<access::mode::read_write>(cgh);
-    auto accB = bufB.get_access<access::mode::read_write>(cgh);
-    auto accC = bufC.get_access<access::mode::read_write>(cgh);
-    auto accD = bufD.get_access<access::mode::read_write>(cgh);
-
-    cgh.parallel_for<class row_row>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, precision::tf32, use::a, M, K,
-                       layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, precision::tf32, use::b, K, N,
-                       layout::row_major>
-              sub_b{};
-          joint_matrix<sub_group, float, use::accumulator, M, N> sub_c{};
-
-          //CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0(ptr %{{.*}}, i32 8)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              K);
-          //CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.b.row.stride.tf32.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              N);
-          //CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              N, layout::row_major);
-
-          auto round_lambda = [](auto &x) { x = round_to_tf32(x); };
-          //CHECK-OPAQUE: tail call i32 @llvm.nvvm.f2tf32.rna(float %{{.*}})
-          joint_matrix_apply(sg, sub_a, round_lambda);
-
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          //CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32.p1(ptr addrspace(1) {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 {{.*}}
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              N, layout::row_major);
-        });
-  });
-
-  q.submit([&](handler &cgh) {
-    auto accA = bufA.get_access<access::mode::read_write>(cgh);
-    auto accB = bufB.get_access<access::mode::read_write>(cgh);
-    auto accC = bufC.get_access<access::mode::read_write>(cgh);
-    auto accD = bufD.get_access<access::mode::read_write>(cgh);
-
-    cgh.parallel_for<class col_col>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, precision::tf32, use::a, M, K,
-                       layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, precision::tf32, use::b, K, N,
-                       layout::col_major>
-              sub_b{};
-          joint_matrix<sub_group, float, use::accumulator, M, N> sub_c{};
-
-          //CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.col.stride.tf32.p0(ptr %{{.*}}, i32 8)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              K);
-          //CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.b.col.stride.tf32.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              N);
-          //CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1(ptr addrspace(1) {{.*}}, i32 {{.*}})
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              N, layout::col_major);
-
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          //CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1(ptr addrspace(1) {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              N, layout::col_major);
-        });
-  });
-
-  return 0;
-};
+// Float is used in this test as the storage type for tf32:
+//
+// float A[M * K];
+// float B[K * N];
+// float C[M * N];
+// float D[M * N];
+//
+// Accessors would have been made, like so:
+//
+// buffer<float, 1> bufA(A, range<1>(M * K)); // will be used as tf32
+// buffer<float, 1> bufB(B, range<1>(K * N)); // will be used as tf32
+// buffer<float, 1> bufC(C, range<1>(M * N));
+// buffer<float, 1> bufD(D, range<1>(M * N));
+// ...
+// auto accA = bufA.get_access<access::mode::read_write>(handler);
+// auto accB = bufB.get_access<access::mode::read_write>(handler);
+// auto accC = bufC.get_access<access::mode::read_write>(handler);
+// auto accD = bufD.get_access<access::mode::read_write>(handler);
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row(sycl::accessor<float, 1, sycl::access::mode::read_write,
+                       sycl::target::device>
+            accA,
+        sycl::accessor<float, 1, sycl::access::mode::read_write,
+                       sycl::target::device>
+            accB,
+        sycl::accessor<float, 1, sycl::access::mode::read_write,
+                       sycl::target::device>
+            accC,
+        sycl::accessor<float, 1, sycl::access::mode::read_write,
+                       sycl::target::device>
+            accD,
+        nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, precision::tf32, use::a, M, K, layout::row_major>
+      sub_a{};
+  joint_matrix<sub_group, precision::tf32, use::b, K, N, layout::row_major>
+      sub_b{};
+  joint_matrix<sub_group, float, use::accumulator, M, N> sub_c{};
+
+  //CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.row.stride.tf32.p0(ptr %{{.*}}, i32 8)
+  joint_matrix_load(sg, sub_a,
+                    accA.template get_multi_ptr<access::decorated::yes>(), K);
+  //CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.b.row.stride.tf32.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_b,
+                    accB.template get_multi_ptr<access::decorated::yes>(), N);
+  //CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.f32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(), N,
+                    layout::row_major);
+
+  auto round_lambda = [](auto &x) { x = round_to_tf32(x); };
+  //CHECK-OPAQUE: tail call i32 @llvm.nvvm.f2tf32.rna(float %{{.*}})
+  joint_matrix_apply(sg, sub_a, round_lambda);
+
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  //CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.f32.p1(ptr addrspace(1) {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 {{.*}}
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(), N,
+                     layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col(sycl::accessor<float, 1, sycl::access::mode::read_write,
+                       sycl::target::device>
+            accA,
+        sycl::accessor<float, 1, sycl::access::mode::read_write,
+                       sycl::target::device>
+            accB,
+        sycl::accessor<float, 1, sycl::access::mode::read_write,
+                       sycl::target::device>
+            accC,
+        sycl::accessor<float, 1, sycl::access::mode::read_write,
+                       sycl::target::device>
+            accD,
+        nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, precision::tf32, use::a, M, K, layout::col_major>
+      sub_a{};
+  joint_matrix<sub_group, precision::tf32, use::b, K, N, layout::col_major>
+      sub_b{};
+  joint_matrix<sub_group, float, use::accumulator, M, N> sub_c{};
+
+  //CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.a.col.stride.tf32.p0(ptr %{{.*}}, i32 8)
+  joint_matrix_load(sg, sub_a,
+                    accA.template get_multi_ptr<access::decorated::yes>(), K);
+  //CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k8.load.b.col.stride.tf32.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_b,
+                    accB.template get_multi_ptr<access::decorated::yes>(), N);
+  //CHECK-OPAQUE: tail call { float, float, float, float, float, float, float, float } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.f32.p1(ptr addrspace(1) {{.*}}, i32 {{.*}})
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(), N,
+                    layout::col_major);
+
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  //CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.f32.p1(ptr addrspace(1) {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, float {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(), N,
+                     layout::col_major);
+}
diff --git a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-uint8-test.cpp b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-uint8-test.cpp
index 67d0dd5ea4728..2a6dfd700fd0a 100644
--- a/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-uint8-test.cpp
+++ b/sycl/test/check_device_code/cuda/matrix/matrix-nvptx-uint8-test.cpp
@@ -9,215 +9,231 @@ using namespace sycl::ext::oneapi::experimental::matrix;
 
 constexpr int stride = 16;
 
-int main() {
-
-  buffer<uint8_t, 1> bufA(nullptr, range<1>(1));
-  buffer<uint8_t, 1> bufB(nullptr, range<1>(1));
-  buffer<int32_t, 1> bufC(nullptr, range<1>(1));
-  buffer<int32_t, 1> bufD(nullptr, range<1>(1));
-
-  queue q;
-
-  q.submit([&](handler &cgh) {
-    sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accA(bufA, cgh);
-    sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accB(bufB, cgh);
-    sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accC(bufC, cgh);
-    sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accD(bufD, cgh);
-
-    cgh.parallel_for<class row_row_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, uint8_t, use::a, 16, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, uint8_t, use::b, 16, 16, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.mma.row.row.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m16n16k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, uint8_t, use::a, 16, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, uint8_t, use::b, 16, 16, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.mma.col.col.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, uint8_t, use::a, 32, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, uint8_t, use::b, 16, 8, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.mma.row.row.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m32n8k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 32, 8> sub_c{};
-          joint_matrix<sub_group, uint8_t, use::a, 32, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, uint8_t, use::b, 16, 8, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.mma.col.col.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-
-    cgh.parallel_for<class row_row_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, uint8_t, use::a, 8, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, uint8_t, use::b, 16, 32, layout::row_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-          // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.mma.row.row.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::row_major);
-        });
-
-    cgh.parallel_for<class col_col_m8n32k16>(
-        nd_range<2>({1, 32}, {1, 32}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 32)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 8, 32> sub_c{};
-          joint_matrix<sub_group, uint8_t, use::a, 8, 16, layout::col_major>
-              sub_a{};
-          joint_matrix<sub_group, uint8_t, use::b, 16, 32, layout::col_major>
-              sub_b{};
-
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_c, accC.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-          // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.u8.p0(ptr %{{.*}}, i32 16)
-          joint_matrix_load(
-              sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(),
-              stride);
-          // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.mma.col.col.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              stride, layout::col_major);
-        });
-  });
-
-  return 0;
-};
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m16n16k16(sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, uint8_t, use::a, 16, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, uint8_t, use::b, 16, 16, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.row.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.row.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.mma.row.row.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m16n16k16(sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, uint8_t, use::a, 16, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, uint8_t, use::b, 16, 16, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.a.col.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32 } @llvm.nvvm.wmma.m16n16k16.load.b.col.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m16n16k16.mma.col.col.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m16n16k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m32n8k16(sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, uint8_t, use::a, 32, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, uint8_t, use::b, 16, 8, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.row.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m32n8k16.load.b.row.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.mma.row.row.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m32n8k16(sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 32, 8> sub_c{};
+  joint_matrix<sub_group, uint8_t, use::a, 32, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, uint8_t, use::b, 16, 8, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.load.a.col.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m32n8k16.load.b.col.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m32n8k16.mma.col.col.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m32n8k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+row_row_m8n32k16(sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, uint8_t, use::a, 8, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, uint8_t, use::b, 16, 32, layout::row_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.c.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::row_major);
+  // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m8n32k16.load.a.row.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.row.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.mma.row.row.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.row.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 32)]] void
+col_col_m8n32k16(sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<uint8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 8, 32> sub_c{};
+  joint_matrix<sub_group, uint8_t, use::a, 8, 16, layout::col_major> sub_a{};
+  joint_matrix<sub_group, uint8_t, use::b, 16, 32, layout::col_major> sub_b{};
+
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.c.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 16)
+  joint_matrix_load(sg, sub_c,
+                    accC.template get_multi_ptr<access::decorated::yes>(),
+                    stride, layout::col_major);
+  // CHECK-OPAQUE: tail call i32 @llvm.nvvm.wmma.m8n32k16.load.a.col.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_a, accA.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.load.b.col.stride.u8.p0(ptr %{{.*}}, i32 16)
+  joint_matrix_load(
+      sg, sub_b, accB.template get_multi_ptr<access::decorated::yes>(), stride);
+  // CHECK-OPAQUE: tail call { i32, i32, i32, i32, i32, i32, i32, i32 } @llvm.nvvm.wmma.m8n32k16.mma.col.col.u8(i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}})
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  // CHECK-OPAQUE: tail call void @llvm.nvvm.wmma.m8n32k16.store.d.col.stride.s32.p1(ptr addrspace(1) %{{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 {{.*}}, i32 16)
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(),
+                     stride, layout::col_major);
+}
diff --git a/sycl/test/check_device_code/device_global_const_eval_use.cpp b/sycl/test/check_device_code/device_global_const_eval_use.cpp
index f7b29ae6f9938..f2b9d0e6b652c 100644
--- a/sycl/test/check_device_code/device_global_const_eval_use.cpp
+++ b/sycl/test/check_device_code/device_global_const_eval_use.cpp
@@ -80,42 +80,38 @@ constexpr device_global<TestStruct2, decltype(properties(device_image_scope))>
     dg_constexpr_constructor_struct{TS4};
 // CHECK: @{{[A-Za-z0-9_]*}}dg_constexpr_constructor_struct = internal addrspace(1) constant { %struct.TestStruct2 } { %struct.TestStruct2 { i32 4 } }, align 4, !spirv.Decorations
 
-int main() {
-  sycl::queue Q;
-  Q.submit([&](sycl::handler &h) {
-    // Simple kernel that just copies over the values from the device_globals so
-    // that we can observe the GlobalVariables that are created to represent
-    // them in the IR
-    h.single_task([=] {
-      // Int and array of ints
-      std::ignore = dg_int;
-      std::ignore = dg_int_arr[0];
-
-      // Char and array of chars
-      std::ignore = dg_char;
-      std::ignore = dg_char_arr[0];
-
-      // Multidimensional array of integers
-      std::ignore = dg_multi_dim_arr[1][1];
-
-      // Float and array of floats
-      std::ignore = dg_float;
-      std::ignore = dg_float_arr[0];
-
-      // Double and array of doubles
-      std::ignore = dg_double;
-      std::ignore = dg_double_arr[0];
-
-      // Bool and array of bools
-      std::ignore = dg_bool;
-      std::ignore = dg_bool_arr[0];
-
-      // Struct and array of structs
-      std::ignore = dg_struct.get().field1;
-      std::ignore = dg_struct_arr[0];
-
-      // Struct with constexpr constructor
-      std::ignore = dg_constexpr_constructor_struct.get().value;
-    });
-  });
+SYCL_EXTERNAL void device_global_const_eval_use() {
+  // Simple kernel that just copies over the values from the device_globals so
+  // that we can observe the GlobalVariables that are created to represent
+  // them in the IR
+
+  // Int and array of ints
+  std::ignore = dg_int;
+  std::ignore = dg_int_arr[0];
+
+  // Char and array of chars
+  std::ignore = dg_char;
+  std::ignore = dg_char_arr[0];
+
+  // Multidimensional array of integers
+  std::ignore = dg_multi_dim_arr[1][1];
+
+  // Float and array of floats
+  std::ignore = dg_float;
+  std::ignore = dg_float_arr[0];
+
+  // Double and array of doubles
+  std::ignore = dg_double;
+  std::ignore = dg_double_arr[0];
+
+  // Bool and array of bools
+  std::ignore = dg_bool;
+  std::ignore = dg_bool_arr[0];
+
+  // Struct and array of structs
+  std::ignore = dg_struct.get().field1;
+  std::ignore = dg_struct_arr[0];
+
+  // Struct with constexpr constructor
+  std::ignore = dg_constexpr_constructor_struct.get().value;
 }
diff --git a/sycl/test/check_device_code/device_has_func.cpp b/sycl/test/check_device_code/device_has_func.cpp
new file mode 100644
index 0000000000000..00f7ad6d042bb
--- /dev/null
+++ b/sycl/test/check_device_code/device_has_func.cpp
@@ -0,0 +1,76 @@
+// RUN: %clangxx -fsycl -Xclang -fsycl-is-device -fsycl-device-only -Xclang -fno-sycl-early-optimizations -S -emit-llvm %s -o %t.ll
+// RUN: FileCheck %s --input-file %t.ll --check-prefix=CHECK-ASPECTS
+// RUN: FileCheck %s --input-file %t.ll --check-prefix=CHECK-SRCLOC
+
+// Tests for IR of device_has(aspect, ...) attribute and
+// !sycl_used_aspects metadata.
+// We run FileCheck for 2 times to break metadata order dependency since
+// compiler has no guarantee for meta data order.
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+// CHECK-ASPECTS: define dso_local spir_func void @{{.*}}kernel_name_1{{.*}} !sycl_declared_aspects ![[ASPECTS1:[0-9]+]] {{.*}}
+// CHECK-SRCLOC: define dso_local spir_func void @{{.*}}kernel_name_1{{.*}} !srcloc ![[SRCLOC1:[0-9]+]] {{.*}}
+
+// CHECK-ASPECTS: define {{.*}}spir_func void @{{.*}}func1{{.*}} !sycl_declared_aspects ![[ASPECTS1]]
+// CHECK-ASPECTS-SAME: !sycl_used_aspects ![[ASPECTS1]]
+// CHECK-SRCLOC: define {{.*}}spir_func void @{{.*}}func1{{.*}} !srcloc ![[SRCLOC2:[0-9]+]]
+[[sycl::device_has(sycl::aspect::cpu)]] void func1() {}
+
+// CHECK-ASPECTS: define {{.*}}spir_func void @{{.*}}func2{{.*}} !sycl_declared_aspects ![[ASPECTS2:[0-9]+]]
+// CHECK-ASPECTS-SAME: !sycl_used_aspects ![[ASPECTS2]]
+// CHECK-SRCLOC: define {{.*}}spir_func void @{{.*}}func2{{.*}} !srcloc ![[SRCLOC3:[0-9]+]]
+[[sycl::device_has(sycl::aspect::fp16, sycl::aspect::gpu)]] void func2() {}
+
+// CHECK-ASPECTS: define {{.*}}spir_func void @{{.*}}func3{{.*}} !sycl_declared_aspects ![[EMPTYASPECTS:[0-9]+]]
+// CHECK-SRCLOC: define {{.*}}spir_func void @{{.*}}func3{{.*}} !srcloc ![[SRCLOC4:[0-9]+]]
+[[sycl::device_has()]] void func3() {}
+
+// CHECK-ASPECTS: define {{.*}}spir_func void @{{.*}}func4{{.*}} !sycl_declared_aspects ![[ASPECTS3:[0-9]+]]
+// CHECK-ASPECTS-SAME: !sycl_used_aspects ![[ASPECTS3]]
+// CHECK-SRCLOC: define {{.*}}spir_func void @{{.*}}func4{{.*}} !srcloc ![[SRCLOC5:[0-9]+]]
+template <sycl::aspect Aspect> [[sycl::device_has(Aspect)]] void func4() {}
+
+// CHECK-ASPECTS: define {{.*}}spir_func void @{{.*}}func5{{.*}} !sycl_declared_aspects ![[ASPECTS1]]
+// CHECK-ASPECTS-SAME: !sycl_used_aspects ![[ASPECTS1]]
+// CHECK-SRCLOC: define {{.*}}spir_func void @{{.*}}func5{{.*}} !srcloc ![[SRCLOC6:[0-9]+]]
+[[sycl::device_has(sycl::aspect::cpu)]] void func5();
+void func5() {}
+
+constexpr sycl::aspect getAspect() { return sycl::aspect::cpu; }
+// CHECK-ASPECTS: define {{.*}}spir_func void @{{.*}}func6{{.*}} !sycl_declared_aspects ![[ASPECTS1]]
+// CHECK-ASPECTS-SAME: !sycl_used_aspects ![[ASPECTS1]]
+// CHECK-SRCLOC: define {{.*}}spir_func void @{{.*}}func6{{.*}} !srcloc ![[SRCLOC7:[0-9]+]]
+[[sycl::device_has(getAspect())]] void func6() {}
+
+SYCL_EXTERNAL [[sycl::device_has(sycl::aspect::cpu)]] void kernel_name_1() {
+  func1();
+  func2();
+  func3();
+  func4<sycl::aspect::host>();
+  func5();
+  func6();
+}
+
+// CHECK-ASPECTS: define dso_local spir_func void @{{.*}}kernel_name_2{{.*}} !sycl_declared_aspects ![[ASPECTS4:[0-9]+]]
+// CHECK-SRCLOC: define dso_local spir_func void @{{.*}}kernel_name_2{{.*}} !srcloc ![[SRCLOC8:[0-9]+]] {{.*}}
+SYCL_EXTERNAL [[sycl::device_has(sycl::aspect::gpu)]] void kernel_name_2() {}
+
+// CHECK-ASPECTS-DAG: [[ASPECTS1]] = !{![[ASPECTCPU:[0-9]+]]}
+// CHECK-ASPECTS-DAG: [[ASPECTCPU]] = !{!"cpu", i32 1}
+// CHECK-SRCLOC-DAG: [[SRCLOC1]] = !{i32 {{[0-9]+}}}
+// CHECK-ASPECTS-DAG: [[EMPTYASPECTS]] = !{}
+// CHECK-SRCLOC-DAG: [[SRCLOC2]] = !{i32 {{[0-9]+}}}
+// CHECK-ASPECTS-DAG: [[ASPECTS2]] = !{![[ASPECTFP16:[0-9]+]], ![[ASPECTGPU:[0-9]+]]}
+// CHECK-ASPECTS-DAG: [[ASPECTFP16]] = !{!"fp16", i32 5}
+// CHECK-ASPECTS-DAG: [[ASPECTGPU]] = !{!"gpu", i32 2}
+// CHECK-SRCLOC-DAG: [[SRCLOC3]] = !{i32 {{[0-9]+}}}
+// CHECK-SRCLOC-DAG: [[SRCLOC4]] = !{i32 {{[0-9]+}}}
+// CHECK-ASPECTS-DAG: [[ASPECTS3]] = !{![[ASPECTHOST:[0-9]+]]}
+// CHECK-ASPECTS-DAG: [[ASPECTHOST]] = !{!"host", i32 0}
+// CHECK-SRCLOC-DAG: [[SRCLOC5]] = !{i32 {{[0-9]+}}}
+// CHECK-SRCLOC-DAG: [[SRCLOC6]] = !{i32 {{[0-9]+}}}
+// CHECK-SRCLOC-DAG: [[SRCLOC7]] = !{i32 {{[0-9]+}}}
+// CHECK-ASPECTS-DAG: [[ASPECTS4]] = !{![[ASPECTGPU]]}
+// CHECK-SRCLOC-DAG: [[SRCLOC8]] = !{i32 {{[0-9]+}}}
diff --git a/sycl/test/check_device_code/device_has.cpp b/sycl/test/check_device_code/device_has_kernel.cpp
similarity index 100%
rename from sycl/test/check_device_code/device_has.cpp
rename to sycl/test/check_device_code/device_has_kernel.cpp
diff --git a/sycl/test/esimd/NBarrierAttr.cpp b/sycl/test/check_device_code/esimd/NBarrierAttr.cpp
similarity index 100%
rename from sycl/test/esimd/NBarrierAttr.cpp
rename to sycl/test/check_device_code/esimd/NBarrierAttr.cpp
diff --git a/sycl/test/esimd/dae.cpp b/sycl/test/check_device_code/esimd/dae.cpp
similarity index 100%
rename from sycl/test/esimd/dae.cpp
rename to sycl/test/check_device_code/esimd/dae.cpp
diff --git a/sycl/test/esimd/dpas.cpp b/sycl/test/check_device_code/esimd/dpas.cpp
similarity index 88%
rename from sycl/test/esimd/dpas.cpp
rename to sycl/test/check_device_code/esimd/dpas.cpp
index ecb994513e46b..0892200641bb6 100644
--- a/sycl/test/esimd/dpas.cpp
+++ b/sycl/test/check_device_code/esimd/dpas.cpp
@@ -56,14 +56,14 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void xmx_func() {
   // CHECK-LABEL: define dso_local spir_func void @_Z8xmx_funcv()
 
   { // ======= DPAS BF16 =======================================================
-    simd<bfloat16, M_one *N_pvc> R_bf = 0;
-    simd<float, M_one *N_pvc> R_f = 0;
+    simd<bfloat16, M_one * N_pvc> R_bf = 0;
+    simd<float, M_one * N_pvc> R_f = 0;
 
-    simd<bfloat16, M_one *N_pvc> C_bf = 0;
-    simd<float, M_one *N_pvc> C_f = 0;
+    simd<bfloat16, M_one * N_pvc> C_bf = 0;
+    simd<float, M_one * N_pvc> C_f = 0;
 
-    simd<bfloat16, K_bf16 *N_pvc> B_bf = 0;
-    simd<bfloat16, M_one *K_bf16> A_bf = 0;
+    simd<bfloat16, K_bf16 * N_pvc> B_bf = 0;
+    simd<bfloat16, M_one * K_bf16> A_bf = 0;
 
     R_f = xmx::dpas<8, 1, float>(C_f, B_bf, A_bf);
     zoo(R_f);
@@ -91,14 +91,14 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void xmx_func() {
   }
 
   { // ======= DPAS FP16 =======================================================
-    simd<half, M_one *N_pvc> R_hf = 0;
-    simd<float, M_one *N_pvc> R_f = 0;
+    simd<half, M_one * N_pvc> R_hf = 0;
+    simd<float, M_one * N_pvc> R_f = 0;
 
-    simd<half, M_one *N_pvc> C_hf = 0;
-    simd<float, M_one *N_pvc> C_f = 0;
+    simd<half, M_one * N_pvc> C_hf = 0;
+    simd<float, M_one * N_pvc> C_f = 0;
 
-    simd<half, K_half *N_pvc> B_hf = 0;
-    simd<half, M_one *K_half> A_hf = 0;
+    simd<half, K_half * N_pvc> B_hf = 0;
+    simd<half, M_one * K_half> A_hf = 0;
 
     // ------------------- FP16: WITH ACC OPERAND -----------------------
     R_f = xmx::dpas<8, 1, float>(C_f, B_hf, A_hf);
@@ -128,10 +128,10 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void xmx_func() {
   }
 
   { // ======= DPAS 8-BIT x 2-BIT INT ==========================================
-    simd<int, M_one *N_pvc> R_d = 0;
-    simd<int, M_one *N_pvc> C_d = 0;
-    simd<int, K_int8x2 *N_pvc / 16> B_int2 = 0; // 16 2-bit integers per int32
-    simd<signed char, M_one *K_int8x2> A_int8 = 0;
+    simd<int, M_one * N_pvc> R_d = 0;
+    simd<int, M_one * N_pvc> C_d = 0;
+    simd<int, K_int8x2 * N_pvc / 16> B_int2 = 0; // 16 2-bit integers per int32
+    simd<signed char, M_one * K_int8x2> A_int8 = 0;
 
     // ------------ DPAS s8 x s2: WITH THE ACCUMULATOR OPERAND -----------------
     R_d = xmx::dpas<8, 1, int, int, int, signed char, s2, s8>(C_d, B_int2,
@@ -146,11 +146,11 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void xmx_func() {
   }
 
   { // ======= DPASW BF16 ======================================================
-    simd<float, M_one *N_dg2> R_f = 0;
-    simd<float, M_one *N_dg2> C_f = 0;
+    simd<float, M_one * N_dg2> R_f = 0;
+    simd<float, M_one * N_dg2> C_f = 0;
 
-    simd<bfloat16, K_bf16 *N_dg2> B_bf = 0;
-    simd<bfloat16, M_one *K_bf16 / 2> A_bf = 0;
+    simd<bfloat16, K_bf16 * N_dg2> B_bf = 0;
+    simd<bfloat16, M_one * K_bf16 / 2> A_bf = 0;
 
     // ------------ DPASW BF16: WITH THE ACCUMULATOR OPERAND -------------------
     R_f = xmx::dpasw<8, 1, float>(C_f, B_bf, A_bf);
@@ -164,8 +164,8 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void xmx_func() {
   }
 
   { // ======= DPASW FP16 ======================================================
-    simd<float, M_one *N_dg2> R_f = 0;
-    simd<float, M_one *N_dg2> C_f = 0;
+    simd<float, M_one * N_dg2> R_f = 0;
+    simd<float, M_one * N_dg2> C_f = 0;
 
     simd<half, K_half * N_dg2> B_hf = 0;
     simd<half, M_one * K_half / 2> A_hf = 0;
@@ -182,12 +182,12 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void xmx_func() {
   }
 
   { // ======= DPAS TFLOAT32 ===================================================
-    simd<float, M_one *N_pvc> R_f = 0;
-    simd<float, M_one *N_pvc> C_f = 0;
+    simd<float, M_one * N_pvc> R_f = 0;
+    simd<float, M_one * N_pvc> C_f = 0;
 
-    simd<sycl::ext::intel::experimental::esimd::tfloat32, K_tf32 *N_pvc> B_tf =
+    simd<sycl::ext::intel::experimental::esimd::tfloat32, K_tf32 * N_pvc> B_tf =
         0;
-    simd<sycl::ext::intel::experimental::esimd::tfloat32, M_one *K_tf32> A_tf =
+    simd<sycl::ext::intel::experimental::esimd::tfloat32, M_one * K_tf32> A_tf =
         0;
 
     // ------------------- TFLOAT32: WITH ACC OPERAND --------------------------
diff --git a/sycl/test/esimd/fp16_converts.cpp b/sycl/test/check_device_code/esimd/fp16_converts.cpp
similarity index 100%
rename from sycl/test/esimd/fp16_converts.cpp
rename to sycl/test/check_device_code/esimd/fp16_converts.cpp
diff --git a/sycl/test/esimd/genx_func_attr.cpp b/sycl/test/check_device_code/esimd/genx_func_attr.cpp
similarity index 100%
rename from sycl/test/esimd/genx_func_attr.cpp
rename to sycl/test/check_device_code/esimd/genx_func_attr.cpp
diff --git a/sycl/test/esimd/glob.cpp b/sycl/test/check_device_code/esimd/glob.cpp
similarity index 100%
rename from sycl/test/esimd/glob.cpp
rename to sycl/test/check_device_code/esimd/glob.cpp
diff --git a/sycl/test/esimd/intrins_trans.cpp b/sycl/test/check_device_code/esimd/intrins_trans.cpp
similarity index 100%
rename from sycl/test/esimd/intrins_trans.cpp
rename to sycl/test/check_device_code/esimd/intrins_trans.cpp
diff --git a/sycl/test/esimd/lane_id.cpp b/sycl/test/check_device_code/esimd/lane_id.cpp
similarity index 88%
rename from sycl/test/esimd/lane_id.cpp
rename to sycl/test/check_device_code/esimd/lane_id.cpp
index 1c49b0baa2307..19c75e2263d56 100644
--- a/sycl/test/esimd/lane_id.cpp
+++ b/sycl/test/check_device_code/esimd/lane_id.cpp
@@ -12,8 +12,7 @@ using namespace sycl::ext::intel::esimd;
 // Wrapper for designating a scalar region of code that will be
 // vectorized by the backend compiler.
 #define SIMT_BEGIN(N, lane)                                                    \
-  [&]() SYCL_ESIMD_FUNCTION ESIMD_NOINLINE                                     \
-      [[intel::sycl_esimd_vectorize(N)]] {                                     \
+  [&]() SYCL_ESIMD_FUNCTION ESIMD_NOINLINE [[intel::sycl_esimd_vectorize(N)]] {                                     \
     int lane = __esimd_lane_id();
 #define SIMT_END                                                               \
   }                                                                            \
diff --git a/sycl/test/check_device_code/esimd/lit.local.cfg b/sycl/test/check_device_code/esimd/lit.local.cfg
new file mode 100644
index 0000000000000..ba736d6369c39
--- /dev/null
+++ b/sycl/test/check_device_code/esimd/lit.local.cfg
@@ -0,0 +1,3 @@
+import platform
+
+config.substitutions.append(("%clang_O0", "-O0 -mllvm -esimd-allow-optnone-noinline"))
\ No newline at end of file
diff --git a/sycl/test/esimd/lower-external-funcs.cpp b/sycl/test/check_device_code/esimd/lower-external-funcs.cpp
similarity index 100%
rename from sycl/test/esimd/lower-external-funcs.cpp
rename to sycl/test/check_device_code/esimd/lower-external-funcs.cpp
diff --git a/sycl/test/esimd/lsc.cpp b/sycl/test/check_device_code/esimd/lsc.cpp
similarity index 100%
rename from sycl/test/esimd/lsc.cpp
rename to sycl/test/check_device_code/esimd/lsc.cpp
diff --git a/sycl/test/esimd/math_impl.cpp b/sycl/test/check_device_code/esimd/math_impl.cpp
similarity index 100%
rename from sycl/test/esimd/math_impl.cpp
rename to sycl/test/check_device_code/esimd/math_impl.cpp
diff --git a/sycl/test/esimd/memory_properties_atomic_update.cpp b/sycl/test/check_device_code/esimd/memory_properties_atomic_update.cpp
similarity index 65%
rename from sycl/test/esimd/memory_properties_atomic_update.cpp
rename to sycl/test/check_device_code/esimd/memory_properties_atomic_update.cpp
index 594f52303e1ff..403448e3677f0 100644
--- a/sycl/test/esimd/memory_properties_atomic_update.cpp
+++ b/sycl/test/check_device_code/esimd/memory_properties_atomic_update.cpp
@@ -87,17 +87,22 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     auto res_atomic_2 =
         atomic_update<atomic_op::inc, int>(ptr, offsets, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}} i8 8, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}} i8 8, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_3 =
         atomic_update<atomic_op::inc, int>(ptr, offsets_view, pred, props_a);
+    res_atomic_3 =
+        atomic_update<atomic_op::inc>(ptr, offsets_view, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}} i8 8, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}} i8 8, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_4 =
         atomic_update<atomic_op::inc, int, VL>(ptr, offsets_view, props_a);
+    res_atomic_4 = atomic_update<atomic_op::inc>(ptr, offsets_view, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}} i8 8, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}} i8 8, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> undef, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_5 = atomic_update<atomic_op::inc, int, VL>(
         ptr, offsets_view.select<VL, 1>(), props_a);
+    res_atomic_5 = atomic_update<atomic_op::inc>(
+        ptr, offsets_view.select<VL, 1>(), props_a);
 
     // atomic_upate without cache hints:
     // CHECK: call <4 x i32> @llvm.genx.svm.atomic.inc.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, <4 x i64> {{[^)]+}}, <4 x i32> undef)
@@ -212,41 +217,59 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     auto res_atomic_1 =
         atomic_update<atomic_op::add, int>(ptr, offsets, add, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_2 = atomic_update<atomic_op::add, int, VL>(
         ptr, offsets, add_view, pred, props_a);
+    res_atomic_2 =
+        atomic_update<atomic_op::add>(ptr, offsets, add_view, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_3 =
         atomic_update<atomic_op::add, int, VL>(ptr, offsets, add_view, props_a);
+    res_atomic_3 =
+        atomic_update<atomic_op::add>(ptr, offsets, add_view, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     res_atomic_3 = atomic_update<atomic_op::add, int, VL>(
         ptr, offsets, add_view.select<VL, 1>(), props_a);
+    res_atomic_3 = atomic_update<atomic_op::add>(
+        ptr, offsets, add_view.select<VL, 1>(), props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_4 = atomic_update<atomic_op::add, int, VL>(
         ptr, offsets_view, add, pred, props_a);
+    res_atomic_4 =
+        atomic_update<atomic_op::add>(ptr, offsets_view, add, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_5 =
         atomic_update<atomic_op::add, int, VL>(ptr, offsets_view, add, props_a);
+    res_atomic_5 =
+        atomic_update<atomic_op::add>(ptr, offsets_view, add, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     res_atomic_5 = atomic_update<atomic_op::add, int, VL>(
         ptr, offsets_view.select<VL, 1>(), add, props_a);
+    res_atomic_5 = atomic_update<atomic_op::add>(
+        ptr, offsets_view.select<VL, 1>(), add, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_6 = atomic_update<atomic_op::add, int, VL>(
         ptr, offsets_view, add_view, pred, props_a);
+    res_atomic_6 = atomic_update<atomic_op::add>(ptr, offsets_view, add_view,
+                                                 pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_7 = atomic_update<atomic_op::add, int, VL>(
         ptr, offsets_view, add_view, props_a);
+    res_atomic_7 =
+        atomic_update<atomic_op::add>(ptr, offsets_view, add_view, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     res_atomic_7 = atomic_update<atomic_op::add, int, VL>(
         ptr, offsets_view.select<VL, 1>(), add_view.select<VL, 1>(), props_a);
+    res_atomic_7 = atomic_update<atomic_op::add>(
+        ptr, offsets_view.select<VL, 1>(), add_view.select<VL, 1>(), props_a);
 
     // atomic_update without cache hints:
     // CHECK: call <4 x i32> @llvm.genx.svm.atomic.add.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, <4 x i64> {{[^)]+}}, <4 x i32> undef)
@@ -267,8 +290,8 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
 
     // Accessors
 
-    // CHECK-STATEFUL-COUNT-14:  call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef)
-    // CHECK-STATELESS-COUNT-14: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
+    // CHECK-STATEFUL-COUNT-26:  call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 {{[^)]+}}, <4 x i32> undef)
+    // CHECK-STATELESS-COUNT-26: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 12, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef, i32 0, <4 x i32> undef)
     auto res_atomic_9 =
         atomic_update<atomic_op::add, int>(acc, offsets, add, pred, props_a);
 
@@ -311,6 +334,42 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     res_atomic_16 = atomic_update<atomic_op::add, int, VL>(
         acc, offsets_view.select<VL, 1>(), add_view.select<VL, 1>(), props_a);
 
+    res_atomic_11 =
+        atomic_update<atomic_op::add>(acc, offsets, add_view, pred, props_a);
+
+    res_atomic_11 = atomic_update<atomic_op::add>(
+        acc, offsets, add_view.select<VL, 1>(), pred, props_a);
+
+    res_atomic_12 =
+        atomic_update<atomic_op::add>(acc, offsets, add_view, props_a);
+
+    res_atomic_12 = atomic_update<atomic_op::add>(
+        acc, offsets, add_view.select<VL, 1>(), props_a);
+
+    res_atomic_13 =
+        atomic_update<atomic_op::add>(acc, offsets_view, add, pred, props_a);
+
+    res_atomic_13 = atomic_update<atomic_op::add>(
+        acc, offsets_view.select<VL, 1>(), add, pred, props_a);
+
+    res_atomic_14 =
+        atomic_update<atomic_op::add>(acc, offsets_view, add, props_a);
+    res_atomic_14 = atomic_update<atomic_op::add>(
+        acc, offsets_view.select<VL, 1>(), add, props_a);
+
+    res_atomic_15 = atomic_update<atomic_op::add>(acc, offsets_view, add_view,
+                                                  pred, props_a);
+
+    res_atomic_15 =
+        atomic_update<atomic_op::add>(acc, offsets_view.select<VL, 1>(),
+                                      add_view.select<VL, 1>(), pred, props_a);
+
+    res_atomic_16 =
+        atomic_update<atomic_op::add>(acc, offsets_view, add_view, props_a);
+
+    res_atomic_16 = atomic_update<atomic_op::add>(
+        acc, offsets_view.select<VL, 1>(), add_view.select<VL, 1>(), props_a);
+
     // atomic_update without cache hints:
     // CHECK-STATEFUL:  call <4 x i32> @llvm.genx.dword.atomic.sub.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
     // CHECK-STATELESS: call <4 x i32> @llvm.genx.svm.atomic.sub.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
@@ -345,67 +404,97 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     auto res_atomic_2 = atomic_update<atomic_op::cmpxchg, int>(
         ptr, offsets, swap, compare, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_3 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets, swap, compare_view, pred, props_a);
+    res_atomic_3 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets, swap, compare_view, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     res_atomic_3 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets, swap, compare_view.select<VL, 1>(), pred, props_a);
+    res_atomic_3 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets, swap, compare_view.select<VL, 1>(), pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_4 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets, swap, compare_view, props_a);
+    res_atomic_4 = atomic_update<atomic_op::cmpxchg>(ptr, offsets, swap,
+                                                     compare_view, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_5 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets, swap_view, compare, pred, props_a);
+    res_atomic_5 = atomic_update<atomic_op::cmpxchg>(ptr, offsets, swap_view,
+                                                     compare, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_6 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets, swap_view, compare, props_a);
+    res_atomic_6 = atomic_update<atomic_op::cmpxchg>(ptr, offsets, swap_view,
+                                                     compare, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_7 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets, swap_view, compare_view, pred, props_a);
+    res_atomic_7 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets, swap_view, compare_view, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_8 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets, swap_view, compare_view, props_a);
+    res_atomic_8 = atomic_update<atomic_op::cmpxchg>(ptr, offsets, swap_view,
+                                                     compare_view, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_9 = atomic_update<atomic_op::cmpxchg, int>(
         ptr, offsets_view, swap, compare, pred, props_a);
+    res_atomic_9 = atomic_update<atomic_op::cmpxchg>(ptr, offsets_view, swap,
+                                                     compare, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_10 = atomic_update<atomic_op::cmpxchg, int>(
         ptr, offsets_view, swap, compare, props_a);
+    res_atomic_10 = atomic_update<atomic_op::cmpxchg>(ptr, offsets_view, swap,
+                                                      compare, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_11 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets_view, swap, compare_view, pred, props_a);
+    res_atomic_11 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view, swap, compare_view, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_12 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets_view, swap, compare_view, props_a);
+    res_atomic_12 = atomic_update<atomic_op::cmpxchg>(ptr, offsets_view, swap,
+                                                      compare_view, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_13 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets_view, swap_view, compare, pred, props_a);
+    res_atomic_13 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view, swap_view, compare, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_14 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets_view, swap_view, compare, props_a);
+    res_atomic_14 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view, swap_view, compare, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_15 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets_view, swap_view, compare_view, pred, props_a);
+    res_atomic_15 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view, swap_view, compare_view, pred, props_a);
 
-    // CHECK: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-2: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_16 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets_view, swap_view, compare_view, props_a);
+    res_atomic_16 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view, swap_view, compare_view, props_a);
 
-    // CHECK-COUNT-13: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-COUNT-26: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     res_atomic_4 = atomic_update<atomic_op::cmpxchg, int, VL>(
         ptr, offsets, swap, compare_view.select<VL, 1>(), props_a);
 
@@ -453,6 +542,53 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
         ptr, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
         compare_view.select<VL, 1>(), props_a);
 
+    res_atomic_4 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets, swap, compare_view.select<VL, 1>(), props_a);
+
+    res_atomic_5 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets, swap_view.select<VL, 1>(), compare, pred, props_a);
+
+    res_atomic_6 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets, swap_view.select<VL, 1>(), compare, props_a);
+
+    res_atomic_7 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets, swap_view.select<VL, 1>(), compare_view.select<VL, 1>(),
+        pred, props_a);
+
+    res_atomic_8 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets, swap_view.select<VL, 1>(), compare_view.select<VL, 1>(),
+        props_a);
+
+    res_atomic_9 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view.select<VL, 1>(), swap, compare, pred, props_a);
+
+    res_atomic_10 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view.select<VL, 1>(), swap, compare, props_a);
+
+    res_atomic_11 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view.select<VL, 1>(), swap, compare_view.select<VL, 1>(),
+        pred, props_a);
+
+    res_atomic_12 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view.select<VL, 1>(), swap, compare_view.select<VL, 1>(),
+        props_a);
+
+    res_atomic_13 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(), compare,
+        pred, props_a);
+
+    res_atomic_14 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(), compare,
+        props_a);
+
+    res_atomic_15 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>(), pred, props_a);
+
+    res_atomic_16 = atomic_update<atomic_op::cmpxchg>(
+        ptr, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>(), props_a);
+
     {
       constexpr int VL = 8;
       simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int);
@@ -489,8 +625,8 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
 
     // Accessors
 
-    // CHECK-STATEFUL-COUNT-30:  call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> undef)
-    // CHECK-STATELESS-COUNT-30: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
+    // CHECK-STATEFUL-COUNT-58:  call <4 x i32> @llvm.genx.lsc.xatomic.bti.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> undef)
+    // CHECK-STATELESS-COUNT-58: call <4 x i32> @llvm.genx.lsc.xatomic.stateless.v4i32.v4i1.v4i64(<4 x i1> {{[^)]+}}, i8 18, i8 1, i8 3, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <4 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0, <4 x i32> undef)
     auto res_atomic_17 = atomic_update<atomic_op::cmpxchg>(
         acc, offsets, swap, compare, pred, props_a);
 
@@ -589,6 +725,98 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
         acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
         compare_view.select<VL, 1>(), props_a);
 
+    res_atomic_19 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets, swap, compare_view, pred, props_a);
+
+    res_atomic_20 = atomic_update<atomic_op::cmpxchg>(acc, offsets, swap,
+                                                      compare_view, props_a);
+
+    res_atomic_21 = atomic_update<atomic_op::cmpxchg>(acc, offsets, swap_view,
+                                                      compare, pred, props_a);
+
+    res_atomic_22 = atomic_update<atomic_op::cmpxchg>(acc, offsets, swap_view,
+                                                      compare, props_a);
+
+    res_atomic_23 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets, swap_view, compare_view, pred, props_a);
+
+    res_atomic_24 = atomic_update<atomic_op::cmpxchg>(acc, offsets, swap_view,
+                                                      compare_view, props_a);
+
+    res_atomic_25 = atomic_update<atomic_op::cmpxchg>(acc, offsets_view, swap,
+                                                      compare, pred, props_a);
+
+    res_atomic_26 = atomic_update<atomic_op::cmpxchg>(acc, offsets_view, swap,
+                                                      compare, props_a);
+
+    res_atomic_27 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view, swap, compare_view, pred, props_a);
+
+    res_atomic_28 = atomic_update<atomic_op::cmpxchg>(acc, offsets_view, swap,
+                                                      compare_view, props_a);
+
+    res_atomic_29 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view, swap_view, compare, pred, props_a);
+
+    res_atomic_30 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view, swap_view, compare, props_a);
+
+    res_atomic_31 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view, swap_view, compare_view, pred, props_a);
+
+    res_atomic_32 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view, swap_view, compare_view, props_a);
+
+    res_atomic_19 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets, swap, compare_view.select<VL, 1>(), pred, props_a);
+
+    res_atomic_20 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets, swap, compare_view.select<VL, 1>(), props_a);
+
+    res_atomic_21 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets, swap_view.select<VL, 1>(), compare, pred, props_a);
+
+    res_atomic_22 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets, swap_view.select<VL, 1>(), compare, props_a);
+
+    res_atomic_23 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets, swap_view.select<VL, 1>(), compare_view.select<VL, 1>(),
+        pred, props_a);
+
+    res_atomic_24 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets, swap_view.select<VL, 1>(), compare_view.select<VL, 1>(),
+        props_a);
+
+    res_atomic_25 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view.select<VL, 1>(), swap, compare, pred, props_a);
+
+    res_atomic_26 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view.select<VL, 1>(), swap, compare, props_a);
+
+    res_atomic_27 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view.select<VL, 1>(), swap, compare_view.select<VL, 1>(),
+        pred, props_a);
+
+    res_atomic_28 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view.select<VL, 1>(), swap, compare_view.select<VL, 1>(),
+        props_a);
+
+    res_atomic_29 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(), compare,
+        pred, props_a);
+
+    res_atomic_30 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(), compare,
+        props_a);
+
+    res_atomic_31 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>(), pred, props_a);
+
+    res_atomic_32 = atomic_update<atomic_op::cmpxchg>(
+        acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>(), props_a);
+
     {
       constexpr int VL = 8;
       simd<uint32_t, VL> offsets = simd<uint32_t, VL>(1) * sizeof(int);
@@ -666,7 +894,7 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
 
   // Test slm_atomic_update with one operand.
   {
-    // CHECK-COUNT-14: call <4 x i32> @llvm.genx.dword.atomic.add.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
+    // CHECK-COUNT-26: call <4 x i32> @llvm.genx.dword.atomic.add.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
     {
       auto res_slm_atomic_1 =
           slm_atomic_update<atomic_op::add>(offsets, add, pred);
@@ -695,6 +923,28 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
           offsets_view.select<VL, 1>(), add_view.select<VL, 1>(), pred);
       res_slm_atomic_8 = slm_atomic_update<atomic_op::add, int, VL>(
           offsets_view.select<VL, 1>(), add_view.select<VL, 1>());
+      res_slm_atomic_3 =
+          slm_atomic_update<atomic_op::add>(offsets, add_view, pred);
+      res_slm_atomic_4 = slm_atomic_update<atomic_op::add>(offsets, add_view);
+      res_slm_atomic_5 =
+          slm_atomic_update<atomic_op::add>(offsets_view, add, pred);
+      res_slm_atomic_6 = slm_atomic_update<atomic_op::add>(offsets_view, add);
+      res_slm_atomic_7 =
+          slm_atomic_update<atomic_op::add>(offsets_view, add_view, pred);
+      res_slm_atomic_8 =
+          slm_atomic_update<atomic_op::add>(offsets_view, add_view);
+      res_slm_atomic_3 = slm_atomic_update<atomic_op::add>(
+          offsets, add_view.select<VL, 1>(), pred);
+      res_slm_atomic_4 =
+          slm_atomic_update<atomic_op::add>(offsets, add_view.select<VL, 1>());
+      res_slm_atomic_5 = slm_atomic_update<atomic_op::add>(
+          offsets_view.select<VL, 1>(), add, pred);
+      res_slm_atomic_6 =
+          slm_atomic_update<atomic_op::add>(offsets_view.select<VL, 1>(), add);
+      res_slm_atomic_7 = slm_atomic_update<atomic_op::add>(
+          offsets_view.select<VL, 1>(), add_view.select<VL, 1>(), pred);
+      res_slm_atomic_8 = slm_atomic_update<atomic_op::add>(
+          offsets_view.select<VL, 1>(), add_view.select<VL, 1>());
     }
 
     // Expect LSC for short.
@@ -733,7 +983,7 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
 
   // Test slm_atomic_update with two operands.
   {
-    // CHECK-COUNT-30: call <4 x i32> @llvm.genx.dword.atomic.cmpxchg.v4i32.v4i1(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
+    // CHECK-COUNT-58: call <4 x i32> @llvm.genx.dword.atomic.cmpxchg.v4i32.v4i1(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
     auto res_atomic_1 =
         slm_atomic_update<atomic_op::cmpxchg>(offsets, swap, compare, pred);
     auto res_atomic_2 =
@@ -810,6 +1060,77 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
         offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
         compare_view.select<VL, 1>());
 
+    res_atomic_3 = slm_atomic_update<atomic_op::cmpxchg>(offsets, swap,
+                                                         compare_view, pred);
+    res_atomic_4 =
+        slm_atomic_update<atomic_op::cmpxchg>(offsets, swap, compare_view);
+
+    res_atomic_5 = slm_atomic_update<atomic_op::cmpxchg>(offsets, swap_view,
+                                                         compare, pred);
+    res_atomic_6 =
+        slm_atomic_update<atomic_op::cmpxchg>(offsets, swap_view, compare);
+
+    res_atomic_7 = slm_atomic_update<atomic_op::cmpxchg>(offsets, swap_view,
+                                                         compare_view, pred);
+    res_atomic_8 =
+        slm_atomic_update<atomic_op::cmpxchg>(offsets, swap_view, compare_view);
+
+    res_atomic_9 = slm_atomic_update<atomic_op::cmpxchg>(offsets_view, swap,
+                                                         compare, pred);
+    res_atomic_10 =
+        slm_atomic_update<atomic_op::cmpxchg>(offsets_view, swap, compare);
+
+    res_atomic_11 = slm_atomic_update<atomic_op::cmpxchg>(offsets_view, swap,
+                                                          compare_view, pred);
+    res_atomic_12 =
+        slm_atomic_update<atomic_op::cmpxchg>(offsets_view, swap, compare_view);
+
+    res_atomic_13 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view, swap_view, compare, pred);
+    res_atomic_14 =
+        slm_atomic_update<atomic_op::cmpxchg>(offsets_view, swap_view, compare);
+
+    res_atomic_15 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view, swap_view, compare_view, pred);
+    res_atomic_16 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view, swap_view, compare_view);
+    res_atomic_3 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets, swap, compare_view.select<VL, 1>(), pred);
+    res_atomic_4 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets, swap, compare_view.select<VL, 1>());
+
+    res_atomic_5 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets, swap_view.select<VL, 1>(), compare, pred);
+    res_atomic_6 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets, swap_view.select<VL, 1>(), compare);
+
+    res_atomic_7 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets, swap_view.select<VL, 1>(), compare_view.select<VL, 1>(), pred);
+    res_atomic_8 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets, swap_view.select<VL, 1>(), compare_view.select<VL, 1>());
+
+    res_atomic_9 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view.select<VL, 1>(), swap, compare, pred);
+    res_atomic_10 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view.select<VL, 1>(), swap, compare);
+
+    res_atomic_11 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view.select<VL, 1>(), swap, compare_view.select<VL, 1>(), pred);
+    res_atomic_12 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view.select<VL, 1>(), swap, compare_view.select<VL, 1>());
+
+    res_atomic_13 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(), compare, pred);
+    res_atomic_14 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(), compare);
+
+    res_atomic_15 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>(), pred);
+    res_atomic_16 = slm_atomic_update<atomic_op::cmpxchg>(
+        offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>());
+
     // Expect LSC for short.
     {
       constexpr int VL = 16;
@@ -878,7 +1199,7 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
   }
   // One operand atomic.
   {
-    // CHECK-COUNT-14: call <4 x i32> @llvm.genx.dword.atomic.add.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
+    // CHECK-COUNT-26: call <4 x i32> @llvm.genx.dword.atomic.add.v4i32.v4i1.v4i32(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
     auto res_slm_atomic_1 =
         atomic_update<atomic_op::add>(local_acc, offsets, add, pred);
     auto res_slm_atomic_2 =
@@ -909,6 +1230,32 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
     res_slm_atomic_8 = atomic_update<atomic_op::add, int, VL>(
         local_acc, offsets_view.select<VL, 1>(), add_view.select<VL, 1>());
 
+    res_slm_atomic_3 =
+        atomic_update<atomic_op::add>(local_acc, offsets, add_view, pred);
+    res_slm_atomic_4 =
+        atomic_update<atomic_op::add>(local_acc, offsets, add_view);
+    res_slm_atomic_5 =
+        atomic_update<atomic_op::add>(local_acc, offsets_view, add, pred);
+    res_slm_atomic_6 =
+        atomic_update<atomic_op::add>(local_acc, offsets_view, add);
+    res_slm_atomic_7 =
+        atomic_update<atomic_op::add>(local_acc, offsets_view, add_view, pred);
+    res_slm_atomic_8 =
+        atomic_update<atomic_op::add>(local_acc, offsets_view, add_view);
+    res_slm_atomic_3 = atomic_update<atomic_op::add>(
+        local_acc, offsets, add_view.select<VL, 1>(), pred);
+    res_slm_atomic_4 = atomic_update<atomic_op::add>(local_acc, offsets,
+                                                     add_view.select<VL, 1>());
+    res_slm_atomic_5 = atomic_update<atomic_op::add>(
+        local_acc, offsets_view.select<VL, 1>(), add, pred);
+    res_slm_atomic_6 = atomic_update<atomic_op::add>(
+        local_acc, offsets_view.select<VL, 1>(), add);
+    res_slm_atomic_7 =
+        atomic_update<atomic_op::add>(local_acc, offsets_view.select<VL, 1>(),
+                                      add_view.select<VL, 1>(), pred);
+    res_slm_atomic_8 = atomic_update<atomic_op::add>(
+        local_acc, offsets_view.select<VL, 1>(), add_view.select<VL, 1>());
+
     // Expect LSC for short.
     {
       using LocalAccType = sycl::local_accessor<int16_t, 1>;
@@ -921,7 +1268,7 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
   }
   // Two operand atomic.
   {
-    // CHECK-COUNT-30: call <4 x i32> @llvm.genx.dword.atomic.cmpxchg.v4i32.v4i1(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
+    // CHECK-COUNT-58: call <4 x i32> @llvm.genx.dword.atomic.cmpxchg.v4i32.v4i1(<4 x i1> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, <4 x i32> undef)
     auto res_slm_atomic_1 = atomic_update<atomic_op::cmpxchg>(
         local_acc, offsets, swap, compare, pred);
     auto res_slm_atomic_2 =
@@ -991,6 +1338,71 @@ test_atomic_update(AccType &acc, LocalAccTypeInt local_acc, float *ptrf,
         local_acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
         compare_view.select<VL, 1>());
 
+    res_slm_atomic_3 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap, compare_view, pred);
+    res_slm_atomic_4 = atomic_update<atomic_op::cmpxchg>(local_acc, offsets,
+                                                         swap, compare_view);
+    res_slm_atomic_5 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap_view, compare, pred);
+    res_slm_atomic_6 = atomic_update<atomic_op::cmpxchg>(local_acc, offsets,
+                                                         swap_view, compare);
+    res_slm_atomic_7 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap_view, compare_view, pred);
+    res_slm_atomic_8 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap_view, compare_view);
+    res_slm_atomic_9 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view, swap, compare, pred);
+    res_slm_atomic_10 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view, swap, compare);
+    res_slm_atomic_11 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view, swap, compare_view, pred);
+    res_slm_atomic_12 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view, swap, compare_view);
+    res_slm_atomic_13 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view, swap_view, compare, pred);
+    res_slm_atomic_14 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view, swap_view, compare);
+    res_slm_atomic_15 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view, swap_view, compare_view, pred);
+    res_slm_atomic_16 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view, swap_view, compare_view);
+    res_slm_atomic_3 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap, compare_view.select<VL, 1>(), pred);
+    res_slm_atomic_4 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap, compare_view.select<VL, 1>());
+    res_slm_atomic_5 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap_view.select<VL, 1>(), compare, pred);
+    res_slm_atomic_6 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap_view.select<VL, 1>(), compare);
+    res_slm_atomic_7 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>(), pred);
+    res_slm_atomic_8 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets, swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>());
+    res_slm_atomic_9 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view.select<VL, 1>(), swap, compare, pred);
+    res_slm_atomic_10 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view.select<VL, 1>(), swap, compare);
+    res_slm_atomic_11 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view.select<VL, 1>(), swap,
+        compare_view.select<VL, 1>(), pred);
+    res_slm_atomic_12 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view.select<VL, 1>(), swap,
+        compare_view.select<VL, 1>());
+    res_slm_atomic_13 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare, pred);
+    res_slm_atomic_14 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare);
+    res_slm_atomic_15 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>(), pred);
+    res_slm_atomic_16 = atomic_update<atomic_op::cmpxchg>(
+        local_acc, offsets_view.select<VL, 1>(), swap_view.select<VL, 1>(),
+        compare_view.select<VL, 1>());
+
     // Expect LSC for short.
     {
       using LocalAccType = sycl::local_accessor<int16_t, 1>;
diff --git a/sycl/test/esimd/memory_properties_copytocopyfrom.cpp b/sycl/test/check_device_code/esimd/memory_properties_copytocopyfrom.cpp
similarity index 100%
rename from sycl/test/esimd/memory_properties_copytocopyfrom.cpp
rename to sycl/test/check_device_code/esimd/memory_properties_copytocopyfrom.cpp
diff --git a/sycl/test/esimd/memory_properties_gather.cpp b/sycl/test/check_device_code/esimd/memory_properties_gather.cpp
similarity index 100%
rename from sycl/test/esimd/memory_properties_gather.cpp
rename to sycl/test/check_device_code/esimd/memory_properties_gather.cpp
diff --git a/sycl/test/esimd/memory_properties_load_store.cpp b/sycl/test/check_device_code/esimd/memory_properties_load_store.cpp
similarity index 64%
rename from sycl/test/esimd/memory_properties_load_store.cpp
rename to sycl/test/check_device_code/esimd/memory_properties_load_store.cpp
index 4691824920018..c1e465536268f 100644
--- a/sycl/test/esimd/memory_properties_load_store.cpp
+++ b/sycl/test/check_device_code/esimd/memory_properties_load_store.cpp
@@ -85,7 +85,11 @@ test_block_load(AccType &acc, LocalAccType &local_acc, float *ptrf,
 
   constexpr int N = 4;
   simd<float, N> pass_thru = 1;
+  auto pass_thru_view = pass_thru.select<N, 1>();
+
   simd<int, N> pass_thrui = 1;
+  auto pass_thrui_view = pass_thrui.select<N, 1>();
+
   const int *ptri = reinterpret_cast<const int *>(ptrf);
   const int8_t *ptrb = reinterpret_cast<const int8_t *>(ptrf);
 
@@ -102,8 +106,10 @@ test_block_load(AccType &acc, LocalAccType &local_acc, float *ptrf,
   simd_mask<1> mask = 1;
   auto d4 = block_load<float, N>(ptrf, mask, props_a);
 
-  // CHECK: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}})
+  // CHECK-COUNT-3: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}})
   auto d5 = block_load<float, N>(ptrf, mask, pass_thru, props_b);
+  d5 = block_load(ptrf, mask, pass_thru, props_b);
+  d5 = block_load(ptrf, mask, pass_thru_view, props_b);
 
   // CHECK: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}})
   auto d6 = block_load<float, N>(ptrf, byte_offset32, mask, props_a);
@@ -111,8 +117,10 @@ test_block_load(AccType &acc, LocalAccType &local_acc, float *ptrf,
   // CHECK: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}})
   auto d7 = block_load<int, N>(ptri, byte_offset64, mask, props_b);
 
-  // CHECK: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}})
+  // CHECK-COUNT-3: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}})
   auto d8 = block_load<int, N>(ptri, byte_offset32, mask, pass_thrui, props_a);
+  d8 = block_load(ptri, byte_offset32, mask, pass_thrui, props_a);
+  d8 = block_load(ptri, byte_offset32, mask, pass_thrui_view, props_a);
 
   // CHECK: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}})
   auto d9 = block_load<int, N>(ptri, byte_offset64, mask, pass_thru, props_b);
@@ -149,9 +157,10 @@ test_block_load(AccType &acc, LocalAccType &local_acc, float *ptrf,
   // CHECK-STATELESS: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}})
   auto a4 = block_load<float, N>(acc, mask, props_a);
 
-  // CHECK-STATEFUL:  call <4 x float> @llvm.genx.lsc.load.merge.bti.v4f32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x float> {{[^)]+}})
-  // CHECK-STATELESS: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}})
+  // CHECK-STATEFUL-COUNT-2:  call <4 x float> @llvm.genx.lsc.load.merge.bti.v4f32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x float> {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}})
   auto a5 = block_load<float, N>(acc, mask, pass_thru, props_b);
+  a5 = block_load<float, N>(acc, mask, pass_thru_view, props_b);
 
   // CHECK-STATEFUL:  call <4 x float> @llvm.genx.lsc.load.merge.bti.v4f32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x float> {{[^)]+}})
   // CHECK-STATELESS: call <4 x float> @llvm.genx.lsc.load.merge.stateless.v4f32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x float> {{[^)]+}})
@@ -161,13 +170,15 @@ test_block_load(AccType &acc, LocalAccType &local_acc, float *ptrf,
   // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}})
   auto a7 = block_load<int, N>(acc, byte_offset64, mask, props_b);
 
-  // CHECK-STATEFUL:  call <4 x i32> @llvm.genx.lsc.load.merge.bti.v4i32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}})
-  // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}})
+  // CHECK-STATEFUL-COUNT-2:  call <4 x i32> @llvm.genx.lsc.load.merge.bti.v4i32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 5, i8 2, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}})
   auto a8 = block_load<int, N>(acc, byte_offset32, mask, pass_thru, props_a);
+  a8 = block_load<int, N>(acc, byte_offset32, mask, pass_thru_view, props_a);
 
-  // CHECK-STATEFUL:  call <4 x i32> @llvm.genx.lsc.load.merge.bti.v4i32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}})
-  // CHECK-STATELESS: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}})
+  // CHECK-STATEFUL-COUNT-2:  call <4 x i32> @llvm.genx.lsc.load.merge.bti.v4i32.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: call <4 x i32> @llvm.genx.lsc.load.merge.stateless.v4i32.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0, <4 x i32> {{[^)]+}})
   auto a9 = block_load<int, N>(acc, byte_offset64, mask, pass_thrui, props_b);
+  a9 = block_load<int, N>(acc, byte_offset64, mask, pass_thrui_view, props_b);
 
   // Now try block_load without cache hints and using the mask to verify
   // svm/legacy code-gen. Also, intentially use vector lengths that are
@@ -195,9 +206,11 @@ test_block_load(AccType &acc, LocalAccType &local_acc, float *ptrf,
   auto slm_bl2 = slm_block_load<double, 8>(byte_offset32, mask, props_c16);
 
   simd<double, 8> pass_thrud = 2.0;
-  // CHECK: call <8 x double> @llvm.genx.lsc.load.merge.slm.v8f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 5, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0, <8 x double> {{[^)]+}})
+  auto pass_thrud_view = pass_thrud.select<8, 1>();
+  // CHECK-COUNT-2: call <8 x double> @llvm.genx.lsc.load.merge.slm.v8f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 5, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0, <8 x double> {{[^)]+}})
   auto slm_bl3 =
       slm_block_load<double, 8>(byte_offset32, mask, pass_thrud, props_c16);
+  slm_bl3 = slm_block_load(byte_offset32, mask, pass_thrud_view, props_c16);
 
   // Now try block_load() accepting local accessor.
 
@@ -210,19 +223,24 @@ test_block_load(AccType &acc, LocalAccType &local_acc, float *ptrf,
   // CHECK: call <8 x double> @llvm.genx.lsc.load.slm.v8f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 5, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0)
   auto lacc_bl3 = block_load<double, 8>(local_acc, mask, props_a);
 
-  // CHECK: call <16 x double> @llvm.genx.lsc.load.merge.slm.v16f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 6, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0, <16 x double> {{[^)]+}})
+  // CHECK-COUNT-2: call <16 x double> @llvm.genx.lsc.load.merge.slm.v16f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 6, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0, <16 x double> {{[^)]+}})
   simd<double, 16> pass_thrud16 = 2.0;
+  auto pass_thrud16_view = pass_thrud16.select<16, 1>();
   auto lacc_bl4 =
       block_load<double, 16>(local_acc, mask, pass_thrud16, props_b);
+  lacc_bl4 = block_load(local_acc, mask, pass_thrud16_view, props_b);
 
   // CHECK: call <32 x double> @llvm.genx.lsc.load.slm.v32f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 7, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0)
   auto lacc_bl5 =
       block_load<double, 32>(local_acc, byte_offset32, mask, props_a);
 
-  // CHECK: call <4 x double> @llvm.genx.lsc.load.merge.slm.v4f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0, <4 x double> {{[^)]+}})
+  // CHECK-COUNT-2: call <4 x double> @llvm.genx.lsc.load.merge.slm.v4f64.v1i1.v1i32(<1 x i1> {{[^)]+}}, i8 0, i8 0, i8 0, i16 1, i32 0, i8 4, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, i32 0, <4 x double> {{[^)]+}})
   simd<double, 4> pass_thrud4 = 2.0;
+  auto pass_thrud4_view = pass_thrud4.select<4, 1>();
   auto lacc_bl6 = block_load<double, 4>(local_acc, byte_offset32, mask,
                                         pass_thrud4, props_a);
+  lacc_bl6 =
+      block_load(local_acc, byte_offset32, mask, pass_thrud4_view, props_a);
 
   // Check the default/assumed alignment when the alignment property is
   // not specified explicitly.
@@ -255,105 +273,135 @@ test_block_store(AccType &acc, LocalAccType &local_acc, float *ptrf,
   simd_mask<1> mask = 1;
   auto view = vals.select<N, 1>();
   auto viewi = valsi.select<N, 1>();
-  // CHECK: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
   block_store(ptrf, vals, store_props_a);
+  block_store(ptrf, view, store_props_a);
 
-  // CHECK: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
   block_store(ptri, byte_offset32, valsi, store_props_a);
+  block_store(ptri, byte_offset32, viewi, store_props_a);
 
-  // CHECK: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
   block_store(ptrf, byte_offset64, vals, store_props_c);
+  block_store(ptrf, byte_offset64, view, store_props_c);
 
-  // CHECK: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
   block_store(ptrf, vals, mask, store_props_a);
+  block_store(ptrf, view, mask, store_props_a);
 
-  // CHECK: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
   block_store(ptri, byte_offset64, valsi, mask, store_props_c);
+  block_store(ptri, byte_offset64, viewi, mask, store_props_c);
 
   // Test SVM/legacy USM block store
 
-  // CHECK: store <4 x float> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 16
+  // CHECK-COUNT-2: store <4 x float> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 16
   block_store(ptrf, vals, store_props_b);
+  block_store(ptrf, view, store_props_b);
 
-  // CHECK: store <4 x float> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 8
+  // CHECK-COUNT-2: store <4 x float> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 8
   block_store(ptrf, vals, store_props_d);
+  block_store(ptrf, view, store_props_d);
 
-  // CHECK: store <4 x float> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 16
+  // CHECK-COUNT-2: store <4 x float> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 16
   block_store(ptrf, byte_offset32, vals, store_props_b);
+  block_store(ptrf, byte_offset32, view, store_props_b);
 
   // Test accessor block store
 
-  // CHECK-STATEFUL:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 {{[^)]+}})
-  // CHECK-STATELESS: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
+  // CHECK-STATEFUL-COUNT-2:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
   block_store(acc, vals, store_props_a);
+  block_store(acc, view, store_props_a);
 
-  // CHECK-STATEFUL:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}})
-  // CHECK-STATELESS: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
+  // CHECK-STATEFUL-COUNT-2:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
   block_store(acc, byte_offset32, valsi, store_props_a);
+  block_store(acc, byte_offset32, viewi, store_props_a);
 
-  // CHECK-STATEFUL:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 {{[^)]+}})
-  // CHECK-STATELESS: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
+  // CHECK-STATEFUL-COUNT-2:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
   block_store(acc, byte_offset64, vals, store_props_c);
+  block_store(acc, byte_offset64, view, store_props_c);
 
-  // CHECK-STATEFUL:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 {{[^)]+}})
-  // CHECK-STATELESS: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
+  // CHECK-STATEFUL-COUNT-2:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 1, i8 1, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
   block_store(acc, vals, mask, store_props_a);
+  block_store(acc, view, mask, store_props_a);
 
-  // CHECK-STATEFUL:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}})
-  // CHECK-STATELESS: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
+  // CHECK-STATEFUL-COUNT-2:  call void @llvm.genx.lsc.store.bti.v1i1.v1i32.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: call void @llvm.genx.lsc.store.stateless.v1i1.v1i64.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 3, i8 3, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i64> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
   block_store(acc, byte_offset64, valsi, mask, store_props_c);
+  block_store(acc, byte_offset64, viewi, mask, store_props_c);
 
   // Test accessor SVM/legacy block store
 
-  // CHECK-STATEFUL:  call void @llvm.genx.oword.st.v4f32(i32 {{[^)]+}}, i32 {{[^)]+}}, <4 x float> {{[^)]+}})
-  // CHECK-STATELESS: store <4 x float> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 16
+  // CHECK-STATEFUL-COUNT-2:  call void @llvm.genx.oword.st.v4f32(i32 {{[^)]+}}, i32 {{[^)]+}}, <4 x float> {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: store <4 x float> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 16
   block_store(acc, vals, store_props_b);
+  block_store(acc, view, store_props_b);
 
-  // CHECK-STATEFUL:  call void @llvm.genx.oword.st.v4i32(i32 {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}})
-  // CHECK-STATELESS: store <4 x i32> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 16
+  // CHECK-STATEFUL-COUNT-2:  call void @llvm.genx.oword.st.v4i32(i32 {{[^)]+}}, i32 {{[^)]+}}, <4 x i32> {{[^)]+}})
+  // CHECK-STATELESS-COUNT-2: store <4 x i32> {{[^)]+}}, ptr addrspace(4) {{[^)]+}}, align 16
   block_store(acc, byte_offset32, valsi, store_props_b);
+  block_store(acc, byte_offset32, viewi, store_props_b);
 
   // Now try SLM block_store() with and without cache hints that are ignored.
 
-  // CHECK-COUNT-3: store <4 x float> {{[^)]+}}, ptr addrspace(3) {{[^)]+}}, align 16
+  // CHECK-COUNT-5: store <4 x float> {{[^)]+}}, ptr addrspace(3) {{[^)]+}}, align 16
   slm_block_store<float, N>(byte_offset32, vals, store_props_b);
   slm_block_store<float, N>(byte_offset32, view, store_props_b);
   slm_block_store<float, N>(byte_offset32, view.select<N, 1>(), store_props_b);
+  slm_block_store(byte_offset32, view, store_props_b);
+  slm_block_store(byte_offset32, view.select<N, 1>(), store_props_b);
 
-  // CHECK-COUNT-3: store <4 x float> {{[^)]+}}, ptr addrspace(3) {{[^)]+}}, align 16
+  // CHECK-COUNT-5: store <4 x float> {{[^)]+}}, ptr addrspace(3) {{[^)]+}}, align 16
   slm_block_store<float, N>(byte_offset32, vals, store_props_a);
   slm_block_store<float, N>(byte_offset32, view, store_props_a);
   slm_block_store<float, N>(byte_offset32, view.select<N, 1>(), store_props_a);
+  slm_block_store(byte_offset32, view, store_props_a);
+  slm_block_store(byte_offset32, view.select<N, 1>(), store_props_a);
 
   // Now try SLM block_store() with a predicate.
 
-  // CHECK-COUNT-3: call void @llvm.genx.lsc.store.slm.v1i1.v1i32.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-5: call void @llvm.genx.lsc.store.slm.v1i1.v1i32.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
   slm_block_store<int, N>(byte_offset32, valsi, mask, store_props_b);
   slm_block_store<int, N>(byte_offset32, viewi, mask, store_props_b);
   slm_block_store<int, N>(byte_offset32, viewi.select<N, 1>(), mask,
                           store_props_b);
+  slm_block_store(byte_offset32, viewi, mask, store_props_b);
+  slm_block_store(byte_offset32, viewi.select<N, 1>(), mask, store_props_b);
 
   // Now try block_store() accepting local accessor.
 
-  // CHECK-COUNT-3: store <4 x float> {{[^)]+}}, ptr addrspace(3) {{[^)]+}}, align 8
+  // CHECK-COUNT-5: store <4 x float> {{[^)]+}}, ptr addrspace(3) {{[^)]+}}, align 8
   block_store<float, N>(local_acc, vals, store_props_d);
   block_store<float, N>(local_acc, view, store_props_d);
   block_store<float, N>(local_acc, view.select<N, 1>(), store_props_d);
+  block_store(local_acc, view, store_props_d);
+  block_store(local_acc, view.select<N, 1>(), store_props_d);
 
-  // CHECK-COUNT-3: store <4 x i32> {{[^)]+}}, ptr addrspace(3) {{[^)]+}}, align 8
+  // CHECK-COUNT-5: store <4 x i32> {{[^)]+}}, ptr addrspace(3) {{[^)]+}}, align 8
   block_store<int, N>(local_acc, byte_offset32, valsi, store_props_d);
   block_store<int, N>(local_acc, byte_offset32, viewi, store_props_d);
   block_store<int, N>(local_acc, byte_offset32, viewi.select<N, 1>(),
                       store_props_d);
+  block_store(local_acc, byte_offset32, viewi, store_props_d);
+  block_store(local_acc, byte_offset32, viewi.select<N, 1>(), store_props_d);
 
-  // CHECK-COUNT-3: call void @llvm.genx.lsc.store.slm.v1i1.v1i32.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-5: call void @llvm.genx.lsc.store.slm.v1i1.v1i32.v4f32(<1 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x float> {{[^)]+}}, i32 0)
   block_store<float, N>(local_acc, vals, mask, store_props_a);
   block_store<float, N>(local_acc, view, mask, store_props_a);
   block_store<float, N>(local_acc, view.select<N, 1>(), mask, store_props_a);
+  block_store(local_acc, view, mask, store_props_a);
+  block_store(local_acc, view.select<N, 1>(), mask, store_props_a);
 
-  // CHECK-COUNT-3: call void @llvm.genx.lsc.store.slm.v1i1.v1i32.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-5: call void @llvm.genx.lsc.store.slm.v1i1.v1i32.v4i32(<1 x i1> {{[^)]+}}, i8 4, i8 0, i8 0, i16 1, i32 0, i8 3, i8 4, i8 2, i8 0, <1 x i32> {{[^)]+}}, <4 x i32> {{[^)]+}}, i32 0)
   block_store<int, N>(local_acc, byte_offset32, valsi, mask, store_props_c);
   block_store<int, N>(local_acc, byte_offset32, viewi, mask, store_props_c);
   block_store<int, N>(local_acc, byte_offset32, viewi.select<N, 1>(), mask,
                       store_props_c);
+  block_store(local_acc, byte_offset32, viewi, mask, store_props_c);
+  block_store(local_acc, byte_offset32, viewi.select<N, 1>(), mask,
+              store_props_c);
 }
\ No newline at end of file
diff --git a/sycl/test/esimd/memory_properties_prefetch_2d.cpp b/sycl/test/check_device_code/esimd/memory_properties_prefetch_2d.cpp
similarity index 90%
rename from sycl/test/esimd/memory_properties_prefetch_2d.cpp
rename to sycl/test/check_device_code/esimd/memory_properties_prefetch_2d.cpp
index 1d98e9be4a582..c9c5f33854057 100644
--- a/sycl/test/esimd/memory_properties_prefetch_2d.cpp
+++ b/sycl/test/check_device_code/esimd/memory_properties_prefetch_2d.cpp
@@ -82,26 +82,34 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void test_prefetch(AccType &acc, float *ptrf,
 
   // 1) prefetch(usm, offsets): offsets is simd or simd_view
 
-  // CHECK-COUNT-6: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-10: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0)
   prefetch(ptrf, ioffset_n32, props_cache_load);
   prefetch<float, 32>(ptrf, ioffset_n32_view, props_cache_load);
   prefetch<float, 32>(ptrf, ioffset_n32_view.select<32, 1>(), props_cache_load);
+  prefetch(ptrf, ioffset_n32_view, props_cache_load);
+  prefetch(ptrf, ioffset_n32_view.select<32, 1>(), props_cache_load);
 
   prefetch(ptrf, loffset_n32, props_cache_load);
   prefetch<float, 32>(ptrf, loffset_n32_view, props_cache_load);
   prefetch<float, 32>(ptrf, loffset_n32_view.select<32, 1>(), props_cache_load);
+  prefetch(ptrf, loffset_n32_view, props_cache_load);
+  prefetch(ptrf, loffset_n32_view.select<32, 1>(), props_cache_load);
 
   // 2) prefetch(usm, offsets, mask): offsets is simd or simd_view
-  // CHECK-COUNT-6: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-10: call void @llvm.genx.lsc.prefetch.stateless.v32i1.v32i64(<32 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 1, i8 0, <32 x i64> {{[^)]+}}, i32 0)
   prefetch(ptrf, ioffset_n32, mask_n32, props_cache_load);
   prefetch<float, 32>(ptrf, ioffset_n32_view, mask_n32, props_cache_load);
   prefetch<float, 32>(ptrf, ioffset_n32_view.select<32, 1>(), mask_n32,
                       props_cache_load);
+  prefetch(ptrf, ioffset_n32_view, mask_n32, props_cache_load);
+  prefetch(ptrf, ioffset_n32_view.select<32, 1>(), mask_n32, props_cache_load);
 
   prefetch(ptrf, loffset_n32, mask_n32, props_cache_load);
   prefetch<float, 32>(ptrf, loffset_n32_view, mask_n32, props_cache_load);
   prefetch<float, 32>(ptrf, loffset_n32_view.select<32, 1>(), mask_n32,
                       props_cache_load);
+  prefetch(ptrf, loffset_n32_view, mask_n32, props_cache_load);
+  prefetch(ptrf, loffset_n32_view.select<32, 1>(), mask_n32, props_cache_load);
 
   // 3) prefetch(usm, offset): offset is scalar
   // CHECK-COUNT-16: call void @llvm.genx.lsc.prefetch.stateless.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 1, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0)
@@ -128,27 +136,37 @@ SYCL_ESIMD_FUNCTION SYCL_EXTERNAL void test_prefetch(AccType &acc, float *ptrf,
                                    props_cache_load_align);
 
   // 4) prefetch(usm, ...): same as (1), (2) above, but with VS > 1.
-  // CHECK-COUNT-6: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-10: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0)
   prefetch<float, 32, 2>(ptrf, ioffset_n16, props_cache_load);
   prefetch<float, 32, 2>(ptrf, ioffset_n16_view, props_cache_load);
   prefetch<float, 32, 2>(ptrf, ioffset_n16_view.select<16, 1>(),
                          props_cache_load);
+  prefetch<2>(ptrf, ioffset_n16_view, props_cache_load);
+  prefetch<2>(ptrf, ioffset_n16_view.select<16, 1>(), props_cache_load);
 
   prefetch<float, 32, 2>(ptrf, loffset_n16, props_cache_load);
   prefetch<float, 32, 2>(ptrf, loffset_n16_view, props_cache_load);
   prefetch<float, 32, 2>(ptrf, loffset_n16_view.select<16, 1>(),
                          props_cache_load);
+  prefetch<2>(ptrf, loffset_n16_view, props_cache_load);
+  prefetch<2>(ptrf, loffset_n16_view.select<16, 1>(), props_cache_load);
 
-  // CHECK-COUNT-6: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0)
+  // CHECK-COUNT-10: call void @llvm.genx.lsc.prefetch.stateless.v16i1.v16i64(<16 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 2, i8 1, i8 0, <16 x i64> {{[^)]+}}, i32 0)
   prefetch<float, 32, 2>(ptrf, ioffset_n16, mask_n16, props_cache_load);
   prefetch<float, 32, 2>(ptrf, ioffset_n16_view, mask_n16, props_cache_load);
   prefetch<float, 32, 2>(ptrf, ioffset_n16_view.select<16, 1>(), mask_n16,
                          props_cache_load);
+  prefetch<2>(ptrf, ioffset_n16_view, mask_n16, props_cache_load);
+  prefetch<2>(ptrf, ioffset_n16_view.select<16, 1>(), mask_n16,
+              props_cache_load);
 
   prefetch<float, 32, 2>(ptrf, loffset_n16, mask_n16, props_cache_load);
   prefetch<float, 32, 2>(ptrf, loffset_n16_view, mask_n16, props_cache_load);
   prefetch<float, 32, 2>(ptrf, loffset_n16_view.select<16, 1>(), mask_n16,
                          props_cache_load);
+  prefetch<2>(ptrf, loffset_n16_view, mask_n16, props_cache_load);
+  prefetch<2>(ptrf, loffset_n16_view.select<16, 1>(), mask_n16,
+              props_cache_load);
 
   // CHECK-COUNT-2: call void @llvm.genx.lsc.prefetch.stateless.v1i1.v1i64(<1 x i1> {{[^)]+}}, i8 0, i8 2, i8 1, i16 1, i32 0, i8 3, i8 7, i8 2, i8 0, <1 x i64> {{[^)]+}}, i32 0)
   __ESIMD_NS::prefetch<float, 32>(ptrf, 0, props_cache_load);
diff --git a/sycl/test/esimd/memory_properties_scatter.cpp b/sycl/test/check_device_code/esimd/memory_properties_scatter.cpp
similarity index 100%
rename from sycl/test/esimd/memory_properties_scatter.cpp
rename to sycl/test/check_device_code/esimd/memory_properties_scatter.cpp
diff --git a/sycl/test/check_device_code/esimd/nbarriers.cpp b/sycl/test/check_device_code/esimd/nbarriers.cpp
new file mode 100644
index 0000000000000..4257b321127d7
--- /dev/null
+++ b/sycl/test/check_device_code/esimd/nbarriers.cpp
@@ -0,0 +1,15 @@
+// RUN: %clangxx -fsycl -c -fsycl-device-only -Xclang -emit-llvm %s -o - 2>&1 | FileCheck %s
+
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/sycl.hpp>
+
+using namespace sycl::ext::intel::esimd;
+using namespace sycl::ext::intel::experimental::esimd;
+
+SYCL_ESIMD_KERNEL SYCL_EXTERNAL void kernel_esimd() {
+  __ESIMD_NS::named_barrier_init<7>();
+  __ESIMD_NS::named_barrier_wait(2);
+  // CHECK: call spir_func void @_Z13__esimd_fenceh(i8 noundef zeroext 33)
+  // CHECK-NEXT: call spir_func void @_Z23__esimd_nbarrier_arrive{{.*}}
+  __ESIMD_NS::named_barrier_signal(0, 0, 4, 4);
+}
diff --git a/sycl/test/esimd/simd_view_bin_op.cpp b/sycl/test/check_device_code/esimd/simd_view_bin_op.cpp
similarity index 100%
rename from sycl/test/esimd/simd_view_bin_op.cpp
rename to sycl/test/check_device_code/esimd/simd_view_bin_op.cpp
diff --git a/sycl/test/esimd/slm_init_no_inline.cpp b/sycl/test/check_device_code/esimd/slm_init_no_inline.cpp
similarity index 100%
rename from sycl/test/esimd/slm_init_no_inline.cpp
rename to sycl/test/check_device_code/esimd/slm_init_no_inline.cpp
diff --git a/sycl/test/esimd/slm_init_specconst_size.cpp b/sycl/test/check_device_code/esimd/slm_init_specconst_size.cpp
similarity index 100%
rename from sycl/test/esimd/slm_init_specconst_size.cpp
rename to sycl/test/check_device_code/esimd/slm_init_specconst_size.cpp
diff --git a/sycl/test/check_device_code/esimd/spirv_intrins_trans.cpp b/sycl/test/check_device_code/esimd/spirv_intrins_trans.cpp
new file mode 100644
index 0000000000000..a5e6e59d21c04
--- /dev/null
+++ b/sycl/test/check_device_code/esimd/spirv_intrins_trans.cpp
@@ -0,0 +1,41 @@
+// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm -x c++ %s -o %t
+// RUN: sycl-post-link -split-esimd -lower-esimd -O0 -S %t -o %t.table
+// RUN: FileCheck %s -input-file=%t_esimd_0.ll
+
+// This test checks that all LLVM-IR instructions that work with SPIR-V builtins
+// are correctly translated into GenX counterparts (implemented in
+// LowerESIMD.cpp)
+
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/sycl.hpp>
+
+SYCL_ESIMD_KERNEL SYCL_EXTERNAL void
+kernel_SubgroupLocalInvocationId(size_t *DoNotOptimize,
+                                 uint32_t *DoNotOptimize32) {
+  DoNotOptimize[0] = __spirv_SubgroupLocalInvocationId();
+  DoNotOptimize32[0] = __spirv_SubgroupLocalInvocationId() + 3;
+  // CHECK-LABEL: @{{.*}}kernel_SubgroupLocalInvocationId
+  // CHECK: [[ZEXT0:%.*]] = zext i32 0 to i64
+  // CHECK: store i64 [[ZEXT0]]
+  // CHECK: add i32 0, 3
+}
+
+SYCL_ESIMD_KERNEL SYCL_EXTERNAL void
+kernel_SubgroupSize(size_t *DoNotOptimize, uint32_t *DoNotOptimize32) {
+  DoNotOptimize[0] = __spirv_SubgroupSize();
+  DoNotOptimize32[0] = __spirv_SubgroupSize() + 7;
+  // CHECK-LABEL: @{{.*}}kernel_SubgroupSize
+  // CHECK: [[ZEXT0:%.*]] = zext i32 1 to i64
+  // CHECK: store i64 [[ZEXT0]]
+  // CHECK: add i32 1, 7
+}
+
+SYCL_ESIMD_KERNEL SYCL_EXTERNAL void
+kernel_SubgroupMaxSize(size_t *DoNotOptimize, uint32_t *DoNotOptimize32) {
+  DoNotOptimize[0] = __spirv_SubgroupMaxSize();
+  DoNotOptimize32[0] = __spirv_SubgroupMaxSize() + 9;
+  // CHECK-LABEL: @{{.*}}kernel_SubgroupMaxSize
+  // CHECK: [[ZEXT0:%.*]] = zext i32 1 to i64
+  // CHECK: store i64 [[ZEXT0]]
+  // CHECK: add i32 1, 9
+}
diff --git a/sycl/test/esimd/vec_arg_call_conv_ext.cpp b/sycl/test/check_device_code/esimd/vec_arg_call_conv_ext.cpp
similarity index 100%
rename from sycl/test/esimd/vec_arg_call_conv_ext.cpp
rename to sycl/test/check_device_code/esimd/vec_arg_call_conv_ext.cpp
diff --git a/sycl/test/esimd/vec_arg_call_conv_smoke.cpp b/sycl/test/check_device_code/esimd/vec_arg_call_conv_smoke.cpp
similarity index 100%
rename from sycl/test/esimd/vec_arg_call_conv_smoke.cpp
rename to sycl/test/check_device_code/esimd/vec_arg_call_conv_smoke.cpp
diff --git a/sycl/test/esimd/wait.cpp b/sycl/test/check_device_code/esimd/wait.cpp
similarity index 100%
rename from sycl/test/esimd/wait.cpp
rename to sycl/test/check_device_code/esimd/wait.cpp
diff --git a/sycl/test/check_device_code/fpga_ihs_float.cpp b/sycl/test/check_device_code/fpga_ihs_float.cpp
index a9d2a75d1d936..777bbbe812ebf 100644
--- a/sycl/test/check_device_code/fpga_ihs_float.cpp
+++ b/sycl/test/check_device_code/fpga_ihs_float.cpp
@@ -1,4 +1,5 @@
-//==- fpga_ihs_float.cpp - SYCL FPGA arbitrary precision floating point test -==//
+//==- fpga_ihs_float.cpp - SYCL FPGA arbitrary precision floating point test
+//-==//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,7 +10,7 @@
 // RUN: %clangxx -I %sycl_include -S -emit-llvm -fsycl -fsycl-device-only -Xclang -no-enable-noundef-analysis %s -o - | FileCheck %s
 // RUN: %clangxx -I %sycl_include -S -emit-llvm -fsycl -fno-sycl-early-optimizations -fsycl-device-only -Xclang -no-enable-noundef-analysis %s -o - | FileCheck %s
 
-#include "CL/__spirv/spirv_ops.hpp"
+#include <sycl/sycl.hpp>
 
 constexpr int32_t Subnorm = 0;
 constexpr int32_t RndMode = 2;
@@ -19,431 +20,379 @@ constexpr bool ToSign = true;
 constexpr bool SignOfB = false;
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_cast() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> float_cast_res =
-      __spirv_ArbitraryFloatCastINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func i40 @_Z{{[0-9]+}}__spirv_ArbitraryFloatCastINTEL{{.*}}(i40 {{[%a-z0-9.]+}}, i32 28, i32 30, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_cast(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatCastINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
 }
+// CHECK: call spir_func i40 @_Z{{[0-9]+}}__spirv_ArbitraryFloatCastINTEL{{.*}}(i40 {{[%A-Za-z0-9.]+}}, i32 28, i32 30, i32 0, i32 2, i32 1)
+template auto ap_float_cast<11, 28, 9, 30>(sycl::detail::ap_int<1 + 11 + 28> A);
 
 template <int WA, int Eout, int Mout>
-void ap_float_cast_from_int() {
-  sycl::detail::ap_int<WA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> cast_from_int_res =
-      __spirv_ArbitraryFloatCastFromIntINTEL<WA, 1 + Eout + Mout>(
-          A, Mout, FromSign, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func signext i25 @_Z{{[0-9]+}}__spirv_ArbitraryFloatCastFromIntINTEL{{.*}}(i43 {{[%a-z0-9.]+}}, i32 16, i1 zeroext false, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_cast_from_int(sycl::detail::ap_int<WA> A) {
+  return __spirv_ArbitraryFloatCastFromIntINTEL<WA, 1 + Eout + Mout>(
+      A, Mout, FromSign, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func signext i25 @_Z{{[0-9]+}}__spirv_ArbitraryFloatCastFromIntINTEL{{.*}}(i43 {{[%A-Za-z0-9.]+}}, i32 16, i1 zeroext false, i32 0, i32 2, i32 1)
 }
+template auto ap_float_cast_from_int<43, 8, 16>(sycl::detail::ap_int<43> A);
 
 template <int EA, int MA, int Wout>
-void ap_float_cast_to_int() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<Wout> cast_to_int_res =
-      __spirv_ArbitraryFloatCastToIntINTEL<1 + EA + MA, Wout>(
-          A, MA, ToSign, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func signext i30 @_Z{{[0-9]+}}__spirv_ArbitraryFloatCastToIntINTEL{{.*}}(i23 signext {{[%a-z0-9.]+}}, i32 15, i1 zeroext true, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_cast_to_int(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatCastToIntINTEL<1 + EA + MA, Wout>(
+      A, MA, ToSign, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func signext i30 @_Z{{[0-9]+}}__spirv_ArbitraryFloatCastToIntINTEL{{.*}}(i23 signext {{[%A-Za-z0-9.]+}}, i32 15, i1 zeroext true, i32 0, i32 2, i32 1)
 }
+template auto ap_float_cast_to_int<7, 15, 30>(sycl::detail::ap_int<1 + 7 + 15>);
 
 template <int EA, int MA, int EB, int MB, int Eout, int Mout>
-void ap_float_add() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + EB + MB> B;
-  sycl::detail::ap_int<1 + Eout + Mout> add_res =
-      __spirv_ArbitraryFloatAddINTEL<1 + EA + MA, 1 + EB + MB, 1 + Eout + Mout>(
-          A, MA, B, MB, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func signext i14 @_Z{{[0-9]+}}__spirv_ArbitraryFloatAddINTEL{{.*}}(i13 signext {{[%a-z0-9.]+}}, i32 7, i15 signext {{[%a-z0-9.]+}}, i32 8, i32 9, i32 0, i32 2, i32 1)
-  // CHECK: call spir_func signext i13 @_Z{{[0-9]+}}__spirv_ArbitraryFloatAddINTEL{{.*}}(i15 signext {{[%a-z0-9.]+}}, i32 8, i14 signext {{[%a-z0-9.]+}}, i32 9, i32 7, i32 0, i32 2, i32 1)
-}
+SYCL_EXTERNAL auto ap_float_add(sycl::detail::ap_int<1 + EA + MA> A,
+                                sycl::detail::ap_int<1 + EB + MB> B) {
+  return __spirv_ArbitraryFloatAddINTEL<1 + EA + MA, 1 + EB + MB,
+                                        1 + Eout + Mout>(
+      A, MA, B, MB, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func signext i14 @_Z{{[0-9]+}}__spirv_ArbitraryFloatAddINTEL{{.*}}(i13 signext {{[%A-Za-z0-9.]+}}, i32 7, i15 signext {{[%A-Za-z0-9.]+}}, i32 8, i32 9, i32 0, i32 2, i32 1)
+  // CHECK: call spir_func signext i13 @_Z{{[0-9]+}}__spirv_ArbitraryFloatAddINTEL{{.*}}(i15 signext {{[%A-Za-z0-9.]+}}, i32 8, i14 signext {{[%A-Za-z0-9.]+}}, i32 9, i32 7, i32 0, i32 2, i32 1)
+}
+template auto ap_float_add<5, 7, 6, 8, 4, 9>(sycl::detail::ap_int<1 + 5 + 7> A,
+                                             sycl::detail::ap_int<1 + 6 + 8> B);
+template auto ap_float_add<6, 8, 4, 9, 5, 7>(sycl::detail::ap_int<1 + 6 + 8> A,
+                                             sycl::detail::ap_int<1 + 4 + 9> B);
 
 template <int EA, int MA, int EB, int MB, int Eout, int Mout>
-void ap_float_sub() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + EB + MB> B;
-  sycl::detail::ap_int<1 + Eout + Mout> sub_res =
-      __spirv_ArbitraryFloatSubINTEL<1 + EA + MA, 1 + EB + MB, 1 + Eout + Mout>(
-          A, MA, B, MB, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func signext i13 @_Z{{[0-9]+}}__spirv_ArbitraryFloatSubINTEL{{.*}}(i9 signext {{[%a-z0-9.]+}}, i32 4, i11 signext {{[%a-z0-9.]+}}, i32 5, i32 6, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_sub(sycl::detail::ap_int<1 + EA + MA> A,
+                                sycl::detail::ap_int<1 + EB + MB> B) {
+  return __spirv_ArbitraryFloatSubINTEL<1 + EA + MA, 1 + EB + MB,
+                                        1 + Eout + Mout>(
+      A, MA, B, MB, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func signext i13 @_Z{{[0-9]+}}__spirv_ArbitraryFloatSubINTEL{{.*}}(i9 signext {{[%A-Za-z0-9.]+}}, i32 4, i11 signext {{[%A-Za-z0-9.]+}}, i32 5, i32 6, i32 0, i32 2, i32 1)
 }
+template auto ap_float_sub<4, 4, 5, 5, 6, 6>(sycl::detail::ap_int<1 + 4 + 4> A,
+                                             sycl::detail::ap_int<1 + 5 + 5> B);
 
 template <int EA, int MA, int EB, int MB, int Eout, int Mout>
-void ap_float_mul() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + EB + MB> B;
-  sycl::detail::ap_int<1 + Eout + Mout> mul_res =
-      __spirv_ArbitraryFloatMulINTEL<1 + EA + MA, 1 + EB + MB, 1 + Eout + Mout>(
-          A, MA, B, MB, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func i51 @_Z{{[0-9]+}}__spirv_ArbitraryFloatMulINTEL{{.*}}(i51 {{[%a-z0-9.]+}}, i32 34, i51 {{[%a-z0-9.]+}}, i32 34, i32 34, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_mul(sycl::detail::ap_int<1 + EA + MA> A,
+                                sycl::detail::ap_int<1 + EB + MB> B) {
+  return __spirv_ArbitraryFloatMulINTEL<1 + EA + MA, 1 + EB + MB,
+                                        1 + Eout + Mout>(
+      A, MA, B, MB, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func i51 @_Z{{[0-9]+}}__spirv_ArbitraryFloatMulINTEL{{.*}}(i51 {{[%A-Za-z0-9.]+}}, i32 34, i51 {{[%A-Za-z0-9.]+}}, i32 34, i32 34, i32 0, i32 2, i32 1)
 }
+template auto
+ap_float_mul<16, 34, 16, 34, 16, 34>(sycl::detail::ap_int<1 + 16 + 34> A,
+                                     sycl::detail::ap_int<1 + 16 + 34> B);
 
 template <int EA, int MA, int EB, int MB, int Eout, int Mout>
-void ap_float_div() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + EB + MB> B;
-  sycl::detail::ap_int<1 + Eout + Mout> div_res =
-      __spirv_ArbitraryFloatDivINTEL<1 + EA + MA, 1 + EB + MB, 1 + Eout + Mout>(
-          A, MA, B, MB, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func signext i18 @_Z{{[0-9]+}}__spirv_ArbitraryFloatDivINTEL{{.*}}(i16 signext {{[%a-z0-9.]+}}, i32 11, i16 signext {{[%a-z0-9.]+}}, i32 11, i32 12, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_div(sycl::detail::ap_int<1 + EA + MA> A,
+                                sycl::detail::ap_int<1 + EB + MB> B) {
+  return __spirv_ArbitraryFloatDivINTEL<1 + EA + MA, 1 + EB + MB,
+                                        1 + Eout + Mout>(
+      A, MA, B, MB, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func signext i18 @_Z{{[0-9]+}}__spirv_ArbitraryFloatDivINTEL{{.*}}(i16 signext {{[%A-Za-z0-9.]+}}, i32 11, i16 signext {{[%A-Za-z0-9.]+}}, i32 11, i32 12, i32 0, i32 2, i32 1)
 }
+template auto
+ap_float_div<4, 11, 4, 11, 5, 12>(sycl::detail::ap_int<1 + 4 + 11> A,
+                                  sycl::detail::ap_int<1 + 4 + 11> B);
 
 template <int EA, int MA, int EB, int MB>
-void ap_float_gt() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + EB + MB> B;
-  bool gt_res = __spirv_ArbitraryFloatGTINTEL<1 + EA + MA, 1 + EB + MB>(A, MA, B, MB);
-  // CHECK: call spir_func zeroext i1 @_Z{{[0-9]+}}__spirv_ArbitraryFloatGTINTEL{{.*}}(i63 {{[%a-z0-9.]+}}, i32 42, i63 {{[%a-z0-9.]+}}, i32 41)
+SYCL_EXTERNAL auto ap_float_gt(sycl::detail::ap_int<1 + EA + MA> A,
+                               sycl::detail::ap_int<1 + EB + MB> B) {
+  return __spirv_ArbitraryFloatGTINTEL<1 + EA + MA, 1 + EB + MB>(A, MA, B, MB);
+  // CHECK: call spir_func zeroext i1 @_Z{{[0-9]+}}__spirv_ArbitraryFloatGTINTEL{{.*}}(i63 {{[%A-Za-z0-9.]+}}, i32 42, i63 {{[%A-Za-z0-9.]+}}, i32 41)
 }
+template auto ap_float_gt<20, 42, 21, 41>(sycl::detail::ap_int<1 + 20 + 42> A,
+                                          sycl::detail::ap_int<1 + 21 + 41> B);
 
 template <int EA, int MA, int EB, int MB>
-void ap_float_ge() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + EB + MB> B;
-  bool ge_res = __spirv_ArbitraryFloatGEINTEL<1 + EA + MA, 1 + EB + MB>(A, MA, B, MB);
-  // CHECK: call spir_func zeroext i1 @_Z{{[0-9]+}}__spirv_ArbitraryFloatGEINTEL{{.*}}(i47 {{[%a-z0-9.]+}}, i32 27, i47 {{[%a-z0-9.]+}}, i32 27)
+SYCL_EXTERNAL auto ap_float_ge(sycl::detail::ap_int<1 + EA + MA> A,
+                               sycl::detail::ap_int<1 + EB + MB> B) {
+  return __spirv_ArbitraryFloatGEINTEL<1 + EA + MA, 1 + EB + MB>(A, MA, B, MB);
+  // CHECK: call spir_func zeroext i1 @_Z{{[0-9]+}}__spirv_ArbitraryFloatGEINTEL{{.*}}(i47 {{[%A-Za-z0-9.]+}}, i32 27, i47 {{[%A-Za-z0-9.]+}}, i32 27)
 }
+template auto ap_float_ge<19, 27, 19, 27>(sycl::detail::ap_int<1 + 19 + 27> A,
+                                          sycl::detail::ap_int<1 + 19 + 27> B);
 
 template <int EA, int MA, int EB, int MB>
-void ap_float_lt() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + EB + MB> B;
-  bool lt_res = __spirv_ArbitraryFloatLTINTEL<1 + EA + MA, 1 + EB + MB>(A, MA, B, MB);
-  // CHECK: call spir_func zeroext i1 @_Z{{[0-9]+}}__spirv_ArbitraryFloatLTINTEL{{.*}}(i5 signext {{[%a-z0-9.]+}}, i32 2, i7 signext {{[%a-z0-9.]+}}, i32 3)
+SYCL_EXTERNAL auto ap_float_lt(sycl::detail::ap_int<1 + EA + MA> A,
+                               sycl::detail::ap_int<1 + EB + MB> B) {
+  return __spirv_ArbitraryFloatLTINTEL<1 + EA + MA, 1 + EB + MB>(A, MA, B, MB);
+  // CHECK: call spir_func zeroext i1 @_Z{{[0-9]+}}__spirv_ArbitraryFloatLTINTEL{{.*}}(i5 signext {{[%A-Za-z0-9.]+}}, i32 2, i7 signext {{[%A-Za-z0-9.]+}}, i32 3)
 }
+template auto ap_float_lt<2, 2, 3, 3>(sycl::detail::ap_int<1 + 2 + 2> A,
+                                      sycl::detail::ap_int<1 + 3 + 3> B);
 
 template <int EA, int MA, int EB, int MB>
-void ap_float_le() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + EB + MB> B;
-  bool le_res = __spirv_ArbitraryFloatLEINTEL<1 + EA + MA, 1 + EB + MB>(A, MA, B, MB);
-  // CHECK: call spir_func zeroext i1 @_Z{{[0-9]+}}__spirv_ArbitraryFloatLEINTEL{{.*}}(i55 {{[%a-z0-9.]+}}, i32 27, i55 {{[%a-z0-9.]+}}, i32 28)
+SYCL_EXTERNAL auto ap_float_le(sycl::detail::ap_int<1 + EA + MA> A,
+                               sycl::detail::ap_int<1 + EB + MB> B) {
+  return __spirv_ArbitraryFloatLEINTEL<1 + EA + MA, 1 + EB + MB>(A, MA, B, MB);
+  // CHECK: call spir_func zeroext i1 @_Z{{[0-9]+}}__spirv_ArbitraryFloatLEINTEL{{.*}}(i55 {{[%A-Za-z0-9.]+}}, i32 27, i55 {{[%A-Za-z0-9.]+}}, i32 28)
 }
+template auto ap_float_le<27, 27, 26, 28>(sycl::detail::ap_int<1 + 27 + 27> A,
+                                          sycl::detail::ap_int<1 + 26 + 28> B);
 
 template <int EA, int MA, int EB, int MB>
-void ap_float_eq() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + EB + MB> B;
-  bool eq_res = __spirv_ArbitraryFloatEQINTEL<1 + EA + MA, 1 + EB + MB>(A, MA, B, MB);
-  // CHECK: call spir_func zeroext i1 @_Z{{[0-9]+}}__spirv_ArbitraryFloatEQINTEL{{.*}}(i20 signext {{[%a-z0-9.]+}}, i32 12, i15 signext {{[%a-z0-9.]+}}, i32 7)
+SYCL_EXTERNAL auto ap_float_eq(sycl::detail::ap_int<1 + EA + MA> A,
+                               sycl::detail::ap_int<1 + EB + MB> B) {
+  return __spirv_ArbitraryFloatEQINTEL<1 + EA + MA, 1 + EB + MB>(A, MA, B, MB);
+  // CHECK: call spir_func zeroext i1 @_Z{{[0-9]+}}__spirv_ArbitraryFloatEQINTEL{{.*}}(i20 signext {{[%A-Za-z0-9.]+}}, i32 12, i15 signext {{[%A-Za-z0-9.]+}}, i32 7)
 }
+template auto ap_float_eq<7, 12, 7, 7>(sycl::detail::ap_int<1 + 7 + 12> A,
+                                       sycl::detail::ap_int<1 + 7 + 7> B);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_recip() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> recip_res =
-      __spirv_ArbitraryFloatRecipINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func i39 @_Z{{[0-9]+}}__spirv_ArbitraryFloatRecipINTEL{{.*}}(i39 {{[%a-z0-9.]+}}, i32 29, i32 29, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_recip(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatRecipINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func i39 @_Z{{[0-9]+}}__spirv_ArbitraryFloatRecipINTEL{{.*}}(i39 {{[%A-Za-z0-9.]+}}, i32 29, i32 29, i32 0, i32 2, i32 1)
 }
+template auto ap_float_recip<9, 29, 9, 29>(sycl::detail::ap_int<1 + 9 + 29> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_rsqrt() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> rsqrt_res =
-      __spirv_ArbitraryFloatRSqrtINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func i34 @_Z{{[0-9]+}}__spirv_ArbitraryFloatRSqrtINTEL{{.*}}(i32 {{[%a-z0-9.]+}}, i32 19, i32 20, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_rsqrt(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatRSqrtINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func i34 @_Z{{[0-9]+}}__spirv_ArbitraryFloatRSqrtINTEL{{.*}}(i32 {{[%A-Za-z0-9.]+}}, i32 19, i32 20, i32 0, i32 2, i32 1)
 }
+template auto
+ap_float_rsqrt<12, 19, 13, 20>(sycl::detail::ap_int<1 + 12 + 19> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_cbrt() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> cbrt_res =
-      __spirv_ArbitraryFloatCbrtINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func signext i2 @_Z{{[0-9]+}}__spirv_ArbitraryFloatCbrtINTEL{{.*}}(i2 signext {{[%a-z0-9.]+}}, i32 1, i32 1, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_cbrt(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatCbrtINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func signext i2 @_Z{{[0-9]+}}__spirv_ArbitraryFloatCbrtINTEL{{.*}}(i2 signext {{[%A-Za-z0-9.]+}}, i32 1, i32 1, i32 0, i32 2, i32 1)
 }
+template auto ap_float_cbrt<0, 1, 0, 1>(sycl::detail::ap_int<1 + 0 + 1> A);
 
 template <int EA, int MA, int EB, int MB, int Eout, int Mout>
-void ap_float_hypot() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + EB + MB> B;
-  sycl::detail::ap_int<1 + Eout + Mout> hypot_res =
-      __spirv_ArbitraryFloatHypotINTEL<1 + EA + MA, 1 + EB + MB,
-                                       1 + Eout + Mout>(
-          A, MA, B, MB, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func i42 @_Z{{[0-9]+}}__spirv_ArbitraryFloatHypotINTEL{{.*}}(i41 {{[%a-z0-9.]+}}, i32 20, i43 {{[%a-z0-9.]+}}, i32 21, i32 22, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_hypot(sycl::detail::ap_int<1 + EA + MA> A,
+                                  sycl::detail::ap_int<1 + EB + MB> B) {
+  return __spirv_ArbitraryFloatHypotINTEL<1 + EA + MA, 1 + EB + MB,
+                                          1 + Eout + Mout>(
+      A, MA, B, MB, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func i42 @_Z{{[0-9]+}}__spirv_ArbitraryFloatHypotINTEL{{.*}}(i41 {{[%A-Za-z0-9.]+}}, i32 20, i43 {{[%A-Za-z0-9.]+}}, i32 21, i32 22, i32 0, i32 2, i32 1)
 }
+template auto
+ap_float_hypot<20, 20, 21, 21, 19, 22>(sycl::detail::ap_int<1 + 20 + 20> A,
+                                       sycl::detail::ap_int<1 + 21 + 21> B);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_sqrt() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> sqrt_res =
-      __spirv_ArbitraryFloatSqrtINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func signext i17 @_Z{{[0-9]+}}__spirv_ArbitraryFloatSqrtINTEL{{.*}}(i15 signext {{[%a-z0-9.]+}}, i32 7, i32 8, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_sqrt(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatSqrtINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func signext i17 @_Z{{[0-9]+}}__spirv_ArbitraryFloatSqrtINTEL{{.*}}(i15 signext {{[%A-Za-z0-9.]+}}, i32 7, i32 8, i32 0, i32 2, i32 1)
 }
+template auto ap_float_sqrt<7, 7, 8, 8>(sycl::detail::ap_int<1 + 7 + 7> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_log() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> log_res =
-      __spirv_ArbitraryFloatLogINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func i50 @_Z{{[0-9]+}}__spirv_ArbitraryFloatLogINTEL{{.*}}(i50 {{[%a-z0-9.]+}}, i32 19, i32 30, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_log(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatLogINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func i50 @_Z{{[0-9]+}}__spirv_ArbitraryFloatLogINTEL{{.*}}(i50 {{[%A-Za-z0-9.]+}}, i32 19, i32 30, i32 0, i32 2, i32 1)
 }
+template auto ap_float_log<30, 19, 19, 30>(sycl::detail::ap_int<1 + 30 + 19> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_log2() {
-  sycl::detail::ap_int<1 + EA + MA> A;
+SYCL_EXTERNAL auto ap_float_log2(sycl::detail::ap_int<1 + EA + MA> A) {
   sycl::detail::ap_int<1 + Eout + Mout> log2_res =
       __spirv_ArbitraryFloatLog2INTEL<1 + EA + MA, 1 + Eout + Mout>(
           A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func i38 @_Z{{[0-9]+}}__spirv_ArbitraryFloatLog2INTEL{{.*}}(i38 {{[%a-z0-9.]+}}, i32 20, i32 19, i32 0, i32 2, i32 1)
+  // CHECK: call spir_func i38 @_Z{{[0-9]+}}__spirv_ArbitraryFloatLog2INTEL{{.*}}(i38 {{[%A-Za-z0-9.]+}}, i32 20, i32 19, i32 0, i32 2, i32 1)
 }
+template auto
+ap_float_log2<17, 20, 18, 19>(sycl::detail::ap_int<1 + 17 + 20> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_log10() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> log10_res =
-      __spirv_ArbitraryFloatLog10INTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func signext i10 @_Z{{[0-9]+}}__spirv_ArbitraryFloatLog10INTEL{{.*}}(i8 signext {{[%a-z0-9.]+}}, i32 3, i32 5, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_log10(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatLog10INTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func signext i10 @_Z{{[0-9]+}}__spirv_ArbitraryFloatLog10INTEL{{.*}}(i8 signext {{[%A-Za-z0-9.]+}}, i32 3, i32 5, i32 0, i32 2, i32 1)
 }
+template auto ap_float_log10<4, 3, 4, 5>(sycl::detail::ap_int<1 + 4 + 3> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_log1p() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> log1p_res =
-      __spirv_ArbitraryFloatLog1pINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func i49 @_Z{{[0-9]+}}__spirv_ArbitraryFloatLog1pINTEL{{.*}}(i48 {{[%a-z0-9.]+}}, i32 30, i32 30, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_log1p(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatLog1pINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func i49 @_Z{{[0-9]+}}__spirv_ArbitraryFloatLog1pINTEL{{.*}}(i48 {{[%A-Za-z0-9.]+}}, i32 30, i32 30, i32 0, i32 2, i32 1)
 }
+template auto
+ap_float_log1p<17, 30, 18, 30>(sycl::detail::ap_int<1 + 17 + 30> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_exp() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> exp_res =
-      __spirv_ArbitraryFloatExpINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func i42 @_Z{{[0-9]+}}__spirv_ArbitraryFloatExpINTEL{{.*}}(i42 {{[%a-z0-9.]+}}, i32 25, i32 25, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_exp(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatExpINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func i42 @_Z{{[0-9]+}}__spirv_ArbitraryFloatExpINTEL{{.*}}(i42 {{[%A-Za-z0-9.]+}}, i32 25, i32 25, i32 0, i32 2, i32 1)
 }
+template auto ap_float_exp<16, 25, 16, 25>(sycl::detail::ap_int<1 + 16 + 25> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_exp2() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> exp2_res =
-      __spirv_ArbitraryFloatExp2INTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func signext i5 @_Z{{[0-9]+}}__spirv_ArbitraryFloatExp2INTEL{{.*}}(i3 signext {{[%a-z0-9.]+}}, i32 1, i32 2, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_exp2(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatExp2INTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func signext i5 @_Z{{[0-9]+}}__spirv_ArbitraryFloatExp2INTEL{{.*}}(i3 signext {{[%A-Za-z0-9.]+}}, i32 1, i32 2, i32 0, i32 2, i32 1)
 }
+template auto ap_float_exp2<1, 1, 2, 2>(sycl::detail::ap_int<1 + 1 + 1> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_exp10() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> exp10_res =
-      __spirv_ArbitraryFloatExp10INTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func signext i25 @_Z{{[0-9]+}}__spirv_ArbitraryFloatExp10INTEL{{.*}}(i25 signext {{[%a-z0-9.]+}}, i32 16, i32 16, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_exp10(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatExp10INTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func signext i25 @_Z{{[0-9]+}}__spirv_ArbitraryFloatExp10INTEL{{.*}}(i25 signext {{[%A-Za-z0-9.]+}}, i32 16, i32 16, i32 0, i32 2, i32 1)
 }
+template auto ap_float_exp10<8, 16, 8, 16>(sycl::detail::ap_int<1 + 8 + 16> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_expm1() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> expm1_res =
-      __spirv_ArbitraryFloatExpm1INTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func i62 @_Z{{[0-9]+}}__spirv_ArbitraryFloatExpm1INTEL{{.*}}(i64 {{[%a-z0-9.]+}}, i32 42, i32 41, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_expm1(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatExpm1INTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func i62 @_Z{{[0-9]+}}__spirv_ArbitraryFloatExpm1INTEL{{.*}}(i64 {{[%A-Za-z0-9.]+}}, i32 42, i32 41, i32 0, i32 2, i32 1)
 }
+template auto
+ap_float_expm1<21, 42, 20, 41>(sycl::detail::ap_int<1 + 21 + 42> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_sin() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> sin_res =
-      __spirv_ArbitraryFloatSinINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func i34 @_Z{{[0-9]+}}__spirv_ArbitraryFloatSinINTEL{{.*}}(i30 signext {{[%a-z0-9.]+}}, i32 15, i32 17, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_sin(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatSinINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func i34 @_Z{{[0-9]+}}__spirv_ArbitraryFloatSinINTEL{{.*}}(i30 signext {{[%A-Za-z0-9.]+}}, i32 15, i32 17, i32 0, i32 2, i32 1)
 }
+template auto ap_float_sin<14, 15, 16, 17>(sycl::detail::ap_int<1 + 14 + 15> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_cos() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> cos_res =
-      __spirv_ArbitraryFloatCosINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func signext i4 @_Z{{[0-9]+}}__spirv_ArbitraryFloatCosINTEL{{.*}}(i4 signext {{[%a-z0-9.]+}}, i32 2, i32 1, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_cos(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatCosINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func signext i4 @_Z{{[0-9]+}}__spirv_ArbitraryFloatCosINTEL{{.*}}(i4 signext {{[%A-Za-z0-9.]+}}, i32 2, i32 1, i32 0, i32 2, i32 1)
 }
+template auto ap_float_cos<1, 2, 2, 1>(sycl::detail::ap_int<1 + 1 + 2> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_sincos() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<2 * (1 + Eout + Mout)> sincos_res =
-      __spirv_ArbitraryFloatSinCosINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func i62 @_Z{{[0-9]+}}__spirv_ArbitraryFloatSinCosINTEL{{.*}}(i27 signext {{[%a-z0-9.]+}}, i32 18, i32 20, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_sincos(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatSinCosINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func i62 @_Z{{[0-9]+}}__spirv_ArbitraryFloatSinCosINTEL{{.*}}(i27 signext {{[%A-Za-z0-9.]+}}, i32 18, i32 20, i32 0, i32 2, i32 1)
 }
+template auto
+ap_float_sincos<8, 18, 10, 20>(sycl::detail::ap_int<1 + 8 + 18> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_sinpi() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> sinpi_res =
-      __spirv_ArbitraryFloatSinPiINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func signext i13 @_Z{{[0-9]+}}__spirv_ArbitraryFloatSinPiINTEL{{.*}}(i10 signext {{[%a-z0-9.]+}}, i32 6, i32 6, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_sinpi(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatSinPiINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func signext i13 @_Z{{[0-9]+}}__spirv_ArbitraryFloatSinPiINTEL{{.*}}(i10 signext {{[%A-Za-z0-9.]+}}, i32 6, i32 6, i32 0, i32 2, i32 1)
 }
+template auto ap_float_sinpi<3, 6, 6, 6>(sycl::detail::ap_int<1 + 3 + 6> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_cospi() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> cospi_res =
-      __spirv_ArbitraryFloatCosPiINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func i59 @_Z{{[0-9]+}}__spirv_ArbitraryFloatCosPiINTEL{{.*}}(i59 {{[%a-z0-9.]+}}, i32 40, i32 40, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_cospi(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatCosPiINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func i59 @_Z{{[0-9]+}}__spirv_ArbitraryFloatCosPiINTEL{{.*}}(i59 {{[%A-Za-z0-9.]+}}, i32 40, i32 40, i32 0, i32 2, i32 1)
 }
+template auto
+ap_float_cospi<18, 40, 18, 40>(sycl::detail::ap_int<1 + 18 + 40> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_sincospi() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<2 * (1 + Eout + Mout)> sincos_res =
-      __spirv_ArbitraryFloatSinCosPiINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func i64 @_Z{{[0-9]+}}__spirv_ArbitraryFloatSinCosPiINTEL{{.*}}(i30 signext {{[%a-z0-9.]+}}, i32 20, i32 20, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_sincospi(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatSinCosPiINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func i64 @_Z{{[0-9]+}}__spirv_ArbitraryFloatSinCosPiINTEL{{.*}}(i30 signext {{[%A-Za-z0-9.]+}}, i32 20, i32 20, i32 0, i32 2, i32 1)
 }
+template auto
+ap_float_sincospi<9, 20, 11, 20>(sycl::detail::ap_int<1 + 9 + 20> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_asin() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> asin_res =
-      __spirv_ArbitraryFloatASinINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func signext i11 @_Z{{[0-9]+}}__spirv_ArbitraryFloatASinINTEL{{.*}}(i7 signext {{[%a-z0-9.]+}}, i32 4, i32 8, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_asin(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatASinINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func signext i11 @_Z{{[0-9]+}}__spirv_ArbitraryFloatASinINTEL{{.*}}(i7 signext {{[%A-Za-z0-9.]+}}, i32 4, i32 8, i32 0, i32 2, i32 1)
 }
+template auto ap_float_asin<2, 4, 2, 8>(sycl::detail::ap_int<1 + 2 + 4> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_asinpi() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> asinpi_res =
-      __spirv_ArbitraryFloatASinPiINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func i35 @_Z{{[0-9]+}}__spirv_ArbitraryFloatASinPiINTEL{{.*}}(i35 {{[%a-z0-9.]+}}, i32 23, i32 23, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_asinpi(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatASinPiINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func i35 @_Z{{[0-9]+}}__spirv_ArbitraryFloatASinPiINTEL{{.*}}(i35 {{[%A-Za-z0-9.]+}}, i32 23, i32 23, i32 0, i32 2, i32 1)
 }
+template auto
+ap_float_asinpi<11, 23, 11, 23>(sycl::detail::ap_int<1 + 11 + 23> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_acos() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> acos_res =
-      __spirv_ArbitraryFloatACosINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func signext i14 @_Z{{[0-9]+}}__spirv_ArbitraryFloatACosINTEL{{.*}}(i14 signext {{[%a-z0-9.]+}}, i32 9, i32 10, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_acos(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatACosINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func signext i14 @_Z{{[0-9]+}}__spirv_ArbitraryFloatACosINTEL{{.*}}(i14 signext {{[%A-Za-z0-9.]+}}, i32 9, i32 10, i32 0, i32 2, i32 1)
 }
+template auto ap_float_acos<4, 9, 3, 10>(sycl::detail::ap_int<1 + 4 + 9> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_acospi() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> acospi_res =
-      __spirv_ArbitraryFloatACosPiINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func signext i8 @_Z{{[0-9]+}}__spirv_ArbitraryFloatACosPiINTEL{{.*}}(i8 signext {{[%a-z0-9.]+}}, i32 5, i32 4, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_acospi(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatACosPiINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func signext i8 @_Z{{[0-9]+}}__spirv_ArbitraryFloatACosPiINTEL{{.*}}(i8 signext {{[%A-Za-z0-9.]+}}, i32 5, i32 4, i32 0, i32 2, i32 1)
 }
+template auto ap_float_acospi<2, 5, 3, 4>(sycl::detail::ap_int<1 + 2 + 5> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_atan() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> atan_res =
-      __spirv_ArbitraryFloatATanINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func i44 @_Z{{[0-9]+}}__spirv_ArbitraryFloatATanINTEL{{.*}}(i44 {{[%a-z0-9.]+}}, i32 31, i32 31, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_atan(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatATanINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func i44 @_Z{{[0-9]+}}__spirv_ArbitraryFloatATanINTEL{{.*}}(i44 {{[%A-Za-z0-9.]+}}, i32 31, i32 31, i32 0, i32 2, i32 1)
 }
+template auto
+ap_float_atan<12, 31, 12, 31>(sycl::detail::ap_int<1 + 12 + 31> A);
 
 template <int EA, int MA, int Eout, int Mout>
-void ap_float_atapin() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + Eout + Mout> atanpi_res =
-      __spirv_ArbitraryFloatATanPiINTEL<1 + EA + MA, 1 + Eout + Mout>(
-          A, MA, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func i34 @_Z{{[0-9]+}}__spirv_ArbitraryFloatATanPiINTEL{{.*}}(i40 {{[%a-z0-9.]+}}, i32 38, i32 32, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_atapin(sycl::detail::ap_int<1 + EA + MA> A) {
+  return __spirv_ArbitraryFloatATanPiINTEL<1 + EA + MA, 1 + Eout + Mout>(
+      A, MA, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func i34 @_Z{{[0-9]+}}__spirv_ArbitraryFloatATanPiINTEL{{.*}}(i40 {{[%A-Za-z0-9.]+}}, i32 38, i32 32, i32 0, i32 2, i32 1)
 }
+template auto ap_float_atapin<1, 38, 1, 32>(sycl::detail::ap_int<1 + 1 + 38> A);
 
 template <int EA, int MA, int EB, int MB, int Eout, int Mout>
-void ap_float_atan2() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + EB + MB> B;
-  sycl::detail::ap_int<1 + Eout + Mout> atan2_res =
-      __spirv_ArbitraryFloatATan2INTEL<1 + EA + MA, 1 + EB + MB,
-                                       1 + Eout + Mout>(
-          A, MA, B, MB, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func signext i27 @_Z{{[0-9]+}}__spirv_ArbitraryFloatATan2INTEL{{.*}}(i24 signext {{[%a-z0-9.]+}}, i32 16, i25 signext {{[%a-z0-9.]+}}, i32 17, i32 18, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_atan2(sycl::detail::ap_int<1 + EA + MA> A,
+                                  sycl::detail::ap_int<1 + EB + MB> B) {
+  return __spirv_ArbitraryFloatATan2INTEL<1 + EA + MA, 1 + EB + MB,
+                                          1 + Eout + Mout>(
+      A, MA, B, MB, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func signext i27 @_Z{{[0-9]+}}__spirv_ArbitraryFloatATan2INTEL{{.*}}(i24 signext {{[%A-Za-z0-9.]+}}, i32 16, i25 signext {{[%A-Za-z0-9.]+}}, i32 17, i32 18, i32 0, i32 2, i32 1)
 }
+template auto
+ap_float_atan2<7, 16, 7, 17, 8, 18>(sycl::detail::ap_int<1 + 7 + 16> A,
+                                    sycl::detail::ap_int<1 + 7 + 17> B);
 
 template <int EA, int MA, int EB, int MB, int Eout, int Mout>
-void ap_float_pow() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + EB + MB> B;
-  sycl::detail::ap_int<1 + Eout + Mout> pow_res =
-      __spirv_ArbitraryFloatPowINTEL<1 + EA + MA, 1 + EB + MB, 1 + Eout + Mout>(
-          A, MA, B, MB, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func signext i21 @_Z{{[0-9]+}}__spirv_ArbitraryFloatPowINTEL{{.*}}(i17 signext {{[%a-z0-9.]+}}, i32 8, i19 signext {{[%a-z0-9.]+}}, i32 9, i32 10, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_pow(sycl::detail::ap_int<1 + EA + MA> A,
+                                sycl::detail::ap_int<1 + EB + MB> B) {
+  return __spirv_ArbitraryFloatPowINTEL<1 + EA + MA, 1 + EB + MB,
+                                        1 + Eout + Mout>(
+      A, MA, B, MB, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func signext i21 @_Z{{[0-9]+}}__spirv_ArbitraryFloatPowINTEL{{.*}}(i17 signext {{[%A-Za-z0-9.]+}}, i32 8, i19 signext {{[%A-Za-z0-9.]+}}, i32 9, i32 10, i32 0, i32 2, i32 1)
 }
+template auto
+ap_float_pow<8, 8, 9, 9, 10, 10>(sycl::detail::ap_int<1 + 8 + 8> A,
+                                 sycl::detail::ap_int<1 + 9 + 9> B);
 
 template <int EA, int MA, int EB, int MB, int Eout, int Mout>
-void ap_float_powr() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<1 + EB + MB> B;
-  sycl::detail::ap_int<1 + Eout + Mout> powr_res =
-      __spirv_ArbitraryFloatPowRINTEL<1 + EA + MA, 1 + EB + MB,
-                                      1 + Eout + Mout>(
-          A, MA, B, MB, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func i56 @_Z{{[0-9]+}}__spirv_ArbitraryFloatPowRINTEL{{.*}}(i54 {{[%a-z0-9.]+}}, i32 35, i55 {{[%a-z0-9.]+}}, i32 35, i32 35, i32 0, i32 2, i32 1)
+SYCL_EXTERNAL auto ap_float_powr(sycl::detail::ap_int<1 + EA + MA> A,
+                                 sycl::detail::ap_int<1 + EB + MB> B) {
+  return __spirv_ArbitraryFloatPowRINTEL<1 + EA + MA, 1 + EB + MB,
+                                         1 + Eout + Mout>(
+      A, MA, B, MB, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func i56 @_Z{{[0-9]+}}__spirv_ArbitraryFloatPowRINTEL{{.*}}(i54 {{[%A-Za-z0-9.]+}}, i32 35, i55 {{[%A-Za-z0-9.]+}}, i32 35, i32 35, i32 0, i32 2, i32 1)
 }
+template auto
+ap_float_powr<18, 35, 19, 35, 20, 35>(sycl::detail::ap_int<1 + 18 + 35> A,
+                                      sycl::detail::ap_int<1 + 19 + 35> B);
 
 template <int EA, int MA, int WB, int Eout, int Mout>
-void ap_float_pown() {
-  sycl::detail::ap_int<1 + EA + MA> A;
-  sycl::detail::ap_int<WB> B;
-  sycl::detail::ap_int<1 + Eout + Mout> pown_res =
-      __spirv_ArbitraryFloatPowNINTEL<1 + EA + MA, WB, 1 + Eout + Mout>(
-          A, MA, B, SignOfB, Mout, Subnorm, RndMode, RndAcc);
-  // CHECK: call spir_func signext i15 @_Z{{[0-9]+}}__spirv_ArbitraryFloatPowNINTEL{{.*}}(i12 signext {{[%a-z0-9.]+}}, i32 7, i10 signext {{[%a-z0-9.]+}}, i1 zeroext false, i32 9, i32 0, i32 2, i32 1)
-}
-
-template <typename name, typename Func>
-__attribute__((sycl_kernel)) void kernel_single_task(Func kernelFunc) {
-  kernelFunc();
-}
-
-int main() {
-  kernel_single_task<class kernel_function>([]() {
-    ap_float_cast<11, 28, 9, 30>();
-    ap_float_cast_from_int<43, 8, 16>();
-    ap_float_cast_to_int<7, 15, 30>();
-    ap_float_add<5, 7, 6, 8, 4, 9>();
-    ap_float_add<6, 8, 4, 9, 5, 7>();
-    ap_float_sub<4, 4, 5, 5, 6, 6>();
-    ap_float_mul<16, 34, 16, 34, 16, 34>();
-    ap_float_div<4, 11, 4, 11, 5, 12>();
-    ap_float_gt<20, 42, 21, 41>();
-    ap_float_ge<19, 27, 19, 27>();
-    ap_float_lt<2, 2, 3, 3>();
-    ap_float_le<27, 27, 26, 28>();
-    ap_float_eq<7, 12, 7, 7>();
-    ap_float_recip<9, 29, 9, 29>();
-    ap_float_rsqrt<12, 19, 13, 20>();
-    ap_float_cbrt<0, 1, 0, 1>();
-    ap_float_hypot<20, 20, 21, 21, 19, 22>();
-    ap_float_sqrt<7, 7, 8, 8>();
-    ap_float_log<30, 19, 19, 30>();
-    ap_float_log2<17, 20, 18, 19>();
-    ap_float_log10<4, 3, 4, 5>();
-    ap_float_log1p<17, 30, 18, 30>();
-    ap_float_exp<16, 25, 16, 25>();
-    ap_float_exp2<1, 1, 2, 2>();
-    ap_float_exp10<8, 16, 8, 16>();
-    ap_float_expm1<21, 42, 20, 41>();
-    ap_float_sin<14, 15, 16, 17>();
-    ap_float_cos<1, 2, 2, 1>();
-    ap_float_sincos<8, 18, 10, 20>();
-    ap_float_sinpi<3, 6, 6, 6>();
-    ap_float_cospi<18, 40, 18, 40>();
-    ap_float_sincospi<9, 20, 11, 20>();
-    ap_float_asin<2, 4, 2, 8>();
-    ap_float_asinpi<11, 23, 11, 23>();
-    ap_float_acos<4, 9, 3, 10>();
-    ap_float_acospi<2, 5, 3, 4>();
-    ap_float_atan<12, 31, 12, 31>();
-    ap_float_atapin<1, 38, 1, 32>();
-    ap_float_atan2<7, 16, 7, 17, 8, 18>();
-    ap_float_pow<8, 8, 9, 9, 10, 10>();
-    ap_float_powr<18, 35, 19, 35, 20, 35>();
-    ap_float_pown<4, 7, 10, 5, 9>();
-  });
-  return 0;
-}
+SYCL_EXTERNAL auto ap_float_pown(sycl::detail::ap_int<1 + EA + MA> A,
+                                 sycl::detail::ap_int<WB> B) {
+  return __spirv_ArbitraryFloatPowNINTEL<1 + EA + MA, WB, 1 + Eout + Mout>(
+      A, MA, B, SignOfB, Mout, Subnorm, RndMode, RndAcc);
+  // CHECK: call spir_func signext i15 @_Z{{[0-9]+}}__spirv_ArbitraryFloatPowNINTEL{{.*}}(i12 signext {{[%A-Za-z0-9.]+}}, i32 7, i10 signext {{[%A-Za-z0-9.]+}}, i1 zeroext false, i32 9, i32 0, i32 2, i32 1)
+}
+template auto ap_float_pown<4, 7, 10, 5, 9>(sycl::detail::ap_int<1 + 4 + 7> A,
+                                            sycl::detail::ap_int<10> B);
\ No newline at end of file
diff --git a/sycl/test/check_device_code/group_barrier.cpp b/sycl/test/check_device_code/group_barrier.cpp
index bf9069eea8558..d3d96a99ae8ce 100644
--- a/sycl/test/check_device_code/group_barrier.cpp
+++ b/sycl/test/check_device_code/group_barrier.cpp
@@ -10,34 +10,23 @@ const auto TestLambda = [](auto G) {
   sycl::group_barrier(G, sycl::memory_scope_system);
 };
 
-int main() {
-  sycl::queue Q;
+SYCL_EXTERNAL void test_1d(sycl::nd_item<1> item) {
+  auto G = item.get_group();
+  auto SG = item.get_sub_group();
+  TestLambda(G);
+  TestLambda(SG);
+}
+
+SYCL_EXTERNAL void test_2d(sycl::nd_item<2> item) {
+  auto G = item.get_group();
+  TestLambda(G);
+}
 
-  Q.submit([](sycl::handler &CGH) {
-    CGH.parallel_for(sycl::nd_range{sycl::range{1}, sycl::range{1}},
-                     [](sycl::nd_item<1> item) {
-                       auto G = item.get_group();
-                       auto SG = item.get_sub_group();
-                       TestLambda(G);
-                       TestLambda(SG);
-                     });
-  });
-  Q.submit([](sycl::handler &CGH) {
-    CGH.parallel_for(sycl::nd_range{sycl::range{1, 1}, sycl::range{1, 1}},
-                     [](sycl::nd_item<2> item) {
-                       auto G = item.get_group();
-                       TestLambda(G);
-                     });
-  });
-  Q.submit([](sycl::handler &CGH) {
-    CGH.parallel_for(sycl::nd_range{sycl::range{1, 1, 1}, sycl::range{1, 1, 1}},
-                     [](sycl::nd_item<3> item) {
-                       auto G = item.get_group();
-                       TestLambda(G);
-                     });
-  });
-  return 0;
+SYCL_EXTERNAL void test_3d(sycl::nd_item<3> item) {
+  auto G = item.get_group();
+  TestLambda(G);
 }
+
 // CHECK: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 2, i32 912)
 // CHECK: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 4, i32 912)
 // CHECK: tail call spir_func void @_Z22__spirv_ControlBarrierjjj(i32 2, i32 3, i32 912)
diff --git a/sycl/test/check_device_code/hip/atomic/amdgpu_unsafe_atomics.cpp b/sycl/test/check_device_code/hip/atomic/amdgpu_unsafe_atomics.cpp
index 0b78738a01514..b045fdb99c8d5 100644
--- a/sycl/test/check_device_code/hip/atomic/amdgpu_unsafe_atomics.cpp
+++ b/sycl/test/check_device_code/hip/atomic/amdgpu_unsafe_atomics.cpp
@@ -5,45 +5,37 @@
 
 #include <sycl/sycl.hpp>
 
-class intKernel;
-class fpKernel;
-
-int main() {
-  int *i;
-  float *f;
-  double *d;
-  sycl::queue{}.single_task<intKernel>([=] {
-    sycl::atomic_ref<int, sycl::memory_order_relaxed, sycl::memory_scope_device>
-        atomicInt(*i);
-    atomicInt.fetch_xor(1);
-    atomicInt.fetch_and(1);
-    atomicInt.fetch_or(1);
-    // CHECK: amdgpu_kernel void{{.*}}intKernel
-    // CHECK-SAFE: cmpxchg volatile
-    // CHECK-SAFE-NOT: atomicrmw
-    // CHECK-UNSAFE: atomicrmw volatile xor
-    // CHECK-UNSAFE: atomicrmw volatile and
-    // CHECK-UNSAFE: atomicrmw volatile or
-    // CHECK-UNSAFE-NOT: cmpxchg
-  });
-  sycl::queue{}.single_task<fpKernel>([=] {
-    sycl::atomic_ref<float, sycl::memory_order_relaxed,
-                     sycl::memory_scope_device,
-                     sycl::access::address_space::global_space>(*f)
-        .fetch_add(1.0f);
-    // CHECK: amdgpu_kernel void{{.*}}fpKernel
-    // CHECK-SAFE: atomicrmw volatile fadd
-    // CHECK-SAFE-NOT: llvm.amdgcn.global.atomic.fadd.f32
-    // CHECK-UNSAFE-FP: llvm.amdgcn.global.atomic.fadd.f32
-    // CHECK-UNSAFE-FP-NOT: atomicrmw volatile fadd
-    sycl::atomic_ref<double, sycl::memory_order_relaxed,
-                     sycl::memory_scope_device,
-                     sycl::access::address_space::global_space>(*d)
-        .fetch_add(1.0);
-    // CHECK-SAFE: cmpxchg
-    // CHECK-SAFE-NOT: llvm.amdgcn.global.atomic.fadd.f64
-    // CHECK-UNSAFE-FP: llvm.amdgcn.global.atomic.fadd.f64
-    // CHECK-UNSAFE-FP-NOT: cmpxchg
-    // CHECK: __CLANG_OFFLOAD_BUNDLE____END__ sycl-amdgcn-amd-amdhsa-
-  });
+SYCL_EXTERNAL void intAtomicFunc(int *i) {
+  sycl::atomic_ref<int, sycl::memory_order_relaxed, sycl::memory_scope_device>
+      atomicInt(*i);
+  atomicInt.fetch_xor(1);
+  atomicInt.fetch_and(1);
+  atomicInt.fetch_or(1);
+  // CHECK: void{{.*}}intAtomicFunc
+  // CHECK-SAFE: cmpxchg volatile
+  // CHECK-SAFE-NOT: atomicrmw
+  // CHECK-UNSAFE: atomicrmw volatile xor
+  // CHECK-UNSAFE: atomicrmw volatile and
+  // CHECK-UNSAFE: atomicrmw volatile or
+  // CHECK-UNSAFE-NOT: cmpxchg
 }
+
+SYCL_EXTERNAL void fpAtomicFunc(float *f, double *d) {
+  sycl::atomic_ref<float, sycl::memory_order_relaxed, sycl::memory_scope_device,
+                   sycl::access::address_space::global_space>(*f)
+      .fetch_add(1.0f);
+  // CHECK: void{{.*}}fpAtomicFunc
+  // CHECK-SAFE: atomicrmw volatile fadd
+  // CHECK-SAFE-NOT: llvm.amdgcn.global.atomic.fadd.f32
+  // CHECK-UNSAFE-FP: llvm.amdgcn.global.atomic.fadd.f32
+  // CHECK-UNSAFE-FP-NOT: atomicrmw volatile fadd
+  sycl::atomic_ref<double, sycl::memory_order_relaxed,
+                   sycl::memory_scope_device,
+                   sycl::access::address_space::global_space>(*d)
+      .fetch_add(1.0);
+  // CHECK-SAFE: cmpxchg
+  // CHECK-SAFE-NOT: llvm.amdgcn.global.atomic.fadd.f64
+  // CHECK-UNSAFE-FP: llvm.amdgcn.global.atomic.fadd.f64
+  // CHECK-UNSAFE-FP-NOT: cmpxchg
+  // CHECK: __CLANG_OFFLOAD_BUNDLE____END__ sycl-amdgcn-amd-amdhsa-
+}
\ No newline at end of file
diff --git a/sycl/test/check_device_code/hip/matrix/matrix-hip-bfloat16-float-test.cpp b/sycl/test/check_device_code/hip/matrix/matrix-hip-bfloat16-float-test.cpp
index 29843ac50f114..2fabef77ee86a 100644
--- a/sycl/test/check_device_code/hip/matrix/matrix-hip-bfloat16-float-test.cpp
+++ b/sycl/test/check_device_code/hip/matrix/matrix-hip-bfloat16-float-test.cpp
@@ -7,62 +7,55 @@ using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 using sycl::ext::oneapi::bfloat16;
 
-int main() {
-  buffer<bfloat16, 1> bufA(nullptr, range<1>(1));
-  buffer<bfloat16, 1> bufB(nullptr, range<1>(1));
-  buffer<float, 1> bufC(nullptr, range<1>(1));
-  buffer<float, 1> bufD(nullptr, range<1>(1));
-  queue q;
-
-  q.submit([&](handler &cgh) {
-    sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accA(bufA, cgh);
-    sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accB(bufB, cgh);
-    sycl::accessor<float, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accC(bufC, cgh);
-    sycl::accessor<float, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accD(bufD, cgh);
-
-    cgh.parallel_for<class row_row_m16n16k16>(
-        nd_range<2>({1, 64}, {1, 64}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, bfloat16, use::a, 16, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, bfloat16, use::b, 16, 16, layout::row_major>
-              sub_b{};
-          // CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, <4 x float> zeroinitializer, i32 0, i32 0, i32 0)
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              16, layout::row_major);
-        });
-
-    cgh.parallel_for<class row_col_m32n32k8>(
-        nd_range<2>({1, 64}, {1, 64}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 32, 32> sub_c{};
-          joint_matrix<sub_group, bfloat16, use::a, 32, 8, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, bfloat16, use::b, 8, 32, layout::col_major>
-              sub_b{};
-
-          // CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              32, layout::row_major);
-        });
-  });
-
-  return 0;
-};
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 64)]] void
+row_row_m16n16k16(sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, bfloat16, use::a, 16, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, bfloat16, use::b, 16, 16, layout::row_major> sub_b{};
+  // CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16bf16.1k(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, <4 x float> zeroinitializer, i32 0, i32 0, i32 0)
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(), 16,
+                     layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 64)]] void
+row_col_m32n32k8(sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<bfloat16, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 32, 32> sub_c{};
+  joint_matrix<sub_group, bfloat16, use::a, 32, 8, layout::row_major> sub_a{};
+  joint_matrix<sub_group, bfloat16, use::b, 8, 32, layout::col_major> sub_b{};
+
+  // CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8bf16.1k(<4 x i16> zeroinitializer, <4 x i16> zeroinitializer, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(), 32,
+                     layout::row_major);
+}
\ No newline at end of file
diff --git a/sycl/test/check_device_code/hip/matrix/matrix-hip-double-double-test.cpp b/sycl/test/check_device_code/hip/matrix/matrix-hip-double-double-test.cpp
index e82e6fd0337db..9c0b16ec82a41 100644
--- a/sycl/test/check_device_code/hip/matrix/matrix-hip-double-double-test.cpp
+++ b/sycl/test/check_device_code/hip/matrix/matrix-hip-double-double-test.cpp
@@ -6,45 +6,29 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-int main() {
-  buffer<double, 1> bufA(nullptr, range<1>(1));
-  buffer<double, 1> bufB(nullptr, range<1>(1));
-  buffer<double, 1> bufC(nullptr, range<1>(1));
-  buffer<double, 1> bufD(nullptr, range<1>(1));
-  queue q;
-
-  q.submit([&](handler &cgh) {
-    sycl::accessor<double, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accA(bufA, cgh);
-    sycl::accessor<double, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accB(bufB, cgh);
-    sycl::accessor<double, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accC(bufC, cgh);
-    sycl::accessor<double, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accD(bufD, cgh);
-
-    cgh.parallel_for<class row_row_m16n16k4>(
-        nd_range<2>({1, 64}, {1, 64}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, double, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, double, use::a, 16, 4, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, double, use::b, 4, 16, layout::row_major>
-              sub_b{};
-
-          // CHECK: tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double {{.*}}, double {{.*}}, <4 x double> zeroinitializer, i32 0, i32 0, i32 0)
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              16, layout::row_major);
-        });
-  });
-
-  return 0;
-};
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 64)]] void
+row_row_m16n16k4(sycl::accessor<double, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<double, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<double, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<double, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, double, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, double, use::a, 16, 4, layout::row_major> sub_a{};
+  joint_matrix<sub_group, double, use::b, 4, 16, layout::row_major> sub_b{};
+
+  // CHECK: tail call <4 x double> @llvm.amdgcn.mfma.f64.16x16x4f64(double {{.*}}, double {{.*}}, <4 x double> zeroinitializer, i32 0, i32 0, i32 0)
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(), 16,
+                     layout::row_major);
+}
\ No newline at end of file
diff --git a/sycl/test/check_device_code/hip/matrix/matrix-hip-half-float-test.cpp b/sycl/test/check_device_code/hip/matrix/matrix-hip-half-float-test.cpp
index 2afe666034bf5..7e5ff71b9b5d9 100644
--- a/sycl/test/check_device_code/hip/matrix/matrix-hip-half-float-test.cpp
+++ b/sycl/test/check_device_code/hip/matrix/matrix-hip-half-float-test.cpp
@@ -6,63 +6,56 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-int main() {
-  buffer<half, 1> bufA(nullptr, range<1>(1));
-  buffer<half, 1> bufB(nullptr, range<1>(1));
-  buffer<float, 1> bufC(nullptr, range<1>(1));
-  buffer<float, 1> bufD(nullptr, range<1>(1));
-  queue q;
-
-  q.submit([&](handler &cgh) {
-    sycl::accessor<half, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accA(bufA, cgh);
-    sycl::accessor<half, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accB(bufB, cgh);
-    sycl::accessor<float, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accC(bufC, cgh);
-    sycl::accessor<float, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accD(bufD, cgh);
-
-    cgh.parallel_for<class row_row_m16n16k16>(
-        nd_range<2>({1, 64}, {1, 64}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, half, use::a, 16, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 16, 16, layout::row_major>
-              sub_b{};
-
-          // CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> zeroinitializer, i32 0, i32 0, i32 0)
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              16, layout::row_major);
-        });
-
-    cgh.parallel_for<class row_col_m32n32k8>(
-        nd_range<2>({1, 64}, {1, 64}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, float, use::accumulator, 32, 32> sub_c{};
-          joint_matrix<sub_group, half, use::a, 32, 8, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, half, use::b, 8, 32, layout::col_major>
-              sub_b{};
-
-          // CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              32, layout::row_major);
-        });
-  });
-
-  return 0;
-};
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 64)]] void
+row_row_m16n16k16(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, half, use::a, 16, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 16, 16, layout::row_major> sub_b{};
+
+  // CHECK: tail call <4 x float> @llvm.amdgcn.mfma.f32.16x16x16f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <4 x float> zeroinitializer, i32 0, i32 0, i32 0)
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(), 16,
+                     layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 64)]] void
+row_col_m32n32k8(sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<half, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<float, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, float, use::accumulator, 32, 32> sub_c{};
+  joint_matrix<sub_group, half, use::a, 32, 8, layout::row_major> sub_a{};
+  joint_matrix<sub_group, half, use::b, 8, 32, layout::col_major> sub_b{};
+
+  // CHECK: tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x8f16(<4 x half> zeroinitializer, <4 x half> zeroinitializer, <16 x float> zeroinitializer, i32 0, i32 0, i32 0)
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(), 32,
+                     layout::row_major);
+}
\ No newline at end of file
diff --git a/sycl/test/check_device_code/hip/matrix/matrix-hip-int8-int32-test.cpp b/sycl/test/check_device_code/hip/matrix/matrix-hip-int8-int32-test.cpp
index d39f7a8772717..98c74f54be794 100644
--- a/sycl/test/check_device_code/hip/matrix/matrix-hip-int8-int32-test.cpp
+++ b/sycl/test/check_device_code/hip/matrix/matrix-hip-int8-int32-test.cpp
@@ -6,63 +6,56 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-int main() {
-  buffer<int8_t, 1> bufA(nullptr, range<1>(1));
-  buffer<int8_t, 1> bufB(nullptr, range<1>(1));
-  buffer<int32_t, 1> bufC(nullptr, range<1>(1));
-  buffer<int32_t, 1> bufD(nullptr, range<1>(1));
-  queue q;
-
-  q.submit([&](handler &cgh) {
-    sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accA(bufA, cgh);
-    sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accB(bufB, cgh);
-    sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accC(bufC, cgh);
-    sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
-                   sycl::target::device>
-        accD(bufD, cgh);
-
-    cgh.parallel_for<class row_row_m16n16k16>(
-        nd_range<2>({1, 64}, {1, 64}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
-          joint_matrix<sub_group, int8_t, use::a, 16, 16, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, int8_t, use::b, 16, 16, layout::row_major>
-              sub_b{};
-
-          // CHECK: tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 {{.*}}, i32 {{.*}}, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              16, layout::row_major);
-        });
-
-    cgh.parallel_for<class row_col_m32n32k8>(
-        nd_range<2>({1, 64}, {1, 64}),
-        [=](nd_item<2> item) [[sycl::reqd_work_group_size(1, 1, 64)]] {
-          sycl::sub_group sg = item.get_sub_group();
-
-          joint_matrix<sub_group, int32_t, use::accumulator, 32, 32> sub_c{};
-          joint_matrix<sub_group, int8_t, use::a, 32, 8, layout::row_major>
-              sub_a{};
-          joint_matrix<sub_group, int8_t, use::b, 8, 32, layout::col_major>
-              sub_b{};
-
-          // CHECK: tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 {{.*}}, i32 {{.*}}, <16 x i32> zeroinitializer, i32 0, i32 0, i32 0)
-          joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-          joint_matrix_store(
-              sg, sub_c, accD.template get_multi_ptr<access::decorated::yes>(),
-              32, layout::row_major);
-        });
-  });
-
-  return 0;
-};
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 64)]] void
+row_row_m16n16k16(sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accA,
+                  sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accB,
+                  sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accC,
+                  sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                 sycl::target::device>
+                      accD,
+                  nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 16, 16> sub_c{};
+  joint_matrix<sub_group, int8_t, use::a, 16, 16, layout::row_major> sub_a{};
+  joint_matrix<sub_group, int8_t, use::b, 16, 16, layout::row_major> sub_b{};
+
+  // CHECK: tail call <4 x i32> @llvm.amdgcn.mfma.i32.16x16x16i8(i32 {{.*}}, i32 {{.*}}, <4 x i32> zeroinitializer, i32 0, i32 0, i32 0)
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(), 16,
+                     layout::row_major);
+}
+
+SYCL_EXTERNAL [[sycl::reqd_work_group_size(1, 1, 64)]] void
+row_col_m32n32k8(sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accA,
+                 sycl::accessor<int8_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accB,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accC,
+                 sycl::accessor<int32_t, 1, sycl::access::mode::read_write,
+                                sycl::target::device>
+                     accD,
+                 nd_item<2> item) {
+  sycl::sub_group sg = item.get_sub_group();
+
+  joint_matrix<sub_group, int32_t, use::accumulator, 32, 32> sub_c{};
+  joint_matrix<sub_group, int8_t, use::a, 32, 8, layout::row_major> sub_a{};
+  joint_matrix<sub_group, int8_t, use::b, 8, 32, layout::col_major> sub_b{};
+
+  // CHECK: tail call <16 x i32> @llvm.amdgcn.mfma.i32.32x32x8i8(i32 {{.*}}, i32 {{.*}}, <16 x i32> zeroinitializer, i32 0, i32 0, i32 0)
+  joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  joint_matrix_store(sg, sub_c,
+                     accD.template get_multi_ptr<access::decorated::yes>(), 32,
+                     layout::row_major);
+}
\ No newline at end of file
diff --git a/sycl/test/check_device_code/math-builtins/native-math-cuda.cpp b/sycl/test/check_device_code/math-builtins/native-math-cuda.cpp
index 37253e48e6554..cb6f09e201d92 100644
--- a/sycl/test/check_device_code/math-builtins/native-math-cuda.cpp
+++ b/sycl/test/check_device_code/math-builtins/native-math-cuda.cpp
@@ -6,64 +6,47 @@
 
 using namespace sycl;
 
-int main() {
-
-  queue q;
-
-  float input[2];
-  float res[13];
-  {
-    buffer<float, 1> input_buff(&input[0], range<1>(2));
-    buffer<float, 1> res_buff(&res[0], range<1>(13));
-    q.submit([&](handler &cgh) {
-      accessor<float, 1, access::mode::write, target::device> res_acc(res_buff,
-                                                                      cgh);
-      accessor<float, 1, access::mode::read, target::device> input_acc(
-          input_buff, cgh);
-      cgh.single_task([=]() {
-        // CHECK: tail call noundef float @llvm.nvvm.cos.approx.f
-        res_acc[0] = sycl::native::cos(input_acc[0]);
-        // CHECK: tail call noundef float @llvm.nvvm.sin.approx.f
-        res_acc[1] = sycl::native::sin(input_acc[0]);
-        // CHECK: tail call noundef float @llvm.nvvm.ex2.approx.f
-        res_acc[2] = sycl::native::exp2(input_acc[0]);
-        // CHECK: tail call noundef float @llvm.nvvm.lg2.approx.f
-        res_acc[3] = sycl::native::log2(input_acc[0]);
-        // CHECK: tail call noundef float @llvm.nvvm.rsqrt.approx.f
-        res_acc[4] = sycl::native::rsqrt(input_acc[0]);
-        // CHECK: tail call noundef float @llvm.nvvm.sqrt.approx.f
-        res_acc[5] = sycl::native::sqrt(input_acc[0]);
-        // CHECK: tail call noundef float @llvm.nvvm.rcp.approx.f
-        res_acc[6] = sycl::native::recip(input_acc[0]);
-        // CHECK: tail call noundef float @llvm.nvvm.div.approx.f
-        res_acc[7] = sycl::native::divide(input_acc[0], input_acc[1]);
-
-        // Functions that use the above builtins:
-
-        // CHECK: tail call float @llvm.nvvm.sin.approx.f
-        // CHECK: tail call float @llvm.nvvm.cos.approx.f
-        // CHECK: tail call noundef float @llvm.nvvm.div.approx.f
-        res_acc[8] = sycl::native::tan(input_acc[0]);
-        // CHECK: fmul float {{.*}}, 0x3FF7154760000000
-        // CHECK: tail call noundef float @llvm.nvvm.ex2.approx.f
-        res_acc[9] = sycl::native::exp(input_acc[0]);
-        // CHECK: fmul float {{.*}}, 0x400A934F00000000
-        // CHECK: tail call noundef float @llvm.nvvm.ex2.approx.f
-        res_acc[10] = sycl::native::exp10(input_acc[0]);
-        // CHECK: tail call float @llvm.nvvm.lg2.approx.f
-        // CHECK: fmul float {{.*}}, 0x3FE62E4300000000
-        res_acc[11] = sycl::native::log(input_acc[0]);
-        // CHECK: tail call float @llvm.nvvm.lg2.approx.f
-        // CHECK: fmul float {{.*}}, 0x3FD3441360000000
-        res_acc[12] = sycl::native::log10(input_acc[0]);
-
-        // CHECK: tail call float @llvm.nvvm.lg2.approx.f
-        // CHECK: fmul float {{.*}}, {{.*}}
-        // CHECK: tail call noundef float @llvm.nvvm.ex2.approx.f
-        res_acc[13] = sycl::native::powr(input_acc[0], input_acc[1]);
-      });
-    });
-  }
-
-  return 0;
+SYCL_EXTERNAL void native_math_cuda(
+    accessor<float, 1, access::mode::write, target::device> res_acc,
+    accessor<float, 1, access::mode::read, target::device> input_acc) {
+  // CHECK: tail call noundef float @llvm.nvvm.cos.approx.f
+  res_acc[0] = sycl::native::cos(input_acc[0]);
+  // CHECK: tail call noundef float @llvm.nvvm.sin.approx.f
+  res_acc[1] = sycl::native::sin(input_acc[0]);
+  // CHECK: tail call noundef float @llvm.nvvm.ex2.approx.f
+  res_acc[2] = sycl::native::exp2(input_acc[0]);
+  // CHECK: tail call noundef float @llvm.nvvm.lg2.approx.f
+  res_acc[3] = sycl::native::log2(input_acc[0]);
+  // CHECK: tail call noundef float @llvm.nvvm.rsqrt.approx.f
+  res_acc[4] = sycl::native::rsqrt(input_acc[0]);
+  // CHECK: tail call noundef float @llvm.nvvm.sqrt.approx.f
+  res_acc[5] = sycl::native::sqrt(input_acc[0]);
+  // CHECK: tail call noundef float @llvm.nvvm.rcp.approx.f
+  res_acc[6] = sycl::native::recip(input_acc[0]);
+  // CHECK: tail call noundef float @llvm.nvvm.div.approx.f
+  res_acc[7] = sycl::native::divide(input_acc[0], input_acc[1]);
+
+  // Functions that use the above builtins:
+
+  // CHECK: tail call float @llvm.nvvm.sin.approx.f
+  // CHECK: tail call float @llvm.nvvm.cos.approx.f
+  // CHECK: tail call noundef float @llvm.nvvm.div.approx.f
+  res_acc[8] = sycl::native::tan(input_acc[0]);
+  // CHECK: fmul float {{.*}}, 0x3FF7154760000000
+  // CHECK: tail call noundef float @llvm.nvvm.ex2.approx.f
+  res_acc[9] = sycl::native::exp(input_acc[0]);
+  // CHECK: fmul float {{.*}}, 0x400A934F00000000
+  // CHECK: tail call noundef float @llvm.nvvm.ex2.approx.f
+  res_acc[10] = sycl::native::exp10(input_acc[0]);
+  // CHECK: tail call float @llvm.nvvm.lg2.approx.f
+  // CHECK: fmul float {{.*}}, 0x3FE62E4300000000
+  res_acc[11] = sycl::native::log(input_acc[0]);
+  // CHECK: tail call float @llvm.nvvm.lg2.approx.f
+  // CHECK: fmul float {{.*}}, 0x3FD3441360000000
+  res_acc[12] = sycl::native::log10(input_acc[0]);
+
+  // CHECK: tail call float @llvm.nvvm.lg2.approx.f
+  // CHECK: fmul float {{.*}}, {{.*}}
+  // CHECK: tail call noundef float @llvm.nvvm.ex2.approx.f
+  res_acc[13] = sycl::native::powr(input_acc[0], input_acc[1]);
 };
diff --git a/sycl/test/matrix/matrix-check-types-in-attributes.cpp b/sycl/test/check_device_code/matrix/matrix-check-types-in-attributes.cpp
similarity index 54%
rename from sycl/test/matrix/matrix-check-types-in-attributes.cpp
rename to sycl/test/check_device_code/matrix/matrix-check-types-in-attributes.cpp
index f7a0223adb24d..48322fe4fa66d 100644
--- a/sycl/test/matrix/matrix-check-types-in-attributes.cpp
+++ b/sycl/test/check_device_code/matrix/matrix-check-types-in-attributes.cpp
@@ -23,30 +23,21 @@ using namespace sycl::ext::oneapi::experimental::matrix;
 
 constexpr size_t Size = 12;
 
-template <typename T> void test(sycl::queue &q) {
-  q.submit([&](sycl::handler &cgh) {
-    cgh.single_task([]() {
-      joint_matrix<sycl::sub_group, T, use::a, Size, Size, layout::row_major> m;
-    });
-  });
+template <typename T> SYCL_EXTERNAL void test() {
+  joint_matrix<sycl::sub_group, T, use::a, Size, Size, layout::row_major> m;
 }
 
-int main() {
-  sycl::queue q;
-
-  test<sycl::ext::oneapi::bfloat16>(q);
-  test<sycl::half>(q);
-  test<sycl::ext::oneapi::experimental::matrix::precision::tf32>(q);
-  test<float>(q);
-  test<double>(q);
-  test<int8_t>(q);
-  test<int16_t>(q);
-  test<int32_t>(q);
-  test<int64_t>(q);
-  test<uint8_t>(q);
-  test<uint16_t>(q);
-  test<uint32_t>(q);
-  test<uint64_t>(q);
-
-  return 0;
-}
+template SYCL_EXTERNAL void test<sycl::ext::oneapi::bfloat16>();
+template SYCL_EXTERNAL void test<sycl::half>();
+template SYCL_EXTERNAL void
+test<sycl::ext::oneapi::experimental::matrix::precision::tf32>();
+template SYCL_EXTERNAL void test<float>();
+template SYCL_EXTERNAL void test<double>();
+template SYCL_EXTERNAL void test<int8_t>();
+template SYCL_EXTERNAL void test<int16_t>();
+template SYCL_EXTERNAL void test<int32_t>();
+template SYCL_EXTERNAL void test<int64_t>();
+template SYCL_EXTERNAL void test<uint8_t>();
+template SYCL_EXTERNAL void test<uint16_t>();
+template SYCL_EXTERNAL void test<uint32_t>();
+template SYCL_EXTERNAL void test<uint64_t>();
diff --git a/sycl/test/check_device_code/matrix/matrix-int8-test.cpp b/sycl/test/check_device_code/matrix/matrix-int8-test.cpp
new file mode 100644
index 0000000000000..30008cf5b99fb
--- /dev/null
+++ b/sycl/test/check_device_code/matrix/matrix-int8-test.cpp
@@ -0,0 +1,82 @@
+// RUN: %clangxx -fsycl -fsycl-device-only -O2 -S -emit-llvm -o - %s | FileCheck %s
+
+// CHECK-DAG: target("spirv.JointMatrixINTEL", i8, 12, 48, 0, 3, 0)
+// CHECK-DAG: target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2)
+// CHECK-DAG: target("spirv.JointMatrixINTEL", i8, 48, 12, 2, 3, 1)
+
+// CHECK: !{!"matrix_type::sint32,use::accumulator,12,12;matrix_type::sint8,use::a,12,48;matrix_type::sint8,use::b,48,12"}
+// CHECK: !{!"matrix_type::sint8,matrix_type::sint8,matrix_type::sint32,matrix_type::sint32,12,48,12"}
+
+#include <iostream>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi::experimental::matrix;
+
+#define TILE_SZ 16
+#define TM (TILE_SZ - 4)
+#define TN (TILE_SZ - 4)
+#define TK (4 * TILE_SZ - 16)
+
+#define SG_SZ 16
+
+// static constexpr size_t MATRIX_M = TM * 2;
+// static constexpr size_t MATRIX_N = TN * 2;
+// static constexpr size_t MATRIX_K = TK * 2;
+// int8_t A[MATRIX_M][MATRIX_K];
+// int8_t B[MATRIX_K / 4][MATRIX_N * 4];
+// int32_t C[MATRIX_M][MATRIX_N];
+
+SYCL_EXTERNAL [[intel::reqd_sub_group_size(SG_SZ)]] void
+matrix_multiply(size_t NUM_COLS_C, size_t NUM_COLS_A,
+                sycl::accessor<int8_t, 2, access::mode::read_write> accA,
+                sycl::accessor<int8_t, 2, access::mode::read_write> accB,
+                sycl::accessor<int32_t, 2, access::mode::read_write> accC,
+                nd_item<2> spmd_item) {
+
+  size_t N = NUM_COLS_C;
+  size_t K = NUM_COLS_A;
+
+  // The submatrix API has to be accessed by all the workitems in a
+  // subgroup these functions will be called once by the subgroup no
+  // code divergence between the workitems
+  const auto global_idx = spmd_item.get_global_id(0);
+  const auto global_idy = spmd_item.get_global_id(1);
+  const auto sg_startx = global_idx - spmd_item.get_local_id(0);
+  const auto sg_starty = global_idy - spmd_item.get_local_id(1);
+
+  sycl::sub_group sg = spmd_item.get_sub_group();
+  joint_matrix<sycl::sub_group, int8_t, use::a, TM, TK, layout::row_major>
+      sub_a;
+  // For B, since current implementation does not support non-packed
+  // layout, users need to specify the updated VNNI sizes along with
+  // the packed_b layout. By default, the layout is row_major and size
+  // is (TK, TN).
+  joint_matrix<sycl::sub_group, int8_t, use::b, TK, TN,
+               layout::ext_intel_packed>
+      sub_b;
+  joint_matrix<sycl::sub_group, int32_t, use::accumulator, TM, TN> sub_c;
+
+  // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64
+  // strideX = X's cols, so strideC = N, strideA = K, strideB = N*4
+  joint_matrix_fill(sg, sub_c, 0);
+  for (int k = 0; k < K / TK; k += 1) {
+    joint_matrix_load(
+        sg, sub_a,
+        accA.template get_multi_ptr<sycl::access::decorated::no>() +
+            (sg_startx * TM) * K + k * TK,
+        K);
+    // Assuming B data is already in VNNI format.
+    joint_matrix_load(
+        sg, sub_b,
+        accB.template get_multi_ptr<sycl::access::decorated::no>() +
+            (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4,
+        N * 4);
+    joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
+  }
+  joint_matrix_store(
+      sg, sub_c,
+      accC.template get_multi_ptr<sycl::access::decorated::no>() +
+          (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
+      N, layout::row_major);
+}
diff --git a/sycl/test/check_device_code/matrix/matrix_load_store_as.cpp b/sycl/test/check_device_code/matrix/matrix_load_store_as.cpp
index 36dbf89dc2661..34fae66a8f09a 100644
--- a/sycl/test/check_device_code/matrix/matrix_load_store_as.cpp
+++ b/sycl/test/check_device_code/matrix/matrix_load_store_as.cpp
@@ -9,55 +9,32 @@
 using namespace sycl;
 using namespace sycl::ext::oneapi::experimental::matrix;
 
-int main(void) {
-  queue q;
-  unsigned short *A = malloc_shared<unsigned short>(8 * 16, q);
-  unsigned short *B = malloc_shared<unsigned short>(16 * 16, q);
-  float *C = malloc_shared<float>(8 * 16, q);
-
-  auto pA = multi_ptr<unsigned short, access::address_space::global_space>(A);
-  auto pB = multi_ptr<unsigned short, access::address_space::global_space>(B);
-  auto pC = multi_ptr<float, access::address_space::global_space>(C);
-
-  q.submit([&](handler &h) {
-    local_accessor<unsigned short, 2> tileA{{8, 16}, h};
-
-    h.parallel_for(
-        nd_range<2>({1, 16}, {1, 16}),
-        [=](nd_item<2> it) [[intel::reqd_sub_group_size(16)]] {
-          joint_matrix<sub_group, unsigned short, use::a, 8, 16,
-                       layout::row_major>
-              tA;
-          joint_matrix<sub_group, unsigned short, use::b, 16, 16,
-                       layout::ext_intel_packed>
-              tB;
-          joint_matrix<sub_group, float, use::accumulator, 8, 16> tC;
-
-          sub_group sg = it.get_sub_group();
-          vec<unsigned short, 8> slmvec = sg.load<8>(pA);
-          sg.store<8>(
-              tileA.template get_multi_ptr<sycl::access::decorated::yes>(),
+SYCL_EXTERNAL [[intel::reqd_sub_group_size(16)]] void matrix_store_as(
+    multi_ptr<unsigned short, access::address_space::global_space> pA,
+    multi_ptr<unsigned short, access::address_space::global_space> pB,
+    multi_ptr<float, access::address_space::global_space> pC,
+    local_accessor<unsigned short, 2> tileA, nd_item<2> it) {
+  joint_matrix<sub_group, unsigned short, use::a, 8, 16, layout::row_major> tA;
+  joint_matrix<sub_group, unsigned short, use::b, 16, 16,
+               layout::ext_intel_packed>
+      tB;
+  joint_matrix<sub_group, float, use::accumulator, 8, 16> tC;
+
+  sub_group sg = it.get_sub_group();
+  vec<unsigned short, 8> slmvec = sg.load<8>(pA);
+  sg.store<8>(tileA.template get_multi_ptr<sycl::access::decorated::yes>(),
               slmvec);
-          it.barrier(access::fence_space::local_space);
-
-          // A should load from local address space
-          // CHECK: %{{.*}} = tail call spir_func noundef target("spirv.JointMatrixINTEL", i16, 8, 16, 0, 3, 0) @_Z[[#]]__spirv_JointMatrixLoadINTEL{{.*}}(ptr addrspace(3) noundef %{{.*}}, i64 noundef 16, i32 noundef 0, i32 noundef 3, i32 noundef 0) #{{.*}}
-          joint_matrix_load(
-              sg, tA,
-              tileA.template get_multi_ptr<sycl::access::decorated::yes>(), 16);
-          // B should load from global address space
-          // CHECK: %{{.*}} = tail call spir_func noundef target("spirv.JointMatrixINTEL", i16, 16, 16, 2, 3, 1) @_Z[[#]]__spirv_JointMatrixLoadINTEL{{.*}}(ptr addrspace(1) noundef %{{.*}}, i64 noundef 32, i32 noundef 2, i32 noundef 3, i32 noundef 0) #{{.*}}
-          joint_matrix_load(sg, tB, pB, 32);
-          joint_matrix_mad(sg, tC, tA, tB, tC);
-          // C should store to global address space
-          // CHECK: tail call spir_func void @_Z[[#]]__spirv_JointMatrixStoreINTEL{{.*}}(ptr addrspace(1) noundef %{{.*}}, target("spirv.JointMatrixINTEL", float, 8, 16, 3, 3, 2) noundef %{{.*}}, i64 noundef 16, i32 noundef 0, i32 noundef 3, i32 noundef 0) #{{.*}}
-          joint_matrix_store(sg, tC, pC, 16, layout::row_major);
-        });
-  });
-
-  free(A, q);
-  free(B, q);
-  free(C, q);
-
-  return 0;
+  it.barrier(access::fence_space::local_space);
+
+  // A should load from local address space
+  // CHECK: %{{.*}} = tail call spir_func noundef target("spirv.JointMatrixINTEL", i16, 8, 16, 0, 3, 0) @_Z[[#]]__spirv_JointMatrixLoadINTEL{{.*}}(ptr addrspace(3) noundef %{{.*}}, i64 noundef 16, i32 noundef 0, i32 noundef 3, i32 noundef 0) #{{.*}}
+  joint_matrix_load(
+      sg, tA, tileA.template get_multi_ptr<sycl::access::decorated::yes>(), 16);
+  // B should load from global address space
+  // CHECK: %{{.*}} = tail call spir_func noundef target("spirv.JointMatrixINTEL", i16, 16, 16, 2, 3, 1) @_Z[[#]]__spirv_JointMatrixLoadINTEL{{.*}}(ptr addrspace(1) noundef %{{.*}}, i64 noundef 32, i32 noundef 2, i32 noundef 3, i32 noundef 0) #{{.*}}
+  joint_matrix_load(sg, tB, pB, 32);
+  joint_matrix_mad(sg, tC, tA, tB, tC);
+  // C should store to global address space
+  // CHECK: tail call spir_func void @_Z[[#]]__spirv_JointMatrixStoreINTEL{{.*}}(ptr addrspace(1) noundef %{{.*}}, target("spirv.JointMatrixINTEL", float, 8, 16, 3, 3, 2) noundef %{{.*}}, i64 noundef 16, i32 noundef 0, i32 noundef 3, i32 noundef 0) #{{.*}}
+  joint_matrix_store(sg, tC, pC, 16, layout::row_major);
 }
diff --git a/sycl/test/check_device_code/no_offset_error.cpp b/sycl/test/check_device_code/no_offset_error.cpp
deleted file mode 100644
index dd7c005d33d5f..0000000000000
--- a/sycl/test/check_device_code/no_offset_error.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-// RUN:  %clangxx -fsycl-device-only -Xclang -verify -Xclang -verify-ignore-unexpected=note -emit-llvm -o - %s
-
-#include <sycl/sycl.hpp>
-
-inline constexpr int size = 100;
-
-int main() {
-
-    sycl::buffer<int> a{sycl::range{size}};
-    sycl::queue q;
-
-    q.submit([&](sycl::handler &cgh) {
-        sycl::ext::oneapi::accessor_property_list PL{sycl::ext::oneapi::no_offset, sycl::no_init};
-        sycl::accessor acc_a(a, cgh, sycl::write_only, PL);
-        // expected-error@sycl/accessor.hpp:* {{static assertion failed due to requirement '!(accessor_property_list<sycl::ext::oneapi::property::no_offset::instance<true>, sycl::property::no_init>::has_property())': Accessor has no_offset property, get_offset() can not be used}}
-        auto b = acc_a.get_offset();
-    });
-
-    q.wait();
-    return 0;
-}
diff --git a/sycl/test/check_device_code/task_sequence_intel_balanced.cpp b/sycl/test/check_device_code/task_sequence_intel_balanced.cpp
index c888ec1eed9d5..a06dc2b92755c 100644
--- a/sycl/test/check_device_code/task_sequence_intel_balanced.cpp
+++ b/sycl/test/check_device_code/task_sequence_intel_balanced.cpp
@@ -1,10 +1,5 @@
 // RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm -Xclang -no-enable-noundef-analysis %s -o - | FileCheck %s
 
-// CHECK: [[TASK_SEQUENCE:%.*]] ={{.*}} call spir_func target("spirv.TaskSequenceINTEL") @_Z31__spirv_TaskSequenceCreateINTEL{{.*}}(ptr{{.*}}@_Z8arrayAdd{{.*}}, i32 -1, i32 -1, i32 0, i32 128)
-// CHECK: call spir_func void @_Z30__spirv_TaskSequenceAsyncINTEL{{.*}}(target("spirv.TaskSequenceINTEL") [[TASK_SEQUENCE]], ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 128)
-// CHECK-COUNT-1: call spir_func i32 @_Z28__spirv_TaskSequenceGetINTEL{{.*}}(target("spirv.TaskSequenceINTEL") [[TASK_SEQUENCE]])
-// CHECK: call spir_func void @_Z32__spirv_TaskSequenceReleaseINTEL{{.*}}(target("spirv.TaskSequenceINTEL") [[TASK_SEQUENCE]])
-
 #include <sycl/sycl.hpp>
 
 using namespace sycl::ext::intel::experimental;
@@ -21,26 +16,24 @@ int arrayAdd(int *data1, int *data2, int N) {
   return ret;
 }
 
-int main() {
-  sycl::queue myQueue;
-  std::vector<int> results(kSize);
-  myQueue.submit([&](sycl::handler &cgh) {
-    sycl::buffer buffer_results(results);
-    sycl::accessor results_acc(buffer_results, sycl::write_only, sycl::no_init);
-    cgh.single_task([=]() {
-      int d1[kSize], d2[kSize];
-      task_sequence<arrayAdd,
-                    decltype(properties{balanced, invocation_capacity<kSize>})>
-          arrayAddTask;
-      for (int i = 0; i < kSize; i++) {
-        arrayAddTask.async(d1, d2, kSize);
-      }
-
-      for (int i = 0; i < kSize; i++) {
-        results_acc[i] = arrayAddTask.get();
-      }
-    });
-  });
-  myQueue.wait();
-  return 0;
+// CHECK: [[TASK_SEQUENCE:%.*]] ={{.*}} call spir_func target("spirv.TaskSequenceINTEL") @_Z31__spirv_TaskSequenceCreateINTEL{{.*}}(ptr{{.*}}@_Z8arrayAdd{{.*}}, i32 -1, i32 -1, i32 0, i32 128)
+// CHECK: call spir_func void @_Z30__spirv_TaskSequenceAsyncINTEL{{.*}}(target("spirv.TaskSequenceINTEL") [[TASK_SEQUENCE]], ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 128)
+// CHECK: call spir_func void @_Z32__spirv_TaskSequenceReleaseINTEL{{.*}}(target("spirv.TaskSequenceINTEL") [[TASK_SEQUENCE]])
+// CHECK-COUNT-1: call spir_func i32 @_Z28__spirv_TaskSequenceGetINTEL{{.*}}(target("spirv.TaskSequenceINTEL") [[TASK_SEQUENCE]])
+SYCL_EXTERNAL void task_sequence_intel_balanced(
+    sycl::accessor<
+        sycl::vec<int, 1>, 1, sycl::access::mode::write,
+        sycl::access::target::device, sycl::access::placeholder::false_t,
+        sycl::ext::oneapi::accessor_property_list<sycl::property::no_init>>
+        results_acc) {
+  int d1[kSize], d2[kSize];
+  task_sequence<arrayAdd,
+                decltype(properties{balanced, invocation_capacity<kSize>})>
+      arrayAddTask;
+  for (int i = 0; i < kSize; i++) {
+    arrayAddTask.async(d1, d2, kSize);
+  }
+  for (int i = 0; i < kSize; i++) {
+    results_acc[i] = arrayAddTask.get();
+  }
 }
\ No newline at end of file
diff --git a/sycl/test/check_device_code/task_sequence_intel_explicit_get.cpp b/sycl/test/check_device_code/task_sequence_intel_explicit_get.cpp
index 2064aa789f393..7d8f45334e99f 100644
--- a/sycl/test/check_device_code/task_sequence_intel_explicit_get.cpp
+++ b/sycl/test/check_device_code/task_sequence_intel_explicit_get.cpp
@@ -1,10 +1,5 @@
 // RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm -Xclang -no-enable-noundef-analysis %s -o - | FileCheck %s
 
-// CHECK: [[TASK_SEQUENCE:%.*]] ={{.*}} call spir_func target("spirv.TaskSequenceINTEL") @_Z31__spirv_TaskSequenceCreateINTEL{{.*}}(ptr{{.*}}@_Z8arrayAdd{{.*}}, i32 -1, i32 -1, i32 0, i32 128)
-// CHECK: call spir_func void @_Z30__spirv_TaskSequenceAsyncINTEL{{.*}}(target("spirv.TaskSequenceINTEL") [[TASK_SEQUENCE]], ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 128)
-// CHECK-COUNT-2: call spir_func i32 @_Z28__spirv_TaskSequenceGetINTEL{{.*}}(target("spirv.TaskSequenceINTEL") [[TASK_SEQUENCE]])
-// CHECK: call spir_func void @_Z32__spirv_TaskSequenceReleaseINTEL{{.*}}(target("spirv.TaskSequenceINTEL") [[TASK_SEQUENCE]])
-
 #include <sycl/sycl.hpp>
 
 using namespace sycl::ext::intel::experimental;
@@ -21,25 +16,25 @@ int arrayAdd(int *data1, int *data2, int N) {
   return ret;
 }
 
-int main() {
-  sycl::queue myQueue;
-  std::vector<int> results(kSize);
-  myQueue.submit([&](sycl::handler &cgh) {
-    sycl::buffer buffer_results(results);
-    sycl::accessor results_acc(buffer_results, sycl::write_only, sycl::no_init);
-    cgh.single_task([=]() {
-      int d1[kSize], d2[kSize];
-      task_sequence<arrayAdd, decltype(properties{invocation_capacity<kSize>})>
-          arrayAddTask;
-      for (int i = 0; i < kSize; i++) {
-        arrayAddTask.async(d1, d2, kSize);
-      }
+// CHECK: [[TASK_SEQUENCE:%.*]] ={{.*}} call spir_func target("spirv.TaskSequenceINTEL") @_Z31__spirv_TaskSequenceCreateINTEL{{.*}}(ptr{{.*}}@_Z8arrayAdd{{.*}}, i32 -1, i32 -1, i32 0, i32 128)
+// CHECK: call spir_func void @_Z30__spirv_TaskSequenceAsyncINTEL{{.*}}(target("spirv.TaskSequenceINTEL") [[TASK_SEQUENCE]], ptr addrspace(4) {{.*}}, ptr addrspace(4) {{.*}}, i32 128)
+// CHECK: call spir_func i32 @_Z28__spirv_TaskSequenceGetINTEL{{.*}}(target("spirv.TaskSequenceINTEL") [[TASK_SEQUENCE]])
+// CHECK: call spir_func void @_Z32__spirv_TaskSequenceReleaseINTEL{{.*}}(target("spirv.TaskSequenceINTEL") [[TASK_SEQUENCE]])
+// CHECK: call spir_func i32 @_Z28__spirv_TaskSequenceGetINTEL{{.*}}(target("spirv.TaskSequenceINTEL") [[TASK_SEQUENCE]])
+SYCL_EXTERNAL void task_sequence_intel_explicit_get(
+    sycl::accessor<
+        sycl::vec<int, 1>, 1, sycl::access::mode::write,
+        sycl::access::target::device, sycl::access::placeholder::false_t,
+        sycl::ext::oneapi::accessor_property_list<sycl::property::no_init>>
+        results_acc) {
+  int d1[kSize], d2[kSize];
+  task_sequence<arrayAdd, decltype(properties{invocation_capacity<kSize>})>
+      arrayAddTask;
+  for (int i = 0; i < kSize; i++) {
+    arrayAddTask.async(d1, d2, kSize);
+  }
 
-      for (int i = 0; i < kSize; i++) {
-        results_acc[i] = arrayAddTask.get();
-      }
-    });
-  });
-  myQueue.wait();
-  return 0;
+  for (int i = 0; i < kSize; i++) {
+    results_acc[i] = arrayAddTask.get();
+  }
 }
\ No newline at end of file
diff --git a/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp b/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp
new file mode 100644
index 0000000000000..6aea590b6155c
--- /dev/null
+++ b/sycl/test/check_device_code/vector/vector_bf16_builtins.cpp
@@ -0,0 +1,377 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// NOTE: ..., followed by some manual cleanup.
+
+// Had to increase inline threashold for this test to force inline of the vec<>
+// math builtins.
+// RUN: %clangxx -I %sycl_include -mllvm -inline-threshold=400 -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -Xclang -disable-lifetime-markers -O3 -fsycl-device-only %s -o - | FileCheck %s
+
+// This test checks the device code generated for vec<bfloat16> math builtins.
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/experimental/bfloat16_math.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::oneapi;
+using namespace sycl::ext::oneapi::experimental;
+
+// CHECK-LABEL: define dso_local spir_func void @_Z8TestFMinN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi2EEES5_(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec") align 4 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec") align 4 [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !srcloc [[META5:![0-9]+]] !sycl_fixed_targets [[META6:![0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[AGG_TMP111_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP10_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP13:%.*]] = alloca %"class.sycl::_V1::vec", align 8
+// CHECK-NEXT:    [[AGG_TMP2:%.*]] = alloca %"class.sycl::_V1::vec", align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !tbaa [[TBAA7:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B]], align 4, !tbaa [[TBAA7]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[AGG_TMP2]])
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[AGG_TMP13]])
+// CHECK-NEXT:    store i32 [[TMP1]], ptr [[AGG_TMP13]], align 1
+// CHECK-NEXT:    store i32 [[TMP0]], ptr [[AGG_TMP2]], align 1
+// CHECK-NEXT:    [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP2]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP13]] to ptr addrspace(4)
+// CHECK-NEXT:    [[X_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP10_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP111_I]] to ptr addrspace(4)
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 2
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I:%.*]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS0_3VECINS2_8BFLOAT16ELI2EEES7_LI2ELI2EEENST9ENABLE_IFIXAAAA24IS_VEC_OR_SWIZZLE_BF16_VIT_E24IS_VEC_OR_SWIZZLE_BF16_VIT0_EEQT1_T2_ENS5_IS6_XT1_EEEE4TYPEES9_SA__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
+// CHECK-NEXT:    [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi2EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 4 dereferenceable_or_null(4) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7:[0-9]+]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10:![0-9]+]]
+// CHECK-NEXT:    [[CALL3_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi2EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 4 dereferenceable_or_null(4) [[Y_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr addrspace(4) [[CALL3_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP10_I]]), !noalias [[META12:![0-9]+]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP111_I]]), !noalias [[META12]]
+// CHECK-NEXT:    store i16 [[TMP3]], ptr [[AGG_TMP111_I]], align 1, !noalias [[META12]]
+// CHECK-NEXT:    store i16 [[TMP2]], ptr [[AGG_TMP10_I]], align 1, !noalias [[META12]]
+// CHECK-NEXT:    [[CONV_I_I_I:%.*]] = zext i16 [[TMP2]] to i32
+// CHECK-NEXT:    [[AND_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I_I_I:%.*]] = icmp eq i32 [[AND_I_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I_I_I:%.*]] = icmp ne i32 [[AND2_I_I_I]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = and i1 [[CMP_I_I_I]], [[TOBOOL_I_I_I]]
+// CHECK-NEXT:    br i1 [[TMP4]], label [[LAND_LHS_TRUE_I_I:%.*]], label [[IF_END6_I_I:%.*]]
+// CHECK:       land.lhs.true.i.i:
+// CHECK-NEXT:    [[CONV_I25_I_I:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[AND_I26_I_I:%.*]] = and i32 [[CONV_I25_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I27_I_I:%.*]] = icmp eq i32 [[AND_I26_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I28_I_I:%.*]] = and i32 [[CONV_I25_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I29_I_I:%.*]] = icmp ne i32 [[AND2_I28_I_I]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = and i1 [[CMP_I27_I_I]], [[TOBOOL_I29_I_I]]
+// CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[TMP5]], i16 32704, i16 [[TMP3]]
+// CHECK-NEXT:    br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]]
+// CHECK:       if.end6.i.i:
+// CHECK-NEXT:    [[CONV_I39_I_I:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[AND_I40_I_I:%.*]] = and i32 [[CONV_I39_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I41_I_I:%.*]] = icmp eq i32 [[AND_I40_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I42_I_I:%.*]] = and i32 [[CONV_I39_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I43_I_I:%.*]] = icmp ne i32 [[AND2_I42_I_I]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = and i1 [[CMP_I41_I_I]], [[TOBOOL_I43_I_I]]
+// CHECK-NEXT:    br i1 [[TMP6]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]], label [[IF_END10_I_I:%.*]]
+// CHECK:       if.end10.i.i:
+// CHECK-NEXT:    [[OR_I_I:%.*]] = or i32 [[CONV_I_I_I]], [[CONV_I39_I_I]]
+// CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp eq i32 [[OR_I_I]], 32768
+// CHECK-NEXT:    [[AND_I_I:%.*]] = and i32 [[CONV_I_I_I]], [[CONV_I39_I_I]]
+// CHECK-NEXT:    [[TOBOOL_NOT_I_I:%.*]] = icmp eq i32 [[AND_I_I]], 0
+// CHECK-NEXT:    [[OR_COND_I_I:%.*]] = and i1 [[CMP_I_I]], [[TOBOOL_NOT_I_I]]
+// CHECK-NEXT:    br i1 [[OR_COND_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]], label [[IF_END18_I_I:%.*]]
+// CHECK:       if.end18.i.i:
+// CHECK-NEXT:    [[CALL_I_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[X_ASCAST_I_I]]) #[[ATTR8:[0-9]+]], !noalias [[META15:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I2_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[Y_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META15]]
+// CHECK-NEXT:    [[CMP_I44_I_I:%.*]] = fcmp olt float [[CALL_I_I_I_I_I]], [[CALL_I_I2_I_I_I]]
+// CHECK-NEXT:    [[X_ASCAST_VAL_I_I:%.*]] = load i16, ptr [[AGG_TMP10_I]], align 2, !noalias [[META18:![0-9]+]]
+// CHECK-NEXT:    [[Y_ASCAST_VAL_I_I:%.*]] = load i16, ptr [[AGG_TMP111_I]], align 2, !noalias [[META18]]
+// CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[CMP_I44_I_I]], i16 [[X_ASCAST_VAL_I_I]], i16 [[Y_ASCAST_VAL_I_I]]
+// CHECK-NEXT:    br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMININS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental4fminINS2_8bfloat16EEENSt9enable_ifIXsr3stdE9is_same_vIT_S5_EES7_E4typeES7_S7_.exit.i:
+// CHECK-NEXT:    [[REF_TMP_SROA_0_0_I:%.*]] = phi i16 [ [[TMP7]], [[IF_END18_I_I]] ], [ [[TMP2]], [[IF_END6_I_I]] ], [ -32768, [[IF_END10_I_I]] ], [ [[SPEC_SELECT_I]], [[LAND_LHS_TRUE_I_I]] ]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP10_I]]), !noalias [[META12]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP111_I]]), !noalias [[META12]]
+// CHECK-NEXT:    [[CALL5_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi2EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 4 dereferenceable_or_null(4) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    store i16 [[REF_TMP_SROA_0_0_I]], ptr addrspace(4) [[CALL5_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP19:![0-9]+]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental4fminINS0_3vecINS2_8bfloat16ELi2EEES7_Li2ELi2EEENSt9enable_ifIXaaaa24is_vec_or_swizzle_bf16_vIT_E24is_vec_or_swizzle_bf16_vIT0_EeqT1_T2_ENS5_IS6_XT1_EEEE4typeES9_SA_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[AGG_TMP2]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[AGG_TMP13]])
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFMin(vec<bfloat16, 2> a, vec<bfloat16, 2> b) {
+  return experimental::fmin(a, b);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z8TestFMaxN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEES5_(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec.0") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.0") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.0") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META22:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[AGG_TMP111_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP10_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP13:%.*]] = alloca %"class.sycl::_V1::vec.0", align 8
+// CHECK-NEXT:    [[AGG_TMP2:%.*]] = alloca %"class.sycl::_V1::vec.0", align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 8, !tbaa [[TBAA7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr [[B]], align 8, !tbaa [[TBAA7]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[AGG_TMP2]])
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[AGG_TMP13]])
+// CHECK-NEXT:    store i64 [[TMP1]], ptr [[AGG_TMP13]], align 1
+// CHECK-NEXT:    store i64 [[TMP0]], ptr [[AGG_TMP2]], align 1
+// CHECK-NEXT:    [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP2]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP13]] to ptr addrspace(4)
+// CHECK-NEXT:    [[X_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP10_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP111_I]] to ptr addrspace(4)
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 3
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I:%.*]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS0_3VECINS2_8BFLOAT16ELI3EEES7_LI3ELI3EEENST9ENABLE_IFIXAAAA24IS_VEC_OR_SWIZZLE_BF16_VIT_E24IS_VEC_OR_SWIZZLE_BF16_VIT0_EEQT1_T2_ENS5_IS6_XT1_EEEE4TYPEES9_SA__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
+// CHECK-NEXT:    [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[CALL3_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) [[Y_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr addrspace(4) [[CALL3_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP10_I]]), !noalias [[META23:![0-9]+]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP111_I]]), !noalias [[META23]]
+// CHECK-NEXT:    store i16 [[TMP3]], ptr [[AGG_TMP111_I]], align 1, !noalias [[META23]]
+// CHECK-NEXT:    store i16 [[TMP2]], ptr [[AGG_TMP10_I]], align 1, !noalias [[META23]]
+// CHECK-NEXT:    [[CONV_I_I_I:%.*]] = zext i16 [[TMP2]] to i32
+// CHECK-NEXT:    [[AND_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I_I_I:%.*]] = icmp eq i32 [[AND_I_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I_I_I:%.*]] = icmp ne i32 [[AND2_I_I_I]], 0
+// CHECK-NEXT:    [[TMP4:%.*]] = and i1 [[CMP_I_I_I]], [[TOBOOL_I_I_I]]
+// CHECK-NEXT:    br i1 [[TMP4]], label [[LAND_LHS_TRUE_I_I:%.*]], label [[IF_END6_I_I:%.*]]
+// CHECK:       land.lhs.true.i.i:
+// CHECK-NEXT:    [[CONV_I25_I_I:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[AND_I26_I_I:%.*]] = and i32 [[CONV_I25_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I27_I_I:%.*]] = icmp eq i32 [[AND_I26_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I28_I_I:%.*]] = and i32 [[CONV_I25_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I29_I_I:%.*]] = icmp ne i32 [[AND2_I28_I_I]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = and i1 [[CMP_I27_I_I]], [[TOBOOL_I29_I_I]]
+// CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = select i1 [[TMP5]], i16 32704, i16 [[TMP3]]
+// CHECK-NEXT:    br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]]
+// CHECK:       if.end6.i.i:
+// CHECK-NEXT:    [[CONV_I39_I_I:%.*]] = zext i16 [[TMP3]] to i32
+// CHECK-NEXT:    [[AND_I40_I_I:%.*]] = and i32 [[CONV_I39_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I41_I_I:%.*]] = icmp eq i32 [[AND_I40_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I42_I_I:%.*]] = and i32 [[CONV_I39_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I43_I_I:%.*]] = icmp ne i32 [[AND2_I42_I_I]], 0
+// CHECK-NEXT:    [[TMP6:%.*]] = and i1 [[CMP_I41_I_I]], [[TOBOOL_I43_I_I]]
+// CHECK-NEXT:    br i1 [[TMP6]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]], label [[IF_END10_I_I:%.*]]
+// CHECK:       if.end10.i.i:
+// CHECK-NEXT:    [[OR_I_I:%.*]] = or i32 [[CONV_I_I_I]], [[CONV_I39_I_I]]
+// CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp eq i32 [[OR_I_I]], 32768
+// CHECK-NEXT:    [[AND_I_I:%.*]] = and i32 [[CONV_I_I_I]], [[CONV_I39_I_I]]
+// CHECK-NEXT:    [[TOBOOL_NOT_I_I:%.*]] = icmp eq i32 [[AND_I_I]], 0
+// CHECK-NEXT:    [[OR_COND_I_I:%.*]] = and i1 [[CMP_I_I]], [[TOBOOL_NOT_I_I]]
+// CHECK-NEXT:    br i1 [[OR_COND_I_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]], label [[IF_END18_I_I:%.*]]
+// CHECK:       if.end18.i.i:
+// CHECK-NEXT:    [[CALL_I_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[X_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META26:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I2_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[Y_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META26]]
+// CHECK-NEXT:    [[CMP_I44_I_I:%.*]] = fcmp ogt float [[CALL_I_I_I_I_I]], [[CALL_I_I2_I_I_I]]
+// CHECK-NEXT:    [[X_ASCAST_VAL_I_I:%.*]] = load i16, ptr [[AGG_TMP10_I]], align 2, !noalias [[META29:![0-9]+]]
+// CHECK-NEXT:    [[Y_ASCAST_VAL_I_I:%.*]] = load i16, ptr [[AGG_TMP111_I]], align 2, !noalias [[META29]]
+// CHECK-NEXT:    [[TMP7:%.*]] = select i1 [[CMP_I44_I_I]], i16 [[X_ASCAST_VAL_I_I]], i16 [[Y_ASCAST_VAL_I_I]]
+// CHECK-NEXT:    br label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FMAXINS2_8BFLOAT16EEENST9ENABLE_IFIXSR3STDE9IS_SAME_VIT_S5_EES7_E4TYPEES7_S7__EXIT_I]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental4fmaxINS2_8bfloat16EEENSt9enable_ifIXsr3stdE9is_same_vIT_S5_EES7_E4typeES7_S7_.exit.i:
+// CHECK-NEXT:    [[REF_TMP_SROA_0_0_I:%.*]] = phi i16 [ [[TMP7]], [[IF_END18_I_I]] ], [ [[TMP2]], [[IF_END6_I_I]] ], [ 0, [[IF_END10_I_I]] ], [ [[SPEC_SELECT_I]], [[LAND_LHS_TRUE_I_I]] ]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP10_I]]), !noalias [[META23]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP111_I]]), !noalias [[META23]]
+// CHECK-NEXT:    [[CALL5_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    store i16 [[REF_TMP_SROA_0_0_I]], ptr addrspace(4) [[CALL5_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP30:![0-9]+]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental4fmaxINS0_3vecINS2_8bfloat16ELi3EEES7_Li3ELi3EEENSt9enable_ifIXaaaa24is_vec_or_swizzle_bf16_vIT_E24is_vec_or_swizzle_bf16_vIT0_EeqT1_T2_ENS5_IS6_XT1_EEEE4typeES9_SA_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[AGG_TMP2]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[AGG_TMP13]])
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFMax(vec<bfloat16, 3> a, vec<bfloat16, 3> b) {
+  return experimental::fmax(a, b);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z9TestIsNanN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi4EEE(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.1") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.2") align 8 [[A:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] !srcloc [[META31:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[AGG_TMP1:%.*]] = alloca %"class.sycl::_V1::vec.2", align 8
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr [[A]], align 8, !tbaa [[TBAA7]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[AGG_TMP1]])
+// CHECK-NEXT:    store i64 [[TMP0]], ptr [[AGG_TMP1]], align 1
+// CHECK-NEXT:    [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP1]] to ptr addrspace(4)
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 4
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL5ISNANINS0_3VECINS2_8BFLOAT16ELI4EEELI4EEENST9ENABLE_IFIX24IS_VEC_OR_SWIZZLE_BF16_VIT_EENS5_ISXT0_EEEE4TYPEES9__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
+// CHECK-NEXT:    [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi4EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 8 dereferenceable_or_null(8) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]], !noalias [[META32]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]], !noalias [[META32]]
+// CHECK-NEXT:    [[CONV_I_I:%.*]] = zext i16 [[TMP1]] to i32
+// CHECK-NEXT:    [[AND_I_I:%.*]] = and i32 [[CONV_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp eq i32 [[AND_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I_I:%.*]] = and i32 [[CONV_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I_I:%.*]] = icmp ne i32 [[AND2_I_I]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = and i1 [[CMP_I_I]], [[TOBOOL_I_I]]
+// CHECK-NEXT:    [[CONV2_I:%.*]] = sext i1 [[TMP2]] to i16
+// CHECK-NEXT:    [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds i16, ptr addrspace(4) [[AGG_RESULT]], i64 [[I_0_I]]
+// CHECK-NEXT:    store i16 [[CONV2_I]], ptr addrspace(4) [[ARRAYIDX_I_I]], align 2, !tbaa [[TBAA10]], !alias.scope [[META32]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP35:![0-9]+]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental5isnanINS0_3vecINS2_8bfloat16ELi4EEELi4EEENSt9enable_ifIX24is_vec_or_swizzle_bf16_vIT_EENS5_IsXT0_EEEE4typeES9_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[AGG_TMP1]])
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestIsNan(vec<bfloat16, 4> a) {
+  return experimental::isnan(a);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z8TestFabsN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEE(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec.3") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.3") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR3]] !srcloc [[META36:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[AGG_TMP1:%.*]] = alloca %"class.sycl::_V1::vec.3", align 16
+// CHECK-NEXT:    [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load <8 x i16>, ptr [[A]], align 16, !tbaa [[TBAA7]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[AGG_TMP1]])
+// CHECK-NEXT:    store <8 x i16> [[AGG_TMP_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP1]], align 1
+// CHECK-NEXT:    [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP1]] to ptr addrspace(4)
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 8
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4FABSINS0_3VECINS2_8BFLOAT16ELI8EEELI8EEENST9ENABLE_IFIX24IS_VEC_OR_SWIZZLE_BF16_VIT_EENS5_IS6_XT0_EEEE4TYPEES9__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
+// CHECK-NEXT:    [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 16 dereferenceable_or_null(16) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[CONV_I_I_I:%.*]] = zext i16 [[TMP0]] to i32
+// CHECK-NEXT:    [[AND_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 32640
+// CHECK-NEXT:    [[CMP_I_I_I:%.*]] = icmp eq i32 [[AND_I_I_I]], 32640
+// CHECK-NEXT:    [[AND2_I_I_I:%.*]] = and i32 [[CONV_I_I_I]], 127
+// CHECK-NEXT:    [[TOBOOL_I_I_I:%.*]] = icmp ne i32 [[AND2_I_I_I]], 0
+// CHECK-NEXT:    [[TMP1:%.*]] = and i1 [[CMP_I_I_I]], [[TOBOOL_I_I_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = and i16 [[TMP0]], 32767
+// CHECK-NEXT:    [[SPEC_SELECT_I_I:%.*]] = select i1 [[TMP1]], i16 [[TMP0]], i16 [[TMP2]]
+// CHECK-NEXT:    [[CALL2_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 16 dereferenceable_or_null(16) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    store i16 [[SPEC_SELECT_I_I]], ptr addrspace(4) [[CALL2_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP37:![0-9]+]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental4fabsINS0_3vecINS2_8bfloat16ELi8EEELi8EEENSt9enable_ifIX24is_vec_or_swizzle_bf16_vIT_EENS5_IS6_XT0_EEEE4typeES9_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[AGG_TMP1]])
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFabs(vec<bfloat16, 8> a) {
+  return experimental::fabs(a);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z8TestCeilN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEE(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec.3") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.3") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META38:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[REF_TMP_I_I:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[AGG_TMP6_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP1:%.*]] = alloca %"class.sycl::_V1::vec.3", align 16
+// CHECK-NEXT:    [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load <8 x i16>, ptr [[A]], align 16, !tbaa [[TBAA7]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[AGG_TMP1]])
+// CHECK-NEXT:    store <8 x i16> [[AGG_TMP_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP1]], align 1
+// CHECK-NEXT:    [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP1]] to ptr addrspace(4)
+// CHECK-NEXT:    [[REF_TMP_ASCAST_I_I:%.*]] = addrspacecast ptr [[REF_TMP_I_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[X_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP6_I]] to ptr addrspace(4)
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 8
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL4CEILINS0_3VECINS2_8BFLOAT16ELI8EEELI8EEENST9ENABLE_IFIX24IS_VEC_OR_SWIZZLE_BF16_VIT_EENS5_IS6_XT0_EEEE4TYPEES9__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
+// CHECK-NEXT:    [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 16 dereferenceable_or_null(16) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP6_I]]), !noalias [[META39:![0-9]+]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META39]]
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[AGG_TMP6_I]], align 1, !noalias [[META39]]
+// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[X_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META42:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I_I:%.*]] = call spir_func noundef float @_Z16__spirv_ocl_ceilf(float noundef [[CALL_I_I_I_I]]) #[[ATTR9:[0-9]+]]
+// CHECK-NEXT:    store float [[CALL_I_I_I]], ptr [[REF_TMP_I_I]], align 4, !tbaa [[TBAA45:![0-9]+]], !noalias [[META47:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I2_I_I:%.*]] = call spir_func noundef zeroext i16 @__devicelib_ConvertFToBF16INTEL(ptr addrspace(4) noundef align 4 dereferenceable(4) [[REF_TMP_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META42]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP6_I]]), !noalias [[META39]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META39]]
+// CHECK-NEXT:    [[CALL2_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi8EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 16 dereferenceable_or_null(16) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    store i16 [[CALL_I_I2_I_I]], ptr addrspace(4) [[CALL2_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP48:![0-9]+]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental4ceilINS0_3vecINS2_8bfloat16ELi8EEELi8EEENSt9enable_ifIX24is_vec_or_swizzle_bf16_vIT_EENS5_IS6_XT0_EEEE4typeES9_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[AGG_TMP1]])
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestCeil(vec<bfloat16, 8> a) {
+  return experimental::ceil(a);
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z7TestFMAN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEES5_S5_(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias writable sret(%"class.sycl::_V1::vec.4") align 32 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 32 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 32 [[B:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 32 [[C:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META49:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[REF_TMP_I_I:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[AGG_TMP416_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP115_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP14_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 8
+// CHECK-NEXT:    [[AGG_TMP25:%.*]] = alloca %"class.sycl::_V1::vec.4", align 32
+// CHECK-NEXT:    [[AGG_TMP14:%.*]] = alloca %"class.sycl::_V1::vec.4", align 32
+// CHECK-NEXT:    [[AGG_TMP3:%.*]] = alloca %"class.sycl::_V1::vec.4", align 32
+// CHECK-NEXT:    [[AGG_TMP_SROA_0_0_COPYLOAD:%.*]] = load <16 x i16>, ptr [[A]], align 32, !tbaa [[TBAA7]]
+// CHECK-NEXT:    [[AGG_TMP1_SROA_0_0_COPYLOAD:%.*]] = load <16 x i16>, ptr [[B]], align 32, !tbaa [[TBAA7]]
+// CHECK-NEXT:    [[AGG_TMP2_SROA_0_0_COPYLOAD:%.*]] = load <16 x i16>, ptr [[C]], align 32, !tbaa [[TBAA7]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[AGG_TMP3]])
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[AGG_TMP14]])
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr nonnull [[AGG_TMP25]])
+// CHECK-NEXT:    store <16 x i16> [[AGG_TMP2_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP25]], align 1
+// CHECK-NEXT:    store <16 x i16> [[AGG_TMP1_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP14]], align 1
+// CHECK-NEXT:    store <16 x i16> [[AGG_TMP_SROA_0_0_COPYLOAD]], ptr [[AGG_TMP3]], align 1
+// CHECK-NEXT:    [[X_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP3]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP14]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Z_ASCAST_I:%.*]] = addrspacecast ptr [[AGG_TMP25]] to ptr addrspace(4)
+// CHECK-NEXT:    [[REF_TMP_ASCAST_I_I:%.*]] = addrspacecast ptr [[REF_TMP_I_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[X_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP14_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Y_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP115_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[Z_ASCAST_I_I:%.*]] = addrspacecast ptr [[AGG_TMP416_I]] to ptr addrspace(4)
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 16
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V13EXT6ONEAPI12EXPERIMENTAL3FMAINS0_3VECINS2_8BFLOAT16ELI16EEES7_S7_LI16ELI16ELI16EEENST9ENABLE_IFIXAAAAAAAA24IS_VEC_OR_SWIZZLE_BF16_VIT_E24IS_VEC_OR_SWIZZLE_BF16_VIT0_E24IS_VEC_OR_SWIZZLE_BF16_VIT1_EEQT2_T3_EQT3_T4_ENS5_IS6_XT2_EEEE4TYPEES9_SA_SB__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
+// CHECK-NEXT:    [[CALL_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 32 dereferenceable_or_null(32) [[X_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr addrspace(4) [[CALL_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[CALL3_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 32 dereferenceable_or_null(32) [[Y_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr addrspace(4) [[CALL3_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[CALL6_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 32 dereferenceable_or_null(32) [[Z_ASCAST_I]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr addrspace(4) [[CALL6_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP14_I]]), !noalias [[META50:![0-9]+]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP115_I]]), !noalias [[META50]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[AGG_TMP416_I]]), !noalias [[META50]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META50]]
+// CHECK-NEXT:    store i16 [[TMP2]], ptr [[AGG_TMP416_I]], align 1, !noalias [[META50]]
+// CHECK-NEXT:    store i16 [[TMP1]], ptr [[AGG_TMP115_I]], align 1, !noalias [[META50]]
+// CHECK-NEXT:    store i16 [[TMP0]], ptr [[AGG_TMP14_I]], align 1, !noalias [[META50]]
+// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[X_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META53:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I4_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[Y_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META53]]
+// CHECK-NEXT:    [[CALL_I_I5_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[Z_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META53]]
+// CHECK-NEXT:    [[CALL_I_I_I:%.*]] = call spir_func noundef float @_Z15__spirv_ocl_fmafff(float noundef [[CALL_I_I_I_I]], float noundef [[CALL_I_I4_I_I]], float noundef [[CALL_I_I5_I_I]]) #[[ATTR9]]
+// CHECK-NEXT:    store float [[CALL_I_I_I]], ptr [[REF_TMP_I_I]], align 4, !tbaa [[TBAA45]], !noalias [[META56:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I6_I_I:%.*]] = call spir_func noundef zeroext i16 @__devicelib_ConvertFToBF16INTEL(ptr addrspace(4) noundef align 4 dereferenceable(4) [[REF_TMP_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META53]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP14_I]]), !noalias [[META50]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP115_I]]), !noalias [[META50]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[AGG_TMP416_I]]), !noalias [[META50]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META50]]
+// CHECK-NEXT:    [[CALL8_I:%.*]] = call spir_func noundef align 2 dereferenceable(2) ptr addrspace(4) @_ZN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEixIS4_EENSt9enable_ifIXsr3stdE9is_same_vIT_S4_EERS4_E4typeEi(ptr addrspace(4) noundef align 32 dereferenceable_or_null(32) [[AGG_RESULT]], i32 noundef [[CONV_I]]) #[[ATTR7]]
+// CHECK-NEXT:    store i16 [[CALL_I_I6_I_I]], ptr addrspace(4) [[CALL8_I]], align 2, !tbaa [[TBAA10]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP57:![0-9]+]]
+// CHECK:       _ZN4sycl3_V13ext6oneapi12experimental3fmaINS0_3vecINS2_8bfloat16ELi16EEES7_S7_Li16ELi16ELi16EEENSt9enable_ifIXaaaaaaaa24is_vec_or_swizzle_bf16_vIT_E24is_vec_or_swizzle_bf16_vIT0_E24is_vec_or_swizzle_bf16_vIT1_EeqT2_T3_eqT3_T4_ENS5_IS6_XT2_EEEE4typeES9_SA_SB_.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[AGG_TMP3]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[AGG_TMP14]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[AGG_TMP25]])
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFMA(vec<bfloat16, 16> a, vec<bfloat16, 16> b,
+                           vec<bfloat16, 16> c) {
+  return experimental::fma(a, b, c);
+}
diff --git a/sycl/test/check_device_code/vector/vector_convert_bfloat.cpp b/sycl/test/check_device_code/vector/vector_convert_bfloat.cpp
new file mode 100644
index 0000000000000..bf4973f32af39
--- /dev/null
+++ b/sycl/test/check_device_code/vector/vector_convert_bfloat.cpp
@@ -0,0 +1,227 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals none --version 4
+// NOTE: ..., followed by some manual cleanup.
+
+// RUN: %clangxx -I %sycl_include -fpreview-breaking-changes -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -Xclang -disable-lifetime-markers %s -fsycl-device-only -o - | FileCheck %s
+// REQUIRES: linux
+
+#include <sycl/detail/core.hpp>
+#include <sycl/ext/oneapi/bfloat16.hpp>
+
+using namespace sycl;
+using bfloat16 = sycl::ext::oneapi::bfloat16;
+
+// CHECK-LABEL: define dso_local spir_func void @_Z18TestBFtoFDeviceRNERN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEE(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec") align 16 [[AGG_RESULT:%.*]], ptr addrspace(4) nocapture noundef readonly align 8 dereferenceable(8) [[INP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !srcloc [[META6:![0-9]+]] !sycl_fixed_targets [[META7:![0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VEC_ADDR_I_I_I_I:%.*]] = alloca <3 x i16>, align 8
+// CHECK-NEXT:    [[DST_I_I_I_I:%.*]] = alloca [4 x float], align 4
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
+// CHECK-NEXT:    [[LOADVEC4_I_I:%.*]] = load <4 x i16>, ptr addrspace(4) [[INP]], align 8, !noalias [[META8]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VEC_ADDR_I_I_I_I]]), !noalias [[META8]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[DST_I_I_I_I]]), !noalias [[META8]]
+// CHECK-NEXT:    [[VEC_ADDR_ASCAST_I_I_I_I:%.*]] = addrspacecast ptr [[VEC_ADDR_I_I_I_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[DST_ASCAST_I_I_I_I:%.*]] = addrspacecast ptr [[DST_I_I_I_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[EXTRACTVEC_I_I_I_I:%.*]] = shufflevector <4 x i16> [[LOADVEC4_I_I]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC_I_I_I_I]], ptr [[VEC_ADDR_I_I_I_I]], align 8, !tbaa [[TBAA11:![0-9]+]], !noalias [[META8]]
+// CHECK-NEXT:    call spir_func void @__devicelib_ConvertBF16ToFINTELVec3(ptr addrspace(4) noundef [[VEC_ADDR_ASCAST_I_I_I_I]], ptr addrspace(4) noundef [[DST_ASCAST_I_I_I_I]]) #[[ATTR4:[0-9]+]], !noalias [[META8]]
+// CHECK-NEXT:    [[LOADVEC4_I_I_I_I_I:%.*]] = load <4 x float>, ptr [[DST_I_I_I_I]], align 4, !noalias [[META8]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VEC_ADDR_I_I_I_I]]), !noalias [[META8]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[DST_I_I_I_I]]), !noalias [[META8]]
+// CHECK-NEXT:    [[EXTRACTVEC5_I:%.*]] = shufflevector <4 x float> [[LOADVEC4_I_I_I_I_I]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC5_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !alias.scope [[META8]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestBFtoFDeviceRNE(vec<bfloat16, 3> &inp) {
+  return inp.template convert<float>();
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z17TestBFtoFDeviceRZRN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEE(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec") align 16 [[AGG_RESULT:%.*]], ptr addrspace(4) nocapture noundef readonly align 8 dereferenceable(8) [[INP:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META14:![0-9]+]] !sycl_fixed_targets [[META7]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VEC_ADDR_I_I_I_I:%.*]] = alloca <3 x i16>, align 8
+// CHECK-NEXT:    [[DST_I_I_I_I:%.*]] = alloca [4 x float], align 4
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]])
+// CHECK-NEXT:    [[LOADVEC4_I_I:%.*]] = load <4 x i16>, ptr addrspace(4) [[INP]], align 8, !noalias [[META15]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[VEC_ADDR_I_I_I_I]]), !noalias [[META15]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[DST_I_I_I_I]]), !noalias [[META15]]
+// CHECK-NEXT:    [[VEC_ADDR_ASCAST_I_I_I_I:%.*]] = addrspacecast ptr [[VEC_ADDR_I_I_I_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[DST_ASCAST_I_I_I_I:%.*]] = addrspacecast ptr [[DST_I_I_I_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[EXTRACTVEC_I_I_I_I:%.*]] = shufflevector <4 x i16> [[LOADVEC4_I_I]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC_I_I_I_I]], ptr [[VEC_ADDR_I_I_I_I]], align 8, !tbaa [[TBAA11]], !noalias [[META15]]
+// CHECK-NEXT:    call spir_func void @__devicelib_ConvertBF16ToFINTELVec3(ptr addrspace(4) noundef [[VEC_ADDR_ASCAST_I_I_I_I]], ptr addrspace(4) noundef [[DST_ASCAST_I_I_I_I]]) #[[ATTR4]], !noalias [[META15]]
+// CHECK-NEXT:    [[LOADVEC4_I_I_I_I_I:%.*]] = load <4 x float>, ptr [[DST_I_I_I_I]], align 4, !noalias [[META15]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[VEC_ADDR_I_I_I_I]]), !noalias [[META15]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[DST_I_I_I_I]]), !noalias [[META15]]
+// CHECK-NEXT:    [[EXTRACTVEC5_I:%.*]] = shufflevector <4 x float> [[LOADVEC4_I_I_I_I_I]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC5_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !alias.scope [[META15]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestBFtoFDeviceRZ(vec<bfloat16, 3> &inp) {
+  return inp.template convert<float, sycl::rounding_mode::rtz>();
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z19TestBFtointDeviceRZRN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEE(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.4") align 16 [[AGG_RESULT:%.*]], ptr addrspace(4) nocapture noundef readonly align 8 dereferenceable(8) [[INP:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META18:![0-9]+]] !sycl_fixed_targets [[META7]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]])
+// CHECK-NEXT:    [[LOADVEC4_I_I:%.*]] = load <4 x i16>, ptr addrspace(4) [[INP]], align 8, !noalias [[META19]]
+// CHECK-NEXT:    [[EXTRACTVEC_I_I:%.*]] = shufflevector <4 x i16> [[LOADVEC4_I_I]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    br label [[FOR_COND_I_I_I:%.*]]
+// CHECK:       for.cond.i.i.i:
+// CHECK-NEXT:    [[RETVAL1_SROA_0_0_I_I_I:%.*]] = phi <4 x i32> [ undef, [[ENTRY:%.*]] ], [ [[RETVAL1_SROA_0_0_VECBLEND_I_I_I:%.*]], [[FOR_BODY_I_I_I:%.*]] ]
+// CHECK-NEXT:    [[I_0_I_I_I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I_I_I:%.*]], [[FOR_BODY_I_I_I]] ]
+// CHECK-NEXT:    [[CMP_I_I_I:%.*]] = icmp ult i32 [[I_0_I_I_I]], 3
+// CHECK-NEXT:    br i1 [[CMP_I_I_I]], label [[FOR_BODY_I_I_I]], label [[_ZNK4SYCL3_V13VECINS0_3EXT6ONEAPI8BFLOAT16ELI3EE7CONVERTIILNS_13ROUNDING_MODEE2EEENS1_IT_LI3EEEV_EXIT:%.*]]
+// CHECK:       for.body.i.i.i:
+// CHECK-NEXT:    [[VECEXT_I_I_I:%.*]] = extractelement <3 x i16> [[EXTRACTVEC_I_I]], i32 [[I_0_I_I_I]]
+// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = tail call spir_func noundef i32 @__imf_bfloat162int_rz(i16 noundef zeroext [[VECEXT_I_I_I]]) #[[ATTR4]], !noalias [[META19]]
+// CHECK-NEXT:    [[RETVAL1_SROA_0_0_VEC_EXTRACT_I_I_I:%.*]] = shufflevector <4 x i32> [[RETVAL1_SROA_0_0_I_I_I]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[VECINS_I_I_I:%.*]] = insertelement <3 x i32> [[RETVAL1_SROA_0_0_VEC_EXTRACT_I_I_I]], i32 [[CALL_I_I_I_I]], i32 [[I_0_I_I_I]]
+// CHECK-NEXT:    [[RETVAL1_SROA_0_0_VEC_EXPAND_I_I_I:%.*]] = shufflevector <3 x i32> [[VECINS_I_I_I]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[RETVAL1_SROA_0_0_VECBLEND_I_I_I]] = shufflevector <4 x i32> [[RETVAL1_SROA_0_0_VEC_EXPAND_I_I_I]], <4 x i32> [[RETVAL1_SROA_0_0_I_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+// CHECK-NEXT:    [[INC_I_I_I]] = add nuw nsw i32 [[I_0_I_I_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I_I_I]], !llvm.loop [[LOOP22:![0-9]+]]
+// CHECK:       _ZNK4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EE7convertIiLNS_13rounding_modeE2EEENS1_IT_Li3EEEv.exit:
+// CHECK-NEXT:    [[EXTRACTVEC5_I:%.*]] = shufflevector <4 x i32> [[RETVAL1_SROA_0_0_I_I_I]], <4 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i32> [[EXTRACTVEC5_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !alias.scope [[META19]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestBFtointDeviceRZ(vec<bfloat16, 3> &inp) {
+  return inp.template convert<int, sycl::rounding_mode::rtz>();
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z20TestBFtointDeviceRNERN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi1EEE(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.8") align 4 [[AGG_RESULT:%.*]], ptr addrspace(4) nocapture noundef readonly align 2 dereferenceable(2) [[INP:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META24:![0-9]+]] !sycl_fixed_targets [[META7]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META25:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr addrspace(4) [[INP]], align 2, !tbaa [[TBAA11]], !noalias [[META25]]
+// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = tail call spir_func noundef i32 @__imf_bfloat162int_rn(i16 noundef zeroext [[TMP0]]) #[[ATTR4]], !noalias [[META25]]
+// CHECK-NEXT:    store i32 [[CALL_I_I_I_I]], ptr addrspace(4) [[AGG_RESULT]], align 4, !alias.scope [[META25]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestBFtointDeviceRNE(vec<bfloat16, 1> &inp) {
+  return inp.template convert<int>();
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z18TestFtoBFDeviceRNERN4sycl3_V13vecIfLi3EEE(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.0") align 8 [[AGG_RESULT:%.*]], ptr addrspace(4) nocapture noundef readonly align 16 dereferenceable(16) [[INP:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META28:![0-9]+]] !sycl_fixed_targets [[META7]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[VEC_ADDR_I_I_I_I:%.*]] = alloca <3 x float>, align 16
+// CHECK-NEXT:    [[DST_I_I_I_I:%.*]] = alloca [4 x %"class.sycl::_V1::ext::oneapi::bfloat16"], align 2
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META29:![0-9]+]])
+// CHECK-NEXT:    [[LOADVEC4_I_I:%.*]] = load <4 x float>, ptr addrspace(4) [[INP]], align 16, !noalias [[META29]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr nonnull [[VEC_ADDR_I_I_I_I]]), !noalias [[META29]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull [[DST_I_I_I_I]]), !noalias [[META29]]
+// CHECK-NEXT:    [[VEC_ADDR_ASCAST_I_I_I_I:%.*]] = addrspacecast ptr [[VEC_ADDR_I_I_I_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[DST_ASCAST_I_I_I_I:%.*]] = addrspacecast ptr [[DST_I_I_I_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[EXTRACTVEC_I_I_I_I:%.*]] = shufflevector <4 x float> [[LOADVEC4_I_I]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC_I_I_I_I]], ptr [[VEC_ADDR_I_I_I_I]], align 16, !tbaa [[TBAA11]], !noalias [[META29]]
+// CHECK-NEXT:    call spir_func void @__devicelib_ConvertFToBF16INTELVec3(ptr addrspace(4) noundef [[VEC_ADDR_ASCAST_I_I_I_I]], ptr addrspace(4) noundef [[DST_ASCAST_I_I_I_I]]) #[[ATTR4]], !noalias [[META29]]
+// CHECK-NEXT:    [[LOADVEC4_I_I_I_I_I:%.*]] = load <4 x i16>, ptr [[DST_I_I_I_I]], align 2, !noalias [[META29]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr nonnull [[VEC_ADDR_I_I_I_I]]), !noalias [[META29]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[DST_I_I_I_I]]), !noalias [[META29]]
+// CHECK-NEXT:    [[EXTRACTVEC5_I:%.*]] = shufflevector <4 x i16> [[LOADVEC4_I_I_I_I_I]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC5_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !alias.scope [[META29]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFtoBFDeviceRNE(vec<float, 3> &inp) {
+  return inp.template convert<bfloat16>();
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z17TestFtoBFDeviceRZRN4sycl3_V13vecIfLi3EEE(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.0") align 8 [[AGG_RESULT:%.*]], ptr addrspace(4) nocapture noundef readonly align 16 dereferenceable(16) [[INP:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META32:![0-9]+]] !sycl_fixed_targets [[META7]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META33:![0-9]+]])
+// CHECK-NEXT:    [[LOADVEC4_I_I:%.*]] = load <4 x float>, ptr addrspace(4) [[INP]], align 16, !noalias [[META33]]
+// CHECK-NEXT:    [[EXTRACTVEC_I_I:%.*]] = shufflevector <4 x float> [[LOADVEC4_I_I]], <4 x float> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    br label [[FOR_COND_I_I_I:%.*]]
+// CHECK:       for.cond.i.i.i:
+// CHECK-NEXT:    [[RETVAL1_SROA_0_0_I_I_I:%.*]] = phi <4 x i16> [ undef, [[ENTRY:%.*]] ], [ [[RETVAL1_SROA_0_0_VECBLEND_I_I_I:%.*]], [[FOR_BODY_I_I_I:%.*]] ]
+// CHECK-NEXT:    [[I_0_I_I_I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I_I_I:%.*]], [[FOR_BODY_I_I_I]] ]
+// CHECK-NEXT:    [[CMP_I_I_I:%.*]] = icmp ult i32 [[I_0_I_I_I]], 3
+// CHECK-NEXT:    br i1 [[CMP_I_I_I]], label [[FOR_BODY_I_I_I]], label [[_ZNK4SYCL3_V13VECIFLI3EE7CONVERTINS0_3EXT6ONEAPI8BFLOAT16ELNS_13ROUNDING_MODEE2EEENS1_IT_LI3EEEV_EXIT:%.*]]
+// CHECK:       for.body.i.i.i:
+// CHECK-NEXT:    [[VECEXT_I_I_I:%.*]] = extractelement <3 x float> [[EXTRACTVEC_I_I]], i32 [[I_0_I_I_I]]
+// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = tail call spir_func noundef zeroext i16 @__imf_float2bfloat16_rz(float noundef [[VECEXT_I_I_I]]) #[[ATTR4]], !noalias [[META33]]
+// CHECK-NEXT:    [[RETVAL1_SROA_0_0_VEC_EXTRACT_I_I_I:%.*]] = shufflevector <4 x i16> [[RETVAL1_SROA_0_0_I_I_I]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[VECINS_I_I_I:%.*]] = insertelement <3 x i16> [[RETVAL1_SROA_0_0_VEC_EXTRACT_I_I_I]], i16 [[CALL_I_I_I_I]], i32 [[I_0_I_I_I]]
+// CHECK-NEXT:    [[RETVAL1_SROA_0_0_VEC_EXPAND_I_I_I:%.*]] = shufflevector <3 x i16> [[VECINS_I_I_I]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[RETVAL1_SROA_0_0_VECBLEND_I_I_I]] = shufflevector <4 x i16> [[RETVAL1_SROA_0_0_VEC_EXPAND_I_I_I]], <4 x i16> [[RETVAL1_SROA_0_0_I_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+// CHECK-NEXT:    [[INC_I_I_I]] = add nuw nsw i32 [[I_0_I_I_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I_I_I]], !llvm.loop [[LOOP36:![0-9]+]]
+// CHECK:       _ZNK4sycl3_V13vecIfLi3EE7convertINS0_3ext6oneapi8bfloat16ELNS_13rounding_modeE2EEENS1_IT_Li3EEEv.exit:
+// CHECK-NEXT:    [[EXTRACTVEC5_I:%.*]] = shufflevector <4 x i16> [[RETVAL1_SROA_0_0_I_I_I]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC5_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !alias.scope [[META33]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestFtoBFDeviceRZ(vec<float, 3> &inp) {
+  return inp.template convert<bfloat16, sycl::rounding_mode::rtz>();
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z19TestInttoBFDeviceRZRN4sycl3_V13vecIiLi3EEE(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.0") align 8 [[AGG_RESULT:%.*]], ptr addrspace(4) nocapture noundef readonly align 16 dereferenceable(16) [[INP:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META37:![0-9]+]] !sycl_fixed_targets [[META7]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META38:![0-9]+]])
+// CHECK-NEXT:    [[LOADVEC4_I_I:%.*]] = load <4 x i32>, ptr addrspace(4) [[INP]], align 16, !noalias [[META38]]
+// CHECK-NEXT:    [[EXTRACTVEC_I_I:%.*]] = shufflevector <4 x i32> [[LOADVEC4_I_I]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    br label [[FOR_COND_I_I_I:%.*]]
+// CHECK:       for.cond.i.i.i:
+// CHECK-NEXT:    [[RETVAL1_SROA_0_0_I_I_I:%.*]] = phi <4 x i16> [ undef, [[ENTRY:%.*]] ], [ [[RETVAL1_SROA_0_0_VECBLEND_I_I_I:%.*]], [[FOR_BODY_I_I_I:%.*]] ]
+// CHECK-NEXT:    [[I_0_I_I_I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I_I_I:%.*]], [[FOR_BODY_I_I_I]] ]
+// CHECK-NEXT:    [[CMP_I_I_I:%.*]] = icmp ult i32 [[I_0_I_I_I]], 3
+// CHECK-NEXT:    br i1 [[CMP_I_I_I]], label [[FOR_BODY_I_I_I]], label [[_ZNK4SYCL3_V13VECIILI3EE7CONVERTINS0_3EXT6ONEAPI8BFLOAT16ELNS_13ROUNDING_MODEE2EEENS1_IT_LI3EEEV_EXIT:%.*]]
+// CHECK:       for.body.i.i.i:
+// CHECK-NEXT:    [[VECEXT_I_I_I:%.*]] = extractelement <3 x i32> [[EXTRACTVEC_I_I]], i32 [[I_0_I_I_I]]
+// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = tail call spir_func noundef zeroext i16 @__imf_int2bfloat16_rz(i32 noundef [[VECEXT_I_I_I]]) #[[ATTR4]], !noalias [[META38]]
+// CHECK-NEXT:    [[RETVAL1_SROA_0_0_VEC_EXTRACT_I_I_I:%.*]] = shufflevector <4 x i16> [[RETVAL1_SROA_0_0_I_I_I]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[VECINS_I_I_I:%.*]] = insertelement <3 x i16> [[RETVAL1_SROA_0_0_VEC_EXTRACT_I_I_I]], i16 [[CALL_I_I_I_I]], i32 [[I_0_I_I_I]]
+// CHECK-NEXT:    [[RETVAL1_SROA_0_0_VEC_EXPAND_I_I_I:%.*]] = shufflevector <3 x i16> [[VECINS_I_I_I]], <3 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    [[RETVAL1_SROA_0_0_VECBLEND_I_I_I]] = shufflevector <4 x i16> [[RETVAL1_SROA_0_0_VEC_EXPAND_I_I_I]], <4 x i16> [[RETVAL1_SROA_0_0_I_I_I]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+// CHECK-NEXT:    [[INC_I_I_I]] = add nuw nsw i32 [[I_0_I_I_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I_I_I]], !llvm.loop [[LOOP41:![0-9]+]]
+// CHECK:       _ZNK4sycl3_V13vecIiLi3EE7convertINS0_3ext6oneapi8bfloat16ELNS_13rounding_modeE2EEENS1_IT_Li3EEEv.exit:
+// CHECK-NEXT:    [[EXTRACTVEC5_I:%.*]] = shufflevector <4 x i16> [[RETVAL1_SROA_0_0_I_I_I]], <4 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i16> [[EXTRACTVEC5_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !alias.scope [[META38]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestInttoBFDeviceRZ(vec<int, 3> &inp) {
+  return inp.template convert<bfloat16, sycl::rounding_mode::rtz>();
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z19TestLLtoBFDeviceRTPRN4sycl3_V13vecIxLi1EEE(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.12") align 2 [[AGG_RESULT:%.*]], ptr addrspace(4) nocapture noundef readonly align 8 dereferenceable(8) [[INP:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META42:![0-9]+]] !sycl_fixed_targets [[META7]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META43:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr addrspace(4) [[INP]], align 8, !tbaa [[TBAA46:![0-9]+]], !noalias [[META43]]
+// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = tail call spir_func noundef zeroext i16 @__imf_ll2bfloat16_ru(i64 noundef [[TMP0]]) #[[ATTR4]], !noalias [[META43]]
+// CHECK-NEXT:    store i16 [[CALL_I_I_I_I]], ptr addrspace(4) [[AGG_RESULT]], align 2, !alias.scope [[META43]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestLLtoBFDeviceRTP(vec<long long, 1> &inp) {
+  return inp.template convert<bfloat16, sycl::rounding_mode::rtp>();
+}
+
+// CHECK-LABEL: define dso_local spir_func void @_Z22TestShorttoBFDeviceRTNRN4sycl3_V13vecIsLi2EEE(
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.20") align 4 [[AGG_RESULT:%.*]], ptr addrspace(4) nocapture noundef readonly align 4 dereferenceable(4) [[INP:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META48:![0-9]+]] !sycl_fixed_targets [[META7]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META49:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(4) [[INP]], align 4, !tbaa [[TBAA11]], !noalias [[META49]]
+// CHECK-NEXT:    br label [[FOR_COND_I_I_I:%.*]]
+// CHECK:       for.cond.i.i.i:
+// CHECK-NEXT:    [[RETVAL1_0_I_I_I:%.*]] = phi <2 x i16> [ undef, [[ENTRY:%.*]] ], [ [[VECINS_I_I_I:%.*]], [[FOR_BODY_I_I_I:%.*]] ]
+// CHECK-NEXT:    [[I_0_I_I_I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I_I_I:%.*]], [[FOR_BODY_I_I_I]] ]
+// CHECK-NEXT:    [[CMP_I_I_I:%.*]] = icmp ult i32 [[I_0_I_I_I]], 2
+// CHECK-NEXT:    br i1 [[CMP_I_I_I]], label [[FOR_BODY_I_I_I]], label [[_ZNK4SYCL3_V13VECISLI2EE7CONVERTINS0_3EXT6ONEAPI8BFLOAT16ELNS_13ROUNDING_MODEE4EEENS1_IT_LI2EEEV_EXIT:%.*]]
+// CHECK:       for.body.i.i.i:
+// CHECK-NEXT:    [[VECEXT_I_I_I:%.*]] = extractelement <2 x i16> [[TMP0]], i32 [[I_0_I_I_I]]
+// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = tail call spir_func noundef zeroext i16 @__imf_short2bfloat16_rd(i16 noundef signext [[VECEXT_I_I_I]]) #[[ATTR4]], !noalias [[META49]]
+// CHECK-NEXT:    [[VECINS_I_I_I]] = insertelement <2 x i16> [[RETVAL1_0_I_I_I]], i16 [[CALL_I_I_I_I]], i32 [[I_0_I_I_I]]
+// CHECK-NEXT:    [[INC_I_I_I]] = add nuw nsw i32 [[I_0_I_I_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I_I_I]], !llvm.loop [[LOOP52:![0-9]+]]
+// CHECK:       _ZNK4sycl3_V13vecIsLi2EE7convertINS0_3ext6oneapi8bfloat16ELNS_13rounding_modeE4EEENS1_IT_Li2EEEv.exit:
+// CHECK-NEXT:    store <2 x i16> [[RETVAL1_0_I_I_I]], ptr addrspace(4) [[AGG_RESULT]], align 4, !alias.scope [[META49]]
+// CHECK-NEXT:    ret void
+//
+SYCL_EXTERNAL auto TestShorttoBFDeviceRTN(vec<short, 2> &inp) {
+  return inp.template convert<bfloat16, sycl::rounding_mode::rtn>();
+}
diff --git a/sycl/test/check_device_code/vector/vector_math_ops.cpp b/sycl/test/check_device_code/vector/vector_math_ops.cpp
index 5d6521d725341..e42e8257d1973 100644
--- a/sycl/test/check_device_code/vector/vector_math_ops.cpp
+++ b/sycl/test/check_device_code/vector/vector_math_ops.cpp
@@ -1,7 +1,13 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // NOTE: ..., followed by some manual cleanup.
 
-// RUN: %clangxx -I %sycl_include -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -Xclang -disable-lifetime-markers -fsycl-device-only %s -o - | FileCheck %s
+// RUN: %clangxx -I %sycl_include -fpreview-breaking-changes -fno-discard-value-names -S -emit-llvm -fno-sycl-instrument-device-code -Xclang -disable-lifetime-markers -fsycl-device-only %s -o - | FileCheck %s
+
+// Windows/linux have some slight differences in IR generation (function
+// arguments passing and long/long long differences/mangling) that could
+// complicate test updates while not improving test coverage. Limiting to linux
+// should be fine.
+// REQUIRES: linux
 
 // This test checks
 // (1) the storage type of sycl::vec on device for all data types, and
@@ -13,51 +19,51 @@ using namespace sycl;
 /*************** Binary Arithmetic Ops ******************/
 
 // CHECK-LABEL: define dso_local spir_func void @_Z7TestAddN4sycl3_V13vecIiLi2EEES2_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !srcloc [[META5:![0-9]+]] !sycl_fixed_targets [[META6:![0-9]+]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] !srcloc [[META6:![0-9]+]] !sycl_fixed_targets [[META7:![0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META7:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[A]], align 8, !tbaa [[TBAA10:![0-9]+]], !noalias [[META7]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[B]], align 8, !tbaa [[TBAA10]], !noalias [[META7]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META8:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[A]], align 8, !tbaa [[TBAA11:![0-9]+]], !noalias [[META8]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[B]], align 8, !tbaa [[TBAA11]], !noalias [[META8]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <2 x i32> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    store <2 x i32> [[ADD_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA10]], !alias.scope [[META7]]
+// CHECK-NEXT:    store <2 x i32> [[ADD_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !alias.scope [[META8]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestAdd(vec<int, 2> a, vec<int, 2> b) { return a + b; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z7TestAddN4sycl3_V13vecIfLi3EEES2_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.0") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.0") align 16 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.0") align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META13:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.0") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.0") align 16 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.0") align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META14:![0-9]+]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META14:![0-9]+]])
-// CHECK-NEXT:    [[LOADVEC4_I:%.*]] = load <4 x float>, ptr [[A]], align 16, !noalias [[META14]]
-// CHECK-NEXT:    [[LOADVEC42_I:%.*]] = load <4 x float>, ptr [[B]], align 16, !noalias [[META14]]
-// CHECK-NEXT:    [[TMP0:%.*]] = fadd <4 x float> [[LOADVEC4_I]], [[LOADVEC42_I]]
-// CHECK-NEXT:    [[EXTRACTVEC5_I:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC5_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA10]], !alias.scope [[META14]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META15:![0-9]+]])
+// CHECK-NEXT:    [[LOADVEC4_I_I:%.*]] = load <4 x float>, ptr [[A]], align 16, !noalias [[META15]]
+// CHECK-NEXT:    [[LOADVEC4_I6_I:%.*]] = load <4 x float>, ptr [[B]], align 16, !noalias [[META15]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fadd <4 x float> [[LOADVEC4_I_I]], [[LOADVEC4_I6_I]]
+// CHECK-NEXT:    [[EXTRACTVEC_I8_I:%.*]] = shufflevector <4 x float> [[TMP0]], <4 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x float> [[EXTRACTVEC_I8_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !alias.scope [[META15]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestAdd(vec<float, 3> a, vec<float, 3> b) { return a + b; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z7TestAddN4sycl3_V13vecIcLi16EEES2_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.1") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.1") align 16 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.1") align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META17:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.4") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 16 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META18:![0-9]+]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META18:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[A]], align 16, !tbaa [[TBAA10]], !noalias [[META18]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[B]], align 16, !tbaa [[TBAA10]], !noalias [[META18]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META19:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[A]], align 16, !tbaa [[TBAA11]], !noalias [[META19]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[B]], align 16, !tbaa [[TBAA11]], !noalias [[META19]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <16 x i8> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    store <16 x i8> [[ADD_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA10]], !alias.scope [[META18]]
+// CHECK-NEXT:    store <16 x i8> [[ADD_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !alias.scope [[META19]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestAdd(vec<char, 16> a, vec<char, 16> b) { return a + b; }
 
 // std::byte does not support '+'. Therefore, using bitwise XOR as a substitute.
 // CHECK-LABEL: define dso_local spir_func void @_Z7TestXorN4sycl3_V13vecISt4byteLi8EEES3_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.2") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.2") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.2") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META21:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.8") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.8") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.8") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META22:![0-9]+]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META22:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 8, !tbaa [[TBAA10]], !noalias [[META22]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[B]], align 8, !tbaa [[TBAA10]], !noalias [[META22]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META23:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[A]], align 8, !tbaa [[TBAA11]], !noalias [[META23]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i8>, ptr [[B]], align 8, !tbaa [[TBAA11]], !noalias [[META23]]
 // CHECK-NEXT:    [[XOR_I:%.*]] = xor <8 x i8> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    store <8 x i8> [[XOR_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA10]], !alias.scope [[META22]]
+// CHECK-NEXT:    store <8 x i8> [[XOR_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !alias.scope [[META23]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestXor(vec<std::byte, 8> a, vec<std::byte, 8> b) {
@@ -65,92 +71,72 @@ SYCL_EXTERNAL auto TestXor(vec<std::byte, 8> a, vec<std::byte, 8> b) {
 }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z7TestAddN4sycl3_V13vecIbLi4EEES2_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.3") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.3") align 4 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.3") align 4 [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !srcloc [[META25:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable sret(%"class.sycl::_V1::vec.12") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.12") align 4 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.12") align 4 [[B:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] !srcloc [[META26:![0-9]+]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META26:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[A]], align 4, !tbaa [[TBAA10]], !noalias [[META26]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[B]], align 4, !tbaa [[TBAA10]], !noalias [[META26]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META27:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[A]], align 4, !tbaa [[TBAA11]], !noalias [[META27]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[B]], align 4, !tbaa [[TBAA11]], !noalias [[META27]]
 // CHECK-NEXT:    [[ADD_I:%.*]] = add <4 x i8> [[TMP0]], [[TMP1]]
+// CHECK-NEXT:    store <4 x i8> [[ADD_I]], ptr addrspace(4) [[AGG_RESULT]], align 4, !alias.scope [[META27]]
 // CHECK-NEXT:    br label [[FOR_COND_I_I:%.*]]
 // CHECK:       for.cond.i.i:
-// CHECK-NEXT:    [[VECINS_I_I6_I_I:%.*]] = phi <4 x i8> [ [[ADD_I]], [[ENTRY:%.*]] ], [ [[VECINS_I_I_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ]
-// CHECK-NEXT:    [[I_0_I_I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I]] ]
+// CHECK-NEXT:    [[I_0_I_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I_I:%.*]], [[FOR_BODY_I_I:%.*]] ]
 // CHECK-NEXT:    [[CMP_I_I:%.*]] = icmp ult i64 [[I_0_I_I]], 4
-// CHECK-NEXT:    br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V1PLERKNS0_3VECIBLI4EEES4__EXIT:%.*]]
+// CHECK-NEXT:    br i1 [[CMP_I_I]], label [[FOR_BODY_I_I]], label [[_ZN4SYCL3_V16DETAILPLIBEENS0_3VECIBLI4EEERKS4_S6__EXIT:%.*]]
 // CHECK:       for.body.i.i:
-// CHECK-NEXT:    [[CONV_I_I:%.*]] = trunc nuw nsw i64 [[I_0_I_I]] to i32
-// CHECK-NEXT:    [[VECEXT_I_I_I_I:%.*]] = extractelement <4 x i8> [[VECINS_I_I6_I_I]], i32 [[CONV_I_I]]
-// CHECK-NEXT:    [[TOBOOL_I_I_I_I:%.*]] = icmp ne i8 [[VECEXT_I_I_I_I]], 0
-// CHECK-NEXT:    [[FROMBOOL_I_I:%.*]] = zext i1 [[TOBOOL_I_I_I_I]] to i8
-// CHECK-NEXT:    [[VECINS_I_I_I_I]] = insertelement <4 x i8> [[VECINS_I_I6_I_I]], i8 [[FROMBOOL_I_I]], i32 [[CONV_I_I]]
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I_I_I:%.*]] = getelementptr inbounds [4 x i8], ptr addrspace(4) [[AGG_RESULT]], i64 0, i64 [[I_0_I_I]]
+// CHECK-NEXT:    [[TMP2:%.*]] = load i8, ptr addrspace(4) [[ARRAYIDX_I_I_I_I_I]], align 1, !tbaa [[TBAA11]], !alias.scope [[META27]]
+// CHECK-NEXT:    [[CMP3_I_I:%.*]] = icmp ne i8 [[TMP2]], 0
+// CHECK-NEXT:    [[FROMBOOL_I_I:%.*]] = zext i1 [[CMP3_I_I]] to i8
+// CHECK-NEXT:    store i8 [[FROMBOOL_I_I]], ptr addrspace(4) [[ARRAYIDX_I_I_I_I_I]], align 1, !tbaa [[TBAA30:![0-9]+]], !alias.scope [[META27]]
 // CHECK-NEXT:    [[INC_I_I]] = add nuw nsw i64 [[I_0_I_I]], 1
-// CHECK-NEXT:    br label [[FOR_COND_I_I]], !llvm.loop [[LOOP29:![0-9]+]]
-// CHECK:       _ZN4sycl3_V1plERKNS0_3vecIbLi4EEES4_.exit:
-// CHECK-NEXT:    store <4 x i8> [[VECINS_I_I6_I_I]], ptr addrspace(4) [[AGG_RESULT]], align 4, !alias.scope [[META26]]
+// CHECK-NEXT:    br label [[FOR_COND_I_I]], !llvm.loop [[LOOP32:![0-9]+]]
+// CHECK:       _ZN4sycl3_V16detailplIbEENS0_3vecIbLi4EEERKS4_S6_.exit:
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestAdd(vec<bool, 4> a, vec<bool, 4> b) { return a + b; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z7TestAddN4sycl3_V13vecINS0_6detail9half_impl4halfELi3EEES5_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.4") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.4") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META31:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.16") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.16") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.16") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META34:![0-9]+]] !sycl_used_aspects [[META35:![0-9]+]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META32:![0-9]+]])
-// CHECK-NEXT:    [[LOADVEC4_I:%.*]] = load <4 x half>, ptr [[A]], align 8, !noalias [[META32]]
-// CHECK-NEXT:    [[LOADVEC42_I:%.*]] = load <4 x half>, ptr [[B]], align 8, !noalias [[META32]]
-// CHECK-NEXT:    [[TMP0:%.*]] = fadd <4 x half> [[LOADVEC4_I]], [[LOADVEC42_I]]
-// CHECK-NEXT:    [[EXTRACTVEC5_I:%.*]] = shufflevector <4 x half> [[TMP0]], <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x half> [[EXTRACTVEC5_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA10]], !alias.scope [[META32]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META37:![0-9]+]])
+// CHECK-NEXT:    [[LOADVEC4_I_I:%.*]] = load <4 x half>, ptr [[A]], align 8, !noalias [[META37]]
+// CHECK-NEXT:    [[LOADVEC4_I6_I:%.*]] = load <4 x half>, ptr [[B]], align 8, !noalias [[META37]]
+// CHECK-NEXT:    [[TMP0:%.*]] = fadd <4 x half> [[LOADVEC4_I_I]], [[LOADVEC4_I6_I]]
+// CHECK-NEXT:    [[EXTRACTVEC_I8_I:%.*]] = shufflevector <4 x half> [[TMP0]], <4 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x half> [[EXTRACTVEC_I8_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !alias.scope [[META37]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestAdd(vec<half, 3> a, vec<half, 3> b) { return a + b; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z7TestAddN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEES5_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable sret(%"class.sycl::_V1::vec.5") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.5") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.5") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] !srcloc [[META35:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.20") align 8 [[AGG_RESULT:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.20") align 8 [[A:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.20") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] !srcloc [[META40:![0-9]+]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[REF_TMP_I_I:%.*]] = alloca float, align 4
-// CHECK-NEXT:    [[REF_TMP1_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 2
-// CHECK-NEXT:    [[REF_TMP3_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 2
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META36:![0-9]+]])
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[REF_TMP1_I]])
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[REF_TMP3_I]])
-// CHECK-NEXT:    [[REF_TMP1_ASCAST_I:%.*]] = addrspacecast ptr [[REF_TMP1_I]] to ptr addrspace(4)
-// CHECK-NEXT:    [[REF_TMP3_ASCAST_I:%.*]] = addrspacecast ptr [[REF_TMP3_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[A_ASCAST:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4)
+// CHECK-NEXT:    [[B_ASCAST:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(4)
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META41:![0-9]+]])
 // CHECK-NEXT:    [[REF_TMP_ASCAST_I_I:%.*]] = addrspacecast ptr [[REF_TMP_I_I]] to ptr addrspace(4)
-// CHECK-NEXT:    [[AGG_RESULT_PROMOTED_I:%.*]] = load <3 x i16>, ptr addrspace(4) [[AGG_RESULT]], align 8, !alias.scope [[META36]]
-// CHECK-NEXT:    [[LOADVEC4_I_I_I:%.*]] = load <4 x i16>, ptr [[A]], align 8, !noalias [[META39:![0-9]+]]
-// CHECK-NEXT:    [[EXTRACTVEC_I_I_I:%.*]] = shufflevector <4 x i16> [[LOADVEC4_I_I_I]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    [[LOADVEC4_I_I9_I:%.*]] = load <4 x i16>, ptr [[B]], align 8, !noalias [[META44:![0-9]+]]
-// CHECK-NEXT:    [[EXTRACTVEC_I_I10_I:%.*]] = shufflevector <4 x i16> [[LOADVEC4_I_I9_I]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
 // CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
 // CHECK:       for.cond.i:
-// CHECK-NEXT:    [[VECINS_I_I12_I:%.*]] = phi <3 x i16> [ [[AGG_RESULT_PROMOTED_I]], [[ENTRY:%.*]] ], [ [[VECINS_I_I_I:%.*]], [[FOR_BODY_I:%.*]] ]
-// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY_I]] ]
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 3
-// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V1PLERKNS0_3VECINS0_3EXT6ONEAPI8BFLOAT16ELI3EEES7__EXIT:%.*]]
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V16DETAILPLINS0_3EXT6ONEAPI8BFLOAT16EEENS0_3VECIS5_LI3EEERKS7_S9__EXIT:%.*]]
 // CHECK:       for.body.i:
-// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META49:![0-9]+]])
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META50:![0-9]+]])
-// CHECK-NEXT:    [[VECEXT_I_I_I:%.*]] = extractelement <3 x i16> [[EXTRACTVEC_I_I_I]], i32 [[CONV_I]]
-// CHECK-NEXT:    store i16 [[VECEXT_I_I_I]], ptr [[REF_TMP1_I]], align 2, !alias.scope [[META51:![0-9]+]], !noalias [[META36]]
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META56:![0-9]+]])
-// CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META57:![0-9]+]])
-// CHECK-NEXT:    [[VECEXT_I_I11_I:%.*]] = extractelement <3 x i16> [[EXTRACTVEC_I_I10_I]], i32 [[CONV_I]]
-// CHECK-NEXT:    store i16 [[VECEXT_I_I11_I]], ptr [[REF_TMP3_I]], align 2, !alias.scope [[META58:![0-9]+]], !noalias [[META36]]
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META36]]
-// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[REF_TMP1_ASCAST_I]]) #[[ATTR9:[0-9]+]], !noalias [[META63:![0-9]+]]
-// CHECK-NEXT:    [[CALL_I_I2_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[REF_TMP3_ASCAST_I]]) #[[ATTR9]], !noalias [[META63]]
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I_I:%.*]] = getelementptr inbounds [4 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr addrspace(4) [[A_ASCAST]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I10_I:%.*]] = getelementptr inbounds [4 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr addrspace(4) [[B_ASCAST]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META41]]
+// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[ARRAYIDX_I_I_I_I]]) #[[ATTR8:[0-9]+]], !noalias [[META44:![0-9]+]]
+// CHECK-NEXT:    [[CALL_I_I2_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[ARRAYIDX_I_I_I10_I]]) #[[ATTR8]], !noalias [[META44]]
 // CHECK-NEXT:    [[ADD_I_I:%.*]] = fadd float [[CALL_I_I_I_I]], [[CALL_I_I2_I_I]]
-// CHECK-NEXT:    store float [[ADD_I_I]], ptr [[REF_TMP_I_I]], align 4, !tbaa [[TBAA66:![0-9]+]], !noalias [[META63]]
-// CHECK-NEXT:    [[CALL_I_I3_I_I:%.*]] = call spir_func noundef zeroext i16 @__devicelib_ConvertFToBF16INTEL(ptr addrspace(4) noundef align 4 dereferenceable(4) [[REF_TMP_ASCAST_I_I]]) #[[ATTR9]], !noalias [[META63]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META36]]
-// CHECK-NEXT:    [[VECINS_I_I_I]] = insertelement <3 x i16> [[VECINS_I_I12_I]], i16 [[CALL_I_I3_I_I]], i32 [[CONV_I]]
+// CHECK-NEXT:    store float [[ADD_I_I]], ptr [[REF_TMP_I_I]], align 4, !tbaa [[TBAA47:![0-9]+]], !noalias [[META44]]
+// CHECK-NEXT:    [[CALL_I_I3_I_I:%.*]] = call spir_func noundef zeroext i16 @__devicelib_ConvertFToBF16INTEL(ptr addrspace(4) noundef align 4 dereferenceable(4) [[REF_TMP_ASCAST_I_I]]) #[[ATTR8]], !noalias [[META44]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META41]]
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I12_I:%.*]] = getelementptr inbounds [4 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr addrspace(4) [[AGG_RESULT]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    store i16 [[CALL_I_I3_I_I]], ptr addrspace(4) [[ARRAYIDX_I_I_I12_I]], align 2, !tbaa [[TBAA49:![0-9]+]], !alias.scope [[META41]]
 // CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
-// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP68:![0-9]+]]
-// CHECK:       _ZN4sycl3_V1plERKNS0_3vecINS0_3ext6oneapi8bfloat16ELi3EEES7_.exit:
-// CHECK-NEXT:    store <3 x i16> [[VECINS_I_I12_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !alias.scope [[META36]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[REF_TMP1_I]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[REF_TMP3_I]])
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP51:![0-9]+]]
+// CHECK:       _ZN4sycl3_V16detailplINS0_3ext6oneapi8bfloat16EEENS0_3vecIS5_Li3EEERKS7_S9_.exit:
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestAdd(vec<ext::oneapi::bfloat16, 3> a,
@@ -161,46 +147,43 @@ SYCL_EXTERNAL auto TestAdd(vec<ext::oneapi::bfloat16, 3> a,
 /***************** Binary Logical Ops *******************/
 
 // CHECK-LABEL: define dso_local spir_func void @_Z15TestGreaterThanN4sycl3_V13vecIiLi16EEES2_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.6") align 64 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.6") align 64 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.6") align 64 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META69:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.24") align 64 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.24") align 64 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.24") align 64 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META52:![0-9]+]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META70:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A]], align 64, !tbaa [[TBAA10]], !noalias [[META70]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr [[B]], align 64, !tbaa [[TBAA10]], !noalias [[META70]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META53:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i32>, ptr [[A]], align 64, !tbaa [[TBAA11]], !noalias [[META53]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i32>, ptr [[B]], align 64, !tbaa [[TBAA11]], !noalias [[META53]]
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <16 x i32> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <16 x i1> [[CMP_I]] to <16 x i32>
-// CHECK-NEXT:    store <16 x i32> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 64, !tbaa [[TBAA10]], !alias.scope [[META70]]
+// CHECK-NEXT:    store <16 x i32> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 64, !alias.scope [[META53]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestGreaterThan(vec<int, 16> a, vec<int, 16> b) {
   return a > b;
 }
 
-// CHECK-LABEL: define dso_local spir_func void @_Z15TestGreaterThanN4sycl3_V13vecISt4byteLi3EEES3_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.7") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.8") align 4 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.8") align 4 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META73:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-LABEL: define dso_local spir_func noundef <3 x i8> @_Z15TestGreaterThanN4sycl3_V13vecISt4byteLi3EEES3_(
+// CHECK-SAME: ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.28") align 4 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.28") align 4 [[B:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] !srcloc [[META56:![0-9]+]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META74:![0-9]+]])
-// CHECK-NEXT:    [[LOADVEC4_I:%.*]] = load <4 x i8>, ptr [[A]], align 4, !noalias [[META74]]
-// CHECK-NEXT:    [[LOADVEC42_I:%.*]] = load <4 x i8>, ptr [[B]], align 4, !noalias [[META74]]
-// CHECK-NEXT:    [[TMP0:%.*]] = icmp sgt <4 x i8> [[LOADVEC4_I]], [[LOADVEC42_I]]
-// CHECK-NEXT:    [[CMP_I:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <3 x i1> [[CMP_I]] to <3 x i8>
-// CHECK-NEXT:    [[EXTRACTVEC_I_I:%.*]] = shufflevector <3 x i8> [[SEXT_I]], <3 x i8> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i8> [[EXTRACTVEC_I_I]], ptr addrspace(4) [[AGG_RESULT]], align 4, !tbaa [[TBAA10]], !alias.scope [[META74]]
-// CHECK-NEXT:    ret void
+// CHECK-NEXT:    [[LOADVEC4_I_I:%.*]] = load <4 x i8>, ptr [[A]], align 1
+// CHECK-NEXT:    [[LOADVEC4_I_I2:%.*]] = load <4 x i8>, ptr [[B]], align 1
+// CHECK-NEXT:    [[TMP0:%.*]] = icmp ugt <4 x i8> [[LOADVEC4_I_I]], [[LOADVEC4_I_I2]]
+// CHECK-NEXT:    [[CMP:%.*]] = shufflevector <4 x i1> [[TMP0]], <4 x i1> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[SEXT:%.*]] = sext <3 x i1> [[CMP]] to <3 x i8>
+// CHECK-NEXT:    ret <3 x i8> [[SEXT]]
 //
 SYCL_EXTERNAL auto TestGreaterThan(vec<std::byte, 3> a, vec<std::byte, 3> b) {
   return a > b;
 }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z15TestGreaterThanN4sycl3_V13vecIbLi2EEES2_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.9") align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.10") align 2 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.10") align 2 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META77:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.32") align 2 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.36") align 2 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.36") align 2 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META57:![0-9]+]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META78:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr [[A]], align 2, !tbaa [[TBAA10]], !noalias [[META78]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, ptr [[B]], align 2, !tbaa [[TBAA10]], !noalias [[META78]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META58:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr [[A]], align 2, !tbaa [[TBAA11]], !noalias [[META58]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i8>, ptr [[B]], align 2, !tbaa [[TBAA11]], !noalias [[META58]]
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt <2 x i8> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i8>
-// CHECK-NEXT:    store <2 x i8> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 2, !tbaa [[TBAA10]], !alias.scope [[META78]]
+// CHECK-NEXT:    store <2 x i8> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 2, !alias.scope [[META58]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestGreaterThan(vec<bool, 2> a, vec<bool, 2> b) {
@@ -208,32 +191,44 @@ SYCL_EXTERNAL auto TestGreaterThan(vec<bool, 2> a, vec<bool, 2> b) {
 }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z15TestGreaterThanN4sycl3_V13vecINS0_6detail9half_impl4halfELi8EEES5_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.11") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.12") align 16 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.12") align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META81:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.40") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.44") align 16 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.44") align 16 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META61:![0-9]+]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META82:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A]], align 16, !tbaa [[TBAA10]], !noalias [[META82]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B]], align 16, !tbaa [[TBAA10]], !noalias [[META82]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META62:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A]], align 16, !tbaa [[TBAA11]], !noalias [[META62]]
+// CHECK-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B]], align 16, !tbaa [[TBAA11]], !noalias [[META62]]
 // CHECK-NEXT:    [[CMP_I:%.*]] = fcmp ogt <8 x half> [[TMP0]], [[TMP1]]
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <8 x i1> [[CMP_I]] to <8 x i16>
-// CHECK-NEXT:    store <8 x i16> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA10]], !alias.scope [[META82]]
+// CHECK-NEXT:    store <8 x i16> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !alias.scope [[META62]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestGreaterThan(vec<half, 8> a, vec<half, 8> b) {
   return a > b;
 }
 
-// FIXME: We incorrectly interpret BF16 as INT16 to peform logical operation.
-// For example, vec<BF16, 2>{-0.5, 3.333} < vec<BF16, 2>{6.0, 6.666} results
-// into {-1, -1} on host but {0, -1} on device.
 // CHECK-LABEL: define dso_local spir_func void @_Z15TestGreaterThanN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi4EEES5_(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.13") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.14") align 8 [[A:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.14") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META85:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.48") align 8 [[AGG_RESULT:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.52") align 8 [[A:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.52") align 8 [[B:%.*]]) local_unnamed_addr #[[ATTR2]] !srcloc [[META65:![0-9]+]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META86:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i16>, ptr [[A]], align 8, !tbaa [[TBAA10]], !noalias [[META86]]
-// CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i16>, ptr [[B]], align 8, !tbaa [[TBAA10]], !noalias [[META86]]
-// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ugt <4 x i16> [[TMP0]], [[TMP1]]
-// CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i16>
-// CHECK-NEXT:    store <4 x i16> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !tbaa [[TBAA10]], !alias.scope [[META86]]
+// CHECK-NEXT:    [[A_ASCAST:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4)
+// CHECK-NEXT:    [[B_ASCAST:%.*]] = addrspacecast ptr [[B]] to ptr addrspace(4)
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META66:![0-9]+]])
+// CHECK-NEXT:    store i64 0, ptr addrspace(4) [[AGG_RESULT]], align 8, !alias.scope [[META66]]
+// CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
+// CHECK:       for.cond.i:
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 4
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V16DETAILGTINS0_3EXT6ONEAPI8BFLOAT16EEENS0_3VECISLI4EEERKNS6_IS5_LI4EEESA__EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I_I:%.*]] = getelementptr inbounds [4 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr addrspace(4) [[A_ASCAST]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I13_I:%.*]] = getelementptr inbounds [4 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr addrspace(4) [[B_ASCAST]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[ARRAYIDX_I_I_I_I]]) #[[ATTR8]], !noalias [[META66]]
+// CHECK-NEXT:    [[CALL_I_I2_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[ARRAYIDX_I_I_I13_I]]) #[[ATTR8]], !noalias [[META66]]
+// CHECK-NEXT:    [[CMP_I_I:%.*]] = fcmp ogt float [[CALL_I_I_I_I]], [[CALL_I_I2_I_I]]
+// CHECK-NEXT:    [[CONV5_I:%.*]] = sext i1 [[CMP_I_I]] to i16
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I15_I:%.*]] = getelementptr inbounds [4 x i16], ptr addrspace(4) [[AGG_RESULT]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    store i16 [[CONV5_I]], ptr addrspace(4) [[ARRAYIDX_I_I_I15_I]], align 2, !tbaa [[TBAA49]], !alias.scope [[META66]]
+// CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP69:![0-9]+]]
+// CHECK:       _ZN4sycl3_V16detailgtINS0_3ext6oneapi8bfloat16EEENS0_3vecIsLi4EEERKNS6_IS5_Li4EEESA_.exit:
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestGreaterThan(vec<ext::oneapi::bfloat16, 4> a,
@@ -244,144 +239,128 @@ SYCL_EXTERNAL auto TestGreaterThan(vec<ext::oneapi::bfloat16, 4> a,
 /********************** Unary Ops **********************/
 
 // CHECK-LABEL: define dso_local spir_func void @_Z12TestNegationN4sycl3_V13vecIiLi3EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.15") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.15") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META89:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.55") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.55") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META70:![0-9]+]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META90:![0-9]+]])
-// CHECK-NEXT:    [[LOADVEC4_I:%.*]] = load <4 x i32>, ptr [[A]], align 16, !noalias [[META90]]
-// CHECK-NEXT:    [[EXTRACTVEC_I:%.*]] = shufflevector <4 x i32> [[LOADVEC4_I]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
-// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <3 x i32> [[EXTRACTVEC_I]], zeroinitializer
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META71:![0-9]+]])
+// CHECK-NEXT:    [[LOADVEC4_I_I:%.*]] = load <4 x i32>, ptr [[A]], align 16, !noalias [[META71]]
+// CHECK-NEXT:    [[EXTRACTVEC_I_I:%.*]] = shufflevector <4 x i32> [[LOADVEC4_I_I]], <4 x i32> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <3 x i32> [[EXTRACTVEC_I_I]], zeroinitializer
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <3 x i1> [[CMP_I]] to <3 x i32>
-// CHECK-NEXT:    [[EXTRACTVEC_I_I:%.*]] = shufflevector <3 x i32> [[SEXT_I]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
-// CHECK-NEXT:    store <4 x i32> [[EXTRACTVEC_I_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !alias.scope [[META90]]
+// CHECK-NEXT:    [[EXTRACTVEC_I2_I:%.*]] = shufflevector <3 x i32> [[SEXT_I]], <3 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+// CHECK-NEXT:    store <4 x i32> [[EXTRACTVEC_I2_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA11]], !alias.scope [[META71]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestNegation(vec<int, 3> a) { return !a; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z9TestMinusN4sycl3_V13vecIiLi4EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.16") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.16") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META93:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.59") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.59") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META74:![0-9]+]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META94:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A]], align 16, !tbaa [[TBAA10]], !noalias [[META94]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META75:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[A]], align 16, !tbaa [[TBAA11]], !noalias [[META75]]
 // CHECK-NEXT:    [[SUB_I:%.*]] = sub <4 x i32> zeroinitializer, [[TMP0]]
-// CHECK-NEXT:    store <4 x i32> [[SUB_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA10]], !alias.scope [[META94]]
+// CHECK-NEXT:    store <4 x i32> [[SUB_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !alias.scope [[META75]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestMinus(vec<int, 4> a) { return -a; }
 
 // Negation is not valid for std::byte. Therefore, using bitwise negation.
 // CHECK-LABEL: define dso_local spir_func void @_Z19TestBitwiseNegationN4sycl3_V13vecISt4byteLi16EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.17") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.17") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META97:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.62") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.62") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META78:![0-9]+]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META98:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[A]], align 16, !tbaa [[TBAA10]], !noalias [[META98]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META79:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[A]], align 16, !tbaa [[TBAA11]], !noalias [[META79]]
 // CHECK-NEXT:    [[NOT_I:%.*]] = xor <16 x i8> [[TMP0]], <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
-// CHECK-NEXT:    store <16 x i8> [[NOT_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA10]], !alias.scope [[META98]]
+// CHECK-NEXT:    store <16 x i8> [[NOT_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA11]], !alias.scope [[META79]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestBitwiseNegation(vec<std::byte, 16> a) { return ~a; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z12TestNegationN4sycl3_V13vecIbLi4EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.18") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.3") align 4 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META101:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.66") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.12") align 4 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META82:![0-9]+]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META102:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[A]], align 4, !tbaa [[TBAA10]], !noalias [[META102]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META83:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[A]], align 4, !tbaa [[TBAA11]], !noalias [[META83]]
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq <4 x i8> [[TMP0]], zeroinitializer
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <4 x i1> [[CMP_I]] to <4 x i8>
-// CHECK-NEXT:    store <4 x i8> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 4, !alias.scope [[META105:![0-9]+]]
+// CHECK-NEXT:    store <4 x i8> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 4, !tbaa [[TBAA11]], !alias.scope [[META83]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestNegation(vec<bool, 4> a) { return !a; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z12TestNegationN4sycl3_V13vecINS0_6detail9half_impl4halfELi2EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.19") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.20") align 4 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META108:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.70") align 4 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.74") align 4 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META86:![0-9]+]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META109:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr [[A]], align 4, !tbaa [[TBAA10]], !noalias [[META109]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META87:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x half>, ptr [[A]], align 4, !tbaa [[TBAA11]], !noalias [[META87]]
 // CHECK-NEXT:    [[CMP_I:%.*]] = fcmp oeq <2 x half> [[TMP0]], zeroinitializer
 // CHECK-NEXT:    [[SEXT_I:%.*]] = sext <2 x i1> [[CMP_I]] to <2 x i16>
-// CHECK-NEXT:    store <2 x i16> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 4, !alias.scope [[META112:![0-9]+]]
+// CHECK-NEXT:    store <2 x i16> [[SEXT_I]], ptr addrspace(4) [[AGG_RESULT]], align 4, !tbaa [[TBAA11]], !alias.scope [[META87]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestNegation(vec<half, 2> a) { return !a; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z9TestMinusN4sycl3_V13vecINS0_6detail9half_impl4halfELi8EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.12") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.12") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META115:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.44") align 16 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.44") align 16 [[A:%.*]]) local_unnamed_addr #[[ATTR0]] !srcloc [[META90:![0-9]+]] !sycl_used_aspects [[META35]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META116:![0-9]+]])
-// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A]], align 16, !tbaa [[TBAA10]], !noalias [[META116]]
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META91:![0-9]+]])
+// CHECK-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A]], align 16, !tbaa [[TBAA11]], !noalias [[META91]]
 // CHECK-NEXT:    [[FNEG_I:%.*]] = fneg <8 x half> [[TMP0]]
-// CHECK-NEXT:    store <8 x half> [[FNEG_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !tbaa [[TBAA10]], !alias.scope [[META116]]
+// CHECK-NEXT:    store <8 x half> [[FNEG_I]], ptr addrspace(4) [[AGG_RESULT]], align 16, !alias.scope [[META91]]
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestMinus(vec<half, 8> a) { return -a; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z12TestNegationN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi3EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.21") align 8 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.5") align 8 [[A:%.*]]) local_unnamed_addr #[[ATTR2]] !srcloc [[META119:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.78") align 8 [[AGG_RESULT:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.20") align 8 [[A:%.*]]) local_unnamed_addr #[[ATTR2]] !srcloc [[META94:![0-9]+]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[REF_TMP1_I:%.*]] = alloca float, align 4
-// CHECK-NEXT:    [[REF_TMP2_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 2
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META120:![0-9]+]])
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[REF_TMP1_I]])
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[REF_TMP2_I]])
-// CHECK-NEXT:    [[REF_TMP1_ASCAST_I:%.*]] = addrspacecast ptr [[REF_TMP1_I]] to ptr addrspace(4)
-// CHECK-NEXT:    [[REF_TMP2_ASCAST_I:%.*]] = addrspacecast ptr [[REF_TMP2_I]] to ptr addrspace(4)
-// CHECK-NEXT:    [[LOADVEC4_I_I_I:%.*]] = load <4 x i16>, ptr [[A]], align 8, !noalias [[META123:![0-9]+]]
-// CHECK-NEXT:    [[EXTRACTVEC_I_I_I:%.*]] = shufflevector <4 x i16> [[LOADVEC4_I_I_I]], <4 x i16> poison, <3 x i32> <i32 0, i32 1, i32 2>
+// CHECK-NEXT:    [[A_ASCAST:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4)
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META95:![0-9]+]])
+// CHECK-NEXT:    store i64 0, ptr addrspace(4) [[AGG_RESULT]], align 8, !alias.scope [[META95]]
 // CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
 // CHECK:       for.cond.i:
-// CHECK-NEXT:    [[RET_SROA_0_0_I:%.*]] = phi <3 x i16> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[VECINS_I_I_I:%.*]], [[FOR_BODY_I:%.*]] ]
-// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY_I]] ]
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 3
-// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V1NTERKNS0_3VECINS0_3EXT6ONEAPI8BFLOAT16ELI3EEE_EXIT:%.*]]
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V16DETAILNTERKNS0_3VECINS0_3EXT6ONEAPI8BFLOAT16ELI3EEE_EXIT:%.*]]
 // CHECK:       for.body.i:
-// CHECK-NEXT:    [[CONV_I:%.*]] = trunc nuw nsw i64 [[I_0_I]] to i32
-// CHECK-NEXT:    [[VECEXT_I_I_I:%.*]] = extractelement <3 x i16> [[EXTRACTVEC_I_I_I]], i32 [[CONV_I]]
-// CHECK-NEXT:    store i16 [[VECEXT_I_I_I]], ptr [[REF_TMP2_I]], align 2, !tbaa [[TBAA128:![0-9]+]], !alias.scope [[META130:![0-9]+]], !noalias [[META120]]
-// CHECK-NEXT:    [[CALL_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[REF_TMP2_ASCAST_I]]) #[[ATTR9]], !noalias [[META120]]
-// CHECK-NEXT:    [[CMP_I_I:%.*]] = fcmp oeq float [[CALL_I_I_I]], 0.000000e+00
-// CHECK-NEXT:    [[CONV4_I:%.*]] = uitofp i1 [[CMP_I_I]] to float
-// CHECK-NEXT:    store float [[CONV4_I]], ptr [[REF_TMP1_I]], align 4, !tbaa [[TBAA66]], !noalias [[META120]]
-// CHECK-NEXT:    [[CALL_I_I9_I:%.*]] = call spir_func noundef zeroext i16 @__devicelib_ConvertFToBF16INTEL(ptr addrspace(4) noundef align 4 dereferenceable(4) [[REF_TMP1_ASCAST_I]]) #[[ATTR9]], !noalias [[META120]]
-// CHECK-NEXT:    [[VECINS_I_I_I]] = insertelement <3 x i16> [[RET_SROA_0_0_I]], i16 [[CALL_I_I9_I]], i32 [[CONV_I]]
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I_I:%.*]] = getelementptr inbounds [4 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr addrspace(4) [[A_ASCAST]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    [[CALL_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[ARRAYIDX_I_I_I_I]]) #[[ATTR8]], !noalias [[META95]]
+// CHECK-NEXT:    [[TOBOOL_I:%.*]] = fcmp oeq float [[CALL_I_I_I]], 0.000000e+00
+// CHECK-NEXT:    [[CONV3_I:%.*]] = sext i1 [[TOBOOL_I]] to i16
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I10_I:%.*]] = getelementptr inbounds [4 x i16], ptr addrspace(4) [[AGG_RESULT]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    store i16 [[CONV3_I]], ptr addrspace(4) [[ARRAYIDX_I_I_I10_I]], align 2, !tbaa [[TBAA49]], !alias.scope [[META95]]
 // CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
-// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP133:![0-9]+]]
-// CHECK:       _ZN4sycl3_V1ntERKNS0_3vecINS0_3ext6oneapi8bfloat16ELi3EEE.exit:
-// CHECK-NEXT:    store <3 x i16> [[RET_SROA_0_0_I]], ptr addrspace(4) [[AGG_RESULT]], align 8, !alias.scope [[META134:![0-9]+]]
-// CHECK-NEXT:    [[AGG_RESULT_SROA_IDX_I:%.*]] = getelementptr inbounds i8, ptr addrspace(4) [[AGG_RESULT]], i64 6
-// CHECK-NEXT:    store i16 0, ptr addrspace(4) [[AGG_RESULT_SROA_IDX_I]], align 2, !alias.scope [[META134]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[REF_TMP1_I]])
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 2, ptr nonnull [[REF_TMP2_I]])
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP98:![0-9]+]]
+// CHECK:       _ZN4sycl3_V16detailntERKNS0_3vecINS0_3ext6oneapi8bfloat16ELi3EEE.exit:
 // CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestNegation(vec<ext::oneapi::bfloat16, 3> a) { return !a; }
 
 // CHECK-LABEL: define dso_local spir_func void @_Z9TestMinusN4sycl3_V13vecINS0_3ext6oneapi8bfloat16ELi16EEE(
-// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.22") align 32 [[AGG_RESULT:%.*]], ptr nocapture noundef readonly byval(%"class.sycl::_V1::vec.22") align 32 [[A:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] !srcloc [[META137:![0-9]+]] !sycl_fixed_targets [[META6]] {
+// CHECK-SAME: ptr addrspace(4) dead_on_unwind noalias nocapture writable writeonly sret(%"class.sycl::_V1::vec.81") align 32 [[AGG_RESULT:%.*]], ptr noundef byval(%"class.sycl::_V1::vec.81") align 32 [[A:%.*]]) local_unnamed_addr #[[ATTR2]] !srcloc [[META99:![0-9]+]] !sycl_fixed_targets [[META7]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[REF_TMP_I_I:%.*]] = alloca float, align 4
-// CHECK-NEXT:    [[V_I:%.*]] = alloca %"class.sycl::_V1::ext::oneapi::bfloat16", align 2
-// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META138:![0-9]+]])
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 2, ptr nonnull [[V_I]])
-// CHECK-NEXT:    [[V_ASCAST_I:%.*]] = addrspacecast ptr [[V_I]] to ptr addrspace(4)
-// CHECK-NEXT:    tail call void @llvm.memset.p4.i64(ptr addrspace(4) noundef align 32 dereferenceable(32) [[AGG_RESULT]], i8 0, i64 32, i1 false), !alias.scope [[META138]]
-// CHECK-NEXT:    [[REF_TMP_ASCAST_I_I:%.*]] = addrspacecast ptr [[REF_TMP_I_I]] to ptr addrspace(4)
+// CHECK-NEXT:    [[REF_TMP_I:%.*]] = alloca float, align 4
+// CHECK-NEXT:    [[A_ASCAST:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(4)
+// CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META100:![0-9]+]])
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[REF_TMP_I]])
+// CHECK-NEXT:    [[REF_TMP_ASCAST_I:%.*]] = addrspacecast ptr [[REF_TMP_I]] to ptr addrspace(4)
+// CHECK-NEXT:    tail call void @llvm.memset.p4.i64(ptr addrspace(4) noundef align 32 dereferenceable(32) [[AGG_RESULT]], i8 0, i64 32, i1 false), !alias.scope [[META100]]
 // CHECK-NEXT:    br label [[FOR_COND_I:%.*]]
 // CHECK:       for.cond.i:
-// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_COND_I]] ]
+// CHECK-NEXT:    [[I_0_I:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC_I:%.*]], [[FOR_BODY_I:%.*]] ]
 // CHECK-NEXT:    [[CMP_I:%.*]] = icmp ult i64 [[I_0_I]], 16
-// CHECK-NEXT:    call void @llvm.assume(i1 [[CMP_I]])
-// CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i16>, ptr [[A]], i64 0, i64 [[I_0_I]]
-// CHECK-NEXT:    [[VECEXT_I:%.*]] = load i16, ptr [[TMP0]], align 2, !noalias [[META138]]
-// CHECK-NEXT:    store i16 [[VECEXT_I]], ptr [[V_I]], align 2, !tbaa [[TBAA141:![0-9]+]], !alias.scope [[META143:![0-9]+]], !noalias [[META138]]
-// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META138]]
-// CHECK-NEXT:    [[CALL_I_I:%.*]] = call spir_func float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[V_ASCAST_I]]) #[[ATTR9]], !noalias [[META146:![0-9]+]]
-// CHECK-NEXT:    [[FNEG_I_I:%.*]] = fneg float [[CALL_I_I]]
-// CHECK-NEXT:    store float [[FNEG_I_I]], ptr [[REF_TMP_I_I]], align 4, !tbaa [[TBAA66]], !noalias [[META146]]
-// CHECK-NEXT:    [[CALL_I_I_I_I:%.*]] = call spir_func noundef zeroext i16 @__devicelib_ConvertFToBF16INTEL(ptr addrspace(4) noundef align 4 dereferenceable(4) [[REF_TMP_ASCAST_I_I]]) #[[ATTR9]], !noalias [[META146]]
-// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[REF_TMP_I_I]]), !noalias [[META138]]
-// CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <16 x i16>, ptr addrspace(4) [[AGG_RESULT]], i64 0, i64 [[I_0_I]]
-// CHECK-NEXT:    store i16 [[CALL_I_I_I_I]], ptr addrspace(4) [[TMP1]], align 2, !alias.scope [[META138]]
+// CHECK-NEXT:    br i1 [[CMP_I]], label [[FOR_BODY_I]], label [[_ZN4SYCL3_V16DETAILNGERKNS0_3VECINS0_3EXT6ONEAPI8BFLOAT16ELI16EEE_EXIT:%.*]]
+// CHECK:       for.body.i:
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I_I:%.*]] = getelementptr inbounds [16 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr addrspace(4) [[A_ASCAST]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    [[CALL_I_I_I:%.*]] = call spir_func noundef float @__devicelib_ConvertBF16ToFINTEL(ptr addrspace(4) noundef align 2 dereferenceable(2) [[ARRAYIDX_I_I_I_I]]) #[[ATTR8]], !noalias [[META100]]
+// CHECK-NEXT:    [[FNEG_I:%.*]] = fneg float [[CALL_I_I_I]]
+// CHECK-NEXT:    store float [[FNEG_I]], ptr [[REF_TMP_I]], align 4, !tbaa [[TBAA47]], !noalias [[META100]]
+// CHECK-NEXT:    [[ARRAYIDX_I_I_I9_I:%.*]] = getelementptr inbounds [16 x %"class.sycl::_V1::ext::oneapi::bfloat16"], ptr addrspace(4) [[AGG_RESULT]], i64 0, i64 [[I_0_I]]
+// CHECK-NEXT:    [[CALL_I_I10_I:%.*]] = call spir_func noundef zeroext i16 @__devicelib_ConvertFToBF16INTEL(ptr addrspace(4) noundef align 4 dereferenceable(4) [[REF_TMP_ASCAST_I]]) #[[ATTR8]], !noalias [[META100]]
+// CHECK-NEXT:    store i16 [[CALL_I_I10_I]], ptr addrspace(4) [[ARRAYIDX_I_I_I9_I]], align 2, !tbaa [[TBAA103:![0-9]+]], !alias.scope [[META100]]
 // CHECK-NEXT:    [[INC_I]] = add nuw nsw i64 [[I_0_I]], 1
-// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP149:![0-9]+]]
+// CHECK-NEXT:    br label [[FOR_COND_I]], !llvm.loop [[LOOP105:![0-9]+]]
+// CHECK:       _ZN4sycl3_V16detailngERKNS0_3vecINS0_3ext6oneapi8bfloat16ELi16EEE.exit:
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull [[REF_TMP_I]])
+// CHECK-NEXT:    ret void
 //
 SYCL_EXTERNAL auto TestMinus(vec<ext::oneapi::bfloat16, 16> a) { return -a; }
diff --git a/sycl/test/esimd/nbarriers.cpp b/sycl/test/esimd/nbarriers.cpp
deleted file mode 100644
index f9a0313cded2b..0000000000000
--- a/sycl/test/esimd/nbarriers.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// RUN: %clangxx -fsycl -c -fsycl-device-only -Xclang -emit-llvm %s -o - 2>&1 | FileCheck %s
-
-#include <sycl/ext/intel/esimd.hpp>
-#include <sycl/sycl.hpp>
-
-using namespace sycl::ext::intel::esimd;
-using namespace sycl::ext::intel::experimental::esimd;
-
-template <typename name, typename Func>
-__attribute__((sycl_kernel)) void kernel(Func kernelFunc) {
-  kernelFunc();
-}
-
-void caller(int x) {
-  kernel<class kernel_esimd>([=]() SYCL_ESIMD_KERNEL {
-    __ESIMD_NS::named_barrier_init<7>();
-    __ESIMD_NS::named_barrier_wait(2);
-    // CHECK: call spir_func void @_Z13__esimd_fenceh(i8 noundef zeroext 33)
-    // CHECK-NEXT: call spir_func void @_Z23__esimd_nbarrier_arrive{{.*}}
-    __ESIMD_NS::named_barrier_signal(0, 0, 4, 4);
-  });
-}
diff --git a/sycl/test/esimd/spirv_intrins_trans.cpp b/sycl/test/esimd/spirv_intrins_trans.cpp
deleted file mode 100644
index 8228dec9edcd3..0000000000000
--- a/sycl/test/esimd/spirv_intrins_trans.cpp
+++ /dev/null
@@ -1,55 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-device-only -S -emit-llvm -x c++ %s -o %t
-// RUN: sycl-post-link -split-esimd -lower-esimd -O0 -S %t -o %t.table
-// RUN: FileCheck %s -input-file=%t_esimd_0.ll
-
-// This test checks that all LLVM-IR instructions that work with SPIR-V builtins
-// are correctly translated into GenX counterparts (implemented in
-// LowerESIMD.cpp)
-
-#include <sycl/ext/intel/esimd.hpp>
-#include <sycl/sycl.hpp>
-
-template <typename name, typename Func>
-__attribute__((sycl_kernel)) void kernel(Func kernelFunc) {
-  kernelFunc();
-}
-
-size_t caller() {
-
-  size_t DoNotOpt[1];
-  uint32_t DoNotOpt32[1];
-  size_t DoNotOptXYZ[3];
-
-  sycl::queue().submit([&](sycl::handler &cgh) {
-    auto DoNotOptimize = &DoNotOpt[0];
-    auto DoNotOptimize32 = &DoNotOpt32[0];
-
-    kernel<class kernel_SubgroupLocalInvocationId>([=]() SYCL_ESIMD_KERNEL {
-      DoNotOptimize[0] = __spirv_SubgroupLocalInvocationId();
-      DoNotOptimize32[0] = __spirv_SubgroupLocalInvocationId() + 3;
-    });
-    // CHECK-LABEL: @{{.*}}kernel_SubgroupLocalInvocationId
-    // CHECK: [[ZEXT0:%.*]] = zext i32 0 to i64
-    // CHECK: store i64 [[ZEXT0]]
-    // CHECK: add i32 0, 3
-
-    kernel<class kernel_SubgroupSize>([=]() SYCL_ESIMD_KERNEL {
-      DoNotOptimize[0] = __spirv_SubgroupSize();
-      DoNotOptimize32[0] = __spirv_SubgroupSize() + 7;
-    });
-    // CHECK-LABEL: @{{.*}}kernel_SubgroupSize
-    // CHECK: [[ZEXT0:%.*]] = zext i32 1 to i64
-    // CHECK: store i64 [[ZEXT0]]
-    // CHECK: add i32 1, 7
-
-    kernel<class kernel_SubgroupMaxSize>([=]() SYCL_ESIMD_KERNEL {
-      DoNotOptimize[0] = __spirv_SubgroupMaxSize();
-      DoNotOptimize32[0] = __spirv_SubgroupMaxSize() + 9;
-    });
-    // CHECK-LABEL: @{{.*}}kernel_SubgroupMaxSize
-    // CHECK: [[ZEXT0:%.*]] = zext i32 1 to i64
-    // CHECK: store i64 [[ZEXT0]]
-    // CHECK: add i32 1, 9
-  });
-  return DoNotOpt[0];
-}
diff --git a/sycl/test/check_device_code/group_ballot.cpp b/sycl/test/extensions/group_ballot.cpp
similarity index 100%
rename from sycl/test/check_device_code/group_ballot.cpp
rename to sycl/test/extensions/group_ballot.cpp
diff --git a/sycl/test/include_deps/sycl_accessor.hpp.cpp b/sycl/test/include_deps/sycl_accessor.hpp.cpp
index b17541012e0eb..de19e7e366560 100644
--- a/sycl/test/include_deps/sycl_accessor.hpp.cpp
+++ b/sycl/test/include_deps/sycl_accessor.hpp.cpp
@@ -76,6 +76,7 @@
 // CHECK-NEXT: range.hpp
 // CHECK-NEXT: info/info_desc.hpp
 // CHECK-NEXT: ext/oneapi/experimental/device_architecture.hpp
+// CHECK-NEXT: ext/oneapi/experimental/architectures.def
 // CHECK-NEXT: ext/oneapi/experimental/forward_progress.hpp
 // CHECK-NEXT: ext/oneapi/matrix/query-types.hpp
 // CHECK-NEXT: ext/oneapi/bfloat16.hpp
diff --git a/sycl/test/include_deps/sycl_buffer.hpp.cpp b/sycl/test/include_deps/sycl_buffer.hpp.cpp
index c10ea7bbab68f..2c5a67e477f70 100644
--- a/sycl/test/include_deps/sycl_buffer.hpp.cpp
+++ b/sycl/test/include_deps/sycl_buffer.hpp.cpp
@@ -68,6 +68,7 @@
 // CHECK-NEXT: detail/boost/mp11/detail/mp_with_index.hpp
 // CHECK-NEXT: detail/boost/mp11/integer_sequence.hpp
 // CHECK-NEXT: ext/oneapi/experimental/device_architecture.hpp
+// CHECK-NEXT: ext/oneapi/experimental/architectures.def
 // CHECK-NEXT: ext/oneapi/experimental/forward_progress.hpp
 // CHECK-NEXT: ext/oneapi/matrix/query-types.hpp
 // CHECK-NEXT: ext/oneapi/bfloat16.hpp
diff --git a/sycl/test/include_deps/sycl_detail_core.hpp.cpp b/sycl/test/include_deps/sycl_detail_core.hpp.cpp
index 8e0406259377c..9b3d9b3145108 100644
--- a/sycl/test/include_deps/sycl_detail_core.hpp.cpp
+++ b/sycl/test/include_deps/sycl_detail_core.hpp.cpp
@@ -77,6 +77,7 @@
 // CHECK-NEXT: range.hpp
 // CHECK-NEXT: info/info_desc.hpp
 // CHECK-NEXT: ext/oneapi/experimental/device_architecture.hpp
+// CHECK-NEXT: ext/oneapi/experimental/architectures.def
 // CHECK-NEXT: ext/oneapi/experimental/forward_progress.hpp
 // CHECK-NEXT: ext/oneapi/matrix/query-types.hpp
 // CHECK-NEXT: ext/oneapi/bfloat16.hpp
@@ -178,5 +179,6 @@
 // CHECK-NEXT: ext/oneapi/bindless_images_interop.hpp
 // CHECK-NEXT: ext/oneapi/bindless_images_mem_handle.hpp
 // CHECK-NEXT: ext/oneapi/experimental/use_root_sync_prop.hpp
+// CHECK-NEXT: ext/oneapi/experimental/virtual_functions.hpp
 // CHECK-NEXT: ext/oneapi/kernel_properties/properties.hpp
 // CHECK-EMPTY:
diff --git a/sycl/test/matrix/matrix-int8-test.cpp b/sycl/test/matrix/matrix-int8-test.cpp
deleted file mode 100644
index 41a8f78303fd3..0000000000000
--- a/sycl/test/matrix/matrix-int8-test.cpp
+++ /dev/null
@@ -1,184 +0,0 @@
-// RUN: %clangxx -fsycl -fsycl-device-only -O2 -S -emit-llvm -o - %s | FileCheck %s
-
-// CHECK-DAG: target("spirv.JointMatrixINTEL", i8, 12, 48, 0, 3, 0)
-// CHECK-DAG: target("spirv.JointMatrixINTEL", i32, 12, 12, 3, 3, 2)
-// CHECK-DAG: target("spirv.JointMatrixINTEL", i8, 48, 12, 2, 3, 1)
-
-// CHECK: !{!"matrix_type::sint32,use::accumulator,12,12;matrix_type::sint8,use::a,12,48;matrix_type::sint8,use::b,48,12"}
-// CHECK: !{!"matrix_type::sint8,matrix_type::sint8,matrix_type::sint32,matrix_type::sint32,12,48,12"}
-
-#include <iostream>
-#include <sycl/sycl.hpp>
-
-using namespace sycl;
-using namespace sycl::ext::oneapi::experimental::matrix;
-
-#define TILE_SZ 16
-#define TM (TILE_SZ - 4)
-#define TN (TILE_SZ - 4)
-#define TK (4 * TILE_SZ - 16)
-
-#define SG_SZ 16
-
-template <typename T, size_t NUM_ROWS, size_t NUM_COLS> struct big_matrix {
-public:
-  T *mat;
-
-public:
-  T *get_data() { return mat; }
-  void set_data(T *data) { mat = data; }
-  big_matrix(T *data) : mat(data) {}
-};
-
-template <typename T1, typename T2, size_t NUM_ROWS_A, size_t NUM_COLS_A,
-          size_t NUM_ROWS_B, size_t NUM_COLS_B, size_t NUM_ROWS_C,
-          size_t NUM_COLS_C>
-void matrix_multiply(big_matrix<T1, NUM_ROWS_C, NUM_COLS_C> &C,
-                     big_matrix<T2, NUM_ROWS_A, NUM_COLS_A> &A,
-                     big_matrix<T2, NUM_ROWS_B, NUM_COLS_B> &B) {
-  size_t M = NUM_ROWS_C;
-  size_t N = NUM_COLS_C;
-  size_t K = NUM_COLS_A;
-  // B => K/4 x N*4, A => M x K, C => M, N
-  // stride should be X's cols, e.g., B's stirde = N*4
-  assert(NUM_ROWS_C == NUM_ROWS_A && NUM_COLS_A == NUM_ROWS_B * 4);
-  size_t NDRangeM = M / TM;
-  size_t NDRangeN = N / TN;
-  buffer<int8_t, 2> bufA(A.get_data(), range<2>(M, K));
-  buffer<int8_t, 2> bufB(B.get_data(), range<2>(K, N));
-  buffer<int32_t, 2> bufC(C.get_data(), range<2>(M, N));
-
-  queue q;
-  q.submit([&](handler &cgh) {
-     auto accC = bufC.get_access<access::mode::read_write>(cgh);
-     auto accA = bufA.get_access<access::mode::read_write>(cgh);
-     auto accB = bufB.get_access<access::mode::read_write>(cgh);
-
-     cgh.parallel_for<class imatrix>(
-         nd_range<2>({NDRangeM, NDRangeN * SG_SZ}, {1, 1 * SG_SZ}),
-         [accA, accB, accC, M, N, K](nd_item<2> spmd_item)
-             [[intel::reqd_sub_group_size(SG_SZ)]]
-
-         {
-           // The submatrix API has to be accessed by all the workitems in a
-           // subgroup these functions will be called once by the subgroup no
-           // code divergence between the workitems
-           const auto global_idx = spmd_item.get_global_id(0);
-           const auto global_idy = spmd_item.get_global_id(1);
-           const auto sg_startx = global_idx - spmd_item.get_local_id(0);
-           const auto sg_starty = global_idy - spmd_item.get_local_id(1);
-
-           sycl::sub_group sg = spmd_item.get_sub_group();
-           joint_matrix<sycl::sub_group, int8_t, use::a, TM, TK,
-                        layout::row_major>
-               sub_a;
-           // For B, since current implementation does not support non-packed
-           // layout, users need to specify the updated VNNI sizes along with
-           // the packed_b layout. By default, the layout is row_major and size
-           // is (TK, TN).
-           joint_matrix<sycl::sub_group, int8_t, use::b, TK, TN,
-                        layout::ext_intel_packed>
-               sub_b;
-           joint_matrix<sycl::sub_group, int32_t, use::accumulator, TM, TN>
-               sub_c;
-
-           // AMX: 8 register tiles : 1k byte size, SMmaxxSKmax =16x64
-           // strideX = X's cols, so strideC = N, strideA = K, strideB = N*4
-           joint_matrix_fill(sg, sub_c, 0);
-           for (int k = 0; k < K / TK; k += 1) {
-             joint_matrix_load(
-                 sg, sub_a,
-                 accA.template get_multi_ptr<sycl::access::decorated::no>() +
-                     (sg_startx * TM) * K + k * TK,
-                 K);
-             // Assuming B data is already in VNNI format.
-             joint_matrix_load(
-                 sg, sub_b,
-                 accB.template get_multi_ptr<sycl::access::decorated::no>() +
-                     (k * TK / 4) * (N * 4) + sg_starty / SG_SZ * TN * 4,
-                 N * 4);
-             joint_matrix_mad(sg, sub_c, sub_a, sub_b, sub_c);
-           }
-           joint_matrix_store(
-               sg, sub_c,
-               accC.template get_multi_ptr<sycl::access::decorated::no>() +
-                   (sg_startx * TM) * N + sg_starty / SG_SZ * TN,
-               N, layout::row_major);
-         }); // parallel for
-   }).wait();
-}
-
-static constexpr size_t MATRIX_M = TM * 2;
-static constexpr size_t MATRIX_N = TN * 2;
-static constexpr size_t MATRIX_K = TK * 2;
-int8_t A[MATRIX_M][MATRIX_K];
-int8_t B[MATRIX_K / 4][MATRIX_N * 4];
-int32_t C[MATRIX_M][MATRIX_N];
-int32_t D[MATRIX_M][MATRIX_N];
-
-void matrix_multiply_ref(int32_t *A_mem, int32_t *B_mem, int32_t *C_mem, int M,
-                         int N, int K) {
-  // tiling
-  for (int m = 0; m < M; m++)
-    for (int n = 0; n < N; n++) {
-      for (int k = 0; k < K; k++) {
-        char *va = (char *)(A_mem + m * K + k);
-        char *vb = (char *)(B_mem + k * N + n);
-        int acc = *(C_mem + m * N + n);
-        for (int i = 0; i < 4; i++) {
-          acc += (va[i] * vb[i]);
-        }
-        *(C_mem + m * N + n) = acc;
-      }
-    }
-}
-
-int main() {
-  for (int i = 0; i < MATRIX_M; i++) {
-    for (int j = 0; j < MATRIX_K; j++) {
-      A[i][j] = i + 2 * j;
-    }
-  }
-  for (int i = 0; i < MATRIX_K / 4; i++) {
-    for (int j = 0; j < MATRIX_N * 4; j++) {
-      B[i][j] = i + j;
-    }
-  }
-  for (int i = 0; i < MATRIX_M; i++) {
-    for (int j = 0; j < MATRIX_N; j++) {
-      C[i][j] = 0;
-      D[i][j] = 0;
-    }
-  }
-
-  big_matrix<int32_t, MATRIX_M, MATRIX_N> MC((int32_t *)&C);
-  big_matrix<int32_t, MATRIX_M, MATRIX_N> MD((int32_t *)&D);
-  big_matrix<int8_t, MATRIX_M, MATRIX_K> MA((int8_t *)&A);
-  big_matrix<int8_t, MATRIX_K / 4, MATRIX_N * 4> MB((int8_t *)&B);
-  matrix_multiply(MC, MA, MB);
-  matrix_multiply_ref((int32_t *)A, (int32_t *)B, (int32_t *)D, MATRIX_M,
-                      MATRIX_N, MATRIX_K / 4);
-
-  bool res = true;
-  for (int i = 0; i < MATRIX_M; i++) {
-    for (int j = 0; j < MATRIX_N; j++) {
-      if (C[i][j] != D[i][j])
-        res = false;
-    }
-  }
-  if (res)
-    std::cout << "passed\n";
-  else
-    std::cout << "failed\n";
-  for (int i = 0; i < MATRIX_M; i++) {
-    for (int j = 0; j < MATRIX_N; j++)
-      std::cout << C[i][j] << ", ";
-    std::cout << "\n";
-  }
-  std::cout << std::endl;
-  for (int i = 0; i < MATRIX_M; i++) {
-    for (int j = 0; j < MATRIX_N; j++)
-      std::cout << D[i][j] << ", ";
-    std::cout << "\n";
-  }
-}
diff --git a/sycl/test/optional_kernel_features/atomic_ref-atomic64-aspect.cpp b/sycl/test/optional_kernel_features/atomic_ref-atomic64-aspect.cpp
index 2e11af832949f..8dd950e5e7232 100644
--- a/sycl/test/optional_kernel_features/atomic_ref-atomic64-aspect.cpp
+++ b/sycl/test/optional_kernel_features/atomic_ref-atomic64-aspect.cpp
@@ -1,10 +1,11 @@
 // RUN: %clangxx %s -S -o %t.ll -fsycl-device-only -Xclang -disable-llvm-passes
 // RUN: FileCheck %s --input-file %t.ll
 
-// CHECK: !sycl_types_that_use_aspects = !{![[#MDNUM1:]], ![[#MDNUM2:]], ![[#MDNUM3:]]}
-// CHECK: ![[#MDNUM1]] = !{!"class.sycl::_V1::detail::atomic_ref_impl", i32 [[#ASPECT_NUM:]]}
-// CHECK: ![[#MDNUM2]] = !{!"class.sycl::_V1::detail::atomic_ref_impl.2", i32 [[#ASPECT_NUM:]]}
-// CHECK: ![[#MDNUM3]] = !{!"class.sycl::_V1::detail::atomic_ref_impl.7", i32 [[#ASPECT_NUM:]]}
+// CHECK: !sycl_types_that_use_aspects = !{![[#MDNUM1:]], ![[#MDNUM2:]], ![[#MDNUM3:]], ![[#MDNUM4:]]}
+// CHECK: ![[#MDNUM1]] = !{!"class.sycl::_V1::detail::atomic_ref_impl.20", i32 [[#ASPECT_NUM:]]}
+// CHECK-NEXT: ![[#MDNUM2]] = !{!"class.sycl::_V1::detail::atomic_ref_impl", i32 [[#ASPECT_NUM:]]}
+// CHECK-NEXT: ![[#MDNUM3]] = !{!"class.sycl::_V1::detail::atomic_ref_impl.2", i32 [[#ASPECT_NUM:]]}
+// CHECK-NEXT: ![[#MDNUM4]] = !{!"class.sycl::_V1::detail::atomic_ref_impl.7", i32 [[#ASPECT_NUM:]]}
 // CHECK: !{{.*}} = !{!"atomic64", i32 [[#ASPECT_NUM]]}
 
 #include <sycl/sycl.hpp>
@@ -46,6 +47,12 @@ int main() {
           sycl::atomic_ref<int, sycl::memory_order_acq_rel,
                            sycl::memory_scope_device,
                            sycl::access::address_space::local_space>(val_int);
+
+      double *ptr = nullptr;
+      auto ref_double_ptr =
+          sycl::atomic_ref<double *, sycl::memory_order_acq_rel,
+                           sycl::memory_scope_device,
+                           sycl::access::address_space::local_space>(ptr);
     });
   });
   return 0;
diff --git a/sycl/test/virtual-functions/calls-indirectly-ir.cpp b/sycl/test/virtual-functions/calls-indirectly-ir.cpp
new file mode 100644
index 0000000000000..1f81142fc5e70
--- /dev/null
+++ b/sycl/test/virtual-functions/calls-indirectly-ir.cpp
@@ -0,0 +1,57 @@
+// RUN: %clangxx -fsycl -fsycl-device-only -emit-llvm -S %s -o %t.ll
+// RUN: FileCheck %s < %t.ll
+//
+// This test is intended to check integration between SYCL headers and SYCL FE,
+// i.e. to make sure that setting properties related to virtual functions will
+// result in the right LLVM IR.
+//
+// This test is specifically focused on the calls_indirectly property.
+//
+// CHECK: define {{.*}}KEmpty{{.*}} #[[#ATTR_SET_DEFAULT:]]
+// CHECK: define {{.*}}KInt{{.*}} #[[#ATTR_SET_INT:]]
+// CHECK: define {{.*}}KVoid{{.*}} #[[#ATTR_SET_DEFAULT]]
+// CHECK: define {{.*}}KUserDefined{{.*}} #[[#ATTR_SET_USER_DEFINED:]]
+// TODO: update the check below
+// As of now calls_indirectly_property takes into account only the first
+// template argument ignoring the rest. This will be fixed in a follow-up
+// patches and the test should be updated to reflect that, because current
+// behavior is not correct.
+// CHECK: define {{.*}}KMultiple{{.*}} #[[#ATTR_SET_INT]]
+//
+// CHECK-DAG: attributes #[[#ATTR_SET_DEFAULT]] {{.*}} "calls-indirectly"="_ZTSv"
+// CHECK-DAG: attributes #[[#ATTR_SET_INT]] {{.*}} "calls-indirectly"="_ZTSi"
+// CHECK-DAG: attributes #[[#ATTR_SET_USER_DEFINED]] {{.*}} "calls-indirectly"="_ZTS12user_defined"
+
+#include <sycl/sycl.hpp>
+
+namespace oneapi = sycl::ext::oneapi::experimental;
+
+struct user_defined {
+  int a;
+  float b;
+};
+
+class KEmpty;
+class KInt;
+class KVoid;
+class KUserDefined;
+class KMultiple;
+
+int main() {
+  sycl::queue q;
+
+  oneapi::properties props_empty{oneapi::calls_indirectly<>};
+  oneapi::properties props_int{oneapi::calls_indirectly<int>};
+  oneapi::properties props_void{oneapi::calls_indirectly<void>};
+  oneapi::properties props_user_defined{oneapi::calls_indirectly<user_defined>};
+  oneapi::properties props_multiple{
+      oneapi::calls_indirectly<int, user_defined>};
+
+  q.single_task<KEmpty>(props_empty, [=]() {});
+  q.single_task<KInt>(props_int, [=]() {});
+  q.single_task<KVoid>(props_void, [=]() {});
+  q.single_task<KUserDefined>(props_user_defined, [=]() {});
+  q.single_task<KMultiple>(props_multiple, [=]() {});
+
+  return 0;
+}
diff --git a/sycl/test/virtual-functions/indirectly-callable-ir.cpp b/sycl/test/virtual-functions/indirectly-callable-ir.cpp
new file mode 100644
index 0000000000000..bd528a44b69a3
--- /dev/null
+++ b/sycl/test/virtual-functions/indirectly-callable-ir.cpp
@@ -0,0 +1,66 @@
+// RUN: %clangxx -fsycl -fsycl-device-only -emit-llvm -S -Xclang -fsycl-allow-virtual-functions %s -o %t.ll
+// RUN: FileCheck %s < %t.ll
+//
+// This test is intended to check integration between SYCL headers and SYCL FE,
+// i.e. to make sure that setting properties related to virtual functions will
+// result in the right LLVM IR.
+//
+// This test is specifically focused on the indirectly_callable property.
+//
+// CHECK: define {{.*}} @_ZN4Base3fooEv{{.*}} #[[#ATTR_SET_DEFAULT:]]
+// CHECK: define {{.*}} @_ZN7Derived3fooEv{{.*}} #[[#ATTR_SET_DEFAULT]]
+// CHECK: define {{.*}} @_ZN7Derived3barEv{{.*}} #[[#ATTR_SET_DEFAULT]]
+// CHECK: define {{.*}} @_ZN10SubDerived3barEv{{.*}} #[[#ATTR_SET_INT:]]
+// CHECK: define {{.*}} @_ZN13SubSubDerived3foo{{.*}} #[[#ATTR_SET_DEFAULT]]
+// CHECK: define {{.*}} @_ZN13SubSubDerived3barEv{{.*}} #[[#ATTR_SET_BASE:]]
+//
+// CHECK-DAG: attributes #[[#ATTR_SET_DEFAULT]] {{.*}} "indirectly-callable"="_ZTSv"
+// CHECK-DAG: attributes #[[#ATTR_SET_INT]] {{.*}} "indirectly-callable"="_ZTSi"
+// CHECK-DAG: attributes #[[#ATTR_SET_BASE]] {{.*}} "indirectly-callable"="_ZTS4Base"
+
+#include <sycl/sycl.hpp>
+
+namespace oneapi = sycl::ext::oneapi::experimental;
+
+class Base {
+public:
+  SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(oneapi::indirectly_callable<>)
+  virtual int foo();
+};
+
+int Base::foo() { return 42; }
+
+class Derived : public Base {
+public:
+  SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(oneapi::indirectly_callable<void>)
+  virtual int bar();
+  SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(oneapi::indirectly_callable<>)
+  int foo() override;
+};
+
+int Derived::foo() { return 43; }
+
+int Derived::bar() { return 0; }
+
+class SubDerived : public Derived {
+public:
+  int foo() override { return 44; }
+
+  SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(oneapi::indirectly_callable<int>)
+  int bar() override;
+};
+
+int SubDerived::bar() { return 1; }
+
+class SubSubDerived : public SubDerived {
+public:
+  SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(oneapi::indirectly_callable<>)
+  int foo() override;
+
+  SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(oneapi::indirectly_callable<Base>)
+  int bar() override;
+};
+
+int SubSubDerived::foo() { return 45; }
+
+int SubSubDerived::bar() { return 2; }
diff --git a/sycl/test/virtual-functions/properties-negative.cpp b/sycl/test/virtual-functions/properties-negative.cpp
new file mode 100644
index 0000000000000..a26f62755641c
--- /dev/null
+++ b/sycl/test/virtual-functions/properties-negative.cpp
@@ -0,0 +1,30 @@
+// RUN: %clangxx %fsycl-host-only -fsyntax-only -Xclang -verify -Xclang -verify-ignore-unexpected=note,warning %s
+
+#include <sycl/sycl.hpp>
+
+namespace oneapi = sycl::ext::oneapi::experimental;
+
+struct user_defined {
+  int a;
+  float b;
+};
+
+int main() {
+  sycl::queue q;
+
+  oneapi::properties props_empty{oneapi::indirectly_callable<>};
+  oneapi::properties props_void{oneapi::indirectly_callable<void>};
+  oneapi::properties props_int{oneapi::indirectly_callable<int>};
+  oneapi::properties props_user{oneapi::indirectly_callable<user_defined>};
+
+  // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}}
+  q.single_task(props_empty, [=]() {});
+  // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}}
+  q.single_task(props_void, [=]() {});
+  // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}}
+  q.single_task(props_int, [=]() {});
+  // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement {{.*}} indirectly_callable property cannot be applied to SYCL kernels}}
+  q.single_task(props_user, [=]() {});
+
+  return 0;
+}
diff --git a/sycl/test/virtual-functions/properties-positive.cpp b/sycl/test/virtual-functions/properties-positive.cpp
new file mode 100644
index 0000000000000..f254e5d2a6df9
--- /dev/null
+++ b/sycl/test/virtual-functions/properties-positive.cpp
@@ -0,0 +1,64 @@
+// RUN: %clangxx -fsycl -fsyntax-only -Xclang -verify %s
+//
+// This test is intended to check that we can successfully compile code that
+// uses new properties from the virtual functions extension.
+//
+// expected-no-diagnostics
+
+#include <sycl/sycl.hpp>
+
+namespace oneapi = sycl::ext::oneapi::experimental;
+
+class Base {
+public:
+  SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(oneapi::indirectly_callable<>)
+  virtual void foo() {}
+};
+
+class Derived : public Base {
+public:
+  SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(oneapi::indirectly_callable<>)
+  void foo() override {}
+
+  SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(oneapi::indirectly_callable<void>)
+  virtual void bar() {}
+};
+
+class SubDerived : public Derived {
+public:
+  void foo() override {}
+
+  SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(oneapi::indirectly_callable<int>)
+  void bar() override {}
+};
+
+class SubSubDerived : public SubDerived {
+public:
+  SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(oneapi::indirectly_callable<>)
+  void foo() override {}
+
+  SYCL_EXT_ONEAPI_FUNCTION_PROPERTY(oneapi::indirectly_callable<Base>)
+  void bar() override {}
+};
+
+int main() {
+  sycl::queue q;
+
+  static_assert(
+      oneapi::is_property_key<oneapi::indirectly_callable_key>::value);
+  static_assert(oneapi::is_property_key<oneapi::calls_indirectly_key>::value);
+
+  oneapi::properties props_empty{oneapi::calls_indirectly<>};
+  oneapi::properties props_void{oneapi::calls_indirectly<void>};
+  oneapi::properties props_int{oneapi::calls_indirectly<int>};
+  oneapi::properties props_base{oneapi::calls_indirectly<Base>};
+  oneapi::properties props_multiple{oneapi::calls_indirectly<int, Base>};
+
+  q.single_task(props_empty, [=]() {});
+  q.single_task(props_void, [=]() {});
+  q.single_task(props_int, [=]() {});
+  q.single_task(props_base, [=]() {});
+  q.single_task(props_multiple, [=]() {});
+
+  return 0;
+}
diff --git a/sycl/tools/sycl-ls/sycl-ls.cpp b/sycl/tools/sycl-ls/sycl-ls.cpp
index f71221840d397..91a2aa3c84e74 100644
--- a/sycl/tools/sycl-ls/sycl-ls.cpp
+++ b/sycl/tools/sycl-ls/sycl-ls.cpp
@@ -69,6 +69,21 @@ std::string getDeviceTypeName(const device &Device) {
   }
 }
 
+const char *getArchName(const device &Device) {
+  namespace syclex = sycl::ext::oneapi::experimental;
+  auto arch = Device.get_info<syclex::info::device::architecture>();
+  switch (arch) {
+#define __SYCL_ARCHITECTURE(ARCH, VAL)                                         \
+  case syclex::architecture::ARCH:                                             \
+    return #ARCH;
+#define __SYCL_ARCHITECTURE_ALIAS(ARCH, VAL)
+#include <sycl/ext/oneapi/experimental/architectures.def>
+#undef __SYCL_ARCHITECTURE
+#undef __SYCL_ARCHITECTURE_ALIAS
+  }
+  return "unknown";
+}
+
 template <typename RangeTy, typename ElemTy>
 bool contains(RangeTy &&Range, const ElemTy &Elem) {
   return std::find(Range.begin(), Range.end(), Elem) != Range.end();
@@ -153,6 +168,8 @@ static void printDeviceInfo(const device &Device, bool Verbose,
     for (auto size : sg_sizes)
       std::cout << " " << size;
     std::cout << std::endl;
+    std::cout << Prepend << "Architecture: " << getArchName(Device)
+              << std::endl;
   } else {
     std::cout << Prepend << ", " << DeviceName << " " << DeviceVersion << " ["
               << DeviceDriverVersion << "]" << std::endl;
diff --git a/sycl/unittests/Extensions/CommandGraph/Regressions.cpp b/sycl/unittests/Extensions/CommandGraph/Regressions.cpp
index 17b58f542d760..94b8549ed7c04 100644
--- a/sycl/unittests/Extensions/CommandGraph/Regressions.cpp
+++ b/sycl/unittests/Extensions/CommandGraph/Regressions.cpp
@@ -58,3 +58,30 @@ TEST_F(CommandGraphTest, AccessorModeRegression) {
   EXPECT_EQ(NodeC.get_predecessors().size(), 0ul);
   EXPECT_EQ(NodeC.get_successors().size(), 0ul);
 }
+
+TEST_F(CommandGraphTest, QueueRecordBarrierMultipleGraph) {
+  // Test that using barriers recorded from the same queue to
+  // different graphs.
+
+  Graph.begin_recording(Queue);
+  auto NodeKernel = Queue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+  Queue.ext_oneapi_submit_barrier({NodeKernel});
+  Graph.end_recording(Queue);
+
+  experimental::command_graph<experimental::graph_state::modifiable> GraphB{
+      Queue};
+  GraphB.begin_recording(Queue);
+  auto NodeKernelB = Queue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+  Queue.ext_oneapi_submit_barrier({NodeKernelB});
+  GraphB.end_recording(Queue);
+
+  experimental::command_graph<experimental::graph_state::modifiable> GraphC{
+      Queue};
+  GraphC.begin_recording(Queue);
+  auto NodeKernelC = Queue.submit(
+      [&](sycl::handler &cgh) { cgh.single_task<TestKernel<>>([]() {}); });
+  Queue.ext_oneapi_submit_barrier();
+  GraphC.end_recording(Queue);
+}
diff --git a/sycl/unittests/buffer/BufferReleaseBase.cpp b/sycl/unittests/buffer/BufferReleaseBase.cpp
index 83c9bb45c9c46..27c45edc983d7 100644
--- a/sycl/unittests/buffer/BufferReleaseBase.cpp
+++ b/sycl/unittests/buffer/BufferReleaseBase.cpp
@@ -256,9 +256,8 @@ TEST_F(BufferDestructionCheck, ReadyToReleaseLogic) {
 
   sycl::buffer<int, 1> Buf(1);
   sycl::detail::Requirement MockReq = getMockRequirement(Buf);
-  std::vector<sycl::detail::Command *> AuxCmds;
   sycl::detail::MemObjRecord *Rec = MockSchedulerPtr->getOrInsertMemObjRecord(
-      sycl::detail::getSyclObjImpl(Q), &MockReq, AuxCmds);
+      sycl::detail::getSyclObjImpl(Q), &MockReq);
 
   std::shared_ptr<sycl::detail::context_impl> CtxImpl =
       sycl::detail::getSyclObjImpl(Context);
diff --git a/sycl/unittests/buffer/BufferReleaseBase.hpp b/sycl/unittests/buffer/BufferReleaseBase.hpp
index 5f46604827302..b35d73cb3909c 100644
--- a/sycl/unittests/buffer/BufferReleaseBase.hpp
+++ b/sycl/unittests/buffer/BufferReleaseBase.hpp
@@ -58,9 +58,8 @@ class BufferDestructionCheckCommon : public ::testing::Test {
   template <typename Buffer>
   MockCmdWithReleaseTracking *addCommandToBuffer(Buffer &Buf, sycl::queue &Q) {
     sycl::detail::Requirement MockReq = getMockRequirement(Buf);
-    std::vector<sycl::detail::Command *> AuxCmds;
     sycl::detail::MemObjRecord *Rec = MockSchedulerPtr->getOrInsertMemObjRecord(
-        sycl::detail::getSyclObjImpl(Q), &MockReq, AuxCmds);
+        sycl::detail::getSyclObjImpl(Q), &MockReq);
     MockCmdWithReleaseTracking *MockCmd = new MockCmdWithReleaseTracking(
         sycl::detail::getSyclObjImpl(Q), MockReq);
     std::vector<sycl::detail::Command *> ToEnqueue;
diff --git a/sycl/unittests/helpers/PiMockPlugin.hpp b/sycl/unittests/helpers/PiMockPlugin.hpp
index d9f18d9008f0d..56803e7eab5bb 100644
--- a/sycl/unittests/helpers/PiMockPlugin.hpp
+++ b/sycl/unittests/helpers/PiMockPlugin.hpp
@@ -502,6 +502,8 @@ inline pi_result mock_piextMemUnsampledImageCreate(
   return PI_SUCCESS;
 }
 
+[[deprecated("This function has been deprecated in favor of "
+             "`piextImportExternalMemory`")]]
 inline pi_result
 mock_piextMemImportOpaqueFD(pi_context context, pi_device device, size_t size,
                             int file_descriptor,
@@ -524,12 +526,28 @@ inline pi_result mock_piextMemReleaseInterop(pi_context context,
   return PI_SUCCESS;
 }
 
+[[deprecated("This function has been deprecated in favor of "
+             "`piextImportExternalSemaphore`")]]
 inline pi_result mock_piextImportExternalSemaphoreOpaqueFD(
     pi_context context, pi_device device, int file_descriptor,
     pi_interop_semaphore_handle *ret_handle) {
   return PI_SUCCESS;
 }
 
+inline pi_result mock_piextImportExternalSemaphore(
+    pi_context context, pi_device device,
+    pi_external_semaphore_descriptor *sem_descriptor,
+    pi_interop_semaphore_handle *ret_handle) {
+  return PI_SUCCESS;
+}
+
+inline pi_result
+mock_piextImportExternalMemory(pi_context context, pi_device device,
+                               pi_external_mem_descriptor *mem_descriptor,
+                               pi_interop_mem_handle *ret_handle) {
+  return PI_SUCCESS;
+}
+
 inline pi_result
 mock_piextDestroyExternalSemaphore(pi_context context, pi_device device,
                                    pi_interop_semaphore_handle sem_handle) {
@@ -538,13 +556,14 @@ mock_piextDestroyExternalSemaphore(pi_context context, pi_device device,
 
 inline pi_result mock_piextWaitExternalSemaphore(
     pi_queue command_queue, pi_interop_semaphore_handle sem_handle,
-    pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
-    pi_event *event) {
+    bool has_wait_value, uint64_t wait_value, pi_uint32 num_events_in_wait_list,
+    const pi_event *event_wait_list, pi_event *event) {
   return PI_SUCCESS;
 }
 
 inline pi_result mock_piextSignalExternalSemaphore(
     pi_queue command_queue, pi_interop_semaphore_handle sem_handle,
+    bool has_signal_value, uint64_t signal_value,
     pi_uint32 num_events_in_wait_list, const pi_event *event_wait_list,
     pi_event *event) {
   return PI_SUCCESS;
diff --git a/sycl/unittests/kernel-and-program/OutOfResources.cpp b/sycl/unittests/kernel-and-program/OutOfResources.cpp
index fe6f18b53e23e..e8c04bf8796c6 100644
--- a/sycl/unittests/kernel-and-program/OutOfResources.cpp
+++ b/sycl/unittests/kernel-and-program/OutOfResources.cpp
@@ -68,6 +68,7 @@ static sycl::unittest::PiImageArray<2> ImgArray{Img};
 
 static int nProgramCreate = 0;
 static volatile bool outOfResourcesToggle = false;
+static volatile bool outOfHostMemoryToggle = false;
 
 static pi_result redefinedProgramCreate(pi_context context, const void *il,
                                         size_t length,
@@ -80,6 +81,17 @@ static pi_result redefinedProgramCreate(pi_context context, const void *il,
   return PI_SUCCESS;
 }
 
+static pi_result
+redefinedProgramCreateOutOfHostMemory(pi_context context, const void *il,
+                                      size_t length, pi_program *res_program) {
+  ++nProgramCreate;
+  if (outOfHostMemoryToggle) {
+    outOfHostMemoryToggle = false;
+    return PI_ERROR_OUT_OF_HOST_MEMORY;
+  }
+  return PI_SUCCESS;
+}
+
 TEST(OutOfResourcesTest, piProgramCreate) {
   sycl::unittest::PiMock Mock;
   Mock.redefineBefore<detail::PiApiKind::piProgramCreate>(
@@ -141,6 +153,70 @@ TEST(OutOfResourcesTest, piProgramCreate) {
   }
 }
 
+TEST(OutOfHostMemoryTest, piProgramCreate) {
+  // Reset to zero.
+  nProgramCreate = 0;
+
+  sycl::unittest::PiMock Mock;
+  Mock.redefineBefore<detail::PiApiKind::piProgramCreate>(
+      redefinedProgramCreateOutOfHostMemory);
+
+  sycl::platform Plt{Mock.getPlatform()};
+  sycl::context Ctx{Plt};
+  auto CtxImpl = detail::getSyclObjImpl(Ctx);
+  queue q(Ctx, default_selector_v);
+
+  int runningTotal = 0;
+  // Cache is empty, so one piProgramCreate call.
+  q.single_task<class OutOfResourcesKernel1>([] {});
+  EXPECT_EQ(nProgramCreate, runningTotal += 1);
+
+  // Now, we make the next piProgramCreate call fail with
+  // PI_ERROR_OUT_OF_HOST_MEMORY. The caching mechanism should catch this,
+  // clear the cache, and retry the piProgramCreate.
+  outOfHostMemoryToggle = true;
+  q.single_task<class OutOfResourcesKernel2>([] {});
+  EXPECT_FALSE(outOfHostMemoryToggle);
+  EXPECT_EQ(nProgramCreate, runningTotal += 2);
+  {
+    detail::KernelProgramCache::ProgramCache &Cache =
+        CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
+    EXPECT_EQ(Cache.size(), 1U) << "Expected 1 program in the cache";
+  }
+
+  // The next piProgramCreate call will fail with
+  // PI_ERROR_OUT_OF_HOST_MEMORY. But OutOfResourcesKernel2 is in
+  // the cache, so we expect no new piProgramCreate calls.
+  outOfHostMemoryToggle = true;
+  q.single_task<class OutOfResourcesKernel2>([] {});
+  EXPECT_TRUE(outOfHostMemoryToggle);
+  EXPECT_EQ(nProgramCreate, runningTotal);
+
+  // OutOfResourcesKernel1 is not in the cache, so we have to
+  // build it. From what we set before, this call will fail,
+  // the cache will clear out, and will try again.
+  q.single_task<class OutOfResourcesKernel1>([] {});
+  EXPECT_FALSE(outOfHostMemoryToggle);
+  EXPECT_EQ(nProgramCreate, runningTotal += 2);
+  {
+    detail::KernelProgramCache::ProgramCache &Cache =
+        CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
+    EXPECT_EQ(Cache.size(), 1U) << "Expected 1 program in the cache";
+  }
+
+  // Finally, OutOfResourcesKernel1 will be in the cache, but
+  // OutOfResourceKenel2 will not, so one more piProgramCreate.
+  // Toggle is not set, so this should succeed.
+  q.single_task<class OutOfResourcesKernel1>([] {});
+  q.single_task<class OutOfResourcesKernel2>([] {});
+  EXPECT_EQ(nProgramCreate, runningTotal += 1);
+  {
+    detail::KernelProgramCache::ProgramCache &Cache =
+        CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
+    EXPECT_EQ(Cache.size(), 2U) << "Expected 2 program in the cache";
+  }
+}
+
 static int nProgramLink = 0;
 
 static pi_result
@@ -158,6 +234,20 @@ redefinedProgramLink(pi_context context, pi_uint32 num_devices,
   return PI_SUCCESS;
 }
 
+static pi_result redefinedProgramLinkOutOfHostMemory(
+    pi_context context, pi_uint32 num_devices, const pi_device *device_list,
+    const char *options, pi_uint32 num_input_programs,
+    const pi_program *input_programs,
+    void (*pfn_notify)(pi_program program, void *user_data), void *user_data,
+    pi_program *ret_program) {
+  ++nProgramLink;
+  if (outOfHostMemoryToggle) {
+    outOfHostMemoryToggle = false;
+    return PI_ERROR_OUT_OF_HOST_MEMORY;
+  }
+  return PI_SUCCESS;
+}
+
 TEST(OutOfResourcesTest, piProgramLink) {
   sycl::unittest::PiMock Mock;
   Mock.redefineBefore<detail::PiApiKind::piProgramLink>(redefinedProgramLink);
@@ -191,4 +281,43 @@ TEST(OutOfResourcesTest, piProgramLink) {
         CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
     EXPECT_EQ(Cache.size(), 0u) << "Expect no programs in the cache";
   }
-}
\ No newline at end of file
+}
+
+TEST(OutOfHostMemoryTest, piProgramLink) {
+  // Reset to zero.
+  nProgramLink = 0;
+
+  sycl::unittest::PiMock Mock;
+  Mock.redefineBefore<detail::PiApiKind::piProgramLink>(
+      redefinedProgramLinkOutOfHostMemory);
+
+  sycl::platform Plt{Mock.getPlatform()};
+  sycl::context Ctx{Plt};
+  auto CtxImpl = detail::getSyclObjImpl(Ctx);
+  queue q(Ctx, default_selector_v);
+  // Put some programs in the cache
+  q.single_task<class OutOfResourcesKernel1>([] {});
+  q.single_task<class OutOfResourcesKernel2>([] {});
+  {
+    detail::KernelProgramCache::ProgramCache &Cache =
+        CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
+    EXPECT_EQ(Cache.size(), 2U) << "Expect 2 programs in the cache";
+  }
+
+  auto b1 = sycl::get_kernel_bundle<OutOfResourcesKernel1,
+                                    sycl::bundle_state::object>(Ctx);
+  auto b2 = sycl::get_kernel_bundle<OutOfResourcesKernel2,
+                                    sycl::bundle_state::object>(Ctx);
+  outOfHostMemoryToggle = true;
+  EXPECT_EQ(nProgramLink, 0);
+  auto b3 = sycl::link({b1, b2});
+  EXPECT_FALSE(outOfHostMemoryToggle);
+  // one restart due to out of resources, one link per each of b1 and b2.
+  EXPECT_EQ(nProgramLink, 3);
+  // no programs should be in the cache due to out of resources.
+  {
+    detail::KernelProgramCache::ProgramCache &Cache =
+        CtxImpl->getKernelProgramCache().acquireCachedPrograms().get();
+    EXPECT_EQ(Cache.size(), 0u) << "Expect no programs in the cache";
+  }
+}
diff --git a/sycl/unittests/scheduler/AllocaLinking.cpp b/sycl/unittests/scheduler/AllocaLinking.cpp
index a77995a203da3..dfb51edcaf13e 100644
--- a/sycl/unittests/scheduler/AllocaLinking.cpp
+++ b/sycl/unittests/scheduler/AllocaLinking.cpp
@@ -67,9 +67,8 @@ TEST_F(SchedulerTest, AllocaLinking) {
     buffer<int, 1> Buf(range<1>(1));
     detail::Requirement Req = getMockRequirement(Buf);
 
+    detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req);
     std::vector<detail::Command *> AuxCmds;
-    detail::MemObjRecord *Record =
-        MS.getOrInsertMemObjRecord(QImpl, &Req, AuxCmds);
     detail::AllocaCommandBase *NonHostAllocaCmd =
         MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds);
     detail::AllocaCommandBase *HostAllocaCmd =
@@ -84,9 +83,8 @@ TEST_F(SchedulerTest, AllocaLinking) {
         range<1>(1), {ext::oneapi::property::buffer::use_pinned_host_memory()});
     detail::Requirement Req = getMockRequirement(Buf);
 
+    detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req);
     std::vector<detail::Command *> AuxCmds;
-    detail::MemObjRecord *Record =
-        MS.getOrInsertMemObjRecord(QImpl, &Req, AuxCmds);
     detail::AllocaCommandBase *NonHostAllocaCmd =
         MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds);
     detail::AllocaCommandBase *HostAllocaCmd =
@@ -101,9 +99,8 @@ TEST_F(SchedulerTest, AllocaLinking) {
     buffer<int, 1> Buf(range<1>(1));
     detail::Requirement Req = getMockRequirement(Buf);
 
+    detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req);
     std::vector<detail::Command *> AuxCmds;
-    detail::MemObjRecord *Record =
-        MS.getOrInsertMemObjRecord(QImpl, &Req, AuxCmds);
     detail::AllocaCommandBase *NonHostAllocaCmd =
         MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds);
     detail::AllocaCommandBase *HostAllocaCmd =
diff --git a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
index fc816d1a4f3af..414f58c6f177c 100644
--- a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
+++ b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
@@ -165,7 +165,13 @@ class DependsOnTests : public ::testing::Test {
   };
 };
 
+#ifdef _WIN32
+// Disabled on Windows due to flaky behavior
+// https://github.com/intel/llvm/issues/14060
+TEST_F(DependsOnTests, DISABLED_EnqueueNoMemObjTwoHostTasks) {
+#else
 TEST_F(DependsOnTests, EnqueueNoMemObjTwoHostTasks) {
+#endif
   // Checks enqueue of two dependent host tasks
   detail::QueueImplPtr QueueHostImpl = MS.getDefaultHostQueue();
   std::vector<EventImplPtr> Events;
diff --git a/sycl/unittests/scheduler/GraphCleanup.cpp b/sycl/unittests/scheduler/GraphCleanup.cpp
index 3389769569e5e..9bf4e37eea0db 100644
--- a/sycl/unittests/scheduler/GraphCleanup.cpp
+++ b/sycl/unittests/scheduler/GraphCleanup.cpp
@@ -78,13 +78,12 @@ static void checkCleanupOnEnqueue(MockScheduler &MS,
                                   buffer<int, 1> &Buf,
                                   detail::Requirement &MockReq) {
   bool CommandDeleted = false;
-  std::vector<detail::Command *> AuxCmds;
   std::vector<detail::Command *> ToCleanUp;
   std::vector<detail::Command *> ToEnqueue;
   detail::MemObjRecord *Record =
-      MS.getOrInsertMemObjRecord(QueueImpl, &MockReq, AuxCmds);
+      MS.getOrInsertMemObjRecord(QueueImpl, &MockReq);
   detail::AllocaCommandBase *AllocaCmd =
-      MS.getOrCreateAllocaForReq(Record, &MockReq, QueueImpl, AuxCmds);
+      MS.getOrCreateAllocaForReq(Record, &MockReq, QueueImpl, ToEnqueue);
   std::function<void()> Callback = [&CommandDeleted]() {
     CommandDeleted = true;
   };
@@ -176,13 +175,12 @@ static void checkCleanupOnLeafUpdate(
     detail::Requirement &MockReq,
     std::function<void(detail::MemObjRecord *)> SchedulerCall) {
   bool CommandDeleted = false;
-  std::vector<detail::Command *> AuxCmds;
   std::vector<detail::Command *> ToCleanUp;
   std::vector<detail::Command *> ToEnqueue;
   detail::MemObjRecord *Record =
-      MS.getOrInsertMemObjRecord(QueueImpl, &MockReq, AuxCmds);
+      MS.getOrInsertMemObjRecord(QueueImpl, &MockReq);
   detail::AllocaCommandBase *AllocaCmd =
-      MS.getOrCreateAllocaForReq(Record, &MockReq, QueueImpl, AuxCmds);
+      MS.getOrCreateAllocaForReq(Record, &MockReq, QueueImpl, ToEnqueue);
   std::function<void()> Callback = [&CommandDeleted]() {
     CommandDeleted = true;
   };
@@ -405,8 +403,7 @@ TEST_F(SchedulerTest, AuxiliaryResourcesDeallocation) {
     auto BufPtr = std::make_shared<buffer<char, 1>>(
         MockAuxResourcePtr->getDataPtr(), range<1>{1});
     detail::Requirement MockReq = getMockRequirement(*BufPtr);
-    std::vector<detail::Command *> AuxCmds;
-    MSPtr->getOrInsertMemObjRecord(QueueImplPtr, &MockReq, AuxCmds);
+    MSPtr->getOrInsertMemObjRecord(QueueImplPtr, &MockReq);
     MockCGH.use_kernel_bundle(ExecBundle);
     MockCGH.addReduction(std::move(MockAuxResourcePtr));
     MockCGH.addReduction(std::move(BufPtr));
diff --git a/sycl/unittests/scheduler/InOrderQueueDeps.cpp b/sycl/unittests/scheduler/InOrderQueueDeps.cpp
index 337ef2ef3d403..049131d661779 100644
--- a/sycl/unittests/scheduler/InOrderQueueDeps.cpp
+++ b/sycl/unittests/scheduler/InOrderQueueDeps.cpp
@@ -88,9 +88,9 @@ TEST_F(SchedulerTest, InOrderQueueDeps) {
   buffer<int, 1> Buf(&val, range<1>(1));
   detail::Requirement Req = getMockRequirement(Buf);
 
-  std::vector<detail::Command *> AuxCmds;
   detail::MemObjRecord *Record =
-      MS.getOrInsertMemObjRecord(InOrderQueueImpl, &Req, AuxCmds);
+      MS.getOrInsertMemObjRecord(InOrderQueueImpl, &Req);
+  std::vector<detail::Command *> AuxCmds;
   MS.getOrCreateAllocaForReq(Record, &Req, InOrderQueueImpl, AuxCmds);
   MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds);
 
diff --git a/sycl/unittests/scheduler/LeafLimit.cpp b/sycl/unittests/scheduler/LeafLimit.cpp
index 36d8f459a324a..a2533cceda138 100644
--- a/sycl/unittests/scheduler/LeafLimit.cpp
+++ b/sycl/unittests/scheduler/LeafLimit.cpp
@@ -47,9 +47,8 @@ TEST_F(SchedulerTest, LeafLimit) {
 
   MockDepCmd =
       std::make_unique<MockCommand>(detail::getSyclObjImpl(Q), MockReq);
-  std::vector<detail::Command *> AuxCmds;
   detail::MemObjRecord *Rec =
-      MS.getOrInsertMemObjRecord(detail::getSyclObjImpl(Q), &MockReq, AuxCmds);
+      MS.getOrInsertMemObjRecord(detail::getSyclObjImpl(Q), &MockReq);
 
   // Create commands that will be added as leaves exceeding the limit by 1
   for (std::size_t i = 0; i < Rec->MWriteLeaves.genericCommandsCapacity() + 1;
diff --git a/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp b/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp
index 38d9ac784c09f..61e3de6671fb1 100644
--- a/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp
+++ b/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp
@@ -52,12 +52,12 @@ TEST_F(SchedulerTest, LeafLimitDiffContexts) {
           AllocaCmd(nullptr) {}
 
     void InitializeUtils(detail::Requirement &MockReq, MockScheduler &MS) {
-      std::vector<detail::Command *> ToEnqueue;
-      Rec = MS.getOrInsertMemObjRecord(detail::getSyclObjImpl(Queue), &MockReq,
-                                       ToEnqueue);
+
+      Rec = MS.getOrInsertMemObjRecord(detail::getSyclObjImpl(Queue), &MockReq);
       // Creating Alloca on both - device and host contexts (will be created in
       // real case in insertMemMove for example) It is done to avoid extra
       // AllocCmd insertion during ConnectCmd insertion
+      std::vector<detail::Command *> ToEnqueue;
       AllocaCmd = MS.getOrCreateAllocaForReq(
           Rec, &MockReq, detail::getSyclObjImpl(Queue), ToEnqueue);
       std::ignore = MS.getOrCreateAllocaForReq(
diff --git a/sycl/unittests/scheduler/MemObjCommandCleanup.cpp b/sycl/unittests/scheduler/MemObjCommandCleanup.cpp
index aeeb815b2db3a..e89f5ac18c517 100644
--- a/sycl/unittests/scheduler/MemObjCommandCleanup.cpp
+++ b/sycl/unittests/scheduler/MemObjCommandCleanup.cpp
@@ -24,9 +24,8 @@ TEST_F(SchedulerTest, MemObjCommandCleanupAllocaUsers) {
   buffer<int, 1> BufB(range<1>(1));
   detail::Requirement MockReqA = getMockRequirement(BufA);
   detail::Requirement MockReqB = getMockRequirement(BufB);
-  std::vector<detail::Command *> AuxCmds;
   detail::MemObjRecord *RecA =
-      MS.getOrInsertMemObjRecord(detail::getSyclObjImpl(Q), &MockReqA, AuxCmds);
+      MS.getOrInsertMemObjRecord(detail::getSyclObjImpl(Q), &MockReqA);
 
   // Create 2 fake allocas, one of which will be cleaned up
   detail::AllocaCommand *MockAllocaA =
@@ -66,9 +65,8 @@ TEST_F(SchedulerTest, MemObjCommandCleanupAllocaDeps) {
   MockScheduler MS;
   buffer<int, 1> Buf(range<1>(1));
   detail::Requirement MockReq = getMockRequirement(Buf);
-  std::vector<detail::Command *> AuxCmds;
   detail::MemObjRecord *MemObjRec =
-      MS.getOrInsertMemObjRecord(detail::getSyclObjImpl(Q), &MockReq, AuxCmds);
+      MS.getOrInsertMemObjRecord(detail::getSyclObjImpl(Q), &MockReq);
 
   // Create a fake alloca.
   detail::AllocaCommand *MockAllocaCmd =
diff --git a/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp b/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp
index 635a8e9c3389c..d52a257f3603b 100644
--- a/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp
+++ b/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp
@@ -103,9 +103,8 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) {
     buffer<int, 1> Buf(&val, range<1>(1));
     detail::Requirement Req = getMockRequirement(Buf);
 
+    detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req);
     std::vector<detail::Command *> AuxCmds;
-    detail::MemObjRecord *Record =
-        MS.getOrInsertMemObjRecord(QImpl, &Req, AuxCmds);
     detail::AllocaCommandBase *NonHostAllocaCmd =
         MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds);
 
@@ -133,9 +132,8 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) {
 
     // No need to create a host allocation in this case since the data can be
     // discarded.
+    detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req);
     std::vector<detail::Command *> AuxCmds;
-    detail::MemObjRecord *Record =
-        MS.getOrInsertMemObjRecord(QImpl, &Req, AuxCmds);
     MS.getOrCreateAllocaForReq(Record, &DiscardReq, QImpl, AuxCmds);
     EXPECT_EQ(Record->MAllocaCommands.size(), 1U);
   }
@@ -146,9 +144,8 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) {
 
     // No need to create a host allocation in this case since there's no data to
     // initialize the buffer with.
+    detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req);
     std::vector<detail::Command *> AuxCmds;
-    detail::MemObjRecord *Record =
-        MS.getOrInsertMemObjRecord(QImpl, &Req, AuxCmds);
     MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds);
     EXPECT_EQ(Record->MAllocaCommands.size(), 1U);
   }
@@ -160,9 +157,9 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) {
 
     // No special handling required: alloca commands are created one after
     // another and the transfer is done via a write operation.
-    std::vector<detail::Command *> AuxCmds;
     detail::MemObjRecord *Record =
-        MS.getOrInsertMemObjRecord(DefaultHostQueue, &Req, AuxCmds);
+        MS.getOrInsertMemObjRecord(DefaultHostQueue, &Req);
+    std::vector<detail::Command *> AuxCmds;
     detail::AllocaCommandBase *HostAllocaCmd =
         MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds);
     EXPECT_EQ(Record->MAllocaCommands.size(), 1U);
@@ -186,9 +183,8 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) {
     detail::Requirement DiscardReq = getMockRequirement(Buf);
     DiscardReq.MAccessMode = access::mode::discard_read_write;
 
+    detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req);
     std::vector<detail::Command *> AuxCmds;
-    detail::MemObjRecord *Record =
-        MS.getOrInsertMemObjRecord(QImpl, &Req, AuxCmds);
     MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds);
     MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds);
 
@@ -217,9 +213,9 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) {
 
     detail::Requirement Req = getMockRequirement();
     Req.MSYCLMemObj = BufI.get();
+
+    detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(QImpl, &Req);
     std::vector<detail::Command *> AuxCmds;
-    detail::MemObjRecord *Record =
-        MS.getOrInsertMemObjRecord(QImpl, &Req, AuxCmds);
     detail::AllocaCommandBase *InteropAlloca =
         MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds);
     detail::EnqueueResultT Res;
diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp
index 88ced1f25904a..1d7fa2075d0da 100644
--- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp
+++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp
@@ -109,9 +109,8 @@ class MockScheduler : public sycl::detail::Scheduler {
 
   sycl::detail::MemObjRecord *
   getOrInsertMemObjRecord(const sycl::detail::QueueImplPtr &Queue,
-                          sycl::detail::Requirement *Req,
-                          std::vector<sycl::detail::Command *> &ToEnqueue) {
-    return MGraphBuilder.getOrInsertMemObjRecord(Queue, Req, ToEnqueue);
+                          sycl::detail::Requirement *Req) {
+    return MGraphBuilder.getOrInsertMemObjRecord(Queue, Req);
   }
 
   void decrementLeafCountersForRecord(sycl::detail::MemObjRecord *Rec) {